erik.andersson@arm.com | 460c689 | 2021-02-24 14:38:09 +0100 | [diff] [blame] | 1 | # Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved. |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 2 | # |
| 3 | # SPDX-License-Identifier: Apache-2.0 |
| 4 | # |
| 5 | # Licensed under the Apache License, Version 2.0 (the License); you may |
| 6 | # not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| 8 | # |
| 9 | # www.apache.org/licenses/LICENSE-2.0 |
| 10 | # |
| 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an AS IS BASIS, WITHOUT |
| 13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 16 | # Description: |
| 17 | # NPU performance estimation functions to estimate performance of a Pass and CascadedPass. Uses a model that takes the |
| 18 | # maximum of the 'cycles required for bandwidth' and 'cycles required for computing'. |
| 19 | # |
| 20 | # Called during scheduling to evaluate different proposals, as well as post-scheduling to provide a final performance |
| 21 | # estimate. |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 22 | import copy |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 23 | from enum import auto |
| 24 | from enum import IntEnum |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame] | 25 | from typing import Optional |
Ayaan Masood | b801dda | 2022-02-22 11:28:55 +0000 | [diff] [blame] | 26 | from typing import Set |
| 27 | from uuid import UUID |
Diego Russo | ea6111a | 2020-04-14 18:41:58 +0100 | [diff] [blame] | 28 | |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 29 | import numpy as np |
Diego Russo | ea6111a | 2020-04-14 18:41:58 +0100 | [diff] [blame] | 30 | |
| 31 | from . import numeric_util |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 32 | from .architecture_allocator import ArchitectureBlockConfig |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 33 | from .architecture_features import Accelerator |
Tim Hall | c1be087 | 2022-03-03 17:50:52 +0000 | [diff] [blame] | 34 | from .architecture_features import ArchitectureFeatures |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 35 | from .architecture_features import NpuBlockType |
| 36 | from .architecture_features import SHRAMElements |
| 37 | from .architecture_features import TensorFormat |
Tim Hall | c1be087 | 2022-03-03 17:50:52 +0000 | [diff] [blame] | 38 | from .debug_database import DebugDatabase |
Ayaan Masood | b801dda | 2022-02-22 11:28:55 +0000 | [diff] [blame] | 39 | from .nn_graph import Graph |
Tim Hall | c1be087 | 2022-03-03 17:50:52 +0000 | [diff] [blame] | 40 | from .nn_graph import NetworkType |
| 41 | from .nn_graph import PassPlacement |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 42 | from .numeric_util import round_up |
Johan Alfvén | f8e353b | 2022-02-04 17:24:23 +0100 | [diff] [blame] | 43 | from .numeric_util import round_up_to_int |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 44 | from .operation import Kernel |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 45 | from .operation import Op |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 46 | from .scheduler import Schedule |
| 47 | from .scheduler import SchedulerOperation |
Ayaan Masood | b801dda | 2022-02-22 11:28:55 +0000 | [diff] [blame] | 48 | from .scheduler import SchedulerOpInfo |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 49 | from .shape4d import Shape4D |
Diqing Zhong | f842b69 | 2020-12-11 13:07:37 +0100 | [diff] [blame] | 50 | from .tensor import BandwidthDirection |
Diego Russo | e8a1045 | 2020-04-21 17:39:10 +0100 | [diff] [blame] | 51 | from .tensor import MemArea |
Diego Russo | e8a1045 | 2020-04-21 17:39:10 +0100 | [diff] [blame] | 52 | from .tensor import TensorPurpose |
Johan Alfvén | 0f98de6 | 2022-05-15 14:54:51 +0200 | [diff] [blame^] | 53 | from .tensor import TensorSubPurpose |
Tim Hall | c1be087 | 2022-03-03 17:50:52 +0000 | [diff] [blame] | 54 | from .tflite_mapping import optype_to_builtintype as tflite_optype_to_builtintype |
| 55 | from .tosa_mapping import optype_to_tosa_op_type as tosa_optype_to_tosa_op_type |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 56 | from .weight_compressor import WeightKey |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 57 | |
| 58 | |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 59 | class PassCycles(IntEnum): |
Diqing Zhong | 42e833d | 2020-10-02 13:18:42 +0200 | [diff] [blame] | 60 | Npu = 0 |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 61 | SramAccess = auto() |
| 62 | DramAccess = auto() |
| 63 | OnChipFlashAccess = auto() |
| 64 | OffChipFlashAccess = auto() |
| 65 | Total = auto() |
| 66 | Size = auto() |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 67 | |
| 68 | def display_name(self): |
Jonas Ohlsson | d857507 | 2022-03-30 10:30:25 +0200 | [diff] [blame] | 69 | return ( |
| 70 | "NPU", |
| 71 | "SRAM Access", |
| 72 | "DRAM Access", |
| 73 | "On-chip Flash Access", |
| 74 | "Off-chip Flash Access", |
| 75 | "Total", |
| 76 | "Size", |
| 77 | )[self.value] |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 78 | |
| 79 | def identifier_name(self): |
Jonas Ohlsson | d857507 | 2022-03-30 10:30:25 +0200 | [diff] [blame] | 80 | return ( |
| 81 | "npu", |
| 82 | "sram_access", |
| 83 | "dram_access", |
| 84 | "on_chip_flash_access", |
| 85 | "off_chip_flash_access", |
| 86 | "total", |
| 87 | "size", |
| 88 | )[self.value] |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 89 | |
| 90 | @staticmethod |
| 91 | def all(): |
| 92 | return ( |
Diqing Zhong | 42e833d | 2020-10-02 13:18:42 +0200 | [diff] [blame] | 93 | PassCycles.Npu, |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 94 | PassCycles.SramAccess, |
| 95 | PassCycles.DramAccess, |
| 96 | PassCycles.OnChipFlashAccess, |
| 97 | PassCycles.OffChipFlashAccess, |
| 98 | PassCycles.Total, |
| 99 | ) |
| 100 | |
| 101 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 102 | class PerformanceQuery: |
| 103 | def __init__(self, npu_block_type=0): |
| 104 | self.npu_block_type = npu_block_type |
| 105 | self.ifm_shape = Shape4D(0) |
| 106 | self.ifm_format = TensorFormat.NHWC |
| 107 | self.ifm_memory_area = MemArea.Unknown |
| 108 | self.ifm2_memory_area = MemArea.Unknown |
| 109 | self.ifm_bits = 0 |
| 110 | self.ifm2_bits = 0 |
| 111 | self.ifm2_shape = None |
| 112 | self.ifm2_format = TensorFormat.NHWC |
| 113 | self.ofm_shape = Shape4D(0) |
| 114 | self.ofm_format = TensorFormat.NHWC |
| 115 | self.ofm_memory_area = MemArea.Unknown |
| 116 | self.ofm_bits = 0 |
| 117 | self.const_shape = Shape4D(0) |
| 118 | self.const_memory_area = MemArea.Unknown |
| 119 | self.kernel = Kernel(1, 1) |
| 120 | self.config = ArchitectureBlockConfig() |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 121 | |
| 122 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 123 | class CycleCost: |
| 124 | def __init__(self): |
| 125 | self.op_macs = 0 |
| 126 | self.op_cycles = 0 |
| 127 | |
| 128 | def __mul__(self, scale): |
| 129 | out = CycleCost() |
| 130 | out.op_macs = self.op_macs * scale |
| 131 | out.op_cycles = self.op_cycles * scale |
| 132 | return out |
| 133 | |
| 134 | def __iadd__(self, rhs): |
| 135 | self.op_macs += rhs.op_macs |
| 136 | self.op_cycles += rhs.op_cycles |
| 137 | return self |
| 138 | |
| 139 | def __str__(self): |
| 140 | return "macs = {}, cycles = {}".format(self.op_macs, self.op_cycles) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 141 | |
| 142 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 143 | class ElementAccess: |
| 144 | def __init__(self): |
| 145 | # List of ONLY element access counts, consumers |
| 146 | # need to scale these values by the correct bitwidths |
| 147 | # to calculated memory bandwidth |
| 148 | self.ifm_read = [0, 0] # ifm1, ifm2 |
| 149 | self.ofm_write = 0 |
| 150 | self.weights_refetch = 0 |
| 151 | self.const_read = [0, 0] # weights, scales |
| 152 | |
| 153 | def __mul__(self, scale): |
| 154 | out = ElementAccess() |
| 155 | out.ifm_read[0] = self.ifm_read[0] * scale |
| 156 | out.ifm_read[1] = self.ifm_read[1] * scale |
| 157 | out.ofm_write = self.ofm_write * scale |
| 158 | out.weights_refetch = self.weights_refetch * scale |
| 159 | out.const_read[0] = self.const_read[0] * scale |
| 160 | out.const_read[1] = self.const_read[1] * scale |
| 161 | return out |
| 162 | |
| 163 | def __iadd__(self, rhs): |
| 164 | self.ifm_read[0] += rhs.ifm_read[0] |
| 165 | self.ifm_read[1] += rhs.ifm_read[1] |
| 166 | self.ofm_write += rhs.ofm_write |
| 167 | self.weights_refetch += rhs.weights_refetch |
| 168 | self.const_read[0] += rhs.const_read[0] |
| 169 | self.const_read[1] += rhs.const_read[1] |
| 170 | return self |
| 171 | |
| 172 | def __str__(self): |
| 173 | return "ifm read = {}, ofm write = {}, const read={}".format(self.ifm_read, self.ofm_write, self.const_read) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 174 | |
| 175 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 176 | def _strides_for_shape(shape: Shape4D, format: TensorFormat, element_bits): |
| 177 | if format == TensorFormat.NHWC: |
| 178 | strides = [0, 0, 0, 0] |
| 179 | strides[3] = element_bits / 8 # +Z |
| 180 | strides[2] = (element_bits * shape.depth) // 8 # +X |
| 181 | strides[1] = (element_bits * shape.depth * shape.width) // 8 # +Y |
| 182 | strides[0] = (element_bits * shape.depth * shape.width * shape.height) // 8 # +N |
| 183 | elif format == TensorFormat.NHCWB16: |
| 184 | strides = [0, 0, 0, 0, 0] |
| 185 | strides[4] = element_bits / 8 # +Z |
| 186 | strides[3] = (element_bits * 16) / 8 # +X |
| 187 | strides[2] = (element_bits * 16 * shape.width) / 8 # +C |
| 188 | strides[1] = (element_bits * shape.width * shape.depth) / 8 # +Y |
| 189 | strides[0] = (element_bits * shape.width * shape.depth) / 8 # +N |
Diqing Zhong | 42e833d | 2020-10-02 13:18:42 +0200 | [diff] [blame] | 190 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 191 | return strides |
Diqing Zhong | 42e833d | 2020-10-02 13:18:42 +0200 | [diff] [blame] | 192 | |
| 193 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 194 | def _estimate_memory_transfer_efficiency( |
| 195 | arch, is_read, mem_area, format: TensorFormat, element_bits, block_size, shape4D, to_transfer |
Patrik Gustavsson | 3a26920 | 2021-01-21 08:28:55 +0100 | [diff] [blame] | 196 | ): |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 197 | burst_len = 8 |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 198 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 199 | strides = _strides_for_shape(shape4D, format, element_bits) |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 200 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 201 | if format == TensorFormat.NHCWB16: |
| 202 | if strides[2] == block_size.depth: # TODO is this check corrrect for non 8-bit |
| 203 | burst_len = element_bits * block_size.depth * block_size.width |
| 204 | elif is_read: |
| 205 | burst_len = 16 * element_bits * block_size.width |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 206 | else: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 207 | burst_len = 16 * element_bits * block_size.width * arch.ncores |
| 208 | elif format == TensorFormat.NHWC: |
| 209 | if is_read: |
| 210 | if strides[3] == block_size.depth: |
| 211 | burst_len = element_bits * block_size.depth * block_size.width |
| 212 | else: |
| 213 | burst_len = element_bits * block_size.depth |
| 214 | else: |
| 215 | if block_size.depth <= 16 and strides[3] == block_size.depth: |
| 216 | burst_len = element_bits * block_size.depth * block_size.width |
| 217 | else: |
| 218 | burst_len = min(64 * 8, 16 * element_bits * arch.ncores, block_size.depth * element_bits) |
| 219 | |
| 220 | burst_len = burst_len // 8 # bits->bytes |
| 221 | burst_len = min(arch.memory_burst_length[mem_area], burst_len) |
| 222 | return to_transfer * (arch.memory_burst_length[mem_area] / burst_len) |
| 223 | |
| 224 | |
| 225 | def _estimate_minimum_memory_cycles(arch, query: PerformanceQuery): |
| 226 | # Input block HW transfer (only for elements present) |
| 227 | ifm_bytes = Shape4D.min(query.ifm_shape, query.config.ifm_block).elements() |
| 228 | cycles_ifm_blk = arch.memory_latency[query.ifm_memory_area][BandwidthDirection.Read] |
| 229 | cycles_ifm_blk = cycles_ifm_blk + ( |
| 230 | _estimate_memory_transfer_efficiency( |
| 231 | arch, |
| 232 | True, |
| 233 | query.ifm_memory_area, |
| 234 | query.ifm_format, |
| 235 | query.ifm_bits, |
| 236 | query.config.ifm_block, |
| 237 | query.ifm_shape, |
| 238 | ifm_bytes, |
| 239 | ) |
| 240 | / arch.memory_bandwidths_per_cycle[query.ifm_memory_area] |
| 241 | ) |
| 242 | # Output block HW transfer (only for elements present) |
| 243 | ofm_bytes = Shape4D.min(query.ofm_shape, query.config.ofm_block).elements() |
| 244 | cycles_ofm_blk = arch.memory_latency[query.ofm_memory_area][BandwidthDirection.Write] |
| 245 | cycles_ofm_blk = cycles_ofm_blk + ( |
| 246 | _estimate_memory_transfer_efficiency( |
| 247 | arch, |
| 248 | False, |
| 249 | query.ofm_memory_area, |
| 250 | query.ofm_format, |
| 251 | query.ofm_bits, |
| 252 | query.config.ofm_block, |
| 253 | query.ofm_shape, |
| 254 | ofm_bytes, |
| 255 | ) |
| 256 | / arch.memory_bandwidths_per_cycle[query.ofm_memory_area] |
| 257 | ) |
| 258 | return cycles_ifm_blk, cycles_ofm_blk |
| 259 | |
| 260 | |
| 261 | def _estimate_output_cycles_per_element(arch, op_type: Op, faf_type: Op, query: PerformanceQuery): |
| 262 | if query.npu_block_type == NpuBlockType.ElementWise and query.ifm_bits == 32: |
| 263 | # Unary op else Binary op |
| 264 | output_perf_index = 0 if query.ifm2_shape is not None else 1 |
| 265 | elif op_type == Op.Mul and query.ofm_bits == 32: |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 266 | output_perf_index = 2 |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 267 | elif op_type == Op.Mul or ( |
| 268 | query.npu_block_type |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 269 | in ( |
| 270 | NpuBlockType.ConvolutionMxN, |
| 271 | NpuBlockType.ConvolutionDepthWise, |
| 272 | NpuBlockType.Pooling, |
| 273 | NpuBlockType.ReduceSum, |
| 274 | NpuBlockType.VectorProduct, |
| 275 | ) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 276 | and query.config.acc_type == SHRAMElements.Acc40 |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 277 | ): |
| 278 | output_perf_index = 3 |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 279 | elif op_type in (Op.Add, Op.Sub): |
| 280 | if False: |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 281 | # Simple Add/Sub |
| 282 | output_perf_index = 4 |
| 283 | else: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 284 | # Advanced Add/Sub TODO: Add as perf selection as operator variant |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 285 | output_perf_index = 5 |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 286 | elif op_type.is_maxpool_op(): |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 287 | output_perf_index = 6 |
| 288 | else: |
| 289 | output_perf_index = 7 |
| 290 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 291 | if faf_type in (Op.Sigmoid, Op.Tanh, Op.LUT): |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 292 | activation_perf_index = 0 |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 293 | elif faf_type in (Op.Relu, Op.Relu6, Op.ReluN1To1): |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 294 | activation_perf_index = 1 |
| 295 | else: |
| 296 | activation_perf_index = 2 |
| 297 | |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 298 | cycle_per_elem = max( |
| 299 | arch.output_cycles_per_elem[output_perf_index], arch.activation_cycles_per_elem[activation_perf_index] |
| 300 | ) |
Diqing Zhong | 986e319 | 2020-11-16 16:15:56 +0100 | [diff] [blame] | 301 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 302 | if op_type.is_elementwise_op(): |
| 303 | num_elems_blk = query.config.ofm_block.elements() |
| 304 | ifm_blk_cycles, ofm_blk_cycles = _estimate_minimum_memory_cycles(arch, query) |
| 305 | cycle_cmd = ifm_blk_cycles + ofm_blk_cycles |
| 306 | cycle_cmd = (cycle_cmd + cycle_per_elem * num_elems_blk) / 4 # per DPU |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 307 | cycle_per_elem = max(cycle_per_elem, cycle_cmd / num_elems_blk) |
| 308 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 309 | return cycle_per_elem |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 310 | |
| 311 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 312 | def _estimate_conv_cycles(arch, op_type: Op, faf_type: Op, query: PerformanceQuery): |
| 313 | ifm_block = Shape4D.min(query.ifm_shape, query.config.ifm_block) |
| 314 | ofm_block = Shape4D.min(query.ofm_shape, query.config.ofm_block) |
Diqing Zhong | e5204a6 | 2020-10-13 11:42:37 +0200 | [diff] [blame] | 315 | |
| 316 | if ( |
| 317 | arch.config.ofm_ublock.height == 2 |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 318 | and query.npu_block_type |
Diqing Zhong | e5204a6 | 2020-10-13 11:42:37 +0200 | [diff] [blame] | 319 | in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 320 | and query.ofm_shape.height == 1 |
Diqing Zhong | e5204a6 | 2020-10-13 11:42:37 +0200 | [diff] [blame] | 321 | # Optimisation only applies for even width tensors |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 322 | and query.ofm_shape.width % 2 == 0 |
| 323 | and query.kernel.height == 1 |
Diqing Zhong | e5204a6 | 2020-10-13 11:42:37 +0200 | [diff] [blame] | 324 | ): |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 325 | ofm_ublock = Shape4D(1, 1, 4, arch.config.ofm_ublock.depth) |
| 326 | ofm_block = ofm_block.with_height(1) |
| 327 | else: |
| 328 | ofm_ublock = Shape4D(arch.config.ofm_ublock.to_hwc()) |
Diqing Zhong | e5204a6 | 2020-10-13 11:42:37 +0200 | [diff] [blame] | 329 | |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 330 | num_ublk_x = numeric_util.round_up_divide(ofm_block.width, ofm_ublock.width) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 331 | num_ublk_y = numeric_util.round_up_divide(ofm_block.height, ofm_ublock.height) |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 332 | num_ublk_xy = num_ublk_x * num_ublk_y |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 333 | num_ublk_z = numeric_util.round_up_divide(ofm_block.depth, ofm_ublock.depth) |
| 334 | use_acc_40bits = query.config.acc_type == SHRAMElements.Acc40 |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 335 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 336 | sub_kernel_limits = arch.sub_kernel_limits[query.npu_block_type] |
| 337 | n_sub_kernels_y = numeric_util.round_up_divide(query.kernel.height, sub_kernel_limits[0]) |
| 338 | n_sub_kernels_x = numeric_util.round_up_divide(query.kernel.width, sub_kernel_limits[1]) |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 339 | sub_kernel_x = [ |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 340 | min((query.kernel.width - i * sub_kernel_limits[1]), sub_kernel_limits[1]) for i in range(n_sub_kernels_x) |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 341 | ] |
| 342 | sub_kernel_y = [ |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 343 | min((query.kernel.height - i * sub_kernel_limits[0]), sub_kernel_limits[0]) for i in range(n_sub_kernels_y) |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 344 | ] |
| 345 | sub_kernel_size = (x * y for y in sub_kernel_y for x in sub_kernel_x) |
| 346 | |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 347 | cycles_dpu_blk = 0 |
Diqing Zhong | 986e319 | 2020-11-16 16:15:56 +0100 | [diff] [blame] | 348 | cycles_wb = 32 * ofm_ublock.depth // 8 |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 349 | |
| 350 | for num_kernel_elems in sub_kernel_size: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 351 | if query.npu_block_type == NpuBlockType.Pooling: |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 352 | num_kernel_steps = 1 |
Diqing Zhong | 986e319 | 2020-11-16 16:15:56 +0100 | [diff] [blame] | 353 | cycles = max(4, num_kernel_elems) * num_ublk_xy * num_ublk_z |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 354 | if query.ifm_bits == 16 and arch.accelerator_config != Accelerator.Ethos_U55_32: |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 355 | cycles *= 2 |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 356 | elif query.npu_block_type == NpuBlockType.ConvolutionDepthWise: |
Diqing Zhong | 986e319 | 2020-11-16 16:15:56 +0100 | [diff] [blame] | 357 | cycles = 4 * num_ublk_xy |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 358 | if query.ifm_bits == 16: |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 359 | cycles *= 2 |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 360 | num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, 4) |
| 361 | cycles = max(cycles_wb, cycles) * num_kernel_steps * num_ublk_z |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 362 | elif ( |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 363 | (query.npu_block_type == NpuBlockType.ConvolutionMxN and not query.config.is_partkernel) |
| 364 | or query.npu_block_type == NpuBlockType.VectorProduct |
| 365 | or query.npu_block_type == NpuBlockType.ReduceSum |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 366 | ): |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 367 | num_kernel_steps = num_kernel_elems |
| 368 | cycles = max(cycles_wb, 4 * num_ublk_xy) * num_kernel_steps * num_ublk_z |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 369 | else: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 370 | assert query.config.is_partkernel |
| 371 | divider = 2 if query.ifm_bits == 16 else 4 |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 372 | num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, divider) |
Diqing Zhong | 986e319 | 2020-11-16 16:15:56 +0100 | [diff] [blame] | 373 | cycles = max(cycles_wb, 4 * num_ublk_xy) * ( |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 374 | num_kernel_steps * numeric_util.round_up_divide(ifm_block.depth, 8) * num_ublk_z |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 375 | ) |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 376 | |
| 377 | delay_cycles = 0 |
| 378 | if arch.accelerator_config is Accelerator.Ethos_U55_32: |
| 379 | delay = 7 if use_acc_40bits else 3 |
| 380 | if num_ublk_x == 1 and num_ublk_y == 1: |
| 381 | if num_ublk_z == 1: |
| 382 | delay_cycles = delay * num_kernel_steps |
| 383 | elif num_kernel_steps > 1: |
| 384 | delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z |
| 385 | if (num_ublk_x == 1 or num_ublk_y == 1) and num_ublk_z > 1 and use_acc_40bits: |
| 386 | delay_cycles += delay * num_ublk_z |
| 387 | else: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 388 | if use_acc_40bits and arch.accelerator_config in (Accelerator.Ethos_U55_64, Accelerator.Ethos_U55_128): |
| 389 | delay = 3 |
| 390 | else: |
| 391 | delay = 2 |
| 392 | |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 393 | if num_ublk_x == 1 and num_ublk_y == 1: |
| 394 | if num_ublk_z == 1: |
| 395 | delay_cycles = delay * num_kernel_steps |
| 396 | elif num_kernel_steps > 1: |
| 397 | delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z |
| 398 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 399 | if query.npu_block_type == NpuBlockType.ConvolutionMxN and query.config.is_partkernel: |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 400 | delay_cycles *= numeric_util.round_up_divide(ifm_block.depth, 8) |
| 401 | |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 402 | cycles_dpu_blk += cycles |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 403 | cycles_dpu_blk += delay_cycles |
| 404 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 405 | if query.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum): |
| 406 | cycles_dpu_blk *= numeric_util.round_up_divide(query.ifm_shape.depth, ifm_block.depth) |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 407 | |
| 408 | cycles_dpu_blk /= arch.ncores |
| 409 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 410 | # Estimate output cycles |
| 411 | num_ofm_blks = query.ofm_shape.div_round_up(ofm_block).elements() |
Johan Alfvén | f8e353b | 2022-02-04 17:24:23 +0100 | [diff] [blame] | 412 | cycles_output_blk = round_up_to_int( |
| 413 | _estimate_output_cycles_per_element(arch, op_type, faf_type, query) * ofm_block.elements() |
| 414 | ) |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 415 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 416 | # Scale and bias tensor |
| 417 | if query.const_shape.depth > 0: |
Diqing Zhong | f842b69 | 2020-12-11 13:07:37 +0100 | [diff] [blame] | 418 | cycles_bias_blk = ( |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 419 | 10 * ofm_block.depth * arch.memory_latency[query.const_memory_area][BandwidthDirection.Read] / 256 |
Diqing Zhong | f842b69 | 2020-12-11 13:07:37 +0100 | [diff] [blame] | 420 | ) |
Diqing Zhong | 986e319 | 2020-11-16 16:15:56 +0100 | [diff] [blame] | 421 | cycles_output_blk = max(cycles_output_blk, cycles_bias_blk) |
| 422 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 423 | ifm_blk_cycles, ofm_blk_cycles = _estimate_minimum_memory_cycles(arch, query) |
| 424 | cycles_cmd = ifm_blk_cycles + ofm_blk_cycles |
| 425 | cycles_cmd = (cycles_cmd + cycles_output_blk + cycles_dpu_blk) / 4 # per DPU |
| 426 | |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 427 | cycles_dpu_blk = max(cycles_dpu_blk, cycles_cmd) |
| 428 | cycles_output_blk = max(cycles_output_blk, cycles_cmd) |
| 429 | |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 430 | if cycles_dpu_blk > cycles_output_blk: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 431 | total_cycles = cycles_dpu_blk * num_ofm_blks + cycles_output_blk |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 432 | else: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 433 | total_cycles = cycles_output_blk * num_ofm_blks + cycles_dpu_blk |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 434 | |
| 435 | return total_cycles |
| 436 | |
| 437 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 438 | def measure_mem2mem_cycles(arch, from_mem_area, to_mem_area, to_transfer): |
| 439 | from_cycles = to_transfer // arch.memory_bandwidths_per_cycle[from_mem_area] |
Tim Hall | 789e6f3 | 2021-06-17 17:02:31 +0100 | [diff] [blame] | 440 | from_cycles += arch.memory_latency[from_mem_area][BandwidthDirection.Read] |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 441 | to_cycles = to_transfer // arch.memory_bandwidths_per_cycle[to_mem_area] |
| 442 | return max(from_cycles, to_cycles) |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 443 | |
Patrik Gustavsson | ee99bb1 | 2021-04-08 09:04:00 +0200 | [diff] [blame] | 444 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 445 | def measure_cycle_cost(arch, op_type: Op, faf_type: Op, query: PerformanceQuery): |
| 446 | cycles = CycleCost() |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 447 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 448 | # Convolution/Vector product cycle calculation |
| 449 | if query.npu_block_type in ( |
| 450 | NpuBlockType.ConvolutionMxN, |
| 451 | NpuBlockType.ConvolutionDepthWise, |
| 452 | NpuBlockType.VectorProduct, |
| 453 | NpuBlockType.Pooling, |
| 454 | NpuBlockType.ReduceSum, |
| 455 | ): |
| 456 | # cycles.op_macs and cycles.op_cycles should both handle >32-bits |
| 457 | if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling): |
| 458 | cycles.op_macs = int(query.kernel.elements_wh()) * 1 * int(query.ofm_shape.elements()) |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 459 | else: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 460 | cycles.op_macs = ( |
| 461 | int(query.kernel.elements_wh()) * int(query.ifm_shape.depth) * int(query.ofm_shape.elements()) |
| 462 | ) |
| 463 | |
| 464 | cycles.op_cycles = int(_estimate_conv_cycles(arch, op_type, faf_type, query)) |
| 465 | # Elementwise cycle calculation |
| 466 | elif query.npu_block_type == NpuBlockType.ElementWise: |
| 467 | cycles.op_macs = 0 |
Johan Alfvén | f8e353b | 2022-02-04 17:24:23 +0100 | [diff] [blame] | 468 | ofm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ofm_format])) |
| 469 | cycles.op_cycles = round_up_to_int( |
| 470 | _estimate_output_cycles_per_element(arch, op_type, faf_type, query) |
| 471 | * Shape4D.round_up(query.ofm_shape, ofm_rounding).elements() |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 472 | ) |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 473 | else: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 474 | assert False |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 475 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 476 | return cycles |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 477 | |
| 478 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 479 | def measure_element_access(arch, query: PerformanceQuery): |
| 480 | access = ElementAccess() |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 481 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 482 | ifm_block = Shape4D.min(query.ifm_shape, query.config.ifm_block) |
| 483 | ofm_block = Shape4D.min(query.ofm_shape, query.config.ofm_block) |
| 484 | ifm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ifm_format])) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 485 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 486 | # Number of ofm blocks in the overall output shape |
| 487 | ofm_blocks = query.ofm_shape.div_round_up(ofm_block) |
| 488 | ofm_block_depth = ofm_block.depth |
| 489 | if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling): |
| 490 | ofm_blocks = ofm_blocks.with_depth(1) |
| 491 | ofm_block_depth = query.ifm_shape.depth |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 492 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 493 | # Convolution & pooling |
| 494 | if query.npu_block_type in ( |
| 495 | NpuBlockType.ConvolutionMxN, |
| 496 | NpuBlockType.ConvolutionDepthWise, |
| 497 | NpuBlockType.VectorProduct, |
| 498 | NpuBlockType.Pooling, |
| 499 | NpuBlockType.ReduceSum, |
| 500 | ): |
| 501 | # Number of sub kernels |
| 502 | sub_kernel_limits = arch.sub_kernel_limits[query.npu_block_type] |
| 503 | subkernels = numeric_util.round_up_divide(query.kernel.width, sub_kernel_limits[0]) |
| 504 | subkernels *= numeric_util.round_up_divide(query.kernel.height, sub_kernel_limits[1]) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 505 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 506 | ofm_block_count = ofm_blocks.elements() |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 507 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 508 | ifm_fetch = ( |
| 509 | Shape4D.round_up(ifm_block, ifm_rounding).elements_wh() |
| 510 | * Shape4D.round_up(query.ifm_shape, ifm_rounding).depth |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 511 | ) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 512 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 513 | if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling): |
| 514 | kernel_read = query.kernel.elements_wh() * 1 # force to no reread |
| 515 | else: |
| 516 | kernel_read = query.kernel.elements_wh() * query.ifm_shape.depth |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 517 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 518 | weight_fetch = kernel_read * ofm_block_depth * ofm_block_count |
| 519 | |
| 520 | access.ifm_read[0] = ifm_fetch * subkernels * ofm_block_count |
| 521 | |
| 522 | if query.npu_block_type not in (NpuBlockType.Pooling, NpuBlockType.ReduceSum): |
| 523 | access.const_read[0] = weight_fetch |
| 524 | access.const_read[1] = query.ofm_shape.depth # Scales & biases |
| 525 | access.weights_refetch = ofm_blocks.elements_wh() |
| 526 | # Elementwise |
| 527 | elif query.npu_block_type == NpuBlockType.ElementWise: |
| 528 | if query.ifm_shape.elements() == 1: |
| 529 | if query.ifm_bits > 8: |
| 530 | # ifm is a non 8-bit scalar |
| 531 | access.ifm_read[0] = Shape4D.round_up(query.ifm_shape, ifm_rounding).elements() |
| 532 | if query.ifm2_shape: |
| 533 | access.ifm_read[1] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements() |
| 534 | else: |
| 535 | access.ifm_read[0] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements() |
| 536 | if query.ifm2_shape: |
| 537 | if query.ifm2_shape.elements() > 1: |
| 538 | access.ifm_read[1] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements() |
| 539 | elif query.ifm2_bits > 8: |
| 540 | # ifm2 is a non 8-bit scalar |
| 541 | access.ifm_read[1] = Shape4D.round_up(query.ifm2_shape, ifm_rounding).elements() |
| 542 | # Unknown |
| 543 | else: |
| 544 | assert False |
| 545 | |
| 546 | ofm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ofm_format])) |
| 547 | access.ofm_write = Shape4D.round_up(query.ofm_shape, ofm_rounding).elements() |
| 548 | return access |
| 549 | |
| 550 | |
| 551 | def measure_performance_cost( |
| 552 | arch, op_type: Op, faf_type: Op, query: PerformanceQuery, offset: Shape4D, sub_shape: Shape4D |
| 553 | ): |
| 554 | assert (query.ofm_bits > 0) and (query.ifm_bits > 0) |
| 555 | assert query.ofm_shape.elements() != 0 |
| 556 | |
| 557 | # Default to start if no offset provided |
| 558 | if offset is None: |
| 559 | offset = Shape4D(0, 0, 0, 0) |
| 560 | |
| 561 | # Default to entire area if no sub-shape provided |
| 562 | if sub_shape is None: |
| 563 | sub_shape = query.ofm_shape |
| 564 | else: |
| 565 | sub_shape = Shape4D.min(sub_shape, query.ofm_shape) |
| 566 | |
| 567 | sub_query = copy.deepcopy(query) |
| 568 | sub_query.ofm_shape = query.ofm_shape.clip(offset, sub_shape) |
| 569 | |
| 570 | access = ElementAccess() |
| 571 | cycles = CycleCost() |
| 572 | |
| 573 | cycle_tmp = measure_cycle_cost(arch, op_type, faf_type, sub_query) |
| 574 | cycles += cycle_tmp |
| 575 | access = measure_element_access(arch, sub_query) |
| 576 | |
| 577 | return access, cycles |
| 578 | |
| 579 | |
| 580 | def make_bandwidth_array(): |
| 581 | return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size)) |
| 582 | |
| 583 | |
| 584 | def make_cycles_array(): |
| 585 | return np.zeros(PassCycles.Size) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 586 | |
| 587 | |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 588 | def update_summary_cycles(arch, bws, cycles): |
| 589 | cycles[PassCycles.SramAccess] = np.sum(bws[MemArea.Sram]) / arch.memory_bandwidths_per_cycle[MemArea.Sram] |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 590 | cycles[PassCycles.DramAccess] = np.sum(bws[MemArea.Dram]) / arch.memory_bandwidths_per_cycle[MemArea.Dram] |
| 591 | cycles[PassCycles.OnChipFlashAccess] = ( |
| 592 | np.sum(bws[MemArea.OnChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OnChipFlash] |
| 593 | ) |
| 594 | cycles[PassCycles.OffChipFlashAccess] = ( |
| 595 | np.sum(bws[MemArea.OffChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OffChipFlash] |
| 596 | ) |
| 597 | |
| 598 | cycles[PassCycles.Total] = np.max(cycles[: PassCycles.Total]) |
| 599 | return cycles |
| 600 | |
| 601 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 602 | def estimate_full_op_performance( |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame] | 603 | arch, schedule: Schedule, op: SchedulerOperation, prev_op: Optional[SchedulerOperation], block_config |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 604 | ): |
| 605 | cycles_a = make_cycles_array() |
| 606 | bws = make_bandwidth_array() |
| 607 | scaled_bws = make_bandwidth_array() # scaled bw with memory transfer efficiency |
| 608 | macs = 0 |
| 609 | |
| 610 | query = PerformanceQuery(op.op_type.npu_block_type) |
| 611 | query.ifm_shape = op.ifm.shape |
| 612 | query.ifm_format = op.ifm.format |
| 613 | query.ifm_memory_area = op.ifm.mem_area |
| 614 | query.ifm_bits = op.ifm.dtype.size_in_bits() |
| 615 | query.ifm2_shape = op.ifm2 and op.ifm2.shape |
| 616 | query.ifm2_format = op.ifm2 and op.ifm2.format |
| 617 | query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area |
| 618 | query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits() |
| 619 | query.ofm_shape = op.ofm.shape |
| 620 | query.ofm_memory_area = op.ofm.mem_area |
| 621 | query.ofm_bits = op.ofm.dtype.size_in_bits() |
| 622 | query.ofm_format = op.ofm.format |
| 623 | query.kernel = op.kernel |
| 624 | query.config = block_config |
| 625 | |
| 626 | cost = schedule.cost_map[op] |
| 627 | prev_cost = schedule.cost_map[prev_op] if prev_op else None |
| 628 | if op.parent_op.bias: |
| 629 | query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth) |
Rickard Bolin | fd8b500 | 2022-05-16 09:11:06 +0000 | [diff] [blame] | 630 | if cost.buffered_weight_tensors: |
| 631 | query.const_memory_area = cost.buffered_weight_tensors[0].mem_area |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 632 | else: |
| 633 | query.const_memory_area = cost.npu_weights_tensor.mem_area |
| 634 | |
| 635 | cycles = measure_cycle_cost(arch, op.op_type, op.parent_op.activation and op.parent_op.activation.op_type, query) |
| 636 | cycles_a[PassCycles.Npu] = cycles.op_cycles |
| 637 | macs = cycles.op_macs |
| 638 | |
| 639 | access = measure_element_access(arch, query) |
| 640 | |
| 641 | # How many NPU cycles are available under the previously executing |
| 642 | # operator for performing buffered DMA transfers |
| 643 | slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0 |
| 644 | |
| 645 | # LUT Transfer |
| 646 | parent_op = op.parent_op |
| 647 | lut_transfer_cycles = 0 |
| 648 | if parent_op.activation_lut: |
| 649 | lut_tensor = [tens for tens in parent_op.inputs if tens.purpose == TensorPurpose.LUT][0] |
| 650 | src_tensor = lut_tensor.src_tensor |
| 651 | if src_tensor and lut_tensor.mem_area != src_tensor.mem_area: |
| 652 | bw = src_tensor.storage_size() |
| 653 | lut_transfer_cycles = measure_mem2mem_cycles(arch, src_tensor.mem_area, lut_tensor.mem_area, bw) |
| 654 | |
| 655 | bws[src_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw |
| 656 | # LUT read from SHRAM TODO remove? |
Ayaan Masood | d5cbef3 | 2022-02-22 15:56:35 +0000 | [diff] [blame] | 657 | scaled_bws[lut_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 658 | |
Rickard Bolin | fd8b500 | 2022-05-16 09:11:06 +0000 | [diff] [blame] | 659 | if cost.npu_weights_tensor and cost.buffered_weight_tensors: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 660 | # DMA Weight Transfer |
| 661 | sz = 0 |
| 662 | # Get the size of the first DMA |
| 663 | for core in range(0, arch.ncores): |
| 664 | key = WeightKey(core, 0) |
| 665 | if key in cost.npu_weights_tensor.encoded_ranges: |
| 666 | weight_range = cost.npu_weights_tensor.encoded_ranges[key] |
| 667 | sz += round_up(weight_range.total_bytes, 16) |
| 668 | |
| 669 | total_sz = len(cost.npu_weights_tensor.buffer) |
| 670 | bws[cost.npu_weights_tensor.mem_area][TensorPurpose.Weights][BandwidthDirection.Read] += total_sz |
Rickard Bolin | fd8b500 | 2022-05-16 09:11:06 +0000 | [diff] [blame] | 671 | bws[cost.buffered_weight_tensors[0].mem_area][TensorPurpose.Weights][BandwidthDirection.Write] += total_sz |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 672 | |
| 673 | ws_first_transfer_cycles = measure_mem2mem_cycles( |
Rickard Bolin | fd8b500 | 2022-05-16 09:11:06 +0000 | [diff] [blame] | 674 | arch, cost.npu_weights_tensor.mem_area, cost.buffered_weight_tensors[0].mem_area, sz |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 675 | ) |
| 676 | |
| 677 | # Add cycles for Weight + Scale Transfer |
Johan Alfvén | 0f98de6 | 2022-05-15 14:54:51 +0200 | [diff] [blame^] | 678 | if cost.buffered_weight_tensors[0].sub_purpose == TensorSubPurpose.DoubleBuffer: |
| 679 | # Double buffer - weights can be fetched in parallel |
| 680 | cycles_a[PassCycles.Npu] = max( |
| 681 | cost.full_weight_transfer_cycles - slack_cycles + cost.slack_buffering_cycles, |
| 682 | cycles.op_cycles + max(ws_first_transfer_cycles - slack_cycles, 0), |
| 683 | ) |
| 684 | else: |
| 685 | # Standard buffer - weights can not be fetched in parallel so weight transfer |
| 686 | # must be included in the result |
| 687 | cycles_a[PassCycles.Npu] = ( |
| 688 | cycles.op_cycles + cost.full_weight_transfer_cycles - min(ws_first_transfer_cycles, slack_cycles) |
| 689 | ) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 690 | |
| 691 | # Add cycles for LUT Transfer |
| 692 | cycles_a[PassCycles.Npu] += lut_transfer_cycles |
| 693 | else: |
| 694 | # Add cycles for LUT Transfer |
| 695 | cycles_a[PassCycles.Npu] += max(lut_transfer_cycles - slack_cycles, 0) |
| 696 | |
| 697 | # OFM write |
| 698 | ofm = op.parent_op.ofm |
| 699 | bw = access.ofm_write * ofm.element_size() |
| 700 | bws[query.ofm_memory_area][ofm.purpose][BandwidthDirection.Write] += bw |
| 701 | scaled_bws[ofm.mem_area][ofm.purpose][BandwidthDirection.Write] += _estimate_memory_transfer_efficiency( |
| 702 | arch, False, query.ofm_memory_area, ofm.format, query.ofm_bits, query.config.ofm_block, query.ofm_shape, bw |
| 703 | ) |
| 704 | |
| 705 | # IFM read |
| 706 | ifm = op.parent_op.ifm |
| 707 | bw = access.ifm_read[0] * ifm.element_size() |
| 708 | bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += bw |
| 709 | scaled_bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency( |
| 710 | arch, True, query.ifm_memory_area, ifm.format, query.ifm_bits, query.config.ifm_block, query.ifm_shape, bw |
| 711 | ) |
| 712 | if query.ifm2_shape: |
| 713 | ifm2 = op.parent_op.ifm2 |
| 714 | bw = access.ifm_read[1] * ifm2.element_size() |
| 715 | bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += bw |
| 716 | scaled_bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency( |
| 717 | arch, |
| 718 | True, |
| 719 | query.ifm2_memory_area, |
| 720 | ifm2.format, |
| 721 | op.ifm2.dtype.size_in_bits(), |
| 722 | query.config.ifm_block, |
| 723 | query.ifm2_shape, |
| 724 | bw, |
| 725 | ) |
| 726 | |
| 727 | # Weight read |
| 728 | if access.const_read[0] > 0: |
| 729 | # alignment not accounted for in bandwidth_compression_scale_approx |
| 730 | encoded_size_approx = ( |
| 731 | cost.npu_weights_tensor.elements() - access.const_read[1] * op.parent_op.bias.element_size() |
| 732 | ) |
| 733 | orig_weight_size = parent_op.weights.elements() |
| 734 | bandwidth_compression_scale_approx = encoded_size_approx / orig_weight_size |
| 735 | bw = access.const_read[0] * bandwidth_compression_scale_approx |
| 736 | bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw |
| 737 | |
Rickard Bolin | fd8b500 | 2022-05-16 09:11:06 +0000 | [diff] [blame] | 738 | if not cost.buffered_weight_tensors: |
Patrik Gustavsson | 225e19d | 2021-06-01 12:43:43 +0200 | [diff] [blame] | 739 | scaled_bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw |
| 740 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 741 | if access.const_read[1] > 0: |
| 742 | # Scales & biases |
| 743 | bw = access.const_read[1] * op.parent_op.bias.element_size() |
| 744 | bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw |
| 745 | |
Rickard Bolin | fd8b500 | 2022-05-16 09:11:06 +0000 | [diff] [blame] | 746 | if not cost.buffered_weight_tensors: |
Patrik Gustavsson | 225e19d | 2021-06-01 12:43:43 +0200 | [diff] [blame] | 747 | scaled_bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw |
| 748 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 749 | update_summary_cycles(arch, scaled_bws, cycles_a) |
| 750 | |
| 751 | return bws, macs, cycles_a |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 752 | |
| 753 | |
Tim Hall | c1be087 | 2022-03-03 17:50:52 +0000 | [diff] [blame] | 754 | def print_performance( |
| 755 | nng: Graph, |
| 756 | arch: ArchitectureFeatures, |
| 757 | network_type: NetworkType, |
| 758 | bws: dict, |
| 759 | macs: dict, |
| 760 | cycles: dict, |
| 761 | mem_usage: dict, |
| 762 | ): |
| 763 | if network_type == NetworkType.TFLite: |
| 764 | nng_optype_to_input_op_type = tflite_optype_to_builtintype |
| 765 | else: |
| 766 | nng_optype_to_input_op_type = tosa_optype_to_tosa_op_type |
| 767 | |
| 768 | suid_inv_map = {v: k for k, v in DebugDatabase._sourceUID.items()} |
| 769 | |
| 770 | for sg in nng.subgraphs: |
| 771 | |
| 772 | if sg.placement != PassPlacement.Npu: |
| 773 | continue |
| 774 | |
| 775 | print(f"\n{str('#') * 80}") |
| 776 | print(f"Performance for NPU Subgraph {sg.name}") |
| 777 | print( |
| 778 | f" {network_type.name + str(' Operator:'):20s}" |
| 779 | f" {str('NNG Operator:'):20s}" |
| 780 | f" {str('SRAM Usage'):>10s}" |
| 781 | f" ({str('Peak'):>6s}%):" |
| 782 | f"{str('Op Cycles'):>10s}" |
| 783 | f" ({str('Netwrk'):>6s}%)" |
| 784 | f" [" |
| 785 | f" {str('NPU'):>10s}" |
| 786 | f" {str('SRAM AC'):>10s}" |
| 787 | f" {str('DRAM AC'):>10s}" |
| 788 | f" {str('OnFlash AC'):>10s}" |
| 789 | f" {str('OffFlashAC'):>10s}" |
| 790 | f" ]:" |
| 791 | f"{str('MAC Count'):>10s}" |
| 792 | f" ({str('Netwrk'):>6s}% / {str('Util'):>6s}%):" |
| 793 | f"Name:" |
| 794 | ) |
| 795 | |
| 796 | for sched_op in sg.sched_ops: |
| 797 | # get source op name |
| 798 | sched_op_src_uid = DebugDatabase._optimisedUID[sched_op.parent_op][1] |
| 799 | if sched_op_src_uid == DebugDatabase.NULLREF: |
| 800 | src_op_type = None |
| 801 | else: |
| 802 | src_op_type = suid_inv_map[sched_op_src_uid].type |
| 803 | |
| 804 | src_op_name = nng_optype_to_input_op_type(src_op_type) |
| 805 | |
| 806 | max_macs = cycles[sched_op][PassCycles.Total] * arch.num_macs_per_cycle * arch.ncores |
| 807 | |
| 808 | print( |
| 809 | f" {src_op_name:20s}" |
| 810 | f" {sched_op.op_type:20s}" |
| 811 | f" {mem_usage[sched_op]:10.0f}" |
| 812 | f" ({mem_usage[sched_op] / nng.memory_used[MemArea.Sram] * 100:6.2f}%)" |
| 813 | f" {cycles[sched_op][PassCycles.Total]:10.0f}" |
| 814 | f" ({cycles[sched_op][PassCycles.Total] / nng.cycles[PassCycles.Total] * 100:6.2f}%)" |
| 815 | f" [" |
| 816 | f" {cycles[sched_op][PassCycles.Npu]:10.0f}" |
| 817 | f" {cycles[sched_op][PassCycles.SramAccess]:10.0f}" |
| 818 | f" {cycles[sched_op][PassCycles.DramAccess]:10.0f}" |
| 819 | f" {cycles[sched_op][PassCycles.OnChipFlashAccess]:10.0f}" |
| 820 | f" {cycles[sched_op][PassCycles.OffChipFlashAccess]:10.0f}" |
| 821 | f" ]" |
| 822 | f" {macs[sched_op]:10d}" |
| 823 | f" ({macs[sched_op] / nng.macs * 100:6.2f}% / {macs[sched_op] / max_macs * 100:6.2f}%)" |
| 824 | f" {sched_op.name:s}" |
| 825 | ) |
| 826 | |
| 827 | |
| 828 | def calc_new_performance_for_network(nng: Graph, arch, network_type: NetworkType, verbose_performance: bool): |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 829 | total_bws = make_bandwidth_array() |
Diqing Zhong | 69aadd0 | 2020-12-08 13:08:48 +0100 | [diff] [blame] | 830 | total_macs = 0 |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 831 | total_cycles = np.zeros(PassCycles.Size) |
Ayaan Masood | b801dda | 2022-02-22 11:28:55 +0000 | [diff] [blame] | 832 | total_weight_size = 0 |
| 833 | total_encoded_weight_size = 0 |
| 834 | |
| 835 | # Store unique instances of original/encoded weight tensor uuids to prevent double counting of weights |
| 836 | original_weight_uuids: Set[UUID] = set() |
| 837 | encoded_npu_weight_uuids: Set[UUID] = set() |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 838 | |
Tim Hall | c1be087 | 2022-03-03 17:50:52 +0000 | [diff] [blame] | 839 | bws = {} |
| 840 | macs = {} |
| 841 | cycles = {} |
| 842 | mem_usage = {} |
| 843 | |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 844 | for sg in nng.subgraphs: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 845 | prev_op = None |
| 846 | for sched_op in sg.sched_ops: |
Ayaan Masood | b801dda | 2022-02-22 11:28:55 +0000 | [diff] [blame] | 847 | op_info: SchedulerOpInfo = sg.schedule.cost_map[sched_op] |
Tim Hall | c1be087 | 2022-03-03 17:50:52 +0000 | [diff] [blame] | 848 | bws[sched_op], macs[sched_op], cycles[sched_op] = estimate_full_op_performance( |
| 849 | arch, sg.schedule, sched_op, prev_op, op_info.block_config |
| 850 | ) |
| 851 | |
| 852 | # get op sram usage |
| 853 | mem_usage[sched_op] = ( |
| 854 | sg.schedule.memory_snapshot[op_info.time_index] |
| 855 | if op_info.time_index < len(sg.schedule.memory_snapshot) |
| 856 | else 0 |
| 857 | ) |
Ayaan Masood | b801dda | 2022-02-22 11:28:55 +0000 | [diff] [blame] | 858 | |
| 859 | # Tensors for calculating weight sizes |
| 860 | original_weight = sched_op.parent_op.weights |
| 861 | encoded_npu_weight = op_info.npu_weights_tensor |
| 862 | |
| 863 | # Save UUIDs of original_weight so only unique instances of tensors are used to calculate weights |
| 864 | if original_weight and (original_weight.equivalence_id not in original_weight_uuids): |
| 865 | |
| 866 | original_weight_uuids.add(original_weight.equivalence_id) |
| 867 | total_weight_size += original_weight.values.itemsize * original_weight.values.size |
| 868 | |
| 869 | # Save UUIDs of encoded_npu_weight so only unique instances of tensors are used to calculate weights |
| 870 | if encoded_npu_weight and (encoded_npu_weight.equivalence_id not in encoded_npu_weight_uuids): |
| 871 | |
Jonas Ohlsson | 77b448f | 2022-03-11 16:08:30 +0100 | [diff] [blame] | 872 | encoded_npu_weight_uuids.add(encoded_npu_weight.equivalence_id) |
Ayaan Masood | b801dda | 2022-02-22 11:28:55 +0000 | [diff] [blame] | 873 | total_encoded_weight_size += len(encoded_npu_weight.buffer) |
| 874 | |
Tim Hall | c1be087 | 2022-03-03 17:50:52 +0000 | [diff] [blame] | 875 | total_bws += bws[sched_op] |
| 876 | total_macs += macs[sched_op] |
| 877 | total_cycles += cycles[sched_op] |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 878 | prev_op = sched_op |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 879 | |
| 880 | nng.bandwidths = total_bws |
| 881 | nng.macs = total_macs |
| 882 | nng.cycles = total_cycles |
Ayaan Masood | b801dda | 2022-02-22 11:28:55 +0000 | [diff] [blame] | 883 | nng.total_original_weights = total_weight_size |
| 884 | nng.total_npu_encoded_weights = total_encoded_weight_size |
Tim Hall | c1be087 | 2022-03-03 17:50:52 +0000 | [diff] [blame] | 885 | |
| 886 | if verbose_performance: |
| 887 | print_performance(nng, arch, network_type, bws, macs, cycles, mem_usage) |