erik.andersson@arm.com | 460c689 | 2021-02-24 14:38:09 +0100 | [diff] [blame] | 1 | # Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved. |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 2 | # |
| 3 | # SPDX-License-Identifier: Apache-2.0 |
| 4 | # |
| 5 | # Licensed under the Apache License, Version 2.0 (the License); you may |
| 6 | # not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| 8 | # |
| 9 | # www.apache.org/licenses/LICENSE-2.0 |
| 10 | # |
| 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an AS IS BASIS, WITHOUT |
| 13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 16 | # Description: |
| 17 | # NPU performance estimation functions to estimate performance of a Pass and CascadedPass. Uses a model that takes the |
| 18 | # maximum of the 'cycles required for bandwidth' and 'cycles required for computing'. |
| 19 | # |
| 20 | # Called during scheduling to evaluate different proposals, as well as post-scheduling to provide a final performance |
| 21 | # estimate. |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 22 | import copy |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 23 | from enum import auto |
| 24 | from enum import IntEnum |
Diego Russo | ea6111a | 2020-04-14 18:41:58 +0100 | [diff] [blame] | 25 | |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 26 | import numpy as np |
Diego Russo | ea6111a | 2020-04-14 18:41:58 +0100 | [diff] [blame] | 27 | |
| 28 | from . import numeric_util |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 29 | from .architecture_allocator import ArchitectureBlockConfig |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 30 | from .architecture_features import Accelerator |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 31 | from .architecture_features import NpuBlockType |
| 32 | from .architecture_features import SHRAMElements |
| 33 | from .architecture_features import TensorFormat |
| 34 | from .numeric_util import round_up |
| 35 | from .operation import Kernel |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 36 | from .operation import Op |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 37 | from .scheduler import Schedule |
| 38 | from .scheduler import SchedulerOperation |
| 39 | from .shape4d import Shape4D |
Diqing Zhong | f842b69 | 2020-12-11 13:07:37 +0100 | [diff] [blame] | 40 | from .tensor import BandwidthDirection |
Diego Russo | e8a1045 | 2020-04-21 17:39:10 +0100 | [diff] [blame] | 41 | from .tensor import MemArea |
Diego Russo | e8a1045 | 2020-04-21 17:39:10 +0100 | [diff] [blame] | 42 | from .tensor import TensorPurpose |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 43 | from .weight_compressor import WeightKey |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 44 | |
| 45 | |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 46 | class PassCycles(IntEnum): |
Diqing Zhong | 42e833d | 2020-10-02 13:18:42 +0200 | [diff] [blame] | 47 | Npu = 0 |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 48 | SramAccess = auto() |
| 49 | DramAccess = auto() |
| 50 | OnChipFlashAccess = auto() |
| 51 | OffChipFlashAccess = auto() |
| 52 | Total = auto() |
| 53 | Size = auto() |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 54 | |
| 55 | def display_name(self): |
Tim Hall | 1bd531d | 2020-11-01 20:59:36 +0000 | [diff] [blame] | 56 | return ("NPU", "SRAM Access", "DRAM Access", "On-chip Flash Access", "Off-chip Flash Access", "Total", "Size",)[ |
| 57 | self.value |
| 58 | ] |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 59 | |
| 60 | def identifier_name(self): |
Tim Hall | 1bd531d | 2020-11-01 20:59:36 +0000 | [diff] [blame] | 61 | return ("npu", "sram_access", "dram_access", "on_chip_flash_access", "off_chip_flash_access", "total", "size",)[ |
| 62 | self.value |
| 63 | ] |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 64 | |
| 65 | @staticmethod |
| 66 | def all(): |
| 67 | return ( |
Diqing Zhong | 42e833d | 2020-10-02 13:18:42 +0200 | [diff] [blame] | 68 | PassCycles.Npu, |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 69 | PassCycles.SramAccess, |
| 70 | PassCycles.DramAccess, |
| 71 | PassCycles.OnChipFlashAccess, |
| 72 | PassCycles.OffChipFlashAccess, |
| 73 | PassCycles.Total, |
| 74 | ) |
| 75 | |
| 76 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 77 | class PerformanceQuery: |
| 78 | def __init__(self, npu_block_type=0): |
| 79 | self.npu_block_type = npu_block_type |
| 80 | self.ifm_shape = Shape4D(0) |
| 81 | self.ifm_format = TensorFormat.NHWC |
| 82 | self.ifm_memory_area = MemArea.Unknown |
| 83 | self.ifm2_memory_area = MemArea.Unknown |
| 84 | self.ifm_bits = 0 |
| 85 | self.ifm2_bits = 0 |
| 86 | self.ifm2_shape = None |
| 87 | self.ifm2_format = TensorFormat.NHWC |
| 88 | self.ofm_shape = Shape4D(0) |
| 89 | self.ofm_format = TensorFormat.NHWC |
| 90 | self.ofm_memory_area = MemArea.Unknown |
| 91 | self.ofm_bits = 0 |
| 92 | self.const_shape = Shape4D(0) |
| 93 | self.const_memory_area = MemArea.Unknown |
| 94 | self.kernel = Kernel(1, 1) |
| 95 | self.config = ArchitectureBlockConfig() |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 96 | |
| 97 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 98 | class CycleCost: |
| 99 | def __init__(self): |
| 100 | self.op_macs = 0 |
| 101 | self.op_cycles = 0 |
| 102 | |
| 103 | def __mul__(self, scale): |
| 104 | out = CycleCost() |
| 105 | out.op_macs = self.op_macs * scale |
| 106 | out.op_cycles = self.op_cycles * scale |
| 107 | return out |
| 108 | |
| 109 | def __iadd__(self, rhs): |
| 110 | self.op_macs += rhs.op_macs |
| 111 | self.op_cycles += rhs.op_cycles |
| 112 | return self |
| 113 | |
| 114 | def __str__(self): |
| 115 | return "macs = {}, cycles = {}".format(self.op_macs, self.op_cycles) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 116 | |
| 117 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 118 | class ElementAccess: |
| 119 | def __init__(self): |
| 120 | # List of ONLY element access counts, consumers |
| 121 | # need to scale these values by the correct bitwidths |
| 122 | # to calculated memory bandwidth |
| 123 | self.ifm_read = [0, 0] # ifm1, ifm2 |
| 124 | self.ofm_write = 0 |
| 125 | self.weights_refetch = 0 |
| 126 | self.const_read = [0, 0] # weights, scales |
| 127 | |
| 128 | def __mul__(self, scale): |
| 129 | out = ElementAccess() |
| 130 | out.ifm_read[0] = self.ifm_read[0] * scale |
| 131 | out.ifm_read[1] = self.ifm_read[1] * scale |
| 132 | out.ofm_write = self.ofm_write * scale |
| 133 | out.weights_refetch = self.weights_refetch * scale |
| 134 | out.const_read[0] = self.const_read[0] * scale |
| 135 | out.const_read[1] = self.const_read[1] * scale |
| 136 | return out |
| 137 | |
| 138 | def __iadd__(self, rhs): |
| 139 | self.ifm_read[0] += rhs.ifm_read[0] |
| 140 | self.ifm_read[1] += rhs.ifm_read[1] |
| 141 | self.ofm_write += rhs.ofm_write |
| 142 | self.weights_refetch += rhs.weights_refetch |
| 143 | self.const_read[0] += rhs.const_read[0] |
| 144 | self.const_read[1] += rhs.const_read[1] |
| 145 | return self |
| 146 | |
| 147 | def __str__(self): |
| 148 | return "ifm read = {}, ofm write = {}, const read={}".format(self.ifm_read, self.ofm_write, self.const_read) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 149 | |
| 150 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 151 | def _strides_for_shape(shape: Shape4D, format: TensorFormat, element_bits): |
| 152 | if format == TensorFormat.NHWC: |
| 153 | strides = [0, 0, 0, 0] |
| 154 | strides[3] = element_bits / 8 # +Z |
| 155 | strides[2] = (element_bits * shape.depth) // 8 # +X |
| 156 | strides[1] = (element_bits * shape.depth * shape.width) // 8 # +Y |
| 157 | strides[0] = (element_bits * shape.depth * shape.width * shape.height) // 8 # +N |
| 158 | elif format == TensorFormat.NHCWB16: |
| 159 | strides = [0, 0, 0, 0, 0] |
| 160 | strides[4] = element_bits / 8 # +Z |
| 161 | strides[3] = (element_bits * 16) / 8 # +X |
| 162 | strides[2] = (element_bits * 16 * shape.width) / 8 # +C |
| 163 | strides[1] = (element_bits * shape.width * shape.depth) / 8 # +Y |
| 164 | strides[0] = (element_bits * shape.width * shape.depth) / 8 # +N |
Diqing Zhong | 42e833d | 2020-10-02 13:18:42 +0200 | [diff] [blame] | 165 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 166 | return strides |
Diqing Zhong | 42e833d | 2020-10-02 13:18:42 +0200 | [diff] [blame] | 167 | |
| 168 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 169 | def _estimate_memory_transfer_efficiency( |
| 170 | arch, is_read, mem_area, format: TensorFormat, element_bits, block_size, shape4D, to_transfer |
Patrik Gustavsson | 3a26920 | 2021-01-21 08:28:55 +0100 | [diff] [blame] | 171 | ): |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 172 | burst_len = 8 |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 173 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 174 | strides = _strides_for_shape(shape4D, format, element_bits) |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 175 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 176 | if format == TensorFormat.NHCWB16: |
| 177 | if strides[2] == block_size.depth: # TODO is this check corrrect for non 8-bit |
| 178 | burst_len = element_bits * block_size.depth * block_size.width |
| 179 | elif is_read: |
| 180 | burst_len = 16 * element_bits * block_size.width |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 181 | else: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 182 | burst_len = 16 * element_bits * block_size.width * arch.ncores |
| 183 | elif format == TensorFormat.NHWC: |
| 184 | if is_read: |
| 185 | if strides[3] == block_size.depth: |
| 186 | burst_len = element_bits * block_size.depth * block_size.width |
| 187 | else: |
| 188 | burst_len = element_bits * block_size.depth |
| 189 | else: |
| 190 | if block_size.depth <= 16 and strides[3] == block_size.depth: |
| 191 | burst_len = element_bits * block_size.depth * block_size.width |
| 192 | else: |
| 193 | burst_len = min(64 * 8, 16 * element_bits * arch.ncores, block_size.depth * element_bits) |
| 194 | |
| 195 | burst_len = burst_len // 8 # bits->bytes |
| 196 | burst_len = min(arch.memory_burst_length[mem_area], burst_len) |
| 197 | return to_transfer * (arch.memory_burst_length[mem_area] / burst_len) |
| 198 | |
| 199 | |
| 200 | def _estimate_minimum_memory_cycles(arch, query: PerformanceQuery): |
| 201 | # Input block HW transfer (only for elements present) |
| 202 | ifm_bytes = Shape4D.min(query.ifm_shape, query.config.ifm_block).elements() |
| 203 | cycles_ifm_blk = arch.memory_latency[query.ifm_memory_area][BandwidthDirection.Read] |
| 204 | cycles_ifm_blk = cycles_ifm_blk + ( |
| 205 | _estimate_memory_transfer_efficiency( |
| 206 | arch, |
| 207 | True, |
| 208 | query.ifm_memory_area, |
| 209 | query.ifm_format, |
| 210 | query.ifm_bits, |
| 211 | query.config.ifm_block, |
| 212 | query.ifm_shape, |
| 213 | ifm_bytes, |
| 214 | ) |
| 215 | / arch.memory_bandwidths_per_cycle[query.ifm_memory_area] |
| 216 | ) |
| 217 | # Output block HW transfer (only for elements present) |
| 218 | ofm_bytes = Shape4D.min(query.ofm_shape, query.config.ofm_block).elements() |
| 219 | cycles_ofm_blk = arch.memory_latency[query.ofm_memory_area][BandwidthDirection.Write] |
| 220 | cycles_ofm_blk = cycles_ofm_blk + ( |
| 221 | _estimate_memory_transfer_efficiency( |
| 222 | arch, |
| 223 | False, |
| 224 | query.ofm_memory_area, |
| 225 | query.ofm_format, |
| 226 | query.ofm_bits, |
| 227 | query.config.ofm_block, |
| 228 | query.ofm_shape, |
| 229 | ofm_bytes, |
| 230 | ) |
| 231 | / arch.memory_bandwidths_per_cycle[query.ofm_memory_area] |
| 232 | ) |
| 233 | return cycles_ifm_blk, cycles_ofm_blk |
| 234 | |
| 235 | |
| 236 | def _estimate_output_cycles_per_element(arch, op_type: Op, faf_type: Op, query: PerformanceQuery): |
| 237 | if query.npu_block_type == NpuBlockType.ElementWise and query.ifm_bits == 32: |
| 238 | # Unary op else Binary op |
| 239 | output_perf_index = 0 if query.ifm2_shape is not None else 1 |
| 240 | elif op_type == Op.Mul and query.ofm_bits == 32: |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 241 | output_perf_index = 2 |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 242 | elif op_type == Op.Mul or ( |
| 243 | query.npu_block_type |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 244 | in ( |
| 245 | NpuBlockType.ConvolutionMxN, |
| 246 | NpuBlockType.ConvolutionDepthWise, |
| 247 | NpuBlockType.Pooling, |
| 248 | NpuBlockType.ReduceSum, |
| 249 | NpuBlockType.VectorProduct, |
| 250 | ) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 251 | and query.config.acc_type == SHRAMElements.Acc40 |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 252 | ): |
| 253 | output_perf_index = 3 |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 254 | elif op_type in (Op.Add, Op.Sub): |
| 255 | if False: |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 256 | # Simple Add/Sub |
| 257 | output_perf_index = 4 |
| 258 | else: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 259 | # Advanced Add/Sub TODO: Add as perf selection as operator variant |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 260 | output_perf_index = 5 |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 261 | elif op_type.is_maxpool_op(): |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 262 | output_perf_index = 6 |
| 263 | else: |
| 264 | output_perf_index = 7 |
| 265 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 266 | if faf_type in (Op.Sigmoid, Op.Tanh, Op.LUT): |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 267 | activation_perf_index = 0 |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 268 | elif faf_type in (Op.Relu, Op.Relu6, Op.ReluN1To1): |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 269 | activation_perf_index = 1 |
| 270 | else: |
| 271 | activation_perf_index = 2 |
| 272 | |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 273 | cycle_per_elem = max( |
| 274 | arch.output_cycles_per_elem[output_perf_index], arch.activation_cycles_per_elem[activation_perf_index] |
| 275 | ) |
Diqing Zhong | 986e319 | 2020-11-16 16:15:56 +0100 | [diff] [blame] | 276 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 277 | if op_type.is_elementwise_op(): |
| 278 | num_elems_blk = query.config.ofm_block.elements() |
| 279 | ifm_blk_cycles, ofm_blk_cycles = _estimate_minimum_memory_cycles(arch, query) |
| 280 | cycle_cmd = ifm_blk_cycles + ofm_blk_cycles |
| 281 | cycle_cmd = (cycle_cmd + cycle_per_elem * num_elems_blk) / 4 # per DPU |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 282 | cycle_per_elem = max(cycle_per_elem, cycle_cmd / num_elems_blk) |
| 283 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 284 | return cycle_per_elem |
Diqing Zhong | e8887a3 | 2020-09-24 09:53:48 +0200 | [diff] [blame] | 285 | |
| 286 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 287 | def _estimate_conv_cycles(arch, op_type: Op, faf_type: Op, query: PerformanceQuery): |
| 288 | ifm_block = Shape4D.min(query.ifm_shape, query.config.ifm_block) |
| 289 | ofm_block = Shape4D.min(query.ofm_shape, query.config.ofm_block) |
Diqing Zhong | e5204a6 | 2020-10-13 11:42:37 +0200 | [diff] [blame] | 290 | |
| 291 | if ( |
| 292 | arch.config.ofm_ublock.height == 2 |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 293 | and query.npu_block_type |
Diqing Zhong | e5204a6 | 2020-10-13 11:42:37 +0200 | [diff] [blame] | 294 | in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 295 | and query.ofm_shape.height == 1 |
Diqing Zhong | e5204a6 | 2020-10-13 11:42:37 +0200 | [diff] [blame] | 296 | # Optimisation only applies for even width tensors |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 297 | and query.ofm_shape.width % 2 == 0 |
| 298 | and query.kernel.height == 1 |
Diqing Zhong | e5204a6 | 2020-10-13 11:42:37 +0200 | [diff] [blame] | 299 | ): |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 300 | ofm_ublock = Shape4D(1, 1, 4, arch.config.ofm_ublock.depth) |
| 301 | ofm_block = ofm_block.with_height(1) |
| 302 | else: |
| 303 | ofm_ublock = Shape4D(arch.config.ofm_ublock.to_hwc()) |
Diqing Zhong | e5204a6 | 2020-10-13 11:42:37 +0200 | [diff] [blame] | 304 | |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 305 | num_ublk_x = numeric_util.round_up_divide(ofm_block.width, ofm_ublock.width) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 306 | num_ublk_y = numeric_util.round_up_divide(ofm_block.height, ofm_ublock.height) |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 307 | num_ublk_xy = num_ublk_x * num_ublk_y |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 308 | num_ublk_z = numeric_util.round_up_divide(ofm_block.depth, ofm_ublock.depth) |
| 309 | use_acc_40bits = query.config.acc_type == SHRAMElements.Acc40 |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 310 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 311 | sub_kernel_limits = arch.sub_kernel_limits[query.npu_block_type] |
| 312 | n_sub_kernels_y = numeric_util.round_up_divide(query.kernel.height, sub_kernel_limits[0]) |
| 313 | n_sub_kernels_x = numeric_util.round_up_divide(query.kernel.width, sub_kernel_limits[1]) |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 314 | sub_kernel_x = [ |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 315 | min((query.kernel.width - i * sub_kernel_limits[1]), sub_kernel_limits[1]) for i in range(n_sub_kernels_x) |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 316 | ] |
| 317 | sub_kernel_y = [ |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 318 | min((query.kernel.height - i * sub_kernel_limits[0]), sub_kernel_limits[0]) for i in range(n_sub_kernels_y) |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 319 | ] |
| 320 | sub_kernel_size = (x * y for y in sub_kernel_y for x in sub_kernel_x) |
| 321 | |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 322 | cycles_dpu_blk = 0 |
Diqing Zhong | 986e319 | 2020-11-16 16:15:56 +0100 | [diff] [blame] | 323 | cycles_wb = 32 * ofm_ublock.depth // 8 |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 324 | |
| 325 | for num_kernel_elems in sub_kernel_size: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 326 | if query.npu_block_type == NpuBlockType.Pooling: |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 327 | num_kernel_steps = 1 |
Diqing Zhong | 986e319 | 2020-11-16 16:15:56 +0100 | [diff] [blame] | 328 | cycles = max(4, num_kernel_elems) * num_ublk_xy * num_ublk_z |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 329 | if query.ifm_bits == 16 and arch.accelerator_config != Accelerator.Ethos_U55_32: |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 330 | cycles *= 2 |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 331 | elif query.npu_block_type == NpuBlockType.ConvolutionDepthWise: |
Diqing Zhong | 986e319 | 2020-11-16 16:15:56 +0100 | [diff] [blame] | 332 | cycles = 4 * num_ublk_xy |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 333 | if query.ifm_bits == 16: |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 334 | cycles *= 2 |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 335 | num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, 4) |
| 336 | cycles = max(cycles_wb, cycles) * num_kernel_steps * num_ublk_z |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 337 | elif ( |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 338 | (query.npu_block_type == NpuBlockType.ConvolutionMxN and not query.config.is_partkernel) |
| 339 | or query.npu_block_type == NpuBlockType.VectorProduct |
| 340 | or query.npu_block_type == NpuBlockType.ReduceSum |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 341 | ): |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 342 | num_kernel_steps = num_kernel_elems |
| 343 | cycles = max(cycles_wb, 4 * num_ublk_xy) * num_kernel_steps * num_ublk_z |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 344 | else: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 345 | assert query.config.is_partkernel |
| 346 | divider = 2 if query.ifm_bits == 16 else 4 |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 347 | num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, divider) |
Diqing Zhong | 986e319 | 2020-11-16 16:15:56 +0100 | [diff] [blame] | 348 | cycles = max(cycles_wb, 4 * num_ublk_xy) * ( |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 349 | num_kernel_steps * numeric_util.round_up_divide(ifm_block.depth, 8) * num_ublk_z |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 350 | ) |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 351 | |
| 352 | delay_cycles = 0 |
| 353 | if arch.accelerator_config is Accelerator.Ethos_U55_32: |
| 354 | delay = 7 if use_acc_40bits else 3 |
| 355 | if num_ublk_x == 1 and num_ublk_y == 1: |
| 356 | if num_ublk_z == 1: |
| 357 | delay_cycles = delay * num_kernel_steps |
| 358 | elif num_kernel_steps > 1: |
| 359 | delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z |
| 360 | if (num_ublk_x == 1 or num_ublk_y == 1) and num_ublk_z > 1 and use_acc_40bits: |
| 361 | delay_cycles += delay * num_ublk_z |
| 362 | else: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 363 | if use_acc_40bits and arch.accelerator_config in (Accelerator.Ethos_U55_64, Accelerator.Ethos_U55_128): |
| 364 | delay = 3 |
| 365 | else: |
| 366 | delay = 2 |
| 367 | |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 368 | if num_ublk_x == 1 and num_ublk_y == 1: |
| 369 | if num_ublk_z == 1: |
| 370 | delay_cycles = delay * num_kernel_steps |
| 371 | elif num_kernel_steps > 1: |
| 372 | delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z |
| 373 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 374 | if query.npu_block_type == NpuBlockType.ConvolutionMxN and query.config.is_partkernel: |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 375 | delay_cycles *= numeric_util.round_up_divide(ifm_block.depth, 8) |
| 376 | |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 377 | cycles_dpu_blk += cycles |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 378 | cycles_dpu_blk += delay_cycles |
| 379 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 380 | if query.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum): |
| 381 | cycles_dpu_blk *= numeric_util.round_up_divide(query.ifm_shape.depth, ifm_block.depth) |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 382 | |
| 383 | cycles_dpu_blk /= arch.ncores |
| 384 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 385 | # Estimate output cycles |
| 386 | num_ofm_blks = query.ofm_shape.div_round_up(ofm_block).elements() |
| 387 | cycles_output_blk = _estimate_output_cycles_per_element(arch, op_type, faf_type, query) * ofm_block.elements() |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 388 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 389 | # Scale and bias tensor |
| 390 | if query.const_shape.depth > 0: |
Diqing Zhong | f842b69 | 2020-12-11 13:07:37 +0100 | [diff] [blame] | 391 | cycles_bias_blk = ( |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 392 | 10 * ofm_block.depth * arch.memory_latency[query.const_memory_area][BandwidthDirection.Read] / 256 |
Diqing Zhong | f842b69 | 2020-12-11 13:07:37 +0100 | [diff] [blame] | 393 | ) |
Diqing Zhong | 986e319 | 2020-11-16 16:15:56 +0100 | [diff] [blame] | 394 | cycles_output_blk = max(cycles_output_blk, cycles_bias_blk) |
| 395 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 396 | ifm_blk_cycles, ofm_blk_cycles = _estimate_minimum_memory_cycles(arch, query) |
| 397 | cycles_cmd = ifm_blk_cycles + ofm_blk_cycles |
| 398 | cycles_cmd = (cycles_cmd + cycles_output_blk + cycles_dpu_blk) / 4 # per DPU |
| 399 | |
Diqing Zhong | ef0c7fe | 2020-11-24 14:38:20 +0100 | [diff] [blame] | 400 | cycles_dpu_blk = max(cycles_dpu_blk, cycles_cmd) |
| 401 | cycles_output_blk = max(cycles_output_blk, cycles_cmd) |
| 402 | |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 403 | if cycles_dpu_blk > cycles_output_blk: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 404 | total_cycles = cycles_dpu_blk * num_ofm_blks + cycles_output_blk |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 405 | else: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 406 | total_cycles = cycles_output_blk * num_ofm_blks + cycles_dpu_blk |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 407 | |
| 408 | return total_cycles |
| 409 | |
| 410 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 411 | def measure_mem2mem_cycles(arch, from_mem_area, to_mem_area, to_transfer): |
| 412 | from_cycles = to_transfer // arch.memory_bandwidths_per_cycle[from_mem_area] |
| 413 | to_cycles = to_transfer // arch.memory_bandwidths_per_cycle[to_mem_area] |
| 414 | return max(from_cycles, to_cycles) |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 415 | |
Patrik Gustavsson | ee99bb1 | 2021-04-08 09:04:00 +0200 | [diff] [blame] | 416 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 417 | def measure_cycle_cost(arch, op_type: Op, faf_type: Op, query: PerformanceQuery): |
| 418 | cycles = CycleCost() |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 419 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 420 | # Convolution/Vector product cycle calculation |
| 421 | if query.npu_block_type in ( |
| 422 | NpuBlockType.ConvolutionMxN, |
| 423 | NpuBlockType.ConvolutionDepthWise, |
| 424 | NpuBlockType.VectorProduct, |
| 425 | NpuBlockType.Pooling, |
| 426 | NpuBlockType.ReduceSum, |
| 427 | ): |
| 428 | # cycles.op_macs and cycles.op_cycles should both handle >32-bits |
| 429 | if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling): |
| 430 | cycles.op_macs = int(query.kernel.elements_wh()) * 1 * int(query.ofm_shape.elements()) |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 431 | else: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 432 | cycles.op_macs = ( |
| 433 | int(query.kernel.elements_wh()) * int(query.ifm_shape.depth) * int(query.ofm_shape.elements()) |
| 434 | ) |
| 435 | |
| 436 | cycles.op_cycles = int(_estimate_conv_cycles(arch, op_type, faf_type, query)) |
| 437 | # Elementwise cycle calculation |
| 438 | elif query.npu_block_type == NpuBlockType.ElementWise: |
| 439 | cycles.op_macs = 0 |
| 440 | cycles.op_cycles = int(_estimate_output_cycles_per_element(arch, op_type, faf_type, query)) * int( |
| 441 | query.ofm_shape.elements() |
| 442 | ) |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 443 | else: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 444 | assert False |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 445 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 446 | return cycles |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 447 | |
| 448 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 449 | def measure_element_access(arch, query: PerformanceQuery): |
| 450 | access = ElementAccess() |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 451 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 452 | ifm_block = Shape4D.min(query.ifm_shape, query.config.ifm_block) |
| 453 | ofm_block = Shape4D.min(query.ofm_shape, query.config.ofm_block) |
| 454 | ifm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ifm_format])) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 455 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 456 | # Number of ofm blocks in the overall output shape |
| 457 | ofm_blocks = query.ofm_shape.div_round_up(ofm_block) |
| 458 | ofm_block_depth = ofm_block.depth |
| 459 | if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling): |
| 460 | ofm_blocks = ofm_blocks.with_depth(1) |
| 461 | ofm_block_depth = query.ifm_shape.depth |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 462 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 463 | # Convolution & pooling |
| 464 | if query.npu_block_type in ( |
| 465 | NpuBlockType.ConvolutionMxN, |
| 466 | NpuBlockType.ConvolutionDepthWise, |
| 467 | NpuBlockType.VectorProduct, |
| 468 | NpuBlockType.Pooling, |
| 469 | NpuBlockType.ReduceSum, |
| 470 | ): |
| 471 | # Number of sub kernels |
| 472 | sub_kernel_limits = arch.sub_kernel_limits[query.npu_block_type] |
| 473 | subkernels = numeric_util.round_up_divide(query.kernel.width, sub_kernel_limits[0]) |
| 474 | subkernels *= numeric_util.round_up_divide(query.kernel.height, sub_kernel_limits[1]) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 475 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 476 | ofm_block_count = ofm_blocks.elements() |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 477 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 478 | ifm_fetch = ( |
| 479 | Shape4D.round_up(ifm_block, ifm_rounding).elements_wh() |
| 480 | * Shape4D.round_up(query.ifm_shape, ifm_rounding).depth |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 481 | ) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 482 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 483 | if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling): |
| 484 | kernel_read = query.kernel.elements_wh() * 1 # force to no reread |
| 485 | else: |
| 486 | kernel_read = query.kernel.elements_wh() * query.ifm_shape.depth |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 487 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 488 | weight_fetch = kernel_read * ofm_block_depth * ofm_block_count |
| 489 | |
| 490 | access.ifm_read[0] = ifm_fetch * subkernels * ofm_block_count |
| 491 | |
| 492 | if query.npu_block_type not in (NpuBlockType.Pooling, NpuBlockType.ReduceSum): |
| 493 | access.const_read[0] = weight_fetch |
| 494 | access.const_read[1] = query.ofm_shape.depth # Scales & biases |
| 495 | access.weights_refetch = ofm_blocks.elements_wh() |
| 496 | # Elementwise |
| 497 | elif query.npu_block_type == NpuBlockType.ElementWise: |
| 498 | if query.ifm_shape.elements() == 1: |
| 499 | if query.ifm_bits > 8: |
| 500 | # ifm is a non 8-bit scalar |
| 501 | access.ifm_read[0] = Shape4D.round_up(query.ifm_shape, ifm_rounding).elements() |
| 502 | if query.ifm2_shape: |
| 503 | access.ifm_read[1] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements() |
| 504 | else: |
| 505 | access.ifm_read[0] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements() |
| 506 | if query.ifm2_shape: |
| 507 | if query.ifm2_shape.elements() > 1: |
| 508 | access.ifm_read[1] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements() |
| 509 | elif query.ifm2_bits > 8: |
| 510 | # ifm2 is a non 8-bit scalar |
| 511 | access.ifm_read[1] = Shape4D.round_up(query.ifm2_shape, ifm_rounding).elements() |
| 512 | # Unknown |
| 513 | else: |
| 514 | assert False |
| 515 | |
| 516 | ofm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ofm_format])) |
| 517 | access.ofm_write = Shape4D.round_up(query.ofm_shape, ofm_rounding).elements() |
| 518 | return access |
| 519 | |
| 520 | |
| 521 | def measure_performance_cost( |
| 522 | arch, op_type: Op, faf_type: Op, query: PerformanceQuery, offset: Shape4D, sub_shape: Shape4D |
| 523 | ): |
| 524 | assert (query.ofm_bits > 0) and (query.ifm_bits > 0) |
| 525 | assert query.ofm_shape.elements() != 0 |
| 526 | |
| 527 | # Default to start if no offset provided |
| 528 | if offset is None: |
| 529 | offset = Shape4D(0, 0, 0, 0) |
| 530 | |
| 531 | # Default to entire area if no sub-shape provided |
| 532 | if sub_shape is None: |
| 533 | sub_shape = query.ofm_shape |
| 534 | else: |
| 535 | sub_shape = Shape4D.min(sub_shape, query.ofm_shape) |
| 536 | |
| 537 | sub_query = copy.deepcopy(query) |
| 538 | sub_query.ofm_shape = query.ofm_shape.clip(offset, sub_shape) |
| 539 | |
| 540 | access = ElementAccess() |
| 541 | cycles = CycleCost() |
| 542 | |
| 543 | cycle_tmp = measure_cycle_cost(arch, op_type, faf_type, sub_query) |
| 544 | cycles += cycle_tmp |
| 545 | access = measure_element_access(arch, sub_query) |
| 546 | |
| 547 | return access, cycles |
| 548 | |
| 549 | |
| 550 | def make_bandwidth_array(): |
| 551 | return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size)) |
| 552 | |
| 553 | |
| 554 | def make_cycles_array(): |
| 555 | return np.zeros(PassCycles.Size) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 556 | |
| 557 | |
Diqing Zhong | e168b96 | 2020-11-05 17:18:47 +0100 | [diff] [blame] | 558 | def update_summary_cycles(arch, bws, cycles): |
| 559 | cycles[PassCycles.SramAccess] = np.sum(bws[MemArea.Sram]) / arch.memory_bandwidths_per_cycle[MemArea.Sram] |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 560 | cycles[PassCycles.DramAccess] = np.sum(bws[MemArea.Dram]) / arch.memory_bandwidths_per_cycle[MemArea.Dram] |
| 561 | cycles[PassCycles.OnChipFlashAccess] = ( |
| 562 | np.sum(bws[MemArea.OnChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OnChipFlash] |
| 563 | ) |
| 564 | cycles[PassCycles.OffChipFlashAccess] = ( |
| 565 | np.sum(bws[MemArea.OffChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OffChipFlash] |
| 566 | ) |
| 567 | |
| 568 | cycles[PassCycles.Total] = np.max(cycles[: PassCycles.Total]) |
| 569 | return cycles |
| 570 | |
| 571 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 572 | def estimate_full_op_performance( |
| 573 | arch, schedule: Schedule, op: SchedulerOperation, prev_op: SchedulerOperation, block_config |
| 574 | ): |
| 575 | cycles_a = make_cycles_array() |
| 576 | bws = make_bandwidth_array() |
| 577 | scaled_bws = make_bandwidth_array() # scaled bw with memory transfer efficiency |
| 578 | macs = 0 |
| 579 | |
| 580 | query = PerformanceQuery(op.op_type.npu_block_type) |
| 581 | query.ifm_shape = op.ifm.shape |
| 582 | query.ifm_format = op.ifm.format |
| 583 | query.ifm_memory_area = op.ifm.mem_area |
| 584 | query.ifm_bits = op.ifm.dtype.size_in_bits() |
| 585 | query.ifm2_shape = op.ifm2 and op.ifm2.shape |
| 586 | query.ifm2_format = op.ifm2 and op.ifm2.format |
| 587 | query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area |
| 588 | query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits() |
| 589 | query.ofm_shape = op.ofm.shape |
| 590 | query.ofm_memory_area = op.ofm.mem_area |
| 591 | query.ofm_bits = op.ofm.dtype.size_in_bits() |
| 592 | query.ofm_format = op.ofm.format |
| 593 | query.kernel = op.kernel |
| 594 | query.config = block_config |
| 595 | |
| 596 | cost = schedule.cost_map[op] |
| 597 | prev_cost = schedule.cost_map[prev_op] if prev_op else None |
| 598 | if op.parent_op.bias: |
| 599 | query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth) |
| 600 | if cost.buffered_weight_tensor: |
| 601 | query.const_memory_area = cost.buffered_weight_tensor.mem_area |
| 602 | else: |
| 603 | query.const_memory_area = cost.npu_weights_tensor.mem_area |
| 604 | |
| 605 | cycles = measure_cycle_cost(arch, op.op_type, op.parent_op.activation and op.parent_op.activation.op_type, query) |
| 606 | cycles_a[PassCycles.Npu] = cycles.op_cycles |
| 607 | macs = cycles.op_macs |
| 608 | |
| 609 | access = measure_element_access(arch, query) |
| 610 | |
| 611 | # How many NPU cycles are available under the previously executing |
| 612 | # operator for performing buffered DMA transfers |
| 613 | slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0 |
| 614 | |
| 615 | # LUT Transfer |
| 616 | parent_op = op.parent_op |
| 617 | lut_transfer_cycles = 0 |
| 618 | if parent_op.activation_lut: |
| 619 | lut_tensor = [tens for tens in parent_op.inputs if tens.purpose == TensorPurpose.LUT][0] |
| 620 | src_tensor = lut_tensor.src_tensor |
| 621 | if src_tensor and lut_tensor.mem_area != src_tensor.mem_area: |
| 622 | bw = src_tensor.storage_size() |
| 623 | lut_transfer_cycles = measure_mem2mem_cycles(arch, src_tensor.mem_area, lut_tensor.mem_area, bw) |
| 624 | |
| 625 | bws[src_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw |
| 626 | # LUT read from SHRAM TODO remove? |
| 627 | scaled_bws[lut_tensor.mem_area][lut_tensor.purpose][ |
| 628 | BandwidthDirection.Read |
| 629 | ] += _estimate_memory_transfer_efficiency( |
| 630 | arch, |
| 631 | True, |
| 632 | lut_tensor.mem_area, |
| 633 | lut_tensor.format, |
| 634 | lut_tensor.element_size(), |
| 635 | query.config.ifm_block, |
| 636 | Shape4D(lut_tensor.shape), |
| 637 | bw, |
| 638 | ) |
| 639 | |
| 640 | if cost.npu_weights_tensor and cost.buffered_weight_tensor: |
| 641 | # DMA Weight Transfer |
| 642 | sz = 0 |
| 643 | # Get the size of the first DMA |
| 644 | for core in range(0, arch.ncores): |
| 645 | key = WeightKey(core, 0) |
| 646 | if key in cost.npu_weights_tensor.encoded_ranges: |
| 647 | weight_range = cost.npu_weights_tensor.encoded_ranges[key] |
| 648 | sz += round_up(weight_range.total_bytes, 16) |
| 649 | |
| 650 | total_sz = len(cost.npu_weights_tensor.buffer) |
| 651 | bws[cost.npu_weights_tensor.mem_area][TensorPurpose.Weights][BandwidthDirection.Read] += total_sz |
| 652 | bws[cost.buffered_weight_tensor.mem_area][TensorPurpose.Weights][BandwidthDirection.Write] += total_sz |
| 653 | |
| 654 | ws_first_transfer_cycles = measure_mem2mem_cycles( |
| 655 | arch, cost.npu_weights_tensor.mem_area, cost.buffered_weight_tensor.mem_area, sz |
| 656 | ) |
| 657 | |
| 658 | # Add cycles for Weight + Scale Transfer |
| 659 | cycles_a[PassCycles.Npu] = max( |
| 660 | cost.full_weight_transfer_cycles - slack_cycles + cost.slack_buffering_cycles, |
| 661 | cycles.op_cycles + max(ws_first_transfer_cycles - slack_cycles, 0), |
| 662 | ) |
| 663 | |
| 664 | # Add cycles for LUT Transfer |
| 665 | cycles_a[PassCycles.Npu] += lut_transfer_cycles |
| 666 | else: |
| 667 | # Add cycles for LUT Transfer |
| 668 | cycles_a[PassCycles.Npu] += max(lut_transfer_cycles - slack_cycles, 0) |
| 669 | |
| 670 | # OFM write |
| 671 | ofm = op.parent_op.ofm |
| 672 | bw = access.ofm_write * ofm.element_size() |
| 673 | bws[query.ofm_memory_area][ofm.purpose][BandwidthDirection.Write] += bw |
| 674 | scaled_bws[ofm.mem_area][ofm.purpose][BandwidthDirection.Write] += _estimate_memory_transfer_efficiency( |
| 675 | arch, False, query.ofm_memory_area, ofm.format, query.ofm_bits, query.config.ofm_block, query.ofm_shape, bw |
| 676 | ) |
| 677 | |
| 678 | # IFM read |
| 679 | ifm = op.parent_op.ifm |
| 680 | bw = access.ifm_read[0] * ifm.element_size() |
| 681 | bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += bw |
| 682 | scaled_bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency( |
| 683 | arch, True, query.ifm_memory_area, ifm.format, query.ifm_bits, query.config.ifm_block, query.ifm_shape, bw |
| 684 | ) |
| 685 | if query.ifm2_shape: |
| 686 | ifm2 = op.parent_op.ifm2 |
| 687 | bw = access.ifm_read[1] * ifm2.element_size() |
| 688 | bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += bw |
| 689 | scaled_bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency( |
| 690 | arch, |
| 691 | True, |
| 692 | query.ifm2_memory_area, |
| 693 | ifm2.format, |
| 694 | op.ifm2.dtype.size_in_bits(), |
| 695 | query.config.ifm_block, |
| 696 | query.ifm2_shape, |
| 697 | bw, |
| 698 | ) |
| 699 | |
| 700 | # Weight read |
| 701 | if access.const_read[0] > 0: |
| 702 | # alignment not accounted for in bandwidth_compression_scale_approx |
| 703 | encoded_size_approx = ( |
| 704 | cost.npu_weights_tensor.elements() - access.const_read[1] * op.parent_op.bias.element_size() |
| 705 | ) |
| 706 | orig_weight_size = parent_op.weights.elements() |
| 707 | bandwidth_compression_scale_approx = encoded_size_approx / orig_weight_size |
| 708 | bw = access.const_read[0] * bandwidth_compression_scale_approx |
| 709 | bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw |
| 710 | |
| 711 | if access.const_read[1] > 0: |
| 712 | # Scales & biases |
| 713 | bw = access.const_read[1] * op.parent_op.bias.element_size() |
| 714 | bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw |
| 715 | |
| 716 | update_summary_cycles(arch, scaled_bws, cycles_a) |
| 717 | |
| 718 | return bws, macs, cycles_a |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 719 | |
| 720 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 721 | def calc_new_performance_for_network(nng, arch): |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 722 | total_bws = make_bandwidth_array() |
Diqing Zhong | 69aadd0 | 2020-12-08 13:08:48 +0100 | [diff] [blame] | 723 | total_macs = 0 |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 724 | total_cycles = np.zeros(PassCycles.Size) |
| 725 | |
| 726 | for sg in nng.subgraphs: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 727 | prev_op = None |
| 728 | for sched_op in sg.sched_ops: |
| 729 | op_info = sg.schedule.cost_map[sched_op] |
| 730 | bws, macs, cycles = estimate_full_op_performance(arch, sg.schedule, sched_op, prev_op, op_info.block_config) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 731 | total_bws += bws |
| 732 | total_macs += macs |
| 733 | total_cycles += cycles |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 734 | prev_op = sched_op |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 735 | |
| 736 | nng.bandwidths = total_bws |
| 737 | nng.macs = total_macs |
| 738 | nng.cycles = total_cycles |