blob: 6d99dea0c877830da41d95c94d48acd1ea5afe20 [file] [log] [blame]
erik.andersson@arm.com460c6892021-02-24 14:38:09 +01001# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# NPU performance estimation functions to estimate performance of a Pass and CascadedPass. Uses a model that takes the
18# maximum of the 'cycles required for bandwidth' and 'cycles required for computing'.
19#
20# Called during scheduling to evaluate different proposals, as well as post-scheduling to provide a final performance
21# estimate.
Tim Halld8339a72021-05-27 18:49:40 +010022import copy
Diqing Zhonge168b962020-11-05 17:18:47 +010023from enum import auto
24from enum import IntEnum
Jonas Ohlsson845e2322022-03-01 12:39:55 +010025from typing import Optional
Ayaan Masoodb801dda2022-02-22 11:28:55 +000026from typing import Set
27from uuid import UUID
Diego Russoea6111a2020-04-14 18:41:58 +010028
Tim Hall79d07d22020-04-27 18:20:16 +010029import numpy as np
Diego Russoea6111a2020-04-14 18:41:58 +010030
31from . import numeric_util
Tim Halld8339a72021-05-27 18:49:40 +010032from .architecture_allocator import ArchitectureBlockConfig
Diqing Zhong09387e22020-09-28 18:46:22 +020033from .architecture_features import Accelerator
Tim Hallc1be0872022-03-03 17:50:52 +000034from .architecture_features import ArchitectureFeatures
Tim Halld8339a72021-05-27 18:49:40 +010035from .architecture_features import NpuBlockType
36from .architecture_features import SHRAMElements
37from .architecture_features import TensorFormat
Tim Hallc1be0872022-03-03 17:50:52 +000038from .debug_database import DebugDatabase
Ayaan Masoodb801dda2022-02-22 11:28:55 +000039from .nn_graph import Graph
Tim Hallc1be0872022-03-03 17:50:52 +000040from .nn_graph import NetworkType
41from .nn_graph import PassPlacement
Tim Halld8339a72021-05-27 18:49:40 +010042from .numeric_util import round_up
Johan Alfvénf8e353b2022-02-04 17:24:23 +010043from .numeric_util import round_up_to_int
Tim Halld8339a72021-05-27 18:49:40 +010044from .operation import Kernel
Diqing Zhonge8887a32020-09-24 09:53:48 +020045from .operation import Op
Tim Halld8339a72021-05-27 18:49:40 +010046from .scheduler import Schedule
47from .scheduler import SchedulerOperation
Ayaan Masoodb801dda2022-02-22 11:28:55 +000048from .scheduler import SchedulerOpInfo
Tim Halld8339a72021-05-27 18:49:40 +010049from .shape4d import Shape4D
Diqing Zhongf842b692020-12-11 13:07:37 +010050from .tensor import BandwidthDirection
Diego Russoe8a10452020-04-21 17:39:10 +010051from .tensor import MemArea
Diego Russoe8a10452020-04-21 17:39:10 +010052from .tensor import TensorPurpose
Johan Alfvén0f98de62022-05-15 14:54:51 +020053from .tensor import TensorSubPurpose
Tim Hallc1be0872022-03-03 17:50:52 +000054from .tflite_mapping import optype_to_builtintype as tflite_optype_to_builtintype
55from .tosa_mapping import optype_to_tosa_op_type as tosa_optype_to_tosa_op_type
Tim Halld8339a72021-05-27 18:49:40 +010056from .weight_compressor import WeightKey
Tim Hall79d07d22020-04-27 18:20:16 +010057
58
Diqing Zhonge168b962020-11-05 17:18:47 +010059class PassCycles(IntEnum):
Diqing Zhong42e833d2020-10-02 13:18:42 +020060 Npu = 0
Diqing Zhonge168b962020-11-05 17:18:47 +010061 SramAccess = auto()
62 DramAccess = auto()
63 OnChipFlashAccess = auto()
64 OffChipFlashAccess = auto()
65 Total = auto()
66 Size = auto()
Tim Hall79d07d22020-04-27 18:20:16 +010067
68 def display_name(self):
Jonas Ohlssond8575072022-03-30 10:30:25 +020069 return (
70 "NPU",
71 "SRAM Access",
72 "DRAM Access",
73 "On-chip Flash Access",
74 "Off-chip Flash Access",
75 "Total",
76 "Size",
77 )[self.value]
Tim Hall79d07d22020-04-27 18:20:16 +010078
79 def identifier_name(self):
Jonas Ohlssond8575072022-03-30 10:30:25 +020080 return (
81 "npu",
82 "sram_access",
83 "dram_access",
84 "on_chip_flash_access",
85 "off_chip_flash_access",
86 "total",
87 "size",
88 )[self.value]
Tim Hall79d07d22020-04-27 18:20:16 +010089
90 @staticmethod
91 def all():
92 return (
Diqing Zhong42e833d2020-10-02 13:18:42 +020093 PassCycles.Npu,
Tim Hall79d07d22020-04-27 18:20:16 +010094 PassCycles.SramAccess,
95 PassCycles.DramAccess,
96 PassCycles.OnChipFlashAccess,
97 PassCycles.OffChipFlashAccess,
98 PassCycles.Total,
99 )
100
101
Tim Halld8339a72021-05-27 18:49:40 +0100102class PerformanceQuery:
103 def __init__(self, npu_block_type=0):
104 self.npu_block_type = npu_block_type
105 self.ifm_shape = Shape4D(0)
106 self.ifm_format = TensorFormat.NHWC
107 self.ifm_memory_area = MemArea.Unknown
108 self.ifm2_memory_area = MemArea.Unknown
109 self.ifm_bits = 0
110 self.ifm2_bits = 0
111 self.ifm2_shape = None
112 self.ifm2_format = TensorFormat.NHWC
113 self.ofm_shape = Shape4D(0)
114 self.ofm_format = TensorFormat.NHWC
115 self.ofm_memory_area = MemArea.Unknown
116 self.ofm_bits = 0
117 self.const_shape = Shape4D(0)
118 self.const_memory_area = MemArea.Unknown
119 self.kernel = Kernel(1, 1)
120 self.config = ArchitectureBlockConfig()
Tim Hall79d07d22020-04-27 18:20:16 +0100121
122
Tim Halld8339a72021-05-27 18:49:40 +0100123class CycleCost:
124 def __init__(self):
125 self.op_macs = 0
126 self.op_cycles = 0
127
128 def __mul__(self, scale):
129 out = CycleCost()
130 out.op_macs = self.op_macs * scale
131 out.op_cycles = self.op_cycles * scale
132 return out
133
134 def __iadd__(self, rhs):
135 self.op_macs += rhs.op_macs
136 self.op_cycles += rhs.op_cycles
137 return self
138
139 def __str__(self):
140 return "macs = {}, cycles = {}".format(self.op_macs, self.op_cycles)
Tim Hall79d07d22020-04-27 18:20:16 +0100141
142
Tim Halld8339a72021-05-27 18:49:40 +0100143class ElementAccess:
144 def __init__(self):
145 # List of ONLY element access counts, consumers
146 # need to scale these values by the correct bitwidths
147 # to calculated memory bandwidth
148 self.ifm_read = [0, 0] # ifm1, ifm2
149 self.ofm_write = 0
150 self.weights_refetch = 0
151 self.const_read = [0, 0] # weights, scales
152
153 def __mul__(self, scale):
154 out = ElementAccess()
155 out.ifm_read[0] = self.ifm_read[0] * scale
156 out.ifm_read[1] = self.ifm_read[1] * scale
157 out.ofm_write = self.ofm_write * scale
158 out.weights_refetch = self.weights_refetch * scale
159 out.const_read[0] = self.const_read[0] * scale
160 out.const_read[1] = self.const_read[1] * scale
161 return out
162
163 def __iadd__(self, rhs):
164 self.ifm_read[0] += rhs.ifm_read[0]
165 self.ifm_read[1] += rhs.ifm_read[1]
166 self.ofm_write += rhs.ofm_write
167 self.weights_refetch += rhs.weights_refetch
168 self.const_read[0] += rhs.const_read[0]
169 self.const_read[1] += rhs.const_read[1]
170 return self
171
172 def __str__(self):
173 return "ifm read = {}, ofm write = {}, const read={}".format(self.ifm_read, self.ofm_write, self.const_read)
Tim Hall79d07d22020-04-27 18:20:16 +0100174
175
Tim Halld8339a72021-05-27 18:49:40 +0100176def _strides_for_shape(shape: Shape4D, format: TensorFormat, element_bits):
177 if format == TensorFormat.NHWC:
178 strides = [0, 0, 0, 0]
179 strides[3] = element_bits / 8 # +Z
180 strides[2] = (element_bits * shape.depth) // 8 # +X
181 strides[1] = (element_bits * shape.depth * shape.width) // 8 # +Y
182 strides[0] = (element_bits * shape.depth * shape.width * shape.height) // 8 # +N
183 elif format == TensorFormat.NHCWB16:
184 strides = [0, 0, 0, 0, 0]
185 strides[4] = element_bits / 8 # +Z
186 strides[3] = (element_bits * 16) / 8 # +X
187 strides[2] = (element_bits * 16 * shape.width) / 8 # +C
188 strides[1] = (element_bits * shape.width * shape.depth) / 8 # +Y
189 strides[0] = (element_bits * shape.width * shape.depth) / 8 # +N
Diqing Zhong42e833d2020-10-02 13:18:42 +0200190
Tim Halld8339a72021-05-27 18:49:40 +0100191 return strides
Diqing Zhong42e833d2020-10-02 13:18:42 +0200192
193
Tim Halld8339a72021-05-27 18:49:40 +0100194def _estimate_memory_transfer_efficiency(
195 arch, is_read, mem_area, format: TensorFormat, element_bits, block_size, shape4D, to_transfer
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100196):
Tim Halld8339a72021-05-27 18:49:40 +0100197 burst_len = 8
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100198
Tim Halld8339a72021-05-27 18:49:40 +0100199 strides = _strides_for_shape(shape4D, format, element_bits)
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100200
Tim Halld8339a72021-05-27 18:49:40 +0100201 if format == TensorFormat.NHCWB16:
202 if strides[2] == block_size.depth: # TODO is this check corrrect for non 8-bit
203 burst_len = element_bits * block_size.depth * block_size.width
204 elif is_read:
205 burst_len = 16 * element_bits * block_size.width
Diqing Zhonge8887a32020-09-24 09:53:48 +0200206 else:
Tim Halld8339a72021-05-27 18:49:40 +0100207 burst_len = 16 * element_bits * block_size.width * arch.ncores
208 elif format == TensorFormat.NHWC:
209 if is_read:
210 if strides[3] == block_size.depth:
211 burst_len = element_bits * block_size.depth * block_size.width
212 else:
213 burst_len = element_bits * block_size.depth
214 else:
215 if block_size.depth <= 16 and strides[3] == block_size.depth:
216 burst_len = element_bits * block_size.depth * block_size.width
217 else:
218 burst_len = min(64 * 8, 16 * element_bits * arch.ncores, block_size.depth * element_bits)
219
220 burst_len = burst_len // 8 # bits->bytes
221 burst_len = min(arch.memory_burst_length[mem_area], burst_len)
222 return to_transfer * (arch.memory_burst_length[mem_area] / burst_len)
223
224
225def _estimate_minimum_memory_cycles(arch, query: PerformanceQuery):
226 # Input block HW transfer (only for elements present)
227 ifm_bytes = Shape4D.min(query.ifm_shape, query.config.ifm_block).elements()
228 cycles_ifm_blk = arch.memory_latency[query.ifm_memory_area][BandwidthDirection.Read]
229 cycles_ifm_blk = cycles_ifm_blk + (
230 _estimate_memory_transfer_efficiency(
231 arch,
232 True,
233 query.ifm_memory_area,
234 query.ifm_format,
235 query.ifm_bits,
236 query.config.ifm_block,
237 query.ifm_shape,
238 ifm_bytes,
239 )
240 / arch.memory_bandwidths_per_cycle[query.ifm_memory_area]
241 )
242 # Output block HW transfer (only for elements present)
243 ofm_bytes = Shape4D.min(query.ofm_shape, query.config.ofm_block).elements()
244 cycles_ofm_blk = arch.memory_latency[query.ofm_memory_area][BandwidthDirection.Write]
245 cycles_ofm_blk = cycles_ofm_blk + (
246 _estimate_memory_transfer_efficiency(
247 arch,
248 False,
249 query.ofm_memory_area,
250 query.ofm_format,
251 query.ofm_bits,
252 query.config.ofm_block,
253 query.ofm_shape,
254 ofm_bytes,
255 )
256 / arch.memory_bandwidths_per_cycle[query.ofm_memory_area]
257 )
258 return cycles_ifm_blk, cycles_ofm_blk
259
260
261def _estimate_output_cycles_per_element(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):
262 if query.npu_block_type == NpuBlockType.ElementWise and query.ifm_bits == 32:
263 # Unary op else Binary op
264 output_perf_index = 0 if query.ifm2_shape is not None else 1
265 elif op_type == Op.Mul and query.ofm_bits == 32:
Diqing Zhonge8887a32020-09-24 09:53:48 +0200266 output_perf_index = 2
Tim Halld8339a72021-05-27 18:49:40 +0100267 elif op_type == Op.Mul or (
268 query.npu_block_type
Diqing Zhonge8887a32020-09-24 09:53:48 +0200269 in (
270 NpuBlockType.ConvolutionMxN,
271 NpuBlockType.ConvolutionDepthWise,
272 NpuBlockType.Pooling,
273 NpuBlockType.ReduceSum,
274 NpuBlockType.VectorProduct,
275 )
Tim Halld8339a72021-05-27 18:49:40 +0100276 and query.config.acc_type == SHRAMElements.Acc40
Diqing Zhonge8887a32020-09-24 09:53:48 +0200277 ):
278 output_perf_index = 3
Tim Halld8339a72021-05-27 18:49:40 +0100279 elif op_type in (Op.Add, Op.Sub):
280 if False:
Diqing Zhonge8887a32020-09-24 09:53:48 +0200281 # Simple Add/Sub
282 output_perf_index = 4
283 else:
Tim Halld8339a72021-05-27 18:49:40 +0100284 # Advanced Add/Sub TODO: Add as perf selection as operator variant
Diqing Zhonge8887a32020-09-24 09:53:48 +0200285 output_perf_index = 5
Tim Halld8339a72021-05-27 18:49:40 +0100286 elif op_type.is_maxpool_op():
Diqing Zhonge8887a32020-09-24 09:53:48 +0200287 output_perf_index = 6
288 else:
289 output_perf_index = 7
290
Tim Halld8339a72021-05-27 18:49:40 +0100291 if faf_type in (Op.Sigmoid, Op.Tanh, Op.LUT):
Diqing Zhonge8887a32020-09-24 09:53:48 +0200292 activation_perf_index = 0
Tim Halld8339a72021-05-27 18:49:40 +0100293 elif faf_type in (Op.Relu, Op.Relu6, Op.ReluN1To1):
Diqing Zhonge8887a32020-09-24 09:53:48 +0200294 activation_perf_index = 1
295 else:
296 activation_perf_index = 2
297
Diqing Zhonge8887a32020-09-24 09:53:48 +0200298 cycle_per_elem = max(
299 arch.output_cycles_per_elem[output_perf_index], arch.activation_cycles_per_elem[activation_perf_index]
300 )
Diqing Zhong986e3192020-11-16 16:15:56 +0100301
Tim Halld8339a72021-05-27 18:49:40 +0100302 if op_type.is_elementwise_op():
303 num_elems_blk = query.config.ofm_block.elements()
304 ifm_blk_cycles, ofm_blk_cycles = _estimate_minimum_memory_cycles(arch, query)
305 cycle_cmd = ifm_blk_cycles + ofm_blk_cycles
306 cycle_cmd = (cycle_cmd + cycle_per_elem * num_elems_blk) / 4 # per DPU
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100307 cycle_per_elem = max(cycle_per_elem, cycle_cmd / num_elems_blk)
308
Tim Halld8339a72021-05-27 18:49:40 +0100309 return cycle_per_elem
Diqing Zhonge8887a32020-09-24 09:53:48 +0200310
311
Tim Halld8339a72021-05-27 18:49:40 +0100312def _estimate_conv_cycles(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):
313 ifm_block = Shape4D.min(query.ifm_shape, query.config.ifm_block)
314 ofm_block = Shape4D.min(query.ofm_shape, query.config.ofm_block)
Diqing Zhonge5204a62020-10-13 11:42:37 +0200315
316 if (
317 arch.config.ofm_ublock.height == 2
Tim Halld8339a72021-05-27 18:49:40 +0100318 and query.npu_block_type
Diqing Zhonge5204a62020-10-13 11:42:37 +0200319 in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)
Tim Halld8339a72021-05-27 18:49:40 +0100320 and query.ofm_shape.height == 1
Diqing Zhonge5204a62020-10-13 11:42:37 +0200321 # Optimisation only applies for even width tensors
Tim Halld8339a72021-05-27 18:49:40 +0100322 and query.ofm_shape.width % 2 == 0
323 and query.kernel.height == 1
Diqing Zhonge5204a62020-10-13 11:42:37 +0200324 ):
Tim Halld8339a72021-05-27 18:49:40 +0100325 ofm_ublock = Shape4D(1, 1, 4, arch.config.ofm_ublock.depth)
326 ofm_block = ofm_block.with_height(1)
327 else:
328 ofm_ublock = Shape4D(arch.config.ofm_ublock.to_hwc())
Diqing Zhonge5204a62020-10-13 11:42:37 +0200329
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100330 num_ublk_x = numeric_util.round_up_divide(ofm_block.width, ofm_ublock.width)
Tim Halld8339a72021-05-27 18:49:40 +0100331 num_ublk_y = numeric_util.round_up_divide(ofm_block.height, ofm_ublock.height)
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100332 num_ublk_xy = num_ublk_x * num_ublk_y
Tim Halld8339a72021-05-27 18:49:40 +0100333 num_ublk_z = numeric_util.round_up_divide(ofm_block.depth, ofm_ublock.depth)
334 use_acc_40bits = query.config.acc_type == SHRAMElements.Acc40
Diqing Zhong09387e22020-09-28 18:46:22 +0200335
Tim Halld8339a72021-05-27 18:49:40 +0100336 sub_kernel_limits = arch.sub_kernel_limits[query.npu_block_type]
337 n_sub_kernels_y = numeric_util.round_up_divide(query.kernel.height, sub_kernel_limits[0])
338 n_sub_kernels_x = numeric_util.round_up_divide(query.kernel.width, sub_kernel_limits[1])
Diqing Zhong09387e22020-09-28 18:46:22 +0200339 sub_kernel_x = [
Tim Halld8339a72021-05-27 18:49:40 +0100340 min((query.kernel.width - i * sub_kernel_limits[1]), sub_kernel_limits[1]) for i in range(n_sub_kernels_x)
Diqing Zhong09387e22020-09-28 18:46:22 +0200341 ]
342 sub_kernel_y = [
Tim Halld8339a72021-05-27 18:49:40 +0100343 min((query.kernel.height - i * sub_kernel_limits[0]), sub_kernel_limits[0]) for i in range(n_sub_kernels_y)
Diqing Zhong09387e22020-09-28 18:46:22 +0200344 ]
345 sub_kernel_size = (x * y for y in sub_kernel_y for x in sub_kernel_x)
346
Diqing Zhong09387e22020-09-28 18:46:22 +0200347 cycles_dpu_blk = 0
Diqing Zhong986e3192020-11-16 16:15:56 +0100348 cycles_wb = 32 * ofm_ublock.depth // 8
Diqing Zhong09387e22020-09-28 18:46:22 +0200349
350 for num_kernel_elems in sub_kernel_size:
Tim Halld8339a72021-05-27 18:49:40 +0100351 if query.npu_block_type == NpuBlockType.Pooling:
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100352 num_kernel_steps = 1
Diqing Zhong986e3192020-11-16 16:15:56 +0100353 cycles = max(4, num_kernel_elems) * num_ublk_xy * num_ublk_z
Tim Halld8339a72021-05-27 18:49:40 +0100354 if query.ifm_bits == 16 and arch.accelerator_config != Accelerator.Ethos_U55_32:
Diqing Zhong09387e22020-09-28 18:46:22 +0200355 cycles *= 2
Tim Halld8339a72021-05-27 18:49:40 +0100356 elif query.npu_block_type == NpuBlockType.ConvolutionDepthWise:
Diqing Zhong986e3192020-11-16 16:15:56 +0100357 cycles = 4 * num_ublk_xy
Tim Halld8339a72021-05-27 18:49:40 +0100358 if query.ifm_bits == 16:
Diqing Zhong09387e22020-09-28 18:46:22 +0200359 cycles *= 2
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100360 num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, 4)
361 cycles = max(cycles_wb, cycles) * num_kernel_steps * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200362 elif (
Tim Halld8339a72021-05-27 18:49:40 +0100363 (query.npu_block_type == NpuBlockType.ConvolutionMxN and not query.config.is_partkernel)
364 or query.npu_block_type == NpuBlockType.VectorProduct
365 or query.npu_block_type == NpuBlockType.ReduceSum
Diqing Zhong09387e22020-09-28 18:46:22 +0200366 ):
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100367 num_kernel_steps = num_kernel_elems
368 cycles = max(cycles_wb, 4 * num_ublk_xy) * num_kernel_steps * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200369 else:
Tim Halld8339a72021-05-27 18:49:40 +0100370 assert query.config.is_partkernel
371 divider = 2 if query.ifm_bits == 16 else 4
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100372 num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, divider)
Diqing Zhong986e3192020-11-16 16:15:56 +0100373 cycles = max(cycles_wb, 4 * num_ublk_xy) * (
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100374 num_kernel_steps * numeric_util.round_up_divide(ifm_block.depth, 8) * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200375 )
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100376
377 delay_cycles = 0
378 if arch.accelerator_config is Accelerator.Ethos_U55_32:
379 delay = 7 if use_acc_40bits else 3
380 if num_ublk_x == 1 and num_ublk_y == 1:
381 if num_ublk_z == 1:
382 delay_cycles = delay * num_kernel_steps
383 elif num_kernel_steps > 1:
384 delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z
385 if (num_ublk_x == 1 or num_ublk_y == 1) and num_ublk_z > 1 and use_acc_40bits:
386 delay_cycles += delay * num_ublk_z
387 else:
Tim Halld8339a72021-05-27 18:49:40 +0100388 if use_acc_40bits and arch.accelerator_config in (Accelerator.Ethos_U55_64, Accelerator.Ethos_U55_128):
389 delay = 3
390 else:
391 delay = 2
392
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100393 if num_ublk_x == 1 and num_ublk_y == 1:
394 if num_ublk_z == 1:
395 delay_cycles = delay * num_kernel_steps
396 elif num_kernel_steps > 1:
397 delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z
398
Tim Halld8339a72021-05-27 18:49:40 +0100399 if query.npu_block_type == NpuBlockType.ConvolutionMxN and query.config.is_partkernel:
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100400 delay_cycles *= numeric_util.round_up_divide(ifm_block.depth, 8)
401
Diqing Zhong09387e22020-09-28 18:46:22 +0200402 cycles_dpu_blk += cycles
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100403 cycles_dpu_blk += delay_cycles
404
Tim Halld8339a72021-05-27 18:49:40 +0100405 if query.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
406 cycles_dpu_blk *= numeric_util.round_up_divide(query.ifm_shape.depth, ifm_block.depth)
Diqing Zhong09387e22020-09-28 18:46:22 +0200407
408 cycles_dpu_blk /= arch.ncores
409
Tim Halld8339a72021-05-27 18:49:40 +0100410 # Estimate output cycles
411 num_ofm_blks = query.ofm_shape.div_round_up(ofm_block).elements()
Johan Alfvénf8e353b2022-02-04 17:24:23 +0100412 cycles_output_blk = round_up_to_int(
413 _estimate_output_cycles_per_element(arch, op_type, faf_type, query) * ofm_block.elements()
414 )
Diqing Zhong09387e22020-09-28 18:46:22 +0200415
Tim Halld8339a72021-05-27 18:49:40 +0100416 # Scale and bias tensor
417 if query.const_shape.depth > 0:
Diqing Zhongf842b692020-12-11 13:07:37 +0100418 cycles_bias_blk = (
Tim Halld8339a72021-05-27 18:49:40 +0100419 10 * ofm_block.depth * arch.memory_latency[query.const_memory_area][BandwidthDirection.Read] / 256
Diqing Zhongf842b692020-12-11 13:07:37 +0100420 )
Diqing Zhong986e3192020-11-16 16:15:56 +0100421 cycles_output_blk = max(cycles_output_blk, cycles_bias_blk)
422
Tim Halld8339a72021-05-27 18:49:40 +0100423 ifm_blk_cycles, ofm_blk_cycles = _estimate_minimum_memory_cycles(arch, query)
424 cycles_cmd = ifm_blk_cycles + ofm_blk_cycles
425 cycles_cmd = (cycles_cmd + cycles_output_blk + cycles_dpu_blk) / 4 # per DPU
426
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100427 cycles_dpu_blk = max(cycles_dpu_blk, cycles_cmd)
428 cycles_output_blk = max(cycles_output_blk, cycles_cmd)
429
Diqing Zhong09387e22020-09-28 18:46:22 +0200430 if cycles_dpu_blk > cycles_output_blk:
Tim Halld8339a72021-05-27 18:49:40 +0100431 total_cycles = cycles_dpu_blk * num_ofm_blks + cycles_output_blk
Diqing Zhong09387e22020-09-28 18:46:22 +0200432 else:
Tim Halld8339a72021-05-27 18:49:40 +0100433 total_cycles = cycles_output_blk * num_ofm_blks + cycles_dpu_blk
Diqing Zhong09387e22020-09-28 18:46:22 +0200434
435 return total_cycles
436
437
Tim Halld8339a72021-05-27 18:49:40 +0100438def measure_mem2mem_cycles(arch, from_mem_area, to_mem_area, to_transfer):
439 from_cycles = to_transfer // arch.memory_bandwidths_per_cycle[from_mem_area]
Tim Hall789e6f32021-06-17 17:02:31 +0100440 from_cycles += arch.memory_latency[from_mem_area][BandwidthDirection.Read]
Tim Halld8339a72021-05-27 18:49:40 +0100441 to_cycles = to_transfer // arch.memory_bandwidths_per_cycle[to_mem_area]
442 return max(from_cycles, to_cycles)
Diqing Zhonge168b962020-11-05 17:18:47 +0100443
Patrik Gustavssonee99bb12021-04-08 09:04:00 +0200444
Tim Halld8339a72021-05-27 18:49:40 +0100445def measure_cycle_cost(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):
446 cycles = CycleCost()
Diqing Zhonge168b962020-11-05 17:18:47 +0100447
Tim Halld8339a72021-05-27 18:49:40 +0100448 # Convolution/Vector product cycle calculation
449 if query.npu_block_type in (
450 NpuBlockType.ConvolutionMxN,
451 NpuBlockType.ConvolutionDepthWise,
452 NpuBlockType.VectorProduct,
453 NpuBlockType.Pooling,
454 NpuBlockType.ReduceSum,
455 ):
456 # cycles.op_macs and cycles.op_cycles should both handle >32-bits
457 if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
458 cycles.op_macs = int(query.kernel.elements_wh()) * 1 * int(query.ofm_shape.elements())
Diqing Zhonge168b962020-11-05 17:18:47 +0100459 else:
Tim Halld8339a72021-05-27 18:49:40 +0100460 cycles.op_macs = (
461 int(query.kernel.elements_wh()) * int(query.ifm_shape.depth) * int(query.ofm_shape.elements())
462 )
463
464 cycles.op_cycles = int(_estimate_conv_cycles(arch, op_type, faf_type, query))
465 # Elementwise cycle calculation
466 elif query.npu_block_type == NpuBlockType.ElementWise:
467 cycles.op_macs = 0
Johan Alfvénf8e353b2022-02-04 17:24:23 +0100468 ofm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ofm_format]))
469 cycles.op_cycles = round_up_to_int(
470 _estimate_output_cycles_per_element(arch, op_type, faf_type, query)
471 * Shape4D.round_up(query.ofm_shape, ofm_rounding).elements()
Tim Halld8339a72021-05-27 18:49:40 +0100472 )
Diqing Zhonge168b962020-11-05 17:18:47 +0100473 else:
Tim Halld8339a72021-05-27 18:49:40 +0100474 assert False
Diqing Zhonge168b962020-11-05 17:18:47 +0100475
Tim Halld8339a72021-05-27 18:49:40 +0100476 return cycles
Diqing Zhonge168b962020-11-05 17:18:47 +0100477
478
Tim Halld8339a72021-05-27 18:49:40 +0100479def measure_element_access(arch, query: PerformanceQuery):
480 access = ElementAccess()
Tim Hall79d07d22020-04-27 18:20:16 +0100481
Tim Halld8339a72021-05-27 18:49:40 +0100482 ifm_block = Shape4D.min(query.ifm_shape, query.config.ifm_block)
483 ofm_block = Shape4D.min(query.ofm_shape, query.config.ofm_block)
484 ifm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ifm_format]))
Tim Hall79d07d22020-04-27 18:20:16 +0100485
Tim Halld8339a72021-05-27 18:49:40 +0100486 # Number of ofm blocks in the overall output shape
487 ofm_blocks = query.ofm_shape.div_round_up(ofm_block)
488 ofm_block_depth = ofm_block.depth
489 if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
490 ofm_blocks = ofm_blocks.with_depth(1)
491 ofm_block_depth = query.ifm_shape.depth
Diqing Zhonge168b962020-11-05 17:18:47 +0100492
Tim Halld8339a72021-05-27 18:49:40 +0100493 # Convolution & pooling
494 if query.npu_block_type in (
495 NpuBlockType.ConvolutionMxN,
496 NpuBlockType.ConvolutionDepthWise,
497 NpuBlockType.VectorProduct,
498 NpuBlockType.Pooling,
499 NpuBlockType.ReduceSum,
500 ):
501 # Number of sub kernels
502 sub_kernel_limits = arch.sub_kernel_limits[query.npu_block_type]
503 subkernels = numeric_util.round_up_divide(query.kernel.width, sub_kernel_limits[0])
504 subkernels *= numeric_util.round_up_divide(query.kernel.height, sub_kernel_limits[1])
Tim Hall79d07d22020-04-27 18:20:16 +0100505
Tim Halld8339a72021-05-27 18:49:40 +0100506 ofm_block_count = ofm_blocks.elements()
Tim Hall79d07d22020-04-27 18:20:16 +0100507
Tim Halld8339a72021-05-27 18:49:40 +0100508 ifm_fetch = (
509 Shape4D.round_up(ifm_block, ifm_rounding).elements_wh()
510 * Shape4D.round_up(query.ifm_shape, ifm_rounding).depth
Diqing Zhonge168b962020-11-05 17:18:47 +0100511 )
Tim Hall79d07d22020-04-27 18:20:16 +0100512
Tim Halld8339a72021-05-27 18:49:40 +0100513 if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
514 kernel_read = query.kernel.elements_wh() * 1 # force to no reread
515 else:
516 kernel_read = query.kernel.elements_wh() * query.ifm_shape.depth
Tim Hall79d07d22020-04-27 18:20:16 +0100517
Tim Halld8339a72021-05-27 18:49:40 +0100518 weight_fetch = kernel_read * ofm_block_depth * ofm_block_count
519
520 access.ifm_read[0] = ifm_fetch * subkernels * ofm_block_count
521
522 if query.npu_block_type not in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):
523 access.const_read[0] = weight_fetch
524 access.const_read[1] = query.ofm_shape.depth # Scales & biases
525 access.weights_refetch = ofm_blocks.elements_wh()
526 # Elementwise
527 elif query.npu_block_type == NpuBlockType.ElementWise:
528 if query.ifm_shape.elements() == 1:
529 if query.ifm_bits > 8:
530 # ifm is a non 8-bit scalar
531 access.ifm_read[0] = Shape4D.round_up(query.ifm_shape, ifm_rounding).elements()
532 if query.ifm2_shape:
533 access.ifm_read[1] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()
534 else:
535 access.ifm_read[0] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()
536 if query.ifm2_shape:
537 if query.ifm2_shape.elements() > 1:
538 access.ifm_read[1] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()
539 elif query.ifm2_bits > 8:
540 # ifm2 is a non 8-bit scalar
541 access.ifm_read[1] = Shape4D.round_up(query.ifm2_shape, ifm_rounding).elements()
542 # Unknown
543 else:
544 assert False
545
546 ofm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ofm_format]))
547 access.ofm_write = Shape4D.round_up(query.ofm_shape, ofm_rounding).elements()
548 return access
549
550
551def measure_performance_cost(
552 arch, op_type: Op, faf_type: Op, query: PerformanceQuery, offset: Shape4D, sub_shape: Shape4D
553):
554 assert (query.ofm_bits > 0) and (query.ifm_bits > 0)
555 assert query.ofm_shape.elements() != 0
556
557 # Default to start if no offset provided
558 if offset is None:
559 offset = Shape4D(0, 0, 0, 0)
560
561 # Default to entire area if no sub-shape provided
562 if sub_shape is None:
563 sub_shape = query.ofm_shape
564 else:
565 sub_shape = Shape4D.min(sub_shape, query.ofm_shape)
566
567 sub_query = copy.deepcopy(query)
568 sub_query.ofm_shape = query.ofm_shape.clip(offset, sub_shape)
569
570 access = ElementAccess()
571 cycles = CycleCost()
572
573 cycle_tmp = measure_cycle_cost(arch, op_type, faf_type, sub_query)
574 cycles += cycle_tmp
575 access = measure_element_access(arch, sub_query)
576
577 return access, cycles
578
579
580def make_bandwidth_array():
581 return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size))
582
583
584def make_cycles_array():
585 return np.zeros(PassCycles.Size)
Tim Hall79d07d22020-04-27 18:20:16 +0100586
587
Diqing Zhonge168b962020-11-05 17:18:47 +0100588def update_summary_cycles(arch, bws, cycles):
589 cycles[PassCycles.SramAccess] = np.sum(bws[MemArea.Sram]) / arch.memory_bandwidths_per_cycle[MemArea.Sram]
Tim Hall79d07d22020-04-27 18:20:16 +0100590 cycles[PassCycles.DramAccess] = np.sum(bws[MemArea.Dram]) / arch.memory_bandwidths_per_cycle[MemArea.Dram]
591 cycles[PassCycles.OnChipFlashAccess] = (
592 np.sum(bws[MemArea.OnChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OnChipFlash]
593 )
594 cycles[PassCycles.OffChipFlashAccess] = (
595 np.sum(bws[MemArea.OffChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OffChipFlash]
596 )
597
598 cycles[PassCycles.Total] = np.max(cycles[: PassCycles.Total])
599 return cycles
600
601
Tim Halld8339a72021-05-27 18:49:40 +0100602def estimate_full_op_performance(
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100603 arch, schedule: Schedule, op: SchedulerOperation, prev_op: Optional[SchedulerOperation], block_config
Tim Halld8339a72021-05-27 18:49:40 +0100604):
605 cycles_a = make_cycles_array()
606 bws = make_bandwidth_array()
607 scaled_bws = make_bandwidth_array() # scaled bw with memory transfer efficiency
608 macs = 0
609
610 query = PerformanceQuery(op.op_type.npu_block_type)
611 query.ifm_shape = op.ifm.shape
612 query.ifm_format = op.ifm.format
613 query.ifm_memory_area = op.ifm.mem_area
614 query.ifm_bits = op.ifm.dtype.size_in_bits()
615 query.ifm2_shape = op.ifm2 and op.ifm2.shape
616 query.ifm2_format = op.ifm2 and op.ifm2.format
617 query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area
618 query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()
619 query.ofm_shape = op.ofm.shape
620 query.ofm_memory_area = op.ofm.mem_area
621 query.ofm_bits = op.ofm.dtype.size_in_bits()
622 query.ofm_format = op.ofm.format
623 query.kernel = op.kernel
624 query.config = block_config
625
626 cost = schedule.cost_map[op]
627 prev_cost = schedule.cost_map[prev_op] if prev_op else None
628 if op.parent_op.bias:
629 query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000630 if cost.buffered_weight_tensors:
631 query.const_memory_area = cost.buffered_weight_tensors[0].mem_area
Tim Halld8339a72021-05-27 18:49:40 +0100632 else:
633 query.const_memory_area = cost.npu_weights_tensor.mem_area
634
635 cycles = measure_cycle_cost(arch, op.op_type, op.parent_op.activation and op.parent_op.activation.op_type, query)
636 cycles_a[PassCycles.Npu] = cycles.op_cycles
637 macs = cycles.op_macs
638
639 access = measure_element_access(arch, query)
640
641 # How many NPU cycles are available under the previously executing
642 # operator for performing buffered DMA transfers
643 slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0
644
645 # LUT Transfer
646 parent_op = op.parent_op
647 lut_transfer_cycles = 0
648 if parent_op.activation_lut:
649 lut_tensor = [tens for tens in parent_op.inputs if tens.purpose == TensorPurpose.LUT][0]
650 src_tensor = lut_tensor.src_tensor
651 if src_tensor and lut_tensor.mem_area != src_tensor.mem_area:
652 bw = src_tensor.storage_size()
653 lut_transfer_cycles = measure_mem2mem_cycles(arch, src_tensor.mem_area, lut_tensor.mem_area, bw)
654
655 bws[src_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw
656 # LUT read from SHRAM TODO remove?
Ayaan Masoodd5cbef32022-02-22 15:56:35 +0000657 scaled_bws[lut_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw
Tim Halld8339a72021-05-27 18:49:40 +0100658
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000659 if cost.npu_weights_tensor and cost.buffered_weight_tensors:
Tim Halld8339a72021-05-27 18:49:40 +0100660 # DMA Weight Transfer
661 sz = 0
662 # Get the size of the first DMA
663 for core in range(0, arch.ncores):
664 key = WeightKey(core, 0)
665 if key in cost.npu_weights_tensor.encoded_ranges:
666 weight_range = cost.npu_weights_tensor.encoded_ranges[key]
667 sz += round_up(weight_range.total_bytes, 16)
668
669 total_sz = len(cost.npu_weights_tensor.buffer)
670 bws[cost.npu_weights_tensor.mem_area][TensorPurpose.Weights][BandwidthDirection.Read] += total_sz
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000671 bws[cost.buffered_weight_tensors[0].mem_area][TensorPurpose.Weights][BandwidthDirection.Write] += total_sz
Tim Halld8339a72021-05-27 18:49:40 +0100672
673 ws_first_transfer_cycles = measure_mem2mem_cycles(
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000674 arch, cost.npu_weights_tensor.mem_area, cost.buffered_weight_tensors[0].mem_area, sz
Tim Halld8339a72021-05-27 18:49:40 +0100675 )
676
677 # Add cycles for Weight + Scale Transfer
Johan Alfvén0f98de62022-05-15 14:54:51 +0200678 if cost.buffered_weight_tensors[0].sub_purpose == TensorSubPurpose.DoubleBuffer:
679 # Double buffer - weights can be fetched in parallel
680 cycles_a[PassCycles.Npu] = max(
681 cost.full_weight_transfer_cycles - slack_cycles + cost.slack_buffering_cycles,
682 cycles.op_cycles + max(ws_first_transfer_cycles - slack_cycles, 0),
683 )
684 else:
685 # Standard buffer - weights can not be fetched in parallel so weight transfer
686 # must be included in the result
687 cycles_a[PassCycles.Npu] = (
688 cycles.op_cycles + cost.full_weight_transfer_cycles - min(ws_first_transfer_cycles, slack_cycles)
689 )
Tim Halld8339a72021-05-27 18:49:40 +0100690
691 # Add cycles for LUT Transfer
692 cycles_a[PassCycles.Npu] += lut_transfer_cycles
693 else:
694 # Add cycles for LUT Transfer
695 cycles_a[PassCycles.Npu] += max(lut_transfer_cycles - slack_cycles, 0)
696
697 # OFM write
698 ofm = op.parent_op.ofm
699 bw = access.ofm_write * ofm.element_size()
700 bws[query.ofm_memory_area][ofm.purpose][BandwidthDirection.Write] += bw
701 scaled_bws[ofm.mem_area][ofm.purpose][BandwidthDirection.Write] += _estimate_memory_transfer_efficiency(
702 arch, False, query.ofm_memory_area, ofm.format, query.ofm_bits, query.config.ofm_block, query.ofm_shape, bw
703 )
704
705 # IFM read
706 ifm = op.parent_op.ifm
707 bw = access.ifm_read[0] * ifm.element_size()
708 bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += bw
709 scaled_bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency(
710 arch, True, query.ifm_memory_area, ifm.format, query.ifm_bits, query.config.ifm_block, query.ifm_shape, bw
711 )
712 if query.ifm2_shape:
713 ifm2 = op.parent_op.ifm2
714 bw = access.ifm_read[1] * ifm2.element_size()
715 bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += bw
716 scaled_bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency(
717 arch,
718 True,
719 query.ifm2_memory_area,
720 ifm2.format,
721 op.ifm2.dtype.size_in_bits(),
722 query.config.ifm_block,
723 query.ifm2_shape,
724 bw,
725 )
726
727 # Weight read
728 if access.const_read[0] > 0:
729 # alignment not accounted for in bandwidth_compression_scale_approx
730 encoded_size_approx = (
731 cost.npu_weights_tensor.elements() - access.const_read[1] * op.parent_op.bias.element_size()
732 )
733 orig_weight_size = parent_op.weights.elements()
734 bandwidth_compression_scale_approx = encoded_size_approx / orig_weight_size
735 bw = access.const_read[0] * bandwidth_compression_scale_approx
736 bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw
737
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000738 if not cost.buffered_weight_tensors:
Patrik Gustavsson225e19d2021-06-01 12:43:43 +0200739 scaled_bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw
740
Tim Halld8339a72021-05-27 18:49:40 +0100741 if access.const_read[1] > 0:
742 # Scales & biases
743 bw = access.const_read[1] * op.parent_op.bias.element_size()
744 bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw
745
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000746 if not cost.buffered_weight_tensors:
Patrik Gustavsson225e19d2021-06-01 12:43:43 +0200747 scaled_bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw
748
Tim Halld8339a72021-05-27 18:49:40 +0100749 update_summary_cycles(arch, scaled_bws, cycles_a)
750
751 return bws, macs, cycles_a
Tim Hall79d07d22020-04-27 18:20:16 +0100752
753
Tim Hallc1be0872022-03-03 17:50:52 +0000754def print_performance(
755 nng: Graph,
756 arch: ArchitectureFeatures,
757 network_type: NetworkType,
758 bws: dict,
759 macs: dict,
760 cycles: dict,
761 mem_usage: dict,
762):
763 if network_type == NetworkType.TFLite:
764 nng_optype_to_input_op_type = tflite_optype_to_builtintype
765 else:
766 nng_optype_to_input_op_type = tosa_optype_to_tosa_op_type
767
768 suid_inv_map = {v: k for k, v in DebugDatabase._sourceUID.items()}
769
770 for sg in nng.subgraphs:
771
772 if sg.placement != PassPlacement.Npu:
773 continue
774
775 print(f"\n{str('#') * 80}")
776 print(f"Performance for NPU Subgraph {sg.name}")
777 print(
778 f" {network_type.name + str(' Operator:'):20s}"
779 f" {str('NNG Operator:'):20s}"
780 f" {str('SRAM Usage'):>10s}"
781 f" ({str('Peak'):>6s}%):"
782 f"{str('Op Cycles'):>10s}"
783 f" ({str('Netwrk'):>6s}%)"
784 f" ["
785 f" {str('NPU'):>10s}"
786 f" {str('SRAM AC'):>10s}"
787 f" {str('DRAM AC'):>10s}"
788 f" {str('OnFlash AC'):>10s}"
789 f" {str('OffFlashAC'):>10s}"
790 f" ]:"
791 f"{str('MAC Count'):>10s}"
792 f" ({str('Netwrk'):>6s}% / {str('Util'):>6s}%):"
793 f"Name:"
794 )
795
796 for sched_op in sg.sched_ops:
797 # get source op name
798 sched_op_src_uid = DebugDatabase._optimisedUID[sched_op.parent_op][1]
799 if sched_op_src_uid == DebugDatabase.NULLREF:
800 src_op_type = None
801 else:
802 src_op_type = suid_inv_map[sched_op_src_uid].type
803
804 src_op_name = nng_optype_to_input_op_type(src_op_type)
805
806 max_macs = cycles[sched_op][PassCycles.Total] * arch.num_macs_per_cycle * arch.ncores
Rickard Bolineb5a4a82022-05-19 12:38:27 +0000807 peak_sram = (
808 mem_usage[sched_op] / nng.memory_used[MemArea.Sram] * 100 if MemArea.Sram in nng.memory_used else 0
809 )
Tim Hallc1be0872022-03-03 17:50:52 +0000810 print(
811 f" {src_op_name:20s}"
812 f" {sched_op.op_type:20s}"
813 f" {mem_usage[sched_op]:10.0f}"
Rickard Bolineb5a4a82022-05-19 12:38:27 +0000814 f" ({peak_sram:6.2f}%)"
Tim Hallc1be0872022-03-03 17:50:52 +0000815 f" {cycles[sched_op][PassCycles.Total]:10.0f}"
816 f" ({cycles[sched_op][PassCycles.Total] / nng.cycles[PassCycles.Total] * 100:6.2f}%)"
817 f" ["
818 f" {cycles[sched_op][PassCycles.Npu]:10.0f}"
819 f" {cycles[sched_op][PassCycles.SramAccess]:10.0f}"
820 f" {cycles[sched_op][PassCycles.DramAccess]:10.0f}"
821 f" {cycles[sched_op][PassCycles.OnChipFlashAccess]:10.0f}"
822 f" {cycles[sched_op][PassCycles.OffChipFlashAccess]:10.0f}"
823 f" ]"
824 f" {macs[sched_op]:10d}"
825 f" ({macs[sched_op] / nng.macs * 100:6.2f}% / {macs[sched_op] / max_macs * 100:6.2f}%)"
826 f" {sched_op.name:s}"
827 )
828
829
830def calc_new_performance_for_network(nng: Graph, arch, network_type: NetworkType, verbose_performance: bool):
Tim Hall79d07d22020-04-27 18:20:16 +0100831 total_bws = make_bandwidth_array()
Diqing Zhong69aadd02020-12-08 13:08:48 +0100832 total_macs = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100833 total_cycles = np.zeros(PassCycles.Size)
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000834 total_weight_size = 0
835 total_encoded_weight_size = 0
836
837 # Store unique instances of original/encoded weight tensor uuids to prevent double counting of weights
838 original_weight_uuids: Set[UUID] = set()
839 encoded_npu_weight_uuids: Set[UUID] = set()
Tim Hall79d07d22020-04-27 18:20:16 +0100840
Tim Hallc1be0872022-03-03 17:50:52 +0000841 bws = {}
842 macs = {}
843 cycles = {}
844 mem_usage = {}
845
Tim Hall79d07d22020-04-27 18:20:16 +0100846 for sg in nng.subgraphs:
Tim Halld8339a72021-05-27 18:49:40 +0100847 prev_op = None
848 for sched_op in sg.sched_ops:
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000849 op_info: SchedulerOpInfo = sg.schedule.cost_map[sched_op]
Tim Hallc1be0872022-03-03 17:50:52 +0000850 bws[sched_op], macs[sched_op], cycles[sched_op] = estimate_full_op_performance(
851 arch, sg.schedule, sched_op, prev_op, op_info.block_config
852 )
853
854 # get op sram usage
855 mem_usage[sched_op] = (
856 sg.schedule.memory_snapshot[op_info.time_index]
857 if op_info.time_index < len(sg.schedule.memory_snapshot)
858 else 0
859 )
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000860
861 # Tensors for calculating weight sizes
862 original_weight = sched_op.parent_op.weights
863 encoded_npu_weight = op_info.npu_weights_tensor
864
865 # Save UUIDs of original_weight so only unique instances of tensors are used to calculate weights
866 if original_weight and (original_weight.equivalence_id not in original_weight_uuids):
867
868 original_weight_uuids.add(original_weight.equivalence_id)
869 total_weight_size += original_weight.values.itemsize * original_weight.values.size
870
871 # Save UUIDs of encoded_npu_weight so only unique instances of tensors are used to calculate weights
872 if encoded_npu_weight and (encoded_npu_weight.equivalence_id not in encoded_npu_weight_uuids):
873
Jonas Ohlsson77b448f2022-03-11 16:08:30 +0100874 encoded_npu_weight_uuids.add(encoded_npu_weight.equivalence_id)
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000875 total_encoded_weight_size += len(encoded_npu_weight.buffer)
876
Tim Hallc1be0872022-03-03 17:50:52 +0000877 total_bws += bws[sched_op]
878 total_macs += macs[sched_op]
879 total_cycles += cycles[sched_op]
Tim Halld8339a72021-05-27 18:49:40 +0100880 prev_op = sched_op
Tim Hall79d07d22020-04-27 18:20:16 +0100881
882 nng.bandwidths = total_bws
883 nng.macs = total_macs
884 nng.cycles = total_cycles
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000885 nng.total_original_weights = total_weight_size
886 nng.total_npu_encoded_weights = total_encoded_weight_size
Tim Hallc1be0872022-03-03 17:50:52 +0000887
888 if verbose_performance:
889 print_performance(nng, arch, network_type, bws, macs, cycles, mem_usage)