blob: cfe13d03252fa57df765b0519fe598d454b4c8d9 [file] [log] [blame]
Rickard Bolinbc6ee582022-11-04 08:24:29 +00001# SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Rickard Bolinbc6ee582022-11-04 08:24:29 +000016#
Tim Hall79d07d22020-04-27 18:20:16 +010017# Description:
18# NPU performance estimation functions to estimate performance of a Pass and CascadedPass. Uses a model that takes the
19# maximum of the 'cycles required for bandwidth' and 'cycles required for computing'.
20#
21# Called during scheduling to evaluate different proposals, as well as post-scheduling to provide a final performance
22# estimate.
Tim Halld8339a72021-05-27 18:49:40 +010023import copy
wilisa0189a8cdd2022-08-22 16:13:06 +000024import csv
Diqing Zhonge168b962020-11-05 17:18:47 +010025from enum import auto
26from enum import IntEnum
Jonas Ohlsson845e2322022-03-01 12:39:55 +010027from typing import Optional
Ayaan Masoodb801dda2022-02-22 11:28:55 +000028from typing import Set
29from uuid import UUID
Diego Russoea6111a2020-04-14 18:41:58 +010030
Tim Hall79d07d22020-04-27 18:20:16 +010031import numpy as np
Diego Russoea6111a2020-04-14 18:41:58 +010032
33from . import numeric_util
Tim Halld8339a72021-05-27 18:49:40 +010034from .architecture_allocator import ArchitectureBlockConfig
Diqing Zhong09387e22020-09-28 18:46:22 +020035from .architecture_features import Accelerator
Tim Hallc1be0872022-03-03 17:50:52 +000036from .architecture_features import ArchitectureFeatures
Tim Halld8339a72021-05-27 18:49:40 +010037from .architecture_features import NpuBlockType
38from .architecture_features import SHRAMElements
39from .architecture_features import TensorFormat
Tim Hallc1be0872022-03-03 17:50:52 +000040from .debug_database import DebugDatabase
Ayaan Masoodb801dda2022-02-22 11:28:55 +000041from .nn_graph import Graph
Tim Hallc1be0872022-03-03 17:50:52 +000042from .nn_graph import NetworkType
43from .nn_graph import PassPlacement
Tim Halld8339a72021-05-27 18:49:40 +010044from .numeric_util import round_up
Johan Alfvénf8e353b2022-02-04 17:24:23 +010045from .numeric_util import round_up_to_int
Tim Halld8339a72021-05-27 18:49:40 +010046from .operation import Kernel
Diqing Zhonge8887a32020-09-24 09:53:48 +020047from .operation import Op
Tim Halld8339a72021-05-27 18:49:40 +010048from .scheduler import Schedule
49from .scheduler import SchedulerOperation
Ayaan Masoodb801dda2022-02-22 11:28:55 +000050from .scheduler import SchedulerOpInfo
Tim Halld8339a72021-05-27 18:49:40 +010051from .shape4d import Shape4D
Diqing Zhongf842b692020-12-11 13:07:37 +010052from .tensor import BandwidthDirection
Diego Russoe8a10452020-04-21 17:39:10 +010053from .tensor import MemArea
Diego Russoe8a10452020-04-21 17:39:10 +010054from .tensor import TensorPurpose
Johan Alfvén0f98de62022-05-15 14:54:51 +020055from .tensor import TensorSubPurpose
Tim Hallc1be0872022-03-03 17:50:52 +000056from .tflite_mapping import optype_to_builtintype as tflite_optype_to_builtintype
57from .tosa_mapping import optype_to_tosa_op_type as tosa_optype_to_tosa_op_type
Tim Halld8339a72021-05-27 18:49:40 +010058from .weight_compressor import WeightKey
Tim Hall79d07d22020-04-27 18:20:16 +010059
60
Diqing Zhonge168b962020-11-05 17:18:47 +010061class PassCycles(IntEnum):
Diqing Zhong42e833d2020-10-02 13:18:42 +020062 Npu = 0
Diqing Zhonge168b962020-11-05 17:18:47 +010063 SramAccess = auto()
64 DramAccess = auto()
65 OnChipFlashAccess = auto()
66 OffChipFlashAccess = auto()
67 Total = auto()
68 Size = auto()
Tim Hall79d07d22020-04-27 18:20:16 +010069
70 def display_name(self):
Jonas Ohlssond8575072022-03-30 10:30:25 +020071 return (
72 "NPU",
73 "SRAM Access",
74 "DRAM Access",
75 "On-chip Flash Access",
76 "Off-chip Flash Access",
77 "Total",
78 "Size",
79 )[self.value]
Tim Hall79d07d22020-04-27 18:20:16 +010080
81 def identifier_name(self):
Jonas Ohlssond8575072022-03-30 10:30:25 +020082 return (
83 "npu",
84 "sram_access",
85 "dram_access",
86 "on_chip_flash_access",
87 "off_chip_flash_access",
88 "total",
89 "size",
90 )[self.value]
Tim Hall79d07d22020-04-27 18:20:16 +010091
92 @staticmethod
93 def all():
94 return (
Diqing Zhong42e833d2020-10-02 13:18:42 +020095 PassCycles.Npu,
Tim Hall79d07d22020-04-27 18:20:16 +010096 PassCycles.SramAccess,
97 PassCycles.DramAccess,
98 PassCycles.OnChipFlashAccess,
99 PassCycles.OffChipFlashAccess,
100 PassCycles.Total,
101 )
102
103
Tim Halld8339a72021-05-27 18:49:40 +0100104class PerformanceQuery:
105 def __init__(self, npu_block_type=0):
106 self.npu_block_type = npu_block_type
107 self.ifm_shape = Shape4D(0)
108 self.ifm_format = TensorFormat.NHWC
109 self.ifm_memory_area = MemArea.Unknown
110 self.ifm2_memory_area = MemArea.Unknown
111 self.ifm_bits = 0
112 self.ifm2_bits = 0
113 self.ifm2_shape = None
114 self.ifm2_format = TensorFormat.NHWC
115 self.ofm_shape = Shape4D(0)
116 self.ofm_format = TensorFormat.NHWC
117 self.ofm_memory_area = MemArea.Unknown
118 self.ofm_bits = 0
119 self.const_shape = Shape4D(0)
120 self.const_memory_area = MemArea.Unknown
121 self.kernel = Kernel(1, 1)
122 self.config = ArchitectureBlockConfig()
Tim Hall79d07d22020-04-27 18:20:16 +0100123
124
Tim Halld8339a72021-05-27 18:49:40 +0100125class CycleCost:
126 def __init__(self):
127 self.op_macs = 0
128 self.op_cycles = 0
129
130 def __mul__(self, scale):
131 out = CycleCost()
132 out.op_macs = self.op_macs * scale
133 out.op_cycles = self.op_cycles * scale
134 return out
135
136 def __iadd__(self, rhs):
137 self.op_macs += rhs.op_macs
138 self.op_cycles += rhs.op_cycles
139 return self
140
141 def __str__(self):
142 return "macs = {}, cycles = {}".format(self.op_macs, self.op_cycles)
Tim Hall79d07d22020-04-27 18:20:16 +0100143
144
Tim Halld8339a72021-05-27 18:49:40 +0100145class ElementAccess:
146 def __init__(self):
147 # List of ONLY element access counts, consumers
148 # need to scale these values by the correct bitwidths
149 # to calculated memory bandwidth
150 self.ifm_read = [0, 0] # ifm1, ifm2
151 self.ofm_write = 0
152 self.weights_refetch = 0
153 self.const_read = [0, 0] # weights, scales
154
155 def __mul__(self, scale):
156 out = ElementAccess()
157 out.ifm_read[0] = self.ifm_read[0] * scale
158 out.ifm_read[1] = self.ifm_read[1] * scale
159 out.ofm_write = self.ofm_write * scale
160 out.weights_refetch = self.weights_refetch * scale
161 out.const_read[0] = self.const_read[0] * scale
162 out.const_read[1] = self.const_read[1] * scale
163 return out
164
165 def __iadd__(self, rhs):
166 self.ifm_read[0] += rhs.ifm_read[0]
167 self.ifm_read[1] += rhs.ifm_read[1]
168 self.ofm_write += rhs.ofm_write
169 self.weights_refetch += rhs.weights_refetch
170 self.const_read[0] += rhs.const_read[0]
171 self.const_read[1] += rhs.const_read[1]
172 return self
173
174 def __str__(self):
175 return "ifm read = {}, ofm write = {}, const read={}".format(self.ifm_read, self.ofm_write, self.const_read)
Tim Hall79d07d22020-04-27 18:20:16 +0100176
177
Tim Halld8339a72021-05-27 18:49:40 +0100178def _strides_for_shape(shape: Shape4D, format: TensorFormat, element_bits):
179 if format == TensorFormat.NHWC:
180 strides = [0, 0, 0, 0]
181 strides[3] = element_bits / 8 # +Z
182 strides[2] = (element_bits * shape.depth) // 8 # +X
183 strides[1] = (element_bits * shape.depth * shape.width) // 8 # +Y
184 strides[0] = (element_bits * shape.depth * shape.width * shape.height) // 8 # +N
185 elif format == TensorFormat.NHCWB16:
186 strides = [0, 0, 0, 0, 0]
187 strides[4] = element_bits / 8 # +Z
188 strides[3] = (element_bits * 16) / 8 # +X
189 strides[2] = (element_bits * 16 * shape.width) / 8 # +C
190 strides[1] = (element_bits * shape.width * shape.depth) / 8 # +Y
191 strides[0] = (element_bits * shape.width * shape.depth) / 8 # +N
Diqing Zhong42e833d2020-10-02 13:18:42 +0200192
Tim Halld8339a72021-05-27 18:49:40 +0100193 return strides
Diqing Zhong42e833d2020-10-02 13:18:42 +0200194
195
Tim Halld8339a72021-05-27 18:49:40 +0100196def _estimate_memory_transfer_efficiency(
197 arch, is_read, mem_area, format: TensorFormat, element_bits, block_size, shape4D, to_transfer
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100198):
Tim Halld8339a72021-05-27 18:49:40 +0100199 burst_len = 8
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100200
Tim Halld8339a72021-05-27 18:49:40 +0100201 strides = _strides_for_shape(shape4D, format, element_bits)
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100202
Tim Halld8339a72021-05-27 18:49:40 +0100203 if format == TensorFormat.NHCWB16:
204 if strides[2] == block_size.depth: # TODO is this check corrrect for non 8-bit
205 burst_len = element_bits * block_size.depth * block_size.width
206 elif is_read:
207 burst_len = 16 * element_bits * block_size.width
Diqing Zhonge8887a32020-09-24 09:53:48 +0200208 else:
Tim Halld8339a72021-05-27 18:49:40 +0100209 burst_len = 16 * element_bits * block_size.width * arch.ncores
210 elif format == TensorFormat.NHWC:
211 if is_read:
212 if strides[3] == block_size.depth:
213 burst_len = element_bits * block_size.depth * block_size.width
214 else:
215 burst_len = element_bits * block_size.depth
216 else:
217 if block_size.depth <= 16 and strides[3] == block_size.depth:
218 burst_len = element_bits * block_size.depth * block_size.width
219 else:
220 burst_len = min(64 * 8, 16 * element_bits * arch.ncores, block_size.depth * element_bits)
221
222 burst_len = burst_len // 8 # bits->bytes
223 burst_len = min(arch.memory_burst_length[mem_area], burst_len)
224 return to_transfer * (arch.memory_burst_length[mem_area] / burst_len)
225
226
227def _estimate_minimum_memory_cycles(arch, query: PerformanceQuery):
228 # Input block HW transfer (only for elements present)
229 ifm_bytes = Shape4D.min(query.ifm_shape, query.config.ifm_block).elements()
230 cycles_ifm_blk = arch.memory_latency[query.ifm_memory_area][BandwidthDirection.Read]
231 cycles_ifm_blk = cycles_ifm_blk + (
232 _estimate_memory_transfer_efficiency(
233 arch,
234 True,
235 query.ifm_memory_area,
236 query.ifm_format,
237 query.ifm_bits,
238 query.config.ifm_block,
239 query.ifm_shape,
240 ifm_bytes,
241 )
242 / arch.memory_bandwidths_per_cycle[query.ifm_memory_area]
243 )
244 # Output block HW transfer (only for elements present)
245 ofm_bytes = Shape4D.min(query.ofm_shape, query.config.ofm_block).elements()
246 cycles_ofm_blk = arch.memory_latency[query.ofm_memory_area][BandwidthDirection.Write]
247 cycles_ofm_blk = cycles_ofm_blk + (
248 _estimate_memory_transfer_efficiency(
249 arch,
250 False,
251 query.ofm_memory_area,
252 query.ofm_format,
253 query.ofm_bits,
254 query.config.ofm_block,
255 query.ofm_shape,
256 ofm_bytes,
257 )
258 / arch.memory_bandwidths_per_cycle[query.ofm_memory_area]
259 )
260 return cycles_ifm_blk, cycles_ofm_blk
261
262
263def _estimate_output_cycles_per_element(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):
264 if query.npu_block_type == NpuBlockType.ElementWise and query.ifm_bits == 32:
265 # Unary op else Binary op
266 output_perf_index = 0 if query.ifm2_shape is not None else 1
267 elif op_type == Op.Mul and query.ofm_bits == 32:
Diqing Zhonge8887a32020-09-24 09:53:48 +0200268 output_perf_index = 2
Tim Halld8339a72021-05-27 18:49:40 +0100269 elif op_type == Op.Mul or (
270 query.npu_block_type
Diqing Zhonge8887a32020-09-24 09:53:48 +0200271 in (
272 NpuBlockType.ConvolutionMxN,
273 NpuBlockType.ConvolutionDepthWise,
274 NpuBlockType.Pooling,
275 NpuBlockType.ReduceSum,
276 NpuBlockType.VectorProduct,
277 )
Tim Halld8339a72021-05-27 18:49:40 +0100278 and query.config.acc_type == SHRAMElements.Acc40
Diqing Zhonge8887a32020-09-24 09:53:48 +0200279 ):
280 output_perf_index = 3
Tim Halld8339a72021-05-27 18:49:40 +0100281 elif op_type in (Op.Add, Op.Sub):
282 if False:
Diqing Zhonge8887a32020-09-24 09:53:48 +0200283 # Simple Add/Sub
284 output_perf_index = 4
285 else:
Tim Halld8339a72021-05-27 18:49:40 +0100286 # Advanced Add/Sub TODO: Add as perf selection as operator variant
Diqing Zhonge8887a32020-09-24 09:53:48 +0200287 output_perf_index = 5
Tim Halld8339a72021-05-27 18:49:40 +0100288 elif op_type.is_maxpool_op():
Diqing Zhonge8887a32020-09-24 09:53:48 +0200289 output_perf_index = 6
290 else:
291 output_perf_index = 7
292
Tim Halld8339a72021-05-27 18:49:40 +0100293 if faf_type in (Op.Sigmoid, Op.Tanh, Op.LUT):
Diqing Zhonge8887a32020-09-24 09:53:48 +0200294 activation_perf_index = 0
Tim Halld8339a72021-05-27 18:49:40 +0100295 elif faf_type in (Op.Relu, Op.Relu6, Op.ReluN1To1):
Diqing Zhonge8887a32020-09-24 09:53:48 +0200296 activation_perf_index = 1
297 else:
298 activation_perf_index = 2
299
Diqing Zhonge8887a32020-09-24 09:53:48 +0200300 cycle_per_elem = max(
301 arch.output_cycles_per_elem[output_perf_index], arch.activation_cycles_per_elem[activation_perf_index]
302 )
Diqing Zhong986e3192020-11-16 16:15:56 +0100303
Tim Halld8339a72021-05-27 18:49:40 +0100304 if op_type.is_elementwise_op():
305 num_elems_blk = query.config.ofm_block.elements()
306 ifm_blk_cycles, ofm_blk_cycles = _estimate_minimum_memory_cycles(arch, query)
307 cycle_cmd = ifm_blk_cycles + ofm_blk_cycles
308 cycle_cmd = (cycle_cmd + cycle_per_elem * num_elems_blk) / 4 # per DPU
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100309 cycle_per_elem = max(cycle_per_elem, cycle_cmd / num_elems_blk)
310
Tim Halld8339a72021-05-27 18:49:40 +0100311 return cycle_per_elem
Diqing Zhonge8887a32020-09-24 09:53:48 +0200312
313
Tim Halld8339a72021-05-27 18:49:40 +0100314def _estimate_conv_cycles(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):
315 ifm_block = Shape4D.min(query.ifm_shape, query.config.ifm_block)
316 ofm_block = Shape4D.min(query.ofm_shape, query.config.ofm_block)
Diqing Zhonge5204a62020-10-13 11:42:37 +0200317
318 if (
319 arch.config.ofm_ublock.height == 2
Tim Halld8339a72021-05-27 18:49:40 +0100320 and query.npu_block_type
Diqing Zhonge5204a62020-10-13 11:42:37 +0200321 in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)
Tim Halld8339a72021-05-27 18:49:40 +0100322 and query.ofm_shape.height == 1
Diqing Zhonge5204a62020-10-13 11:42:37 +0200323 # Optimisation only applies for even width tensors
Tim Halld8339a72021-05-27 18:49:40 +0100324 and query.ofm_shape.width % 2 == 0
325 and query.kernel.height == 1
Diqing Zhonge5204a62020-10-13 11:42:37 +0200326 ):
Tim Halld8339a72021-05-27 18:49:40 +0100327 ofm_ublock = Shape4D(1, 1, 4, arch.config.ofm_ublock.depth)
328 ofm_block = ofm_block.with_height(1)
329 else:
330 ofm_ublock = Shape4D(arch.config.ofm_ublock.to_hwc())
Diqing Zhonge5204a62020-10-13 11:42:37 +0200331
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100332 num_ublk_x = numeric_util.round_up_divide(ofm_block.width, ofm_ublock.width)
Tim Halld8339a72021-05-27 18:49:40 +0100333 num_ublk_y = numeric_util.round_up_divide(ofm_block.height, ofm_ublock.height)
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100334 num_ublk_xy = num_ublk_x * num_ublk_y
Tim Halld8339a72021-05-27 18:49:40 +0100335 num_ublk_z = numeric_util.round_up_divide(ofm_block.depth, ofm_ublock.depth)
336 use_acc_40bits = query.config.acc_type == SHRAMElements.Acc40
Diqing Zhong09387e22020-09-28 18:46:22 +0200337
Tim Halld8339a72021-05-27 18:49:40 +0100338 sub_kernel_limits = arch.sub_kernel_limits[query.npu_block_type]
339 n_sub_kernels_y = numeric_util.round_up_divide(query.kernel.height, sub_kernel_limits[0])
340 n_sub_kernels_x = numeric_util.round_up_divide(query.kernel.width, sub_kernel_limits[1])
Diqing Zhong09387e22020-09-28 18:46:22 +0200341 sub_kernel_x = [
Tim Halld8339a72021-05-27 18:49:40 +0100342 min((query.kernel.width - i * sub_kernel_limits[1]), sub_kernel_limits[1]) for i in range(n_sub_kernels_x)
Diqing Zhong09387e22020-09-28 18:46:22 +0200343 ]
344 sub_kernel_y = [
Tim Halld8339a72021-05-27 18:49:40 +0100345 min((query.kernel.height - i * sub_kernel_limits[0]), sub_kernel_limits[0]) for i in range(n_sub_kernels_y)
Diqing Zhong09387e22020-09-28 18:46:22 +0200346 ]
347 sub_kernel_size = (x * y for y in sub_kernel_y for x in sub_kernel_x)
348
Diqing Zhong09387e22020-09-28 18:46:22 +0200349 cycles_dpu_blk = 0
Diqing Zhong986e3192020-11-16 16:15:56 +0100350 cycles_wb = 32 * ofm_ublock.depth // 8
Diqing Zhong09387e22020-09-28 18:46:22 +0200351
352 for num_kernel_elems in sub_kernel_size:
Tim Halld8339a72021-05-27 18:49:40 +0100353 if query.npu_block_type == NpuBlockType.Pooling:
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100354 num_kernel_steps = 1
Diqing Zhong986e3192020-11-16 16:15:56 +0100355 cycles = max(4, num_kernel_elems) * num_ublk_xy * num_ublk_z
Tim Halld8339a72021-05-27 18:49:40 +0100356 if query.ifm_bits == 16 and arch.accelerator_config != Accelerator.Ethos_U55_32:
Diqing Zhong09387e22020-09-28 18:46:22 +0200357 cycles *= 2
Tim Halld8339a72021-05-27 18:49:40 +0100358 elif query.npu_block_type == NpuBlockType.ConvolutionDepthWise:
Diqing Zhong986e3192020-11-16 16:15:56 +0100359 cycles = 4 * num_ublk_xy
Tim Halld8339a72021-05-27 18:49:40 +0100360 if query.ifm_bits == 16:
Diqing Zhong09387e22020-09-28 18:46:22 +0200361 cycles *= 2
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100362 num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, 4)
363 cycles = max(cycles_wb, cycles) * num_kernel_steps * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200364 elif (
Tim Halld8339a72021-05-27 18:49:40 +0100365 (query.npu_block_type == NpuBlockType.ConvolutionMxN and not query.config.is_partkernel)
366 or query.npu_block_type == NpuBlockType.VectorProduct
367 or query.npu_block_type == NpuBlockType.ReduceSum
Diqing Zhong09387e22020-09-28 18:46:22 +0200368 ):
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100369 num_kernel_steps = num_kernel_elems
370 cycles = max(cycles_wb, 4 * num_ublk_xy) * num_kernel_steps * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200371 else:
Tim Halld8339a72021-05-27 18:49:40 +0100372 assert query.config.is_partkernel
373 divider = 2 if query.ifm_bits == 16 else 4
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100374 num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, divider)
Diqing Zhong986e3192020-11-16 16:15:56 +0100375 cycles = max(cycles_wb, 4 * num_ublk_xy) * (
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100376 num_kernel_steps * numeric_util.round_up_divide(ifm_block.depth, 8) * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200377 )
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100378
379 delay_cycles = 0
380 if arch.accelerator_config is Accelerator.Ethos_U55_32:
381 delay = 7 if use_acc_40bits else 3
382 if num_ublk_x == 1 and num_ublk_y == 1:
383 if num_ublk_z == 1:
384 delay_cycles = delay * num_kernel_steps
385 elif num_kernel_steps > 1:
386 delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z
387 if (num_ublk_x == 1 or num_ublk_y == 1) and num_ublk_z > 1 and use_acc_40bits:
388 delay_cycles += delay * num_ublk_z
389 else:
Tim Halld8339a72021-05-27 18:49:40 +0100390 if use_acc_40bits and arch.accelerator_config in (Accelerator.Ethos_U55_64, Accelerator.Ethos_U55_128):
391 delay = 3
392 else:
393 delay = 2
394
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100395 if num_ublk_x == 1 and num_ublk_y == 1:
396 if num_ublk_z == 1:
397 delay_cycles = delay * num_kernel_steps
398 elif num_kernel_steps > 1:
399 delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z
400
Tim Halld8339a72021-05-27 18:49:40 +0100401 if query.npu_block_type == NpuBlockType.ConvolutionMxN and query.config.is_partkernel:
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100402 delay_cycles *= numeric_util.round_up_divide(ifm_block.depth, 8)
403
Diqing Zhong09387e22020-09-28 18:46:22 +0200404 cycles_dpu_blk += cycles
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100405 cycles_dpu_blk += delay_cycles
406
Tim Halld8339a72021-05-27 18:49:40 +0100407 if query.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
408 cycles_dpu_blk *= numeric_util.round_up_divide(query.ifm_shape.depth, ifm_block.depth)
Diqing Zhong09387e22020-09-28 18:46:22 +0200409
410 cycles_dpu_blk /= arch.ncores
411
Tim Halld8339a72021-05-27 18:49:40 +0100412 # Estimate output cycles
413 num_ofm_blks = query.ofm_shape.div_round_up(ofm_block).elements()
Johan Alfvénf8e353b2022-02-04 17:24:23 +0100414 cycles_output_blk = round_up_to_int(
415 _estimate_output_cycles_per_element(arch, op_type, faf_type, query) * ofm_block.elements()
416 )
Diqing Zhong09387e22020-09-28 18:46:22 +0200417
Tim Halld8339a72021-05-27 18:49:40 +0100418 # Scale and bias tensor
419 if query.const_shape.depth > 0:
Diqing Zhongf842b692020-12-11 13:07:37 +0100420 cycles_bias_blk = (
Tim Halld8339a72021-05-27 18:49:40 +0100421 10 * ofm_block.depth * arch.memory_latency[query.const_memory_area][BandwidthDirection.Read] / 256
Diqing Zhongf842b692020-12-11 13:07:37 +0100422 )
Diqing Zhong986e3192020-11-16 16:15:56 +0100423 cycles_output_blk = max(cycles_output_blk, cycles_bias_blk)
424
Tim Halld8339a72021-05-27 18:49:40 +0100425 ifm_blk_cycles, ofm_blk_cycles = _estimate_minimum_memory_cycles(arch, query)
426 cycles_cmd = ifm_blk_cycles + ofm_blk_cycles
427 cycles_cmd = (cycles_cmd + cycles_output_blk + cycles_dpu_blk) / 4 # per DPU
428
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100429 cycles_dpu_blk = max(cycles_dpu_blk, cycles_cmd)
430 cycles_output_blk = max(cycles_output_blk, cycles_cmd)
431
Diqing Zhong09387e22020-09-28 18:46:22 +0200432 if cycles_dpu_blk > cycles_output_blk:
Tim Halld8339a72021-05-27 18:49:40 +0100433 total_cycles = cycles_dpu_blk * num_ofm_blks + cycles_output_blk
Diqing Zhong09387e22020-09-28 18:46:22 +0200434 else:
Tim Halld8339a72021-05-27 18:49:40 +0100435 total_cycles = cycles_output_blk * num_ofm_blks + cycles_dpu_blk
Diqing Zhong09387e22020-09-28 18:46:22 +0200436
437 return total_cycles
438
439
Tim Halld8339a72021-05-27 18:49:40 +0100440def measure_mem2mem_cycles(arch, from_mem_area, to_mem_area, to_transfer):
441 from_cycles = to_transfer // arch.memory_bandwidths_per_cycle[from_mem_area]
Tim Hall789e6f32021-06-17 17:02:31 +0100442 from_cycles += arch.memory_latency[from_mem_area][BandwidthDirection.Read]
Tim Halld8339a72021-05-27 18:49:40 +0100443 to_cycles = to_transfer // arch.memory_bandwidths_per_cycle[to_mem_area]
444 return max(from_cycles, to_cycles)
Diqing Zhonge168b962020-11-05 17:18:47 +0100445
Patrik Gustavssonee99bb12021-04-08 09:04:00 +0200446
Tim Halld8339a72021-05-27 18:49:40 +0100447def measure_cycle_cost(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):
448 cycles = CycleCost()
Diqing Zhonge168b962020-11-05 17:18:47 +0100449
Tim Halld8339a72021-05-27 18:49:40 +0100450 # Convolution/Vector product cycle calculation
451 if query.npu_block_type in (
452 NpuBlockType.ConvolutionMxN,
453 NpuBlockType.ConvolutionDepthWise,
454 NpuBlockType.VectorProduct,
455 NpuBlockType.Pooling,
456 NpuBlockType.ReduceSum,
457 ):
458 # cycles.op_macs and cycles.op_cycles should both handle >32-bits
459 if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
460 cycles.op_macs = int(query.kernel.elements_wh()) * 1 * int(query.ofm_shape.elements())
Diqing Zhonge168b962020-11-05 17:18:47 +0100461 else:
Tim Halld8339a72021-05-27 18:49:40 +0100462 cycles.op_macs = (
463 int(query.kernel.elements_wh()) * int(query.ifm_shape.depth) * int(query.ofm_shape.elements())
464 )
465
466 cycles.op_cycles = int(_estimate_conv_cycles(arch, op_type, faf_type, query))
467 # Elementwise cycle calculation
468 elif query.npu_block_type == NpuBlockType.ElementWise:
469 cycles.op_macs = 0
Johan Alfvénf8e353b2022-02-04 17:24:23 +0100470 ofm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ofm_format]))
471 cycles.op_cycles = round_up_to_int(
472 _estimate_output_cycles_per_element(arch, op_type, faf_type, query)
473 * Shape4D.round_up(query.ofm_shape, ofm_rounding).elements()
Tim Halld8339a72021-05-27 18:49:40 +0100474 )
Diqing Zhonge168b962020-11-05 17:18:47 +0100475 else:
Tim Halld8339a72021-05-27 18:49:40 +0100476 assert False
Diqing Zhonge168b962020-11-05 17:18:47 +0100477
Tim Halld8339a72021-05-27 18:49:40 +0100478 return cycles
Diqing Zhonge168b962020-11-05 17:18:47 +0100479
480
Tim Halld8339a72021-05-27 18:49:40 +0100481def measure_element_access(arch, query: PerformanceQuery):
482 access = ElementAccess()
Tim Hall79d07d22020-04-27 18:20:16 +0100483
Tim Halld8339a72021-05-27 18:49:40 +0100484 ifm_block = Shape4D.min(query.ifm_shape, query.config.ifm_block)
485 ofm_block = Shape4D.min(query.ofm_shape, query.config.ofm_block)
486 ifm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ifm_format]))
Tim Hall79d07d22020-04-27 18:20:16 +0100487
Tim Halld8339a72021-05-27 18:49:40 +0100488 # Number of ofm blocks in the overall output shape
489 ofm_blocks = query.ofm_shape.div_round_up(ofm_block)
490 ofm_block_depth = ofm_block.depth
491 if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
492 ofm_blocks = ofm_blocks.with_depth(1)
493 ofm_block_depth = query.ifm_shape.depth
Diqing Zhonge168b962020-11-05 17:18:47 +0100494
Tim Halld8339a72021-05-27 18:49:40 +0100495 # Convolution & pooling
496 if query.npu_block_type in (
497 NpuBlockType.ConvolutionMxN,
498 NpuBlockType.ConvolutionDepthWise,
499 NpuBlockType.VectorProduct,
500 NpuBlockType.Pooling,
501 NpuBlockType.ReduceSum,
502 ):
503 # Number of sub kernels
504 sub_kernel_limits = arch.sub_kernel_limits[query.npu_block_type]
505 subkernels = numeric_util.round_up_divide(query.kernel.width, sub_kernel_limits[0])
506 subkernels *= numeric_util.round_up_divide(query.kernel.height, sub_kernel_limits[1])
Tim Hall79d07d22020-04-27 18:20:16 +0100507
Tim Halld8339a72021-05-27 18:49:40 +0100508 ofm_block_count = ofm_blocks.elements()
Tim Hall79d07d22020-04-27 18:20:16 +0100509
Tim Halld8339a72021-05-27 18:49:40 +0100510 ifm_fetch = (
511 Shape4D.round_up(ifm_block, ifm_rounding).elements_wh()
512 * Shape4D.round_up(query.ifm_shape, ifm_rounding).depth
Diqing Zhonge168b962020-11-05 17:18:47 +0100513 )
Tim Hall79d07d22020-04-27 18:20:16 +0100514
Tim Halld8339a72021-05-27 18:49:40 +0100515 if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
516 kernel_read = query.kernel.elements_wh() * 1 # force to no reread
517 else:
518 kernel_read = query.kernel.elements_wh() * query.ifm_shape.depth
Tim Hall79d07d22020-04-27 18:20:16 +0100519
Tim Halld8339a72021-05-27 18:49:40 +0100520 weight_fetch = kernel_read * ofm_block_depth * ofm_block_count
521
522 access.ifm_read[0] = ifm_fetch * subkernels * ofm_block_count
523
524 if query.npu_block_type not in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):
525 access.const_read[0] = weight_fetch
526 access.const_read[1] = query.ofm_shape.depth # Scales & biases
527 access.weights_refetch = ofm_blocks.elements_wh()
528 # Elementwise
529 elif query.npu_block_type == NpuBlockType.ElementWise:
530 if query.ifm_shape.elements() == 1:
531 if query.ifm_bits > 8:
532 # ifm is a non 8-bit scalar
533 access.ifm_read[0] = Shape4D.round_up(query.ifm_shape, ifm_rounding).elements()
534 if query.ifm2_shape:
535 access.ifm_read[1] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()
536 else:
537 access.ifm_read[0] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()
538 if query.ifm2_shape:
539 if query.ifm2_shape.elements() > 1:
540 access.ifm_read[1] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()
541 elif query.ifm2_bits > 8:
542 # ifm2 is a non 8-bit scalar
543 access.ifm_read[1] = Shape4D.round_up(query.ifm2_shape, ifm_rounding).elements()
544 # Unknown
545 else:
546 assert False
547
548 ofm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ofm_format]))
549 access.ofm_write = Shape4D.round_up(query.ofm_shape, ofm_rounding).elements()
550 return access
551
552
553def measure_performance_cost(
554 arch, op_type: Op, faf_type: Op, query: PerformanceQuery, offset: Shape4D, sub_shape: Shape4D
555):
556 assert (query.ofm_bits > 0) and (query.ifm_bits > 0)
557 assert query.ofm_shape.elements() != 0
558
559 # Default to start if no offset provided
560 if offset is None:
561 offset = Shape4D(0, 0, 0, 0)
562
563 # Default to entire area if no sub-shape provided
564 if sub_shape is None:
565 sub_shape = query.ofm_shape
566 else:
567 sub_shape = Shape4D.min(sub_shape, query.ofm_shape)
568
569 sub_query = copy.deepcopy(query)
570 sub_query.ofm_shape = query.ofm_shape.clip(offset, sub_shape)
571
572 access = ElementAccess()
573 cycles = CycleCost()
574
575 cycle_tmp = measure_cycle_cost(arch, op_type, faf_type, sub_query)
576 cycles += cycle_tmp
577 access = measure_element_access(arch, sub_query)
578
579 return access, cycles
580
581
582def make_bandwidth_array():
583 return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size))
584
585
586def make_cycles_array():
587 return np.zeros(PassCycles.Size)
Tim Hall79d07d22020-04-27 18:20:16 +0100588
589
Diqing Zhonge168b962020-11-05 17:18:47 +0100590def update_summary_cycles(arch, bws, cycles):
591 cycles[PassCycles.SramAccess] = np.sum(bws[MemArea.Sram]) / arch.memory_bandwidths_per_cycle[MemArea.Sram]
Tim Hall79d07d22020-04-27 18:20:16 +0100592 cycles[PassCycles.DramAccess] = np.sum(bws[MemArea.Dram]) / arch.memory_bandwidths_per_cycle[MemArea.Dram]
593 cycles[PassCycles.OnChipFlashAccess] = (
594 np.sum(bws[MemArea.OnChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OnChipFlash]
595 )
596 cycles[PassCycles.OffChipFlashAccess] = (
597 np.sum(bws[MemArea.OffChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OffChipFlash]
598 )
599
600 cycles[PassCycles.Total] = np.max(cycles[: PassCycles.Total])
601 return cycles
602
603
Tim Halld8339a72021-05-27 18:49:40 +0100604def estimate_full_op_performance(
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100605 arch, schedule: Schedule, op: SchedulerOperation, prev_op: Optional[SchedulerOperation], block_config
Tim Halld8339a72021-05-27 18:49:40 +0100606):
607 cycles_a = make_cycles_array()
608 bws = make_bandwidth_array()
609 scaled_bws = make_bandwidth_array() # scaled bw with memory transfer efficiency
610 macs = 0
611
612 query = PerformanceQuery(op.op_type.npu_block_type)
613 query.ifm_shape = op.ifm.shape
614 query.ifm_format = op.ifm.format
615 query.ifm_memory_area = op.ifm.mem_area
616 query.ifm_bits = op.ifm.dtype.size_in_bits()
617 query.ifm2_shape = op.ifm2 and op.ifm2.shape
618 query.ifm2_format = op.ifm2 and op.ifm2.format
619 query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area
620 query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()
621 query.ofm_shape = op.ofm.shape
622 query.ofm_memory_area = op.ofm.mem_area
623 query.ofm_bits = op.ofm.dtype.size_in_bits()
624 query.ofm_format = op.ofm.format
625 query.kernel = op.kernel
626 query.config = block_config
627
628 cost = schedule.cost_map[op]
629 prev_cost = schedule.cost_map[prev_op] if prev_op else None
630 if op.parent_op.bias:
631 query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000632 if cost.buffered_weight_tensors:
633 query.const_memory_area = cost.buffered_weight_tensors[0].mem_area
Tim Halld8339a72021-05-27 18:49:40 +0100634 else:
635 query.const_memory_area = cost.npu_weights_tensor.mem_area
636
637 cycles = measure_cycle_cost(arch, op.op_type, op.parent_op.activation and op.parent_op.activation.op_type, query)
638 cycles_a[PassCycles.Npu] = cycles.op_cycles
639 macs = cycles.op_macs
640
641 access = measure_element_access(arch, query)
642
643 # How many NPU cycles are available under the previously executing
644 # operator for performing buffered DMA transfers
645 slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0
646
647 # LUT Transfer
648 parent_op = op.parent_op
649 lut_transfer_cycles = 0
650 if parent_op.activation_lut:
651 lut_tensor = [tens for tens in parent_op.inputs if tens.purpose == TensorPurpose.LUT][0]
652 src_tensor = lut_tensor.src_tensor
653 if src_tensor and lut_tensor.mem_area != src_tensor.mem_area:
654 bw = src_tensor.storage_size()
655 lut_transfer_cycles = measure_mem2mem_cycles(arch, src_tensor.mem_area, lut_tensor.mem_area, bw)
656
657 bws[src_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw
658 # LUT read from SHRAM TODO remove?
Ayaan Masoodd5cbef32022-02-22 15:56:35 +0000659 scaled_bws[lut_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw
Tim Halld8339a72021-05-27 18:49:40 +0100660
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000661 if cost.npu_weights_tensor and cost.buffered_weight_tensors:
Tim Halld8339a72021-05-27 18:49:40 +0100662 # DMA Weight Transfer
663 sz = 0
664 # Get the size of the first DMA
665 for core in range(0, arch.ncores):
666 key = WeightKey(core, 0)
667 if key in cost.npu_weights_tensor.encoded_ranges:
668 weight_range = cost.npu_weights_tensor.encoded_ranges[key]
669 sz += round_up(weight_range.total_bytes, 16)
670
671 total_sz = len(cost.npu_weights_tensor.buffer)
672 bws[cost.npu_weights_tensor.mem_area][TensorPurpose.Weights][BandwidthDirection.Read] += total_sz
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000673 bws[cost.buffered_weight_tensors[0].mem_area][TensorPurpose.Weights][BandwidthDirection.Write] += total_sz
Tim Halld8339a72021-05-27 18:49:40 +0100674
675 ws_first_transfer_cycles = measure_mem2mem_cycles(
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000676 arch, cost.npu_weights_tensor.mem_area, cost.buffered_weight_tensors[0].mem_area, sz
Tim Halld8339a72021-05-27 18:49:40 +0100677 )
678
679 # Add cycles for Weight + Scale Transfer
Johan Alfvén0f98de62022-05-15 14:54:51 +0200680 if cost.buffered_weight_tensors[0].sub_purpose == TensorSubPurpose.DoubleBuffer:
681 # Double buffer - weights can be fetched in parallel
682 cycles_a[PassCycles.Npu] = max(
683 cost.full_weight_transfer_cycles - slack_cycles + cost.slack_buffering_cycles,
684 cycles.op_cycles + max(ws_first_transfer_cycles - slack_cycles, 0),
685 )
686 else:
687 # Standard buffer - weights can not be fetched in parallel so weight transfer
688 # must be included in the result
689 cycles_a[PassCycles.Npu] = (
690 cycles.op_cycles + cost.full_weight_transfer_cycles - min(ws_first_transfer_cycles, slack_cycles)
691 )
Tim Halld8339a72021-05-27 18:49:40 +0100692
693 # Add cycles for LUT Transfer
694 cycles_a[PassCycles.Npu] += lut_transfer_cycles
695 else:
696 # Add cycles for LUT Transfer
697 cycles_a[PassCycles.Npu] += max(lut_transfer_cycles - slack_cycles, 0)
698
699 # OFM write
700 ofm = op.parent_op.ofm
701 bw = access.ofm_write * ofm.element_size()
702 bws[query.ofm_memory_area][ofm.purpose][BandwidthDirection.Write] += bw
703 scaled_bws[ofm.mem_area][ofm.purpose][BandwidthDirection.Write] += _estimate_memory_transfer_efficiency(
704 arch, False, query.ofm_memory_area, ofm.format, query.ofm_bits, query.config.ofm_block, query.ofm_shape, bw
705 )
706
707 # IFM read
708 ifm = op.parent_op.ifm
709 bw = access.ifm_read[0] * ifm.element_size()
710 bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += bw
711 scaled_bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency(
712 arch, True, query.ifm_memory_area, ifm.format, query.ifm_bits, query.config.ifm_block, query.ifm_shape, bw
713 )
714 if query.ifm2_shape:
715 ifm2 = op.parent_op.ifm2
716 bw = access.ifm_read[1] * ifm2.element_size()
717 bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += bw
718 scaled_bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency(
719 arch,
720 True,
721 query.ifm2_memory_area,
722 ifm2.format,
723 op.ifm2.dtype.size_in_bits(),
724 query.config.ifm_block,
725 query.ifm2_shape,
726 bw,
727 )
728
729 # Weight read
730 if access.const_read[0] > 0:
731 # alignment not accounted for in bandwidth_compression_scale_approx
732 encoded_size_approx = (
733 cost.npu_weights_tensor.elements() - access.const_read[1] * op.parent_op.bias.element_size()
734 )
735 orig_weight_size = parent_op.weights.elements()
736 bandwidth_compression_scale_approx = encoded_size_approx / orig_weight_size
737 bw = access.const_read[0] * bandwidth_compression_scale_approx
738 bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw
739
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000740 if not cost.buffered_weight_tensors:
Patrik Gustavsson225e19d2021-06-01 12:43:43 +0200741 scaled_bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw
742
Tim Halld8339a72021-05-27 18:49:40 +0100743 if access.const_read[1] > 0:
744 # Scales & biases
745 bw = access.const_read[1] * op.parent_op.bias.element_size()
746 bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw
747
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000748 if not cost.buffered_weight_tensors:
Patrik Gustavsson225e19d2021-06-01 12:43:43 +0200749 scaled_bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw
750
Tim Halld8339a72021-05-27 18:49:40 +0100751 update_summary_cycles(arch, scaled_bws, cycles_a)
752
753 return bws, macs, cycles_a
Tim Hall79d07d22020-04-27 18:20:16 +0100754
755
Tim Hallc1be0872022-03-03 17:50:52 +0000756def print_performance(
757 nng: Graph,
758 arch: ArchitectureFeatures,
759 network_type: NetworkType,
760 bws: dict,
761 macs: dict,
762 cycles: dict,
763 mem_usage: dict,
wilisa0189a8cdd2022-08-22 16:13:06 +0000764 output_basename: str,
Tim Hallc1be0872022-03-03 17:50:52 +0000765):
Tim Hall5ae6cb02022-11-11 18:55:49 +0000766 def _percentage(part, whole):
767 # desired behaviour is for division by zero to return 100%
768 if whole == 0:
769 return 100.0
770 else:
771 return part / whole * 100.0
772
Tim Hallc1be0872022-03-03 17:50:52 +0000773 if network_type == NetworkType.TFLite:
774 nng_optype_to_input_op_type = tflite_optype_to_builtintype
775 else:
776 nng_optype_to_input_op_type = tosa_optype_to_tosa_op_type
777
778 suid_inv_map = {v: k for k, v in DebugDatabase._sourceUID.items()}
779
Tim Hall5ae6cb02022-11-11 18:55:49 +0000780 # the header is a list (one entry per column) of tuples (column name, alignment, width, precision)
781 header = [
782 (f"{network_type.name}_operator", "<", 20, -1),
783 ("NNG Operator", "<", 20, -1),
784 ("SRAM Usage", ">", 10, 0.0),
785 ("Peak%", ">", 6, 0.2),
786 ("Op Cycles", ">", 10, 0.0),
787 ("Network%", ">", 8, 0.2),
788 ("NPU", ">", 10, 0.0),
789 ("SRAM AC", ">", 10, 0.0),
790 ("DRAM AC", ">", 10, 0.0),
791 ("OnFlash AC", ">", 10, 0.0),
792 ("OffFlash AC", ">", 11, 0.0),
793 ("MAC Count", ">", 10, 0.0),
794 ("Network%", ">", 8, 0.2),
795 ("Util%", ">", 6, 0.2),
796 ("Name", "<", 20, -1),
797 ]
798
799 # open the csv
800 csv_file = open(output_basename + "_per-layer.csv", "w", encoding="UTF8")
801 writer = csv.writer(csv_file)
802
Tim Hallc1be0872022-03-03 17:50:52 +0000803 for sg in nng.subgraphs:
804
805 if sg.placement != PassPlacement.Npu:
806 continue
807
Tim Hall5ae6cb02022-11-11 18:55:49 +0000808 sg_seperator_text = f"\n{str('#') * 80}\nPerformance for NPU Subgraph {sg.name}"
Tim Hallc1be0872022-03-03 17:50:52 +0000809
Tim Hall5ae6cb02022-11-11 18:55:49 +0000810 # the data is a list (one entry per op) of lists (matching the header columns)
811 data = []
812 for sched_op in sg.sched_ops:
813 # get source op name
814 sched_op_src_uid = DebugDatabase._optimisedUID[sched_op.parent_op][1]
815 if sched_op_src_uid == DebugDatabase.NULLREF:
816 src_op_type = None
817 else:
818 src_op_type = suid_inv_map[sched_op_src_uid].original_type
Tim Hallc1be0872022-03-03 17:50:52 +0000819
Tim Hall5ae6cb02022-11-11 18:55:49 +0000820 src_op_name = nng_optype_to_input_op_type(src_op_type)
Tim Hallc1be0872022-03-03 17:50:52 +0000821
Tim Hall5ae6cb02022-11-11 18:55:49 +0000822 max_macs = cycles[sched_op][PassCycles.Total] * arch.num_macs_per_cycle * arch.ncores
823 peak_sram = (
824 _percentage(mem_usage[sched_op], nng.memory_used[MemArea.Sram])
825 if MemArea.Sram in nng.memory_used
826 else 0
827 )
wilisa0189a8cdd2022-08-22 16:13:06 +0000828
Tim Hall5ae6cb02022-11-11 18:55:49 +0000829 data.append(
830 [
831 src_op_name,
832 sched_op.op_type,
833 mem_usage[sched_op],
834 peak_sram,
835 cycles[sched_op][PassCycles.Total],
836 _percentage(cycles[sched_op][PassCycles.Total], nng.cycles[PassCycles.Total]),
837 cycles[sched_op][PassCycles.Npu],
838 cycles[sched_op][PassCycles.SramAccess],
839 cycles[sched_op][PassCycles.DramAccess],
840 cycles[sched_op][PassCycles.OnChipFlashAccess],
841 cycles[sched_op][PassCycles.OffChipFlashAccess],
842 macs[sched_op],
843 _percentage(macs[sched_op], nng.macs),
844 _percentage(macs[sched_op], max_macs),
845 sched_op.name,
wilisa0189a8cdd2022-08-22 16:13:06 +0000846 ]
Tim Hall5ae6cb02022-11-11 18:55:49 +0000847 )
848
849 # print to console
850 print(sg_seperator_text)
851 line = ""
852 line2 = ""
853 for col_name, align, width, _ in header:
854 line_data = f"{col_name:{align}{width}}"
855 line += line_data + " "
856 line2 += "-" * len(line_data) + " "
857 print(line)
858 print(line2)
859
860 for op_data in data:
861 line = ""
862 for idx, item in enumerate(op_data):
863 _, align, width, precision = header[idx]
864 if precision == -1:
865 w = str(width)
866 else:
867 w = str(width + precision) + "f"
868 line += f"{item:{align}{w}}" + " "
869 print(line)
870
871 # print to csv
872 writer.writerow((sg_seperator_text,))
873 writer.writerow(col_name for col_name, _, _, _ in header)
874 for op_data in data:
875 writer.writerow(op_data)
876
877 # close the csv
878 csv_file.close()
Tim Hallc1be0872022-03-03 17:50:52 +0000879
880
wilisa0189a8cdd2022-08-22 16:13:06 +0000881def calc_new_performance_for_network(
882 nng: Graph,
883 arch,
884 network_type: NetworkType,
885 verbose_performance: bool,
886 output_basename: str = "output/unnamed_network",
887):
Tim Hall79d07d22020-04-27 18:20:16 +0100888 total_bws = make_bandwidth_array()
Diqing Zhong69aadd02020-12-08 13:08:48 +0100889 total_macs = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100890 total_cycles = np.zeros(PassCycles.Size)
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000891 total_weight_size = 0
892 total_encoded_weight_size = 0
893
894 # Store unique instances of original/encoded weight tensor uuids to prevent double counting of weights
895 original_weight_uuids: Set[UUID] = set()
896 encoded_npu_weight_uuids: Set[UUID] = set()
Tim Hall79d07d22020-04-27 18:20:16 +0100897
Tim Hallc1be0872022-03-03 17:50:52 +0000898 bws = {}
899 macs = {}
900 cycles = {}
901 mem_usage = {}
902
Tim Hall79d07d22020-04-27 18:20:16 +0100903 for sg in nng.subgraphs:
Tim Halld8339a72021-05-27 18:49:40 +0100904 prev_op = None
905 for sched_op in sg.sched_ops:
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000906 op_info: SchedulerOpInfo = sg.schedule.cost_map[sched_op]
Tim Hallc1be0872022-03-03 17:50:52 +0000907 bws[sched_op], macs[sched_op], cycles[sched_op] = estimate_full_op_performance(
908 arch, sg.schedule, sched_op, prev_op, op_info.block_config
909 )
910
911 # get op sram usage
912 mem_usage[sched_op] = (
913 sg.schedule.memory_snapshot[op_info.time_index]
914 if op_info.time_index < len(sg.schedule.memory_snapshot)
915 else 0
916 )
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000917
918 # Tensors for calculating weight sizes
919 original_weight = sched_op.parent_op.weights
920 encoded_npu_weight = op_info.npu_weights_tensor
921
922 # Save UUIDs of original_weight so only unique instances of tensors are used to calculate weights
923 if original_weight and (original_weight.equivalence_id not in original_weight_uuids):
924
925 original_weight_uuids.add(original_weight.equivalence_id)
926 total_weight_size += original_weight.values.itemsize * original_weight.values.size
927
928 # Save UUIDs of encoded_npu_weight so only unique instances of tensors are used to calculate weights
929 if encoded_npu_weight and (encoded_npu_weight.equivalence_id not in encoded_npu_weight_uuids):
930
Jonas Ohlsson77b448f2022-03-11 16:08:30 +0100931 encoded_npu_weight_uuids.add(encoded_npu_weight.equivalence_id)
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000932 total_encoded_weight_size += len(encoded_npu_weight.buffer)
933
Tim Hallc1be0872022-03-03 17:50:52 +0000934 total_bws += bws[sched_op]
935 total_macs += macs[sched_op]
936 total_cycles += cycles[sched_op]
Tim Halld8339a72021-05-27 18:49:40 +0100937 prev_op = sched_op
Tim Hall79d07d22020-04-27 18:20:16 +0100938
939 nng.bandwidths = total_bws
940 nng.macs = total_macs
941 nng.cycles = total_cycles
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000942 nng.total_original_weights = total_weight_size
943 nng.total_npu_encoded_weights = total_encoded_weight_size
Tim Hallc1be0872022-03-03 17:50:52 +0000944
945 if verbose_performance:
wilisa0189a8cdd2022-08-22 16:13:06 +0000946 print_performance(nng, arch, network_type, bws, macs, cycles, mem_usage, output_basename)