blob: 929548ecc4612c3f76585de503380b3d34166890 [file] [log] [blame]
wilisa0189a8cdd2022-08-22 16:13:06 +00001# Copyright (C) 2020-2022 Arm Limited or its affiliates. All rights reserved.
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# NPU performance estimation functions to estimate performance of a Pass and CascadedPass. Uses a model that takes the
18# maximum of the 'cycles required for bandwidth' and 'cycles required for computing'.
19#
20# Called during scheduling to evaluate different proposals, as well as post-scheduling to provide a final performance
21# estimate.
Tim Halld8339a72021-05-27 18:49:40 +010022import copy
wilisa0189a8cdd2022-08-22 16:13:06 +000023import csv
Diqing Zhonge168b962020-11-05 17:18:47 +010024from enum import auto
25from enum import IntEnum
Jonas Ohlsson845e2322022-03-01 12:39:55 +010026from typing import Optional
Ayaan Masoodb801dda2022-02-22 11:28:55 +000027from typing import Set
28from uuid import UUID
Diego Russoea6111a2020-04-14 18:41:58 +010029
Tim Hall79d07d22020-04-27 18:20:16 +010030import numpy as np
Diego Russoea6111a2020-04-14 18:41:58 +010031
32from . import numeric_util
Tim Halld8339a72021-05-27 18:49:40 +010033from .architecture_allocator import ArchitectureBlockConfig
Diqing Zhong09387e22020-09-28 18:46:22 +020034from .architecture_features import Accelerator
Tim Hallc1be0872022-03-03 17:50:52 +000035from .architecture_features import ArchitectureFeatures
Tim Halld8339a72021-05-27 18:49:40 +010036from .architecture_features import NpuBlockType
37from .architecture_features import SHRAMElements
38from .architecture_features import TensorFormat
Tim Hallc1be0872022-03-03 17:50:52 +000039from .debug_database import DebugDatabase
Ayaan Masoodb801dda2022-02-22 11:28:55 +000040from .nn_graph import Graph
Tim Hallc1be0872022-03-03 17:50:52 +000041from .nn_graph import NetworkType
42from .nn_graph import PassPlacement
Tim Halld8339a72021-05-27 18:49:40 +010043from .numeric_util import round_up
Johan Alfvénf8e353b2022-02-04 17:24:23 +010044from .numeric_util import round_up_to_int
Tim Halld8339a72021-05-27 18:49:40 +010045from .operation import Kernel
Diqing Zhonge8887a32020-09-24 09:53:48 +020046from .operation import Op
Tim Halld8339a72021-05-27 18:49:40 +010047from .scheduler import Schedule
48from .scheduler import SchedulerOperation
Ayaan Masoodb801dda2022-02-22 11:28:55 +000049from .scheduler import SchedulerOpInfo
Tim Halld8339a72021-05-27 18:49:40 +010050from .shape4d import Shape4D
Diqing Zhongf842b692020-12-11 13:07:37 +010051from .tensor import BandwidthDirection
Diego Russoe8a10452020-04-21 17:39:10 +010052from .tensor import MemArea
Diego Russoe8a10452020-04-21 17:39:10 +010053from .tensor import TensorPurpose
Johan Alfvén0f98de62022-05-15 14:54:51 +020054from .tensor import TensorSubPurpose
Tim Hallc1be0872022-03-03 17:50:52 +000055from .tflite_mapping import optype_to_builtintype as tflite_optype_to_builtintype
56from .tosa_mapping import optype_to_tosa_op_type as tosa_optype_to_tosa_op_type
Tim Halld8339a72021-05-27 18:49:40 +010057from .weight_compressor import WeightKey
Tim Hall79d07d22020-04-27 18:20:16 +010058
59
Diqing Zhonge168b962020-11-05 17:18:47 +010060class PassCycles(IntEnum):
Diqing Zhong42e833d2020-10-02 13:18:42 +020061 Npu = 0
Diqing Zhonge168b962020-11-05 17:18:47 +010062 SramAccess = auto()
63 DramAccess = auto()
64 OnChipFlashAccess = auto()
65 OffChipFlashAccess = auto()
66 Total = auto()
67 Size = auto()
Tim Hall79d07d22020-04-27 18:20:16 +010068
69 def display_name(self):
Jonas Ohlssond8575072022-03-30 10:30:25 +020070 return (
71 "NPU",
72 "SRAM Access",
73 "DRAM Access",
74 "On-chip Flash Access",
75 "Off-chip Flash Access",
76 "Total",
77 "Size",
78 )[self.value]
Tim Hall79d07d22020-04-27 18:20:16 +010079
80 def identifier_name(self):
Jonas Ohlssond8575072022-03-30 10:30:25 +020081 return (
82 "npu",
83 "sram_access",
84 "dram_access",
85 "on_chip_flash_access",
86 "off_chip_flash_access",
87 "total",
88 "size",
89 )[self.value]
Tim Hall79d07d22020-04-27 18:20:16 +010090
91 @staticmethod
92 def all():
93 return (
Diqing Zhong42e833d2020-10-02 13:18:42 +020094 PassCycles.Npu,
Tim Hall79d07d22020-04-27 18:20:16 +010095 PassCycles.SramAccess,
96 PassCycles.DramAccess,
97 PassCycles.OnChipFlashAccess,
98 PassCycles.OffChipFlashAccess,
99 PassCycles.Total,
100 )
101
102
Tim Halld8339a72021-05-27 18:49:40 +0100103class PerformanceQuery:
104 def __init__(self, npu_block_type=0):
105 self.npu_block_type = npu_block_type
106 self.ifm_shape = Shape4D(0)
107 self.ifm_format = TensorFormat.NHWC
108 self.ifm_memory_area = MemArea.Unknown
109 self.ifm2_memory_area = MemArea.Unknown
110 self.ifm_bits = 0
111 self.ifm2_bits = 0
112 self.ifm2_shape = None
113 self.ifm2_format = TensorFormat.NHWC
114 self.ofm_shape = Shape4D(0)
115 self.ofm_format = TensorFormat.NHWC
116 self.ofm_memory_area = MemArea.Unknown
117 self.ofm_bits = 0
118 self.const_shape = Shape4D(0)
119 self.const_memory_area = MemArea.Unknown
120 self.kernel = Kernel(1, 1)
121 self.config = ArchitectureBlockConfig()
Tim Hall79d07d22020-04-27 18:20:16 +0100122
123
Tim Halld8339a72021-05-27 18:49:40 +0100124class CycleCost:
125 def __init__(self):
126 self.op_macs = 0
127 self.op_cycles = 0
128
129 def __mul__(self, scale):
130 out = CycleCost()
131 out.op_macs = self.op_macs * scale
132 out.op_cycles = self.op_cycles * scale
133 return out
134
135 def __iadd__(self, rhs):
136 self.op_macs += rhs.op_macs
137 self.op_cycles += rhs.op_cycles
138 return self
139
140 def __str__(self):
141 return "macs = {}, cycles = {}".format(self.op_macs, self.op_cycles)
Tim Hall79d07d22020-04-27 18:20:16 +0100142
143
Tim Halld8339a72021-05-27 18:49:40 +0100144class ElementAccess:
145 def __init__(self):
146 # List of ONLY element access counts, consumers
147 # need to scale these values by the correct bitwidths
148 # to calculated memory bandwidth
149 self.ifm_read = [0, 0] # ifm1, ifm2
150 self.ofm_write = 0
151 self.weights_refetch = 0
152 self.const_read = [0, 0] # weights, scales
153
154 def __mul__(self, scale):
155 out = ElementAccess()
156 out.ifm_read[0] = self.ifm_read[0] * scale
157 out.ifm_read[1] = self.ifm_read[1] * scale
158 out.ofm_write = self.ofm_write * scale
159 out.weights_refetch = self.weights_refetch * scale
160 out.const_read[0] = self.const_read[0] * scale
161 out.const_read[1] = self.const_read[1] * scale
162 return out
163
164 def __iadd__(self, rhs):
165 self.ifm_read[0] += rhs.ifm_read[0]
166 self.ifm_read[1] += rhs.ifm_read[1]
167 self.ofm_write += rhs.ofm_write
168 self.weights_refetch += rhs.weights_refetch
169 self.const_read[0] += rhs.const_read[0]
170 self.const_read[1] += rhs.const_read[1]
171 return self
172
173 def __str__(self):
174 return "ifm read = {}, ofm write = {}, const read={}".format(self.ifm_read, self.ofm_write, self.const_read)
Tim Hall79d07d22020-04-27 18:20:16 +0100175
176
Tim Halld8339a72021-05-27 18:49:40 +0100177def _strides_for_shape(shape: Shape4D, format: TensorFormat, element_bits):
178 if format == TensorFormat.NHWC:
179 strides = [0, 0, 0, 0]
180 strides[3] = element_bits / 8 # +Z
181 strides[2] = (element_bits * shape.depth) // 8 # +X
182 strides[1] = (element_bits * shape.depth * shape.width) // 8 # +Y
183 strides[0] = (element_bits * shape.depth * shape.width * shape.height) // 8 # +N
184 elif format == TensorFormat.NHCWB16:
185 strides = [0, 0, 0, 0, 0]
186 strides[4] = element_bits / 8 # +Z
187 strides[3] = (element_bits * 16) / 8 # +X
188 strides[2] = (element_bits * 16 * shape.width) / 8 # +C
189 strides[1] = (element_bits * shape.width * shape.depth) / 8 # +Y
190 strides[0] = (element_bits * shape.width * shape.depth) / 8 # +N
Diqing Zhong42e833d2020-10-02 13:18:42 +0200191
Tim Halld8339a72021-05-27 18:49:40 +0100192 return strides
Diqing Zhong42e833d2020-10-02 13:18:42 +0200193
194
Tim Halld8339a72021-05-27 18:49:40 +0100195def _estimate_memory_transfer_efficiency(
196 arch, is_read, mem_area, format: TensorFormat, element_bits, block_size, shape4D, to_transfer
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100197):
Tim Halld8339a72021-05-27 18:49:40 +0100198 burst_len = 8
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100199
Tim Halld8339a72021-05-27 18:49:40 +0100200 strides = _strides_for_shape(shape4D, format, element_bits)
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100201
Tim Halld8339a72021-05-27 18:49:40 +0100202 if format == TensorFormat.NHCWB16:
203 if strides[2] == block_size.depth: # TODO is this check corrrect for non 8-bit
204 burst_len = element_bits * block_size.depth * block_size.width
205 elif is_read:
206 burst_len = 16 * element_bits * block_size.width
Diqing Zhonge8887a32020-09-24 09:53:48 +0200207 else:
Tim Halld8339a72021-05-27 18:49:40 +0100208 burst_len = 16 * element_bits * block_size.width * arch.ncores
209 elif format == TensorFormat.NHWC:
210 if is_read:
211 if strides[3] == block_size.depth:
212 burst_len = element_bits * block_size.depth * block_size.width
213 else:
214 burst_len = element_bits * block_size.depth
215 else:
216 if block_size.depth <= 16 and strides[3] == block_size.depth:
217 burst_len = element_bits * block_size.depth * block_size.width
218 else:
219 burst_len = min(64 * 8, 16 * element_bits * arch.ncores, block_size.depth * element_bits)
220
221 burst_len = burst_len // 8 # bits->bytes
222 burst_len = min(arch.memory_burst_length[mem_area], burst_len)
223 return to_transfer * (arch.memory_burst_length[mem_area] / burst_len)
224
225
226def _estimate_minimum_memory_cycles(arch, query: PerformanceQuery):
227 # Input block HW transfer (only for elements present)
228 ifm_bytes = Shape4D.min(query.ifm_shape, query.config.ifm_block).elements()
229 cycles_ifm_blk = arch.memory_latency[query.ifm_memory_area][BandwidthDirection.Read]
230 cycles_ifm_blk = cycles_ifm_blk + (
231 _estimate_memory_transfer_efficiency(
232 arch,
233 True,
234 query.ifm_memory_area,
235 query.ifm_format,
236 query.ifm_bits,
237 query.config.ifm_block,
238 query.ifm_shape,
239 ifm_bytes,
240 )
241 / arch.memory_bandwidths_per_cycle[query.ifm_memory_area]
242 )
243 # Output block HW transfer (only for elements present)
244 ofm_bytes = Shape4D.min(query.ofm_shape, query.config.ofm_block).elements()
245 cycles_ofm_blk = arch.memory_latency[query.ofm_memory_area][BandwidthDirection.Write]
246 cycles_ofm_blk = cycles_ofm_blk + (
247 _estimate_memory_transfer_efficiency(
248 arch,
249 False,
250 query.ofm_memory_area,
251 query.ofm_format,
252 query.ofm_bits,
253 query.config.ofm_block,
254 query.ofm_shape,
255 ofm_bytes,
256 )
257 / arch.memory_bandwidths_per_cycle[query.ofm_memory_area]
258 )
259 return cycles_ifm_blk, cycles_ofm_blk
260
261
262def _estimate_output_cycles_per_element(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):
263 if query.npu_block_type == NpuBlockType.ElementWise and query.ifm_bits == 32:
264 # Unary op else Binary op
265 output_perf_index = 0 if query.ifm2_shape is not None else 1
266 elif op_type == Op.Mul and query.ofm_bits == 32:
Diqing Zhonge8887a32020-09-24 09:53:48 +0200267 output_perf_index = 2
Tim Halld8339a72021-05-27 18:49:40 +0100268 elif op_type == Op.Mul or (
269 query.npu_block_type
Diqing Zhonge8887a32020-09-24 09:53:48 +0200270 in (
271 NpuBlockType.ConvolutionMxN,
272 NpuBlockType.ConvolutionDepthWise,
273 NpuBlockType.Pooling,
274 NpuBlockType.ReduceSum,
275 NpuBlockType.VectorProduct,
276 )
Tim Halld8339a72021-05-27 18:49:40 +0100277 and query.config.acc_type == SHRAMElements.Acc40
Diqing Zhonge8887a32020-09-24 09:53:48 +0200278 ):
279 output_perf_index = 3
Tim Halld8339a72021-05-27 18:49:40 +0100280 elif op_type in (Op.Add, Op.Sub):
281 if False:
Diqing Zhonge8887a32020-09-24 09:53:48 +0200282 # Simple Add/Sub
283 output_perf_index = 4
284 else:
Tim Halld8339a72021-05-27 18:49:40 +0100285 # Advanced Add/Sub TODO: Add as perf selection as operator variant
Diqing Zhonge8887a32020-09-24 09:53:48 +0200286 output_perf_index = 5
Tim Halld8339a72021-05-27 18:49:40 +0100287 elif op_type.is_maxpool_op():
Diqing Zhonge8887a32020-09-24 09:53:48 +0200288 output_perf_index = 6
289 else:
290 output_perf_index = 7
291
Tim Halld8339a72021-05-27 18:49:40 +0100292 if faf_type in (Op.Sigmoid, Op.Tanh, Op.LUT):
Diqing Zhonge8887a32020-09-24 09:53:48 +0200293 activation_perf_index = 0
Tim Halld8339a72021-05-27 18:49:40 +0100294 elif faf_type in (Op.Relu, Op.Relu6, Op.ReluN1To1):
Diqing Zhonge8887a32020-09-24 09:53:48 +0200295 activation_perf_index = 1
296 else:
297 activation_perf_index = 2
298
Diqing Zhonge8887a32020-09-24 09:53:48 +0200299 cycle_per_elem = max(
300 arch.output_cycles_per_elem[output_perf_index], arch.activation_cycles_per_elem[activation_perf_index]
301 )
Diqing Zhong986e3192020-11-16 16:15:56 +0100302
Tim Halld8339a72021-05-27 18:49:40 +0100303 if op_type.is_elementwise_op():
304 num_elems_blk = query.config.ofm_block.elements()
305 ifm_blk_cycles, ofm_blk_cycles = _estimate_minimum_memory_cycles(arch, query)
306 cycle_cmd = ifm_blk_cycles + ofm_blk_cycles
307 cycle_cmd = (cycle_cmd + cycle_per_elem * num_elems_blk) / 4 # per DPU
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100308 cycle_per_elem = max(cycle_per_elem, cycle_cmd / num_elems_blk)
309
Tim Halld8339a72021-05-27 18:49:40 +0100310 return cycle_per_elem
Diqing Zhonge8887a32020-09-24 09:53:48 +0200311
312
Tim Halld8339a72021-05-27 18:49:40 +0100313def _estimate_conv_cycles(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):
314 ifm_block = Shape4D.min(query.ifm_shape, query.config.ifm_block)
315 ofm_block = Shape4D.min(query.ofm_shape, query.config.ofm_block)
Diqing Zhonge5204a62020-10-13 11:42:37 +0200316
317 if (
318 arch.config.ofm_ublock.height == 2
Tim Halld8339a72021-05-27 18:49:40 +0100319 and query.npu_block_type
Diqing Zhonge5204a62020-10-13 11:42:37 +0200320 in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)
Tim Halld8339a72021-05-27 18:49:40 +0100321 and query.ofm_shape.height == 1
Diqing Zhonge5204a62020-10-13 11:42:37 +0200322 # Optimisation only applies for even width tensors
Tim Halld8339a72021-05-27 18:49:40 +0100323 and query.ofm_shape.width % 2 == 0
324 and query.kernel.height == 1
Diqing Zhonge5204a62020-10-13 11:42:37 +0200325 ):
Tim Halld8339a72021-05-27 18:49:40 +0100326 ofm_ublock = Shape4D(1, 1, 4, arch.config.ofm_ublock.depth)
327 ofm_block = ofm_block.with_height(1)
328 else:
329 ofm_ublock = Shape4D(arch.config.ofm_ublock.to_hwc())
Diqing Zhonge5204a62020-10-13 11:42:37 +0200330
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100331 num_ublk_x = numeric_util.round_up_divide(ofm_block.width, ofm_ublock.width)
Tim Halld8339a72021-05-27 18:49:40 +0100332 num_ublk_y = numeric_util.round_up_divide(ofm_block.height, ofm_ublock.height)
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100333 num_ublk_xy = num_ublk_x * num_ublk_y
Tim Halld8339a72021-05-27 18:49:40 +0100334 num_ublk_z = numeric_util.round_up_divide(ofm_block.depth, ofm_ublock.depth)
335 use_acc_40bits = query.config.acc_type == SHRAMElements.Acc40
Diqing Zhong09387e22020-09-28 18:46:22 +0200336
Tim Halld8339a72021-05-27 18:49:40 +0100337 sub_kernel_limits = arch.sub_kernel_limits[query.npu_block_type]
338 n_sub_kernels_y = numeric_util.round_up_divide(query.kernel.height, sub_kernel_limits[0])
339 n_sub_kernels_x = numeric_util.round_up_divide(query.kernel.width, sub_kernel_limits[1])
Diqing Zhong09387e22020-09-28 18:46:22 +0200340 sub_kernel_x = [
Tim Halld8339a72021-05-27 18:49:40 +0100341 min((query.kernel.width - i * sub_kernel_limits[1]), sub_kernel_limits[1]) for i in range(n_sub_kernels_x)
Diqing Zhong09387e22020-09-28 18:46:22 +0200342 ]
343 sub_kernel_y = [
Tim Halld8339a72021-05-27 18:49:40 +0100344 min((query.kernel.height - i * sub_kernel_limits[0]), sub_kernel_limits[0]) for i in range(n_sub_kernels_y)
Diqing Zhong09387e22020-09-28 18:46:22 +0200345 ]
346 sub_kernel_size = (x * y for y in sub_kernel_y for x in sub_kernel_x)
347
Diqing Zhong09387e22020-09-28 18:46:22 +0200348 cycles_dpu_blk = 0
Diqing Zhong986e3192020-11-16 16:15:56 +0100349 cycles_wb = 32 * ofm_ublock.depth // 8
Diqing Zhong09387e22020-09-28 18:46:22 +0200350
351 for num_kernel_elems in sub_kernel_size:
Tim Halld8339a72021-05-27 18:49:40 +0100352 if query.npu_block_type == NpuBlockType.Pooling:
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100353 num_kernel_steps = 1
Diqing Zhong986e3192020-11-16 16:15:56 +0100354 cycles = max(4, num_kernel_elems) * num_ublk_xy * num_ublk_z
Tim Halld8339a72021-05-27 18:49:40 +0100355 if query.ifm_bits == 16 and arch.accelerator_config != Accelerator.Ethos_U55_32:
Diqing Zhong09387e22020-09-28 18:46:22 +0200356 cycles *= 2
Tim Halld8339a72021-05-27 18:49:40 +0100357 elif query.npu_block_type == NpuBlockType.ConvolutionDepthWise:
Diqing Zhong986e3192020-11-16 16:15:56 +0100358 cycles = 4 * num_ublk_xy
Tim Halld8339a72021-05-27 18:49:40 +0100359 if query.ifm_bits == 16:
Diqing Zhong09387e22020-09-28 18:46:22 +0200360 cycles *= 2
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100361 num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, 4)
362 cycles = max(cycles_wb, cycles) * num_kernel_steps * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200363 elif (
Tim Halld8339a72021-05-27 18:49:40 +0100364 (query.npu_block_type == NpuBlockType.ConvolutionMxN and not query.config.is_partkernel)
365 or query.npu_block_type == NpuBlockType.VectorProduct
366 or query.npu_block_type == NpuBlockType.ReduceSum
Diqing Zhong09387e22020-09-28 18:46:22 +0200367 ):
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100368 num_kernel_steps = num_kernel_elems
369 cycles = max(cycles_wb, 4 * num_ublk_xy) * num_kernel_steps * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200370 else:
Tim Halld8339a72021-05-27 18:49:40 +0100371 assert query.config.is_partkernel
372 divider = 2 if query.ifm_bits == 16 else 4
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100373 num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, divider)
Diqing Zhong986e3192020-11-16 16:15:56 +0100374 cycles = max(cycles_wb, 4 * num_ublk_xy) * (
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100375 num_kernel_steps * numeric_util.round_up_divide(ifm_block.depth, 8) * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200376 )
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100377
378 delay_cycles = 0
379 if arch.accelerator_config is Accelerator.Ethos_U55_32:
380 delay = 7 if use_acc_40bits else 3
381 if num_ublk_x == 1 and num_ublk_y == 1:
382 if num_ublk_z == 1:
383 delay_cycles = delay * num_kernel_steps
384 elif num_kernel_steps > 1:
385 delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z
386 if (num_ublk_x == 1 or num_ublk_y == 1) and num_ublk_z > 1 and use_acc_40bits:
387 delay_cycles += delay * num_ublk_z
388 else:
Tim Halld8339a72021-05-27 18:49:40 +0100389 if use_acc_40bits and arch.accelerator_config in (Accelerator.Ethos_U55_64, Accelerator.Ethos_U55_128):
390 delay = 3
391 else:
392 delay = 2
393
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100394 if num_ublk_x == 1 and num_ublk_y == 1:
395 if num_ublk_z == 1:
396 delay_cycles = delay * num_kernel_steps
397 elif num_kernel_steps > 1:
398 delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z
399
Tim Halld8339a72021-05-27 18:49:40 +0100400 if query.npu_block_type == NpuBlockType.ConvolutionMxN and query.config.is_partkernel:
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100401 delay_cycles *= numeric_util.round_up_divide(ifm_block.depth, 8)
402
Diqing Zhong09387e22020-09-28 18:46:22 +0200403 cycles_dpu_blk += cycles
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100404 cycles_dpu_blk += delay_cycles
405
Tim Halld8339a72021-05-27 18:49:40 +0100406 if query.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
407 cycles_dpu_blk *= numeric_util.round_up_divide(query.ifm_shape.depth, ifm_block.depth)
Diqing Zhong09387e22020-09-28 18:46:22 +0200408
409 cycles_dpu_blk /= arch.ncores
410
Tim Halld8339a72021-05-27 18:49:40 +0100411 # Estimate output cycles
412 num_ofm_blks = query.ofm_shape.div_round_up(ofm_block).elements()
Johan Alfvénf8e353b2022-02-04 17:24:23 +0100413 cycles_output_blk = round_up_to_int(
414 _estimate_output_cycles_per_element(arch, op_type, faf_type, query) * ofm_block.elements()
415 )
Diqing Zhong09387e22020-09-28 18:46:22 +0200416
Tim Halld8339a72021-05-27 18:49:40 +0100417 # Scale and bias tensor
418 if query.const_shape.depth > 0:
Diqing Zhongf842b692020-12-11 13:07:37 +0100419 cycles_bias_blk = (
Tim Halld8339a72021-05-27 18:49:40 +0100420 10 * ofm_block.depth * arch.memory_latency[query.const_memory_area][BandwidthDirection.Read] / 256
Diqing Zhongf842b692020-12-11 13:07:37 +0100421 )
Diqing Zhong986e3192020-11-16 16:15:56 +0100422 cycles_output_blk = max(cycles_output_blk, cycles_bias_blk)
423
Tim Halld8339a72021-05-27 18:49:40 +0100424 ifm_blk_cycles, ofm_blk_cycles = _estimate_minimum_memory_cycles(arch, query)
425 cycles_cmd = ifm_blk_cycles + ofm_blk_cycles
426 cycles_cmd = (cycles_cmd + cycles_output_blk + cycles_dpu_blk) / 4 # per DPU
427
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100428 cycles_dpu_blk = max(cycles_dpu_blk, cycles_cmd)
429 cycles_output_blk = max(cycles_output_blk, cycles_cmd)
430
Diqing Zhong09387e22020-09-28 18:46:22 +0200431 if cycles_dpu_blk > cycles_output_blk:
Tim Halld8339a72021-05-27 18:49:40 +0100432 total_cycles = cycles_dpu_blk * num_ofm_blks + cycles_output_blk
Diqing Zhong09387e22020-09-28 18:46:22 +0200433 else:
Tim Halld8339a72021-05-27 18:49:40 +0100434 total_cycles = cycles_output_blk * num_ofm_blks + cycles_dpu_blk
Diqing Zhong09387e22020-09-28 18:46:22 +0200435
436 return total_cycles
437
438
Tim Halld8339a72021-05-27 18:49:40 +0100439def measure_mem2mem_cycles(arch, from_mem_area, to_mem_area, to_transfer):
440 from_cycles = to_transfer // arch.memory_bandwidths_per_cycle[from_mem_area]
Tim Hall789e6f32021-06-17 17:02:31 +0100441 from_cycles += arch.memory_latency[from_mem_area][BandwidthDirection.Read]
Tim Halld8339a72021-05-27 18:49:40 +0100442 to_cycles = to_transfer // arch.memory_bandwidths_per_cycle[to_mem_area]
443 return max(from_cycles, to_cycles)
Diqing Zhonge168b962020-11-05 17:18:47 +0100444
Patrik Gustavssonee99bb12021-04-08 09:04:00 +0200445
Tim Halld8339a72021-05-27 18:49:40 +0100446def measure_cycle_cost(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):
447 cycles = CycleCost()
Diqing Zhonge168b962020-11-05 17:18:47 +0100448
Tim Halld8339a72021-05-27 18:49:40 +0100449 # Convolution/Vector product cycle calculation
450 if query.npu_block_type in (
451 NpuBlockType.ConvolutionMxN,
452 NpuBlockType.ConvolutionDepthWise,
453 NpuBlockType.VectorProduct,
454 NpuBlockType.Pooling,
455 NpuBlockType.ReduceSum,
456 ):
457 # cycles.op_macs and cycles.op_cycles should both handle >32-bits
458 if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
459 cycles.op_macs = int(query.kernel.elements_wh()) * 1 * int(query.ofm_shape.elements())
Diqing Zhonge168b962020-11-05 17:18:47 +0100460 else:
Tim Halld8339a72021-05-27 18:49:40 +0100461 cycles.op_macs = (
462 int(query.kernel.elements_wh()) * int(query.ifm_shape.depth) * int(query.ofm_shape.elements())
463 )
464
465 cycles.op_cycles = int(_estimate_conv_cycles(arch, op_type, faf_type, query))
466 # Elementwise cycle calculation
467 elif query.npu_block_type == NpuBlockType.ElementWise:
468 cycles.op_macs = 0
Johan Alfvénf8e353b2022-02-04 17:24:23 +0100469 ofm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ofm_format]))
470 cycles.op_cycles = round_up_to_int(
471 _estimate_output_cycles_per_element(arch, op_type, faf_type, query)
472 * Shape4D.round_up(query.ofm_shape, ofm_rounding).elements()
Tim Halld8339a72021-05-27 18:49:40 +0100473 )
Diqing Zhonge168b962020-11-05 17:18:47 +0100474 else:
Tim Halld8339a72021-05-27 18:49:40 +0100475 assert False
Diqing Zhonge168b962020-11-05 17:18:47 +0100476
Tim Halld8339a72021-05-27 18:49:40 +0100477 return cycles
Diqing Zhonge168b962020-11-05 17:18:47 +0100478
479
Tim Halld8339a72021-05-27 18:49:40 +0100480def measure_element_access(arch, query: PerformanceQuery):
481 access = ElementAccess()
Tim Hall79d07d22020-04-27 18:20:16 +0100482
Tim Halld8339a72021-05-27 18:49:40 +0100483 ifm_block = Shape4D.min(query.ifm_shape, query.config.ifm_block)
484 ofm_block = Shape4D.min(query.ofm_shape, query.config.ofm_block)
485 ifm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ifm_format]))
Tim Hall79d07d22020-04-27 18:20:16 +0100486
Tim Halld8339a72021-05-27 18:49:40 +0100487 # Number of ofm blocks in the overall output shape
488 ofm_blocks = query.ofm_shape.div_round_up(ofm_block)
489 ofm_block_depth = ofm_block.depth
490 if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
491 ofm_blocks = ofm_blocks.with_depth(1)
492 ofm_block_depth = query.ifm_shape.depth
Diqing Zhonge168b962020-11-05 17:18:47 +0100493
Tim Halld8339a72021-05-27 18:49:40 +0100494 # Convolution & pooling
495 if query.npu_block_type in (
496 NpuBlockType.ConvolutionMxN,
497 NpuBlockType.ConvolutionDepthWise,
498 NpuBlockType.VectorProduct,
499 NpuBlockType.Pooling,
500 NpuBlockType.ReduceSum,
501 ):
502 # Number of sub kernels
503 sub_kernel_limits = arch.sub_kernel_limits[query.npu_block_type]
504 subkernels = numeric_util.round_up_divide(query.kernel.width, sub_kernel_limits[0])
505 subkernels *= numeric_util.round_up_divide(query.kernel.height, sub_kernel_limits[1])
Tim Hall79d07d22020-04-27 18:20:16 +0100506
Tim Halld8339a72021-05-27 18:49:40 +0100507 ofm_block_count = ofm_blocks.elements()
Tim Hall79d07d22020-04-27 18:20:16 +0100508
Tim Halld8339a72021-05-27 18:49:40 +0100509 ifm_fetch = (
510 Shape4D.round_up(ifm_block, ifm_rounding).elements_wh()
511 * Shape4D.round_up(query.ifm_shape, ifm_rounding).depth
Diqing Zhonge168b962020-11-05 17:18:47 +0100512 )
Tim Hall79d07d22020-04-27 18:20:16 +0100513
Tim Halld8339a72021-05-27 18:49:40 +0100514 if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
515 kernel_read = query.kernel.elements_wh() * 1 # force to no reread
516 else:
517 kernel_read = query.kernel.elements_wh() * query.ifm_shape.depth
Tim Hall79d07d22020-04-27 18:20:16 +0100518
Tim Halld8339a72021-05-27 18:49:40 +0100519 weight_fetch = kernel_read * ofm_block_depth * ofm_block_count
520
521 access.ifm_read[0] = ifm_fetch * subkernels * ofm_block_count
522
523 if query.npu_block_type not in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):
524 access.const_read[0] = weight_fetch
525 access.const_read[1] = query.ofm_shape.depth # Scales & biases
526 access.weights_refetch = ofm_blocks.elements_wh()
527 # Elementwise
528 elif query.npu_block_type == NpuBlockType.ElementWise:
529 if query.ifm_shape.elements() == 1:
530 if query.ifm_bits > 8:
531 # ifm is a non 8-bit scalar
532 access.ifm_read[0] = Shape4D.round_up(query.ifm_shape, ifm_rounding).elements()
533 if query.ifm2_shape:
534 access.ifm_read[1] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()
535 else:
536 access.ifm_read[0] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()
537 if query.ifm2_shape:
538 if query.ifm2_shape.elements() > 1:
539 access.ifm_read[1] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()
540 elif query.ifm2_bits > 8:
541 # ifm2 is a non 8-bit scalar
542 access.ifm_read[1] = Shape4D.round_up(query.ifm2_shape, ifm_rounding).elements()
543 # Unknown
544 else:
545 assert False
546
547 ofm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ofm_format]))
548 access.ofm_write = Shape4D.round_up(query.ofm_shape, ofm_rounding).elements()
549 return access
550
551
552def measure_performance_cost(
553 arch, op_type: Op, faf_type: Op, query: PerformanceQuery, offset: Shape4D, sub_shape: Shape4D
554):
555 assert (query.ofm_bits > 0) and (query.ifm_bits > 0)
556 assert query.ofm_shape.elements() != 0
557
558 # Default to start if no offset provided
559 if offset is None:
560 offset = Shape4D(0, 0, 0, 0)
561
562 # Default to entire area if no sub-shape provided
563 if sub_shape is None:
564 sub_shape = query.ofm_shape
565 else:
566 sub_shape = Shape4D.min(sub_shape, query.ofm_shape)
567
568 sub_query = copy.deepcopy(query)
569 sub_query.ofm_shape = query.ofm_shape.clip(offset, sub_shape)
570
571 access = ElementAccess()
572 cycles = CycleCost()
573
574 cycle_tmp = measure_cycle_cost(arch, op_type, faf_type, sub_query)
575 cycles += cycle_tmp
576 access = measure_element_access(arch, sub_query)
577
578 return access, cycles
579
580
581def make_bandwidth_array():
582 return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size))
583
584
585def make_cycles_array():
586 return np.zeros(PassCycles.Size)
Tim Hall79d07d22020-04-27 18:20:16 +0100587
588
Diqing Zhonge168b962020-11-05 17:18:47 +0100589def update_summary_cycles(arch, bws, cycles):
590 cycles[PassCycles.SramAccess] = np.sum(bws[MemArea.Sram]) / arch.memory_bandwidths_per_cycle[MemArea.Sram]
Tim Hall79d07d22020-04-27 18:20:16 +0100591 cycles[PassCycles.DramAccess] = np.sum(bws[MemArea.Dram]) / arch.memory_bandwidths_per_cycle[MemArea.Dram]
592 cycles[PassCycles.OnChipFlashAccess] = (
593 np.sum(bws[MemArea.OnChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OnChipFlash]
594 )
595 cycles[PassCycles.OffChipFlashAccess] = (
596 np.sum(bws[MemArea.OffChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OffChipFlash]
597 )
598
599 cycles[PassCycles.Total] = np.max(cycles[: PassCycles.Total])
600 return cycles
601
602
Tim Halld8339a72021-05-27 18:49:40 +0100603def estimate_full_op_performance(
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100604 arch, schedule: Schedule, op: SchedulerOperation, prev_op: Optional[SchedulerOperation], block_config
Tim Halld8339a72021-05-27 18:49:40 +0100605):
606 cycles_a = make_cycles_array()
607 bws = make_bandwidth_array()
608 scaled_bws = make_bandwidth_array() # scaled bw with memory transfer efficiency
609 macs = 0
610
611 query = PerformanceQuery(op.op_type.npu_block_type)
612 query.ifm_shape = op.ifm.shape
613 query.ifm_format = op.ifm.format
614 query.ifm_memory_area = op.ifm.mem_area
615 query.ifm_bits = op.ifm.dtype.size_in_bits()
616 query.ifm2_shape = op.ifm2 and op.ifm2.shape
617 query.ifm2_format = op.ifm2 and op.ifm2.format
618 query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area
619 query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()
620 query.ofm_shape = op.ofm.shape
621 query.ofm_memory_area = op.ofm.mem_area
622 query.ofm_bits = op.ofm.dtype.size_in_bits()
623 query.ofm_format = op.ofm.format
624 query.kernel = op.kernel
625 query.config = block_config
626
627 cost = schedule.cost_map[op]
628 prev_cost = schedule.cost_map[prev_op] if prev_op else None
629 if op.parent_op.bias:
630 query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000631 if cost.buffered_weight_tensors:
632 query.const_memory_area = cost.buffered_weight_tensors[0].mem_area
Tim Halld8339a72021-05-27 18:49:40 +0100633 else:
634 query.const_memory_area = cost.npu_weights_tensor.mem_area
635
636 cycles = measure_cycle_cost(arch, op.op_type, op.parent_op.activation and op.parent_op.activation.op_type, query)
637 cycles_a[PassCycles.Npu] = cycles.op_cycles
638 macs = cycles.op_macs
639
640 access = measure_element_access(arch, query)
641
642 # How many NPU cycles are available under the previously executing
643 # operator for performing buffered DMA transfers
644 slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0
645
646 # LUT Transfer
647 parent_op = op.parent_op
648 lut_transfer_cycles = 0
649 if parent_op.activation_lut:
650 lut_tensor = [tens for tens in parent_op.inputs if tens.purpose == TensorPurpose.LUT][0]
651 src_tensor = lut_tensor.src_tensor
652 if src_tensor and lut_tensor.mem_area != src_tensor.mem_area:
653 bw = src_tensor.storage_size()
654 lut_transfer_cycles = measure_mem2mem_cycles(arch, src_tensor.mem_area, lut_tensor.mem_area, bw)
655
656 bws[src_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw
657 # LUT read from SHRAM TODO remove?
Ayaan Masoodd5cbef32022-02-22 15:56:35 +0000658 scaled_bws[lut_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw
Tim Halld8339a72021-05-27 18:49:40 +0100659
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000660 if cost.npu_weights_tensor and cost.buffered_weight_tensors:
Tim Halld8339a72021-05-27 18:49:40 +0100661 # DMA Weight Transfer
662 sz = 0
663 # Get the size of the first DMA
664 for core in range(0, arch.ncores):
665 key = WeightKey(core, 0)
666 if key in cost.npu_weights_tensor.encoded_ranges:
667 weight_range = cost.npu_weights_tensor.encoded_ranges[key]
668 sz += round_up(weight_range.total_bytes, 16)
669
670 total_sz = len(cost.npu_weights_tensor.buffer)
671 bws[cost.npu_weights_tensor.mem_area][TensorPurpose.Weights][BandwidthDirection.Read] += total_sz
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000672 bws[cost.buffered_weight_tensors[0].mem_area][TensorPurpose.Weights][BandwidthDirection.Write] += total_sz
Tim Halld8339a72021-05-27 18:49:40 +0100673
674 ws_first_transfer_cycles = measure_mem2mem_cycles(
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000675 arch, cost.npu_weights_tensor.mem_area, cost.buffered_weight_tensors[0].mem_area, sz
Tim Halld8339a72021-05-27 18:49:40 +0100676 )
677
678 # Add cycles for Weight + Scale Transfer
Johan Alfvén0f98de62022-05-15 14:54:51 +0200679 if cost.buffered_weight_tensors[0].sub_purpose == TensorSubPurpose.DoubleBuffer:
680 # Double buffer - weights can be fetched in parallel
681 cycles_a[PassCycles.Npu] = max(
682 cost.full_weight_transfer_cycles - slack_cycles + cost.slack_buffering_cycles,
683 cycles.op_cycles + max(ws_first_transfer_cycles - slack_cycles, 0),
684 )
685 else:
686 # Standard buffer - weights can not be fetched in parallel so weight transfer
687 # must be included in the result
688 cycles_a[PassCycles.Npu] = (
689 cycles.op_cycles + cost.full_weight_transfer_cycles - min(ws_first_transfer_cycles, slack_cycles)
690 )
Tim Halld8339a72021-05-27 18:49:40 +0100691
692 # Add cycles for LUT Transfer
693 cycles_a[PassCycles.Npu] += lut_transfer_cycles
694 else:
695 # Add cycles for LUT Transfer
696 cycles_a[PassCycles.Npu] += max(lut_transfer_cycles - slack_cycles, 0)
697
698 # OFM write
699 ofm = op.parent_op.ofm
700 bw = access.ofm_write * ofm.element_size()
701 bws[query.ofm_memory_area][ofm.purpose][BandwidthDirection.Write] += bw
702 scaled_bws[ofm.mem_area][ofm.purpose][BandwidthDirection.Write] += _estimate_memory_transfer_efficiency(
703 arch, False, query.ofm_memory_area, ofm.format, query.ofm_bits, query.config.ofm_block, query.ofm_shape, bw
704 )
705
706 # IFM read
707 ifm = op.parent_op.ifm
708 bw = access.ifm_read[0] * ifm.element_size()
709 bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += bw
710 scaled_bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency(
711 arch, True, query.ifm_memory_area, ifm.format, query.ifm_bits, query.config.ifm_block, query.ifm_shape, bw
712 )
713 if query.ifm2_shape:
714 ifm2 = op.parent_op.ifm2
715 bw = access.ifm_read[1] * ifm2.element_size()
716 bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += bw
717 scaled_bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency(
718 arch,
719 True,
720 query.ifm2_memory_area,
721 ifm2.format,
722 op.ifm2.dtype.size_in_bits(),
723 query.config.ifm_block,
724 query.ifm2_shape,
725 bw,
726 )
727
728 # Weight read
729 if access.const_read[0] > 0:
730 # alignment not accounted for in bandwidth_compression_scale_approx
731 encoded_size_approx = (
732 cost.npu_weights_tensor.elements() - access.const_read[1] * op.parent_op.bias.element_size()
733 )
734 orig_weight_size = parent_op.weights.elements()
735 bandwidth_compression_scale_approx = encoded_size_approx / orig_weight_size
736 bw = access.const_read[0] * bandwidth_compression_scale_approx
737 bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw
738
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000739 if not cost.buffered_weight_tensors:
Patrik Gustavsson225e19d2021-06-01 12:43:43 +0200740 scaled_bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw
741
Tim Halld8339a72021-05-27 18:49:40 +0100742 if access.const_read[1] > 0:
743 # Scales & biases
744 bw = access.const_read[1] * op.parent_op.bias.element_size()
745 bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw
746
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000747 if not cost.buffered_weight_tensors:
Patrik Gustavsson225e19d2021-06-01 12:43:43 +0200748 scaled_bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw
749
Tim Halld8339a72021-05-27 18:49:40 +0100750 update_summary_cycles(arch, scaled_bws, cycles_a)
751
752 return bws, macs, cycles_a
Tim Hall79d07d22020-04-27 18:20:16 +0100753
754
Tim Hallc1be0872022-03-03 17:50:52 +0000755def print_performance(
756 nng: Graph,
757 arch: ArchitectureFeatures,
758 network_type: NetworkType,
759 bws: dict,
760 macs: dict,
761 cycles: dict,
762 mem_usage: dict,
wilisa0189a8cdd2022-08-22 16:13:06 +0000763 output_basename: str,
Tim Hallc1be0872022-03-03 17:50:52 +0000764):
Tim Hall5ae6cb02022-11-11 18:55:49 +0000765 def _percentage(part, whole):
766 # desired behaviour is for division by zero to return 100%
767 if whole == 0:
768 return 100.0
769 else:
770 return part / whole * 100.0
771
Tim Hallc1be0872022-03-03 17:50:52 +0000772 if network_type == NetworkType.TFLite:
773 nng_optype_to_input_op_type = tflite_optype_to_builtintype
774 else:
775 nng_optype_to_input_op_type = tosa_optype_to_tosa_op_type
776
777 suid_inv_map = {v: k for k, v in DebugDatabase._sourceUID.items()}
778
Tim Hall5ae6cb02022-11-11 18:55:49 +0000779 # the header is a list (one entry per column) of tuples (column name, alignment, width, precision)
780 header = [
781 (f"{network_type.name}_operator", "<", 20, -1),
782 ("NNG Operator", "<", 20, -1),
783 ("SRAM Usage", ">", 10, 0.0),
784 ("Peak%", ">", 6, 0.2),
785 ("Op Cycles", ">", 10, 0.0),
786 ("Network%", ">", 8, 0.2),
787 ("NPU", ">", 10, 0.0),
788 ("SRAM AC", ">", 10, 0.0),
789 ("DRAM AC", ">", 10, 0.0),
790 ("OnFlash AC", ">", 10, 0.0),
791 ("OffFlash AC", ">", 11, 0.0),
792 ("MAC Count", ">", 10, 0.0),
793 ("Network%", ">", 8, 0.2),
794 ("Util%", ">", 6, 0.2),
795 ("Name", "<", 20, -1),
796 ]
797
798 # open the csv
799 csv_file = open(output_basename + "_per-layer.csv", "w", encoding="UTF8")
800 writer = csv.writer(csv_file)
801
Tim Hallc1be0872022-03-03 17:50:52 +0000802 for sg in nng.subgraphs:
803
804 if sg.placement != PassPlacement.Npu:
805 continue
806
Tim Hall5ae6cb02022-11-11 18:55:49 +0000807 sg_seperator_text = f"\n{str('#') * 80}\nPerformance for NPU Subgraph {sg.name}"
Tim Hallc1be0872022-03-03 17:50:52 +0000808
Tim Hall5ae6cb02022-11-11 18:55:49 +0000809 # the data is a list (one entry per op) of lists (matching the header columns)
810 data = []
811 for sched_op in sg.sched_ops:
812 # get source op name
813 sched_op_src_uid = DebugDatabase._optimisedUID[sched_op.parent_op][1]
814 if sched_op_src_uid == DebugDatabase.NULLREF:
815 src_op_type = None
816 else:
817 src_op_type = suid_inv_map[sched_op_src_uid].original_type
Tim Hallc1be0872022-03-03 17:50:52 +0000818
Tim Hall5ae6cb02022-11-11 18:55:49 +0000819 src_op_name = nng_optype_to_input_op_type(src_op_type)
Tim Hallc1be0872022-03-03 17:50:52 +0000820
Tim Hall5ae6cb02022-11-11 18:55:49 +0000821 max_macs = cycles[sched_op][PassCycles.Total] * arch.num_macs_per_cycle * arch.ncores
822 peak_sram = (
823 _percentage(mem_usage[sched_op], nng.memory_used[MemArea.Sram])
824 if MemArea.Sram in nng.memory_used
825 else 0
826 )
wilisa0189a8cdd2022-08-22 16:13:06 +0000827
Tim Hall5ae6cb02022-11-11 18:55:49 +0000828 data.append(
829 [
830 src_op_name,
831 sched_op.op_type,
832 mem_usage[sched_op],
833 peak_sram,
834 cycles[sched_op][PassCycles.Total],
835 _percentage(cycles[sched_op][PassCycles.Total], nng.cycles[PassCycles.Total]),
836 cycles[sched_op][PassCycles.Npu],
837 cycles[sched_op][PassCycles.SramAccess],
838 cycles[sched_op][PassCycles.DramAccess],
839 cycles[sched_op][PassCycles.OnChipFlashAccess],
840 cycles[sched_op][PassCycles.OffChipFlashAccess],
841 macs[sched_op],
842 _percentage(macs[sched_op], nng.macs),
843 _percentage(macs[sched_op], max_macs),
844 sched_op.name,
wilisa0189a8cdd2022-08-22 16:13:06 +0000845 ]
Tim Hall5ae6cb02022-11-11 18:55:49 +0000846 )
847
848 # print to console
849 print(sg_seperator_text)
850 line = ""
851 line2 = ""
852 for col_name, align, width, _ in header:
853 line_data = f"{col_name:{align}{width}}"
854 line += line_data + " "
855 line2 += "-" * len(line_data) + " "
856 print(line)
857 print(line2)
858
859 for op_data in data:
860 line = ""
861 for idx, item in enumerate(op_data):
862 _, align, width, precision = header[idx]
863 if precision == -1:
864 w = str(width)
865 else:
866 w = str(width + precision) + "f"
867 line += f"{item:{align}{w}}" + " "
868 print(line)
869
870 # print to csv
871 writer.writerow((sg_seperator_text,))
872 writer.writerow(col_name for col_name, _, _, _ in header)
873 for op_data in data:
874 writer.writerow(op_data)
875
876 # close the csv
877 csv_file.close()
Tim Hallc1be0872022-03-03 17:50:52 +0000878
879
wilisa0189a8cdd2022-08-22 16:13:06 +0000880def calc_new_performance_for_network(
881 nng: Graph,
882 arch,
883 network_type: NetworkType,
884 verbose_performance: bool,
885 output_basename: str = "output/unnamed_network",
886):
Tim Hall79d07d22020-04-27 18:20:16 +0100887 total_bws = make_bandwidth_array()
Diqing Zhong69aadd02020-12-08 13:08:48 +0100888 total_macs = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100889 total_cycles = np.zeros(PassCycles.Size)
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000890 total_weight_size = 0
891 total_encoded_weight_size = 0
892
893 # Store unique instances of original/encoded weight tensor uuids to prevent double counting of weights
894 original_weight_uuids: Set[UUID] = set()
895 encoded_npu_weight_uuids: Set[UUID] = set()
Tim Hall79d07d22020-04-27 18:20:16 +0100896
Tim Hallc1be0872022-03-03 17:50:52 +0000897 bws = {}
898 macs = {}
899 cycles = {}
900 mem_usage = {}
901
Tim Hall79d07d22020-04-27 18:20:16 +0100902 for sg in nng.subgraphs:
Tim Halld8339a72021-05-27 18:49:40 +0100903 prev_op = None
904 for sched_op in sg.sched_ops:
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000905 op_info: SchedulerOpInfo = sg.schedule.cost_map[sched_op]
Tim Hallc1be0872022-03-03 17:50:52 +0000906 bws[sched_op], macs[sched_op], cycles[sched_op] = estimate_full_op_performance(
907 arch, sg.schedule, sched_op, prev_op, op_info.block_config
908 )
909
910 # get op sram usage
911 mem_usage[sched_op] = (
912 sg.schedule.memory_snapshot[op_info.time_index]
913 if op_info.time_index < len(sg.schedule.memory_snapshot)
914 else 0
915 )
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000916
917 # Tensors for calculating weight sizes
918 original_weight = sched_op.parent_op.weights
919 encoded_npu_weight = op_info.npu_weights_tensor
920
921 # Save UUIDs of original_weight so only unique instances of tensors are used to calculate weights
922 if original_weight and (original_weight.equivalence_id not in original_weight_uuids):
923
924 original_weight_uuids.add(original_weight.equivalence_id)
925 total_weight_size += original_weight.values.itemsize * original_weight.values.size
926
927 # Save UUIDs of encoded_npu_weight so only unique instances of tensors are used to calculate weights
928 if encoded_npu_weight and (encoded_npu_weight.equivalence_id not in encoded_npu_weight_uuids):
929
Jonas Ohlsson77b448f2022-03-11 16:08:30 +0100930 encoded_npu_weight_uuids.add(encoded_npu_weight.equivalence_id)
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000931 total_encoded_weight_size += len(encoded_npu_weight.buffer)
932
Tim Hallc1be0872022-03-03 17:50:52 +0000933 total_bws += bws[sched_op]
934 total_macs += macs[sched_op]
935 total_cycles += cycles[sched_op]
Tim Halld8339a72021-05-27 18:49:40 +0100936 prev_op = sched_op
Tim Hall79d07d22020-04-27 18:20:16 +0100937
938 nng.bandwidths = total_bws
939 nng.macs = total_macs
940 nng.cycles = total_cycles
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000941 nng.total_original_weights = total_weight_size
942 nng.total_npu_encoded_weights = total_encoded_weight_size
Tim Hallc1be0872022-03-03 17:50:52 +0000943
944 if verbose_performance:
wilisa0189a8cdd2022-08-22 16:13:06 +0000945 print_performance(nng, arch, network_type, bws, macs, cycles, mem_usage, output_basename)