blob: 800112442e990620540d076e3a0143e3cbbcc7b8 [file] [log] [blame]
Johan Alfven90724962023-02-02 09:07:48 +01001# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Rickard Bolinbc6ee582022-11-04 08:24:29 +000016#
Tim Hall79d07d22020-04-27 18:20:16 +010017# Description:
18# NPU performance estimation functions to estimate performance of a Pass and CascadedPass. Uses a model that takes the
19# maximum of the 'cycles required for bandwidth' and 'cycles required for computing'.
20#
21# Called during scheduling to evaluate different proposals, as well as post-scheduling to provide a final performance
22# estimate.
Tim Halld8339a72021-05-27 18:49:40 +010023import copy
wilisa0189a8cdd2022-08-22 16:13:06 +000024import csv
Diqing Zhonge168b962020-11-05 17:18:47 +010025from enum import auto
26from enum import IntEnum
Jonas Ohlsson845e2322022-03-01 12:39:55 +010027from typing import Optional
Ayaan Masoodb801dda2022-02-22 11:28:55 +000028from typing import Set
29from uuid import UUID
Diego Russoea6111a2020-04-14 18:41:58 +010030
Tim Hall79d07d22020-04-27 18:20:16 +010031import numpy as np
Diego Russoea6111a2020-04-14 18:41:58 +010032
33from . import numeric_util
Tim Halld8339a72021-05-27 18:49:40 +010034from .architecture_allocator import ArchitectureBlockConfig
Diqing Zhong09387e22020-09-28 18:46:22 +020035from .architecture_features import Accelerator
Tim Hallc1be0872022-03-03 17:50:52 +000036from .architecture_features import ArchitectureFeatures
Tim Halld8339a72021-05-27 18:49:40 +010037from .architecture_features import NpuBlockType
38from .architecture_features import SHRAMElements
39from .architecture_features import TensorFormat
Tim Hallc1be0872022-03-03 17:50:52 +000040from .debug_database import DebugDatabase
Ayaan Masoodb801dda2022-02-22 11:28:55 +000041from .nn_graph import Graph
Tim Hallc1be0872022-03-03 17:50:52 +000042from .nn_graph import NetworkType
43from .nn_graph import PassPlacement
Tim Halld8339a72021-05-27 18:49:40 +010044from .numeric_util import round_up
Johan Alfvénf8e353b2022-02-04 17:24:23 +010045from .numeric_util import round_up_to_int
Tim Halld8339a72021-05-27 18:49:40 +010046from .operation import Kernel
Diqing Zhonge8887a32020-09-24 09:53:48 +020047from .operation import Op
Tim Halld8339a72021-05-27 18:49:40 +010048from .scheduler import Schedule
49from .scheduler import SchedulerOperation
Ayaan Masoodb801dda2022-02-22 11:28:55 +000050from .scheduler import SchedulerOpInfo
Tim Halld8339a72021-05-27 18:49:40 +010051from .shape4d import Shape4D
Diqing Zhongf842b692020-12-11 13:07:37 +010052from .tensor import BandwidthDirection
Diego Russoe8a10452020-04-21 17:39:10 +010053from .tensor import MemArea
Diego Russoe8a10452020-04-21 17:39:10 +010054from .tensor import TensorPurpose
Johan Alfvén0f98de62022-05-15 14:54:51 +020055from .tensor import TensorSubPurpose
Tim Hallc1be0872022-03-03 17:50:52 +000056from .tflite_mapping import optype_to_builtintype as tflite_optype_to_builtintype
57from .tosa_mapping import optype_to_tosa_op_type as tosa_optype_to_tosa_op_type
Tim Halld8339a72021-05-27 18:49:40 +010058from .weight_compressor import WeightKey
Tim Hall79d07d22020-04-27 18:20:16 +010059
60
Diqing Zhonge168b962020-11-05 17:18:47 +010061class PassCycles(IntEnum):
Diqing Zhong42e833d2020-10-02 13:18:42 +020062 Npu = 0
Diqing Zhonge168b962020-11-05 17:18:47 +010063 SramAccess = auto()
64 DramAccess = auto()
65 OnChipFlashAccess = auto()
66 OffChipFlashAccess = auto()
67 Total = auto()
68 Size = auto()
Tim Hall79d07d22020-04-27 18:20:16 +010069
70 def display_name(self):
Jonas Ohlssond8575072022-03-30 10:30:25 +020071 return (
72 "NPU",
73 "SRAM Access",
74 "DRAM Access",
75 "On-chip Flash Access",
76 "Off-chip Flash Access",
77 "Total",
78 "Size",
79 )[self.value]
Tim Hall79d07d22020-04-27 18:20:16 +010080
81 def identifier_name(self):
Jonas Ohlssond8575072022-03-30 10:30:25 +020082 return (
83 "npu",
84 "sram_access",
85 "dram_access",
86 "on_chip_flash_access",
87 "off_chip_flash_access",
88 "total",
89 "size",
90 )[self.value]
Tim Hall79d07d22020-04-27 18:20:16 +010091
92 @staticmethod
93 def all():
94 return (
Diqing Zhong42e833d2020-10-02 13:18:42 +020095 PassCycles.Npu,
Tim Hall79d07d22020-04-27 18:20:16 +010096 PassCycles.SramAccess,
97 PassCycles.DramAccess,
98 PassCycles.OnChipFlashAccess,
99 PassCycles.OffChipFlashAccess,
100 PassCycles.Total,
101 )
102
103
Tim Halld8339a72021-05-27 18:49:40 +0100104class PerformanceQuery:
105 def __init__(self, npu_block_type=0):
106 self.npu_block_type = npu_block_type
107 self.ifm_shape = Shape4D(0)
108 self.ifm_format = TensorFormat.NHWC
109 self.ifm_memory_area = MemArea.Unknown
110 self.ifm2_memory_area = MemArea.Unknown
111 self.ifm_bits = 0
112 self.ifm2_bits = 0
113 self.ifm2_shape = None
114 self.ifm2_format = TensorFormat.NHWC
115 self.ofm_shape = Shape4D(0)
116 self.ofm_format = TensorFormat.NHWC
117 self.ofm_memory_area = MemArea.Unknown
118 self.ofm_bits = 0
119 self.const_shape = Shape4D(0)
120 self.const_memory_area = MemArea.Unknown
121 self.kernel = Kernel(1, 1)
122 self.config = ArchitectureBlockConfig()
Tim Hall79d07d22020-04-27 18:20:16 +0100123
124
Tim Halld8339a72021-05-27 18:49:40 +0100125class CycleCost:
126 def __init__(self):
127 self.op_macs = 0
128 self.op_cycles = 0
129
130 def __mul__(self, scale):
131 out = CycleCost()
132 out.op_macs = self.op_macs * scale
133 out.op_cycles = self.op_cycles * scale
134 return out
135
136 def __iadd__(self, rhs):
137 self.op_macs += rhs.op_macs
138 self.op_cycles += rhs.op_cycles
139 return self
140
141 def __str__(self):
142 return "macs = {}, cycles = {}".format(self.op_macs, self.op_cycles)
Tim Hall79d07d22020-04-27 18:20:16 +0100143
144
Tim Halld8339a72021-05-27 18:49:40 +0100145class ElementAccess:
146 def __init__(self):
147 # List of ONLY element access counts, consumers
148 # need to scale these values by the correct bitwidths
149 # to calculated memory bandwidth
150 self.ifm_read = [0, 0] # ifm1, ifm2
151 self.ofm_write = 0
152 self.weights_refetch = 0
153 self.const_read = [0, 0] # weights, scales
154
155 def __mul__(self, scale):
156 out = ElementAccess()
157 out.ifm_read[0] = self.ifm_read[0] * scale
158 out.ifm_read[1] = self.ifm_read[1] * scale
159 out.ofm_write = self.ofm_write * scale
160 out.weights_refetch = self.weights_refetch * scale
161 out.const_read[0] = self.const_read[0] * scale
162 out.const_read[1] = self.const_read[1] * scale
163 return out
164
165 def __iadd__(self, rhs):
166 self.ifm_read[0] += rhs.ifm_read[0]
167 self.ifm_read[1] += rhs.ifm_read[1]
168 self.ofm_write += rhs.ofm_write
169 self.weights_refetch += rhs.weights_refetch
170 self.const_read[0] += rhs.const_read[0]
171 self.const_read[1] += rhs.const_read[1]
172 return self
173
174 def __str__(self):
175 return "ifm read = {}, ofm write = {}, const read={}".format(self.ifm_read, self.ofm_write, self.const_read)
Tim Hall79d07d22020-04-27 18:20:16 +0100176
177
Tim Halld8339a72021-05-27 18:49:40 +0100178def _strides_for_shape(shape: Shape4D, format: TensorFormat, element_bits):
179 if format == TensorFormat.NHWC:
180 strides = [0, 0, 0, 0]
181 strides[3] = element_bits / 8 # +Z
182 strides[2] = (element_bits * shape.depth) // 8 # +X
183 strides[1] = (element_bits * shape.depth * shape.width) // 8 # +Y
184 strides[0] = (element_bits * shape.depth * shape.width * shape.height) // 8 # +N
185 elif format == TensorFormat.NHCWB16:
186 strides = [0, 0, 0, 0, 0]
187 strides[4] = element_bits / 8 # +Z
188 strides[3] = (element_bits * 16) / 8 # +X
189 strides[2] = (element_bits * 16 * shape.width) / 8 # +C
190 strides[1] = (element_bits * shape.width * shape.depth) / 8 # +Y
191 strides[0] = (element_bits * shape.width * shape.depth) / 8 # +N
Diqing Zhong42e833d2020-10-02 13:18:42 +0200192
Tim Halld8339a72021-05-27 18:49:40 +0100193 return strides
Diqing Zhong42e833d2020-10-02 13:18:42 +0200194
195
Tim Halld8339a72021-05-27 18:49:40 +0100196def _estimate_memory_transfer_efficiency(
197 arch, is_read, mem_area, format: TensorFormat, element_bits, block_size, shape4D, to_transfer
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100198):
Tim Halld8339a72021-05-27 18:49:40 +0100199 burst_len = 8
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100200
Tim Halld8339a72021-05-27 18:49:40 +0100201 strides = _strides_for_shape(shape4D, format, element_bits)
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100202
Tim Halld8339a72021-05-27 18:49:40 +0100203 if format == TensorFormat.NHCWB16:
204 if strides[2] == block_size.depth: # TODO is this check corrrect for non 8-bit
205 burst_len = element_bits * block_size.depth * block_size.width
206 elif is_read:
207 burst_len = 16 * element_bits * block_size.width
Diqing Zhonge8887a32020-09-24 09:53:48 +0200208 else:
Tim Halld8339a72021-05-27 18:49:40 +0100209 burst_len = 16 * element_bits * block_size.width * arch.ncores
210 elif format == TensorFormat.NHWC:
211 if is_read:
212 if strides[3] == block_size.depth:
213 burst_len = element_bits * block_size.depth * block_size.width
214 else:
215 burst_len = element_bits * block_size.depth
216 else:
217 if block_size.depth <= 16 and strides[3] == block_size.depth:
218 burst_len = element_bits * block_size.depth * block_size.width
219 else:
220 burst_len = min(64 * 8, 16 * element_bits * arch.ncores, block_size.depth * element_bits)
221
222 burst_len = burst_len // 8 # bits->bytes
223 burst_len = min(arch.memory_burst_length[mem_area], burst_len)
224 return to_transfer * (arch.memory_burst_length[mem_area] / burst_len)
225
226
227def _estimate_minimum_memory_cycles(arch, query: PerformanceQuery):
228 # Input block HW transfer (only for elements present)
229 ifm_bytes = Shape4D.min(query.ifm_shape, query.config.ifm_block).elements()
230 cycles_ifm_blk = arch.memory_latency[query.ifm_memory_area][BandwidthDirection.Read]
231 cycles_ifm_blk = cycles_ifm_blk + (
232 _estimate_memory_transfer_efficiency(
233 arch,
234 True,
235 query.ifm_memory_area,
236 query.ifm_format,
237 query.ifm_bits,
238 query.config.ifm_block,
239 query.ifm_shape,
240 ifm_bytes,
241 )
242 / arch.memory_bandwidths_per_cycle[query.ifm_memory_area]
243 )
244 # Output block HW transfer (only for elements present)
245 ofm_bytes = Shape4D.min(query.ofm_shape, query.config.ofm_block).elements()
246 cycles_ofm_blk = arch.memory_latency[query.ofm_memory_area][BandwidthDirection.Write]
247 cycles_ofm_blk = cycles_ofm_blk + (
248 _estimate_memory_transfer_efficiency(
249 arch,
250 False,
251 query.ofm_memory_area,
252 query.ofm_format,
253 query.ofm_bits,
254 query.config.ofm_block,
255 query.ofm_shape,
256 ofm_bytes,
257 )
258 / arch.memory_bandwidths_per_cycle[query.ofm_memory_area]
259 )
260 return cycles_ifm_blk, cycles_ofm_blk
261
262
263def _estimate_output_cycles_per_element(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):
264 if query.npu_block_type == NpuBlockType.ElementWise and query.ifm_bits == 32:
265 # Unary op else Binary op
266 output_perf_index = 0 if query.ifm2_shape is not None else 1
267 elif op_type == Op.Mul and query.ofm_bits == 32:
Diqing Zhonge8887a32020-09-24 09:53:48 +0200268 output_perf_index = 2
Tim Halld8339a72021-05-27 18:49:40 +0100269 elif op_type == Op.Mul or (
270 query.npu_block_type
Diqing Zhonge8887a32020-09-24 09:53:48 +0200271 in (
272 NpuBlockType.ConvolutionMxN,
273 NpuBlockType.ConvolutionDepthWise,
274 NpuBlockType.Pooling,
275 NpuBlockType.ReduceSum,
276 NpuBlockType.VectorProduct,
277 )
Tim Halld8339a72021-05-27 18:49:40 +0100278 and query.config.acc_type == SHRAMElements.Acc40
Diqing Zhonge8887a32020-09-24 09:53:48 +0200279 ):
280 output_perf_index = 3
Tim Halld8339a72021-05-27 18:49:40 +0100281 elif op_type in (Op.Add, Op.Sub):
282 if False:
Diqing Zhonge8887a32020-09-24 09:53:48 +0200283 # Simple Add/Sub
284 output_perf_index = 4
285 else:
Tim Halld8339a72021-05-27 18:49:40 +0100286 # Advanced Add/Sub TODO: Add as perf selection as operator variant
Diqing Zhonge8887a32020-09-24 09:53:48 +0200287 output_perf_index = 5
Tim Halld8339a72021-05-27 18:49:40 +0100288 elif op_type.is_maxpool_op():
Diqing Zhonge8887a32020-09-24 09:53:48 +0200289 output_perf_index = 6
290 else:
291 output_perf_index = 7
292
Tim Halld8339a72021-05-27 18:49:40 +0100293 if faf_type in (Op.Sigmoid, Op.Tanh, Op.LUT):
Diqing Zhonge8887a32020-09-24 09:53:48 +0200294 activation_perf_index = 0
Tim Halld8339a72021-05-27 18:49:40 +0100295 elif faf_type in (Op.Relu, Op.Relu6, Op.ReluN1To1):
Diqing Zhonge8887a32020-09-24 09:53:48 +0200296 activation_perf_index = 1
297 else:
298 activation_perf_index = 2
299
Diqing Zhonge8887a32020-09-24 09:53:48 +0200300 cycle_per_elem = max(
301 arch.output_cycles_per_elem[output_perf_index], arch.activation_cycles_per_elem[activation_perf_index]
302 )
Diqing Zhong986e3192020-11-16 16:15:56 +0100303
Tim Halld8339a72021-05-27 18:49:40 +0100304 if op_type.is_elementwise_op():
305 num_elems_blk = query.config.ofm_block.elements()
306 ifm_blk_cycles, ofm_blk_cycles = _estimate_minimum_memory_cycles(arch, query)
307 cycle_cmd = ifm_blk_cycles + ofm_blk_cycles
308 cycle_cmd = (cycle_cmd + cycle_per_elem * num_elems_blk) / 4 # per DPU
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100309 cycle_per_elem = max(cycle_per_elem, cycle_cmd / num_elems_blk)
310
Tim Halld8339a72021-05-27 18:49:40 +0100311 return cycle_per_elem
Diqing Zhonge8887a32020-09-24 09:53:48 +0200312
313
Tim Halld8339a72021-05-27 18:49:40 +0100314def _estimate_conv_cycles(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):
315 ifm_block = Shape4D.min(query.ifm_shape, query.config.ifm_block)
316 ofm_block = Shape4D.min(query.ofm_shape, query.config.ofm_block)
Diqing Zhonge5204a62020-10-13 11:42:37 +0200317
318 if (
319 arch.config.ofm_ublock.height == 2
Tim Halld8339a72021-05-27 18:49:40 +0100320 and query.npu_block_type
Diqing Zhonge5204a62020-10-13 11:42:37 +0200321 in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)
Tim Halld8339a72021-05-27 18:49:40 +0100322 and query.ofm_shape.height == 1
Diqing Zhonge5204a62020-10-13 11:42:37 +0200323 # Optimisation only applies for even width tensors
Tim Halld8339a72021-05-27 18:49:40 +0100324 and query.ofm_shape.width % 2 == 0
325 and query.kernel.height == 1
Diqing Zhonge5204a62020-10-13 11:42:37 +0200326 ):
Tim Halld8339a72021-05-27 18:49:40 +0100327 ofm_ublock = Shape4D(1, 1, 4, arch.config.ofm_ublock.depth)
328 ofm_block = ofm_block.with_height(1)
329 else:
330 ofm_ublock = Shape4D(arch.config.ofm_ublock.to_hwc())
Diqing Zhonge5204a62020-10-13 11:42:37 +0200331
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100332 num_ublk_x = numeric_util.round_up_divide(ofm_block.width, ofm_ublock.width)
Tim Halld8339a72021-05-27 18:49:40 +0100333 num_ublk_y = numeric_util.round_up_divide(ofm_block.height, ofm_ublock.height)
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100334 num_ublk_xy = num_ublk_x * num_ublk_y
Tim Halld8339a72021-05-27 18:49:40 +0100335 num_ublk_z = numeric_util.round_up_divide(ofm_block.depth, ofm_ublock.depth)
336 use_acc_40bits = query.config.acc_type == SHRAMElements.Acc40
Diqing Zhong09387e22020-09-28 18:46:22 +0200337
Tim Halld8339a72021-05-27 18:49:40 +0100338 sub_kernel_limits = arch.sub_kernel_limits[query.npu_block_type]
339 n_sub_kernels_y = numeric_util.round_up_divide(query.kernel.height, sub_kernel_limits[0])
340 n_sub_kernels_x = numeric_util.round_up_divide(query.kernel.width, sub_kernel_limits[1])
Diqing Zhong09387e22020-09-28 18:46:22 +0200341 sub_kernel_x = [
Tim Halld8339a72021-05-27 18:49:40 +0100342 min((query.kernel.width - i * sub_kernel_limits[1]), sub_kernel_limits[1]) for i in range(n_sub_kernels_x)
Diqing Zhong09387e22020-09-28 18:46:22 +0200343 ]
344 sub_kernel_y = [
Tim Halld8339a72021-05-27 18:49:40 +0100345 min((query.kernel.height - i * sub_kernel_limits[0]), sub_kernel_limits[0]) for i in range(n_sub_kernels_y)
Diqing Zhong09387e22020-09-28 18:46:22 +0200346 ]
347 sub_kernel_size = (x * y for y in sub_kernel_y for x in sub_kernel_x)
348
Diqing Zhong09387e22020-09-28 18:46:22 +0200349 cycles_dpu_blk = 0
Diqing Zhong986e3192020-11-16 16:15:56 +0100350 cycles_wb = 32 * ofm_ublock.depth // 8
Diqing Zhong09387e22020-09-28 18:46:22 +0200351
352 for num_kernel_elems in sub_kernel_size:
Tim Halld8339a72021-05-27 18:49:40 +0100353 if query.npu_block_type == NpuBlockType.Pooling:
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100354 num_kernel_steps = 1
Diqing Zhong986e3192020-11-16 16:15:56 +0100355 cycles = max(4, num_kernel_elems) * num_ublk_xy * num_ublk_z
Tim Halld8339a72021-05-27 18:49:40 +0100356 if query.ifm_bits == 16 and arch.accelerator_config != Accelerator.Ethos_U55_32:
Diqing Zhong09387e22020-09-28 18:46:22 +0200357 cycles *= 2
Tim Halld8339a72021-05-27 18:49:40 +0100358 elif query.npu_block_type == NpuBlockType.ConvolutionDepthWise:
Diqing Zhong986e3192020-11-16 16:15:56 +0100359 cycles = 4 * num_ublk_xy
Tim Halld8339a72021-05-27 18:49:40 +0100360 if query.ifm_bits == 16:
Diqing Zhong09387e22020-09-28 18:46:22 +0200361 cycles *= 2
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100362 num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, 4)
363 cycles = max(cycles_wb, cycles) * num_kernel_steps * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200364 elif (
Tim Halld8339a72021-05-27 18:49:40 +0100365 (query.npu_block_type == NpuBlockType.ConvolutionMxN and not query.config.is_partkernel)
366 or query.npu_block_type == NpuBlockType.VectorProduct
367 or query.npu_block_type == NpuBlockType.ReduceSum
Diqing Zhong09387e22020-09-28 18:46:22 +0200368 ):
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100369 num_kernel_steps = num_kernel_elems
370 cycles = max(cycles_wb, 4 * num_ublk_xy) * num_kernel_steps * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200371 else:
Tim Halld8339a72021-05-27 18:49:40 +0100372 assert query.config.is_partkernel
373 divider = 2 if query.ifm_bits == 16 else 4
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100374 num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, divider)
Diqing Zhong986e3192020-11-16 16:15:56 +0100375 cycles = max(cycles_wb, 4 * num_ublk_xy) * (
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100376 num_kernel_steps * numeric_util.round_up_divide(ifm_block.depth, 8) * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200377 )
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100378
379 delay_cycles = 0
380 if arch.accelerator_config is Accelerator.Ethos_U55_32:
381 delay = 7 if use_acc_40bits else 3
382 if num_ublk_x == 1 and num_ublk_y == 1:
383 if num_ublk_z == 1:
384 delay_cycles = delay * num_kernel_steps
385 elif num_kernel_steps > 1:
386 delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z
387 if (num_ublk_x == 1 or num_ublk_y == 1) and num_ublk_z > 1 and use_acc_40bits:
388 delay_cycles += delay * num_ublk_z
389 else:
Tim Halld8339a72021-05-27 18:49:40 +0100390 if use_acc_40bits and arch.accelerator_config in (Accelerator.Ethos_U55_64, Accelerator.Ethos_U55_128):
391 delay = 3
392 else:
393 delay = 2
394
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100395 if num_ublk_x == 1 and num_ublk_y == 1:
396 if num_ublk_z == 1:
397 delay_cycles = delay * num_kernel_steps
398 elif num_kernel_steps > 1:
399 delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z
400
Tim Halld8339a72021-05-27 18:49:40 +0100401 if query.npu_block_type == NpuBlockType.ConvolutionMxN and query.config.is_partkernel:
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100402 delay_cycles *= numeric_util.round_up_divide(ifm_block.depth, 8)
403
Diqing Zhong09387e22020-09-28 18:46:22 +0200404 cycles_dpu_blk += cycles
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100405 cycles_dpu_blk += delay_cycles
406
Tim Halld8339a72021-05-27 18:49:40 +0100407 if query.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
408 cycles_dpu_blk *= numeric_util.round_up_divide(query.ifm_shape.depth, ifm_block.depth)
Diqing Zhong09387e22020-09-28 18:46:22 +0200409
410 cycles_dpu_blk /= arch.ncores
411
Tim Halld8339a72021-05-27 18:49:40 +0100412 # Estimate output cycles
413 num_ofm_blks = query.ofm_shape.div_round_up(ofm_block).elements()
Johan Alfvénf8e353b2022-02-04 17:24:23 +0100414 cycles_output_blk = round_up_to_int(
415 _estimate_output_cycles_per_element(arch, op_type, faf_type, query) * ofm_block.elements()
416 )
Diqing Zhong09387e22020-09-28 18:46:22 +0200417
Tim Halld8339a72021-05-27 18:49:40 +0100418 # Scale and bias tensor
419 if query.const_shape.depth > 0:
Diqing Zhongf842b692020-12-11 13:07:37 +0100420 cycles_bias_blk = (
Tim Halld8339a72021-05-27 18:49:40 +0100421 10 * ofm_block.depth * arch.memory_latency[query.const_memory_area][BandwidthDirection.Read] / 256
Diqing Zhongf842b692020-12-11 13:07:37 +0100422 )
Diqing Zhong986e3192020-11-16 16:15:56 +0100423 cycles_output_blk = max(cycles_output_blk, cycles_bias_blk)
424
Tim Halld8339a72021-05-27 18:49:40 +0100425 ifm_blk_cycles, ofm_blk_cycles = _estimate_minimum_memory_cycles(arch, query)
426 cycles_cmd = ifm_blk_cycles + ofm_blk_cycles
427 cycles_cmd = (cycles_cmd + cycles_output_blk + cycles_dpu_blk) / 4 # per DPU
428
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100429 cycles_dpu_blk = max(cycles_dpu_blk, cycles_cmd)
430 cycles_output_blk = max(cycles_output_blk, cycles_cmd)
431
Diqing Zhong09387e22020-09-28 18:46:22 +0200432 if cycles_dpu_blk > cycles_output_blk:
Tim Halld8339a72021-05-27 18:49:40 +0100433 total_cycles = cycles_dpu_blk * num_ofm_blks + cycles_output_blk
Diqing Zhong09387e22020-09-28 18:46:22 +0200434 else:
Tim Halld8339a72021-05-27 18:49:40 +0100435 total_cycles = cycles_output_blk * num_ofm_blks + cycles_dpu_blk
Diqing Zhong09387e22020-09-28 18:46:22 +0200436
437 return total_cycles
438
439
Tim Halld8339a72021-05-27 18:49:40 +0100440def measure_mem2mem_cycles(arch, from_mem_area, to_mem_area, to_transfer):
441 from_cycles = to_transfer // arch.memory_bandwidths_per_cycle[from_mem_area]
Tim Hall789e6f32021-06-17 17:02:31 +0100442 from_cycles += arch.memory_latency[from_mem_area][BandwidthDirection.Read]
Tim Halld8339a72021-05-27 18:49:40 +0100443 to_cycles = to_transfer // arch.memory_bandwidths_per_cycle[to_mem_area]
444 return max(from_cycles, to_cycles)
Diqing Zhonge168b962020-11-05 17:18:47 +0100445
Patrik Gustavssonee99bb12021-04-08 09:04:00 +0200446
Tim Halld8339a72021-05-27 18:49:40 +0100447def measure_cycle_cost(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):
448 cycles = CycleCost()
Diqing Zhonge168b962020-11-05 17:18:47 +0100449
Tim Halld8339a72021-05-27 18:49:40 +0100450 # Convolution/Vector product cycle calculation
451 if query.npu_block_type in (
452 NpuBlockType.ConvolutionMxN,
453 NpuBlockType.ConvolutionDepthWise,
454 NpuBlockType.VectorProduct,
455 NpuBlockType.Pooling,
456 NpuBlockType.ReduceSum,
457 ):
458 # cycles.op_macs and cycles.op_cycles should both handle >32-bits
459 if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
460 cycles.op_macs = int(query.kernel.elements_wh()) * 1 * int(query.ofm_shape.elements())
Diqing Zhonge168b962020-11-05 17:18:47 +0100461 else:
Tim Halld8339a72021-05-27 18:49:40 +0100462 cycles.op_macs = (
463 int(query.kernel.elements_wh()) * int(query.ifm_shape.depth) * int(query.ofm_shape.elements())
464 )
465
466 cycles.op_cycles = int(_estimate_conv_cycles(arch, op_type, faf_type, query))
467 # Elementwise cycle calculation
468 elif query.npu_block_type == NpuBlockType.ElementWise:
469 cycles.op_macs = 0
Johan Alfvénf8e353b2022-02-04 17:24:23 +0100470 ofm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ofm_format]))
471 cycles.op_cycles = round_up_to_int(
472 _estimate_output_cycles_per_element(arch, op_type, faf_type, query)
473 * Shape4D.round_up(query.ofm_shape, ofm_rounding).elements()
Tim Halld8339a72021-05-27 18:49:40 +0100474 )
Johan Alfven90724962023-02-02 09:07:48 +0100475 # DMA cycle calculation
476 elif query.npu_block_type == NpuBlockType.Dma:
477 # Return 0 since this is not an actual NPU op
478 cycles.op_cycles = 0
Diqing Zhonge168b962020-11-05 17:18:47 +0100479 else:
Tim Halld8339a72021-05-27 18:49:40 +0100480 assert False
Diqing Zhonge168b962020-11-05 17:18:47 +0100481
Tim Halld8339a72021-05-27 18:49:40 +0100482 return cycles
Diqing Zhonge168b962020-11-05 17:18:47 +0100483
484
Tim Halld8339a72021-05-27 18:49:40 +0100485def measure_element_access(arch, query: PerformanceQuery):
486 access = ElementAccess()
Tim Hall79d07d22020-04-27 18:20:16 +0100487
Tim Halld8339a72021-05-27 18:49:40 +0100488 ifm_block = Shape4D.min(query.ifm_shape, query.config.ifm_block)
489 ofm_block = Shape4D.min(query.ofm_shape, query.config.ofm_block)
490 ifm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ifm_format]))
Tim Hall79d07d22020-04-27 18:20:16 +0100491
Tim Halld8339a72021-05-27 18:49:40 +0100492 # Number of ofm blocks in the overall output shape
493 ofm_blocks = query.ofm_shape.div_round_up(ofm_block)
494 ofm_block_depth = ofm_block.depth
495 if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
496 ofm_blocks = ofm_blocks.with_depth(1)
497 ofm_block_depth = query.ifm_shape.depth
Diqing Zhonge168b962020-11-05 17:18:47 +0100498
Tim Halld8339a72021-05-27 18:49:40 +0100499 # Convolution & pooling
500 if query.npu_block_type in (
501 NpuBlockType.ConvolutionMxN,
502 NpuBlockType.ConvolutionDepthWise,
503 NpuBlockType.VectorProduct,
504 NpuBlockType.Pooling,
505 NpuBlockType.ReduceSum,
506 ):
507 # Number of sub kernels
508 sub_kernel_limits = arch.sub_kernel_limits[query.npu_block_type]
509 subkernels = numeric_util.round_up_divide(query.kernel.width, sub_kernel_limits[0])
510 subkernels *= numeric_util.round_up_divide(query.kernel.height, sub_kernel_limits[1])
Tim Hall79d07d22020-04-27 18:20:16 +0100511
Tim Halld8339a72021-05-27 18:49:40 +0100512 ofm_block_count = ofm_blocks.elements()
Tim Hall79d07d22020-04-27 18:20:16 +0100513
Tim Halld8339a72021-05-27 18:49:40 +0100514 ifm_fetch = (
515 Shape4D.round_up(ifm_block, ifm_rounding).elements_wh()
516 * Shape4D.round_up(query.ifm_shape, ifm_rounding).depth
Diqing Zhonge168b962020-11-05 17:18:47 +0100517 )
Tim Hall79d07d22020-04-27 18:20:16 +0100518
Tim Halld8339a72021-05-27 18:49:40 +0100519 if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
520 kernel_read = query.kernel.elements_wh() * 1 # force to no reread
521 else:
522 kernel_read = query.kernel.elements_wh() * query.ifm_shape.depth
Tim Hall79d07d22020-04-27 18:20:16 +0100523
Tim Halld8339a72021-05-27 18:49:40 +0100524 weight_fetch = kernel_read * ofm_block_depth * ofm_block_count
525
526 access.ifm_read[0] = ifm_fetch * subkernels * ofm_block_count
527
528 if query.npu_block_type not in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):
529 access.const_read[0] = weight_fetch
530 access.const_read[1] = query.ofm_shape.depth # Scales & biases
531 access.weights_refetch = ofm_blocks.elements_wh()
532 # Elementwise
533 elif query.npu_block_type == NpuBlockType.ElementWise:
534 if query.ifm_shape.elements() == 1:
535 if query.ifm_bits > 8:
536 # ifm is a non 8-bit scalar
537 access.ifm_read[0] = Shape4D.round_up(query.ifm_shape, ifm_rounding).elements()
538 if query.ifm2_shape:
539 access.ifm_read[1] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()
540 else:
541 access.ifm_read[0] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()
542 if query.ifm2_shape:
543 if query.ifm2_shape.elements() > 1:
544 access.ifm_read[1] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()
545 elif query.ifm2_bits > 8:
546 # ifm2 is a non 8-bit scalar
547 access.ifm_read[1] = Shape4D.round_up(query.ifm2_shape, ifm_rounding).elements()
Johan Alfven90724962023-02-02 09:07:48 +0100548 # DMA
549 elif query.npu_block_type == NpuBlockType.Dma:
550 # Return empty access since this is not an actual NPU op
551 return access
Tim Halld8339a72021-05-27 18:49:40 +0100552 # Unknown
553 else:
554 assert False
555
556 ofm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ofm_format]))
557 access.ofm_write = Shape4D.round_up(query.ofm_shape, ofm_rounding).elements()
558 return access
559
560
561def measure_performance_cost(
562 arch, op_type: Op, faf_type: Op, query: PerformanceQuery, offset: Shape4D, sub_shape: Shape4D
563):
564 assert (query.ofm_bits > 0) and (query.ifm_bits > 0)
565 assert query.ofm_shape.elements() != 0
566
567 # Default to start if no offset provided
568 if offset is None:
569 offset = Shape4D(0, 0, 0, 0)
570
571 # Default to entire area if no sub-shape provided
572 if sub_shape is None:
573 sub_shape = query.ofm_shape
574 else:
575 sub_shape = Shape4D.min(sub_shape, query.ofm_shape)
576
577 sub_query = copy.deepcopy(query)
578 sub_query.ofm_shape = query.ofm_shape.clip(offset, sub_shape)
579
580 access = ElementAccess()
581 cycles = CycleCost()
582
583 cycle_tmp = measure_cycle_cost(arch, op_type, faf_type, sub_query)
584 cycles += cycle_tmp
585 access = measure_element_access(arch, sub_query)
586
587 return access, cycles
588
589
590def make_bandwidth_array():
591 return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size))
592
593
594def make_cycles_array():
595 return np.zeros(PassCycles.Size)
Tim Hall79d07d22020-04-27 18:20:16 +0100596
597
Diqing Zhonge168b962020-11-05 17:18:47 +0100598def update_summary_cycles(arch, bws, cycles):
599 cycles[PassCycles.SramAccess] = np.sum(bws[MemArea.Sram]) / arch.memory_bandwidths_per_cycle[MemArea.Sram]
Tim Hall79d07d22020-04-27 18:20:16 +0100600 cycles[PassCycles.DramAccess] = np.sum(bws[MemArea.Dram]) / arch.memory_bandwidths_per_cycle[MemArea.Dram]
601 cycles[PassCycles.OnChipFlashAccess] = (
602 np.sum(bws[MemArea.OnChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OnChipFlash]
603 )
604 cycles[PassCycles.OffChipFlashAccess] = (
605 np.sum(bws[MemArea.OffChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OffChipFlash]
606 )
607
608 cycles[PassCycles.Total] = np.max(cycles[: PassCycles.Total])
609 return cycles
610
611
Tim Halld8339a72021-05-27 18:49:40 +0100612def estimate_full_op_performance(
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100613 arch, schedule: Schedule, op: SchedulerOperation, prev_op: Optional[SchedulerOperation], block_config
Tim Halld8339a72021-05-27 18:49:40 +0100614):
615 cycles_a = make_cycles_array()
616 bws = make_bandwidth_array()
617 scaled_bws = make_bandwidth_array() # scaled bw with memory transfer efficiency
618 macs = 0
619
620 query = PerformanceQuery(op.op_type.npu_block_type)
621 query.ifm_shape = op.ifm.shape
622 query.ifm_format = op.ifm.format
623 query.ifm_memory_area = op.ifm.mem_area
624 query.ifm_bits = op.ifm.dtype.size_in_bits()
625 query.ifm2_shape = op.ifm2 and op.ifm2.shape
626 query.ifm2_format = op.ifm2 and op.ifm2.format
627 query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area
628 query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()
629 query.ofm_shape = op.ofm.shape
630 query.ofm_memory_area = op.ofm.mem_area
631 query.ofm_bits = op.ofm.dtype.size_in_bits()
632 query.ofm_format = op.ofm.format
633 query.kernel = op.kernel
634 query.config = block_config
635
636 cost = schedule.cost_map[op]
637 prev_cost = schedule.cost_map[prev_op] if prev_op else None
638 if op.parent_op.bias:
639 query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000640 if cost.buffered_weight_tensors:
641 query.const_memory_area = cost.buffered_weight_tensors[0].mem_area
Tim Halld8339a72021-05-27 18:49:40 +0100642 else:
643 query.const_memory_area = cost.npu_weights_tensor.mem_area
644
645 cycles = measure_cycle_cost(arch, op.op_type, op.parent_op.activation and op.parent_op.activation.op_type, query)
646 cycles_a[PassCycles.Npu] = cycles.op_cycles
647 macs = cycles.op_macs
648
649 access = measure_element_access(arch, query)
650
651 # How many NPU cycles are available under the previously executing
652 # operator for performing buffered DMA transfers
653 slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0
654
655 # LUT Transfer
656 parent_op = op.parent_op
Johan Alfven90724962023-02-02 09:07:48 +0100657 dma_transfer_cycles = 0
Tim Halld8339a72021-05-27 18:49:40 +0100658 if parent_op.activation_lut:
659 lut_tensor = [tens for tens in parent_op.inputs if tens.purpose == TensorPurpose.LUT][0]
660 src_tensor = lut_tensor.src_tensor
661 if src_tensor and lut_tensor.mem_area != src_tensor.mem_area:
662 bw = src_tensor.storage_size()
Johan Alfven90724962023-02-02 09:07:48 +0100663 dma_transfer_cycles += measure_mem2mem_cycles(arch, src_tensor.mem_area, lut_tensor.mem_area, bw)
Tim Halld8339a72021-05-27 18:49:40 +0100664
665 bws[src_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw
666 # LUT read from SHRAM TODO remove?
Ayaan Masoodd5cbef32022-02-22 15:56:35 +0000667 scaled_bws[lut_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw
Tim Halld8339a72021-05-27 18:49:40 +0100668
Johan Alfven90724962023-02-02 09:07:48 +0100669 # DMA Transfer
670 if parent_op.type == Op.Memcpy:
671 src_tensor = parent_op.ifm
672 dst_tensor = parent_op.ofm
673 if src_tensor.mem_area != dst_tensor.mem_area:
674 bw = src_tensor.storage_size()
675 dma_transfer_cycles += measure_mem2mem_cycles(arch, src_tensor.mem_area, dst_tensor.mem_area, bw)
676 bws[src_tensor.mem_area][src_tensor.purpose][BandwidthDirection.Read] += bw
677 bws[dst_tensor.mem_area][src_tensor.purpose][BandwidthDirection.Write] += bw
678
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000679 if cost.npu_weights_tensor and cost.buffered_weight_tensors:
Tim Halld8339a72021-05-27 18:49:40 +0100680 # DMA Weight Transfer
681 sz = 0
682 # Get the size of the first DMA
683 for core in range(0, arch.ncores):
684 key = WeightKey(core, 0)
685 if key in cost.npu_weights_tensor.encoded_ranges:
686 weight_range = cost.npu_weights_tensor.encoded_ranges[key]
687 sz += round_up(weight_range.total_bytes, 16)
688
689 total_sz = len(cost.npu_weights_tensor.buffer)
690 bws[cost.npu_weights_tensor.mem_area][TensorPurpose.Weights][BandwidthDirection.Read] += total_sz
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000691 bws[cost.buffered_weight_tensors[0].mem_area][TensorPurpose.Weights][BandwidthDirection.Write] += total_sz
Tim Halld8339a72021-05-27 18:49:40 +0100692
693 ws_first_transfer_cycles = measure_mem2mem_cycles(
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000694 arch, cost.npu_weights_tensor.mem_area, cost.buffered_weight_tensors[0].mem_area, sz
Tim Halld8339a72021-05-27 18:49:40 +0100695 )
696
697 # Add cycles for Weight + Scale Transfer
Johan Alfvén0f98de62022-05-15 14:54:51 +0200698 if cost.buffered_weight_tensors[0].sub_purpose == TensorSubPurpose.DoubleBuffer:
699 # Double buffer - weights can be fetched in parallel
700 cycles_a[PassCycles.Npu] = max(
701 cost.full_weight_transfer_cycles - slack_cycles + cost.slack_buffering_cycles,
702 cycles.op_cycles + max(ws_first_transfer_cycles - slack_cycles, 0),
703 )
704 else:
705 # Standard buffer - weights can not be fetched in parallel so weight transfer
706 # must be included in the result
707 cycles_a[PassCycles.Npu] = (
708 cycles.op_cycles + cost.full_weight_transfer_cycles - min(ws_first_transfer_cycles, slack_cycles)
709 )
Tim Halld8339a72021-05-27 18:49:40 +0100710
Johan Alfven90724962023-02-02 09:07:48 +0100711 # Add cycles for LUT + mempcy op Transfer
712 cycles_a[PassCycles.Npu] += dma_transfer_cycles
Tim Halld8339a72021-05-27 18:49:40 +0100713 else:
Johan Alfven90724962023-02-02 09:07:48 +0100714 # Add cycles for LUT + mempcy op Transfer
715 cycles_a[PassCycles.Npu] += max(dma_transfer_cycles - slack_cycles, 0)
Tim Halld8339a72021-05-27 18:49:40 +0100716
717 # OFM write
718 ofm = op.parent_op.ofm
719 bw = access.ofm_write * ofm.element_size()
720 bws[query.ofm_memory_area][ofm.purpose][BandwidthDirection.Write] += bw
721 scaled_bws[ofm.mem_area][ofm.purpose][BandwidthDirection.Write] += _estimate_memory_transfer_efficiency(
722 arch, False, query.ofm_memory_area, ofm.format, query.ofm_bits, query.config.ofm_block, query.ofm_shape, bw
723 )
724
725 # IFM read
Johan Alfvén2f876172022-12-07 12:40:55 +0100726 ifm = op.parent_op.ifm2 if op.reversed_operands else op.parent_op.ifm
Tim Halld8339a72021-05-27 18:49:40 +0100727 bw = access.ifm_read[0] * ifm.element_size()
728 bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += bw
729 scaled_bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency(
730 arch, True, query.ifm_memory_area, ifm.format, query.ifm_bits, query.config.ifm_block, query.ifm_shape, bw
731 )
Johan Alfvén2f876172022-12-07 12:40:55 +0100732
Tim Halld8339a72021-05-27 18:49:40 +0100733 if query.ifm2_shape:
Johan Alfvén2f876172022-12-07 12:40:55 +0100734 ifm2 = op.parent_op.ifm if op.reversed_operands else op.parent_op.ifm2
Tim Halld8339a72021-05-27 18:49:40 +0100735 bw = access.ifm_read[1] * ifm2.element_size()
736 bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += bw
737 scaled_bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency(
738 arch,
739 True,
740 query.ifm2_memory_area,
741 ifm2.format,
742 op.ifm2.dtype.size_in_bits(),
743 query.config.ifm_block,
744 query.ifm2_shape,
745 bw,
746 )
747
748 # Weight read
749 if access.const_read[0] > 0:
750 # alignment not accounted for in bandwidth_compression_scale_approx
751 encoded_size_approx = (
752 cost.npu_weights_tensor.elements() - access.const_read[1] * op.parent_op.bias.element_size()
753 )
754 orig_weight_size = parent_op.weights.elements()
755 bandwidth_compression_scale_approx = encoded_size_approx / orig_weight_size
756 bw = access.const_read[0] * bandwidth_compression_scale_approx
757 bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw
758
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000759 if not cost.buffered_weight_tensors:
Patrik Gustavsson225e19d2021-06-01 12:43:43 +0200760 scaled_bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw
761
Tim Halld8339a72021-05-27 18:49:40 +0100762 if access.const_read[1] > 0:
763 # Scales & biases
764 bw = access.const_read[1] * op.parent_op.bias.element_size()
765 bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw
766
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000767 if not cost.buffered_weight_tensors:
Patrik Gustavsson225e19d2021-06-01 12:43:43 +0200768 scaled_bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw
769
Tim Halld8339a72021-05-27 18:49:40 +0100770 update_summary_cycles(arch, scaled_bws, cycles_a)
771
772 return bws, macs, cycles_a
Tim Hall79d07d22020-04-27 18:20:16 +0100773
774
Tim Hallc1be0872022-03-03 17:50:52 +0000775def print_performance(
776 nng: Graph,
777 arch: ArchitectureFeatures,
778 network_type: NetworkType,
779 bws: dict,
780 macs: dict,
781 cycles: dict,
782 mem_usage: dict,
wilisa0189a8cdd2022-08-22 16:13:06 +0000783 output_basename: str,
Tim Hallc1be0872022-03-03 17:50:52 +0000784):
Tim Hall5ae6cb02022-11-11 18:55:49 +0000785 def _percentage(part, whole):
786 # desired behaviour is for division by zero to return 100%
787 if whole == 0:
788 return 100.0
789 else:
790 return part / whole * 100.0
791
Tim Hallc1be0872022-03-03 17:50:52 +0000792 if network_type == NetworkType.TFLite:
793 nng_optype_to_input_op_type = tflite_optype_to_builtintype
794 else:
795 nng_optype_to_input_op_type = tosa_optype_to_tosa_op_type
796
797 suid_inv_map = {v: k for k, v in DebugDatabase._sourceUID.items()}
798
Tim Hall5ae6cb02022-11-11 18:55:49 +0000799 # the header is a list (one entry per column) of tuples (column name, alignment, width, precision)
800 header = [
801 (f"{network_type.name}_operator", "<", 20, -1),
802 ("NNG Operator", "<", 20, -1),
803 ("SRAM Usage", ">", 10, 0.0),
804 ("Peak%", ">", 6, 0.2),
805 ("Op Cycles", ">", 10, 0.0),
806 ("Network%", ">", 8, 0.2),
807 ("NPU", ">", 10, 0.0),
808 ("SRAM AC", ">", 10, 0.0),
809 ("DRAM AC", ">", 10, 0.0),
810 ("OnFlash AC", ">", 10, 0.0),
811 ("OffFlash AC", ">", 11, 0.0),
812 ("MAC Count", ">", 10, 0.0),
813 ("Network%", ">", 8, 0.2),
814 ("Util%", ">", 6, 0.2),
815 ("Name", "<", 20, -1),
816 ]
817
818 # open the csv
819 csv_file = open(output_basename + "_per-layer.csv", "w", encoding="UTF8")
820 writer = csv.writer(csv_file)
821
Tim Hallc1be0872022-03-03 17:50:52 +0000822 for sg in nng.subgraphs:
823
824 if sg.placement != PassPlacement.Npu:
825 continue
826
Tim Hall5ae6cb02022-11-11 18:55:49 +0000827 sg_seperator_text = f"\n{str('#') * 80}\nPerformance for NPU Subgraph {sg.name}"
Tim Hallc1be0872022-03-03 17:50:52 +0000828
Tim Hall5ae6cb02022-11-11 18:55:49 +0000829 # the data is a list (one entry per op) of lists (matching the header columns)
830 data = []
831 for sched_op in sg.sched_ops:
832 # get source op name
833 sched_op_src_uid = DebugDatabase._optimisedUID[sched_op.parent_op][1]
834 if sched_op_src_uid == DebugDatabase.NULLREF:
835 src_op_type = None
836 else:
837 src_op_type = suid_inv_map[sched_op_src_uid].original_type
Tim Hallc1be0872022-03-03 17:50:52 +0000838
Tim Hall5ae6cb02022-11-11 18:55:49 +0000839 src_op_name = nng_optype_to_input_op_type(src_op_type)
Tim Hallc1be0872022-03-03 17:50:52 +0000840
Tim Hall5ae6cb02022-11-11 18:55:49 +0000841 max_macs = cycles[sched_op][PassCycles.Total] * arch.num_macs_per_cycle * arch.ncores
842 peak_sram = (
843 _percentage(mem_usage[sched_op], nng.memory_used[MemArea.Sram])
844 if MemArea.Sram in nng.memory_used
845 else 0
846 )
wilisa0189a8cdd2022-08-22 16:13:06 +0000847
Tim Hall5ae6cb02022-11-11 18:55:49 +0000848 data.append(
849 [
850 src_op_name,
851 sched_op.op_type,
852 mem_usage[sched_op],
853 peak_sram,
854 cycles[sched_op][PassCycles.Total],
855 _percentage(cycles[sched_op][PassCycles.Total], nng.cycles[PassCycles.Total]),
856 cycles[sched_op][PassCycles.Npu],
857 cycles[sched_op][PassCycles.SramAccess],
858 cycles[sched_op][PassCycles.DramAccess],
859 cycles[sched_op][PassCycles.OnChipFlashAccess],
860 cycles[sched_op][PassCycles.OffChipFlashAccess],
861 macs[sched_op],
862 _percentage(macs[sched_op], nng.macs),
863 _percentage(macs[sched_op], max_macs),
864 sched_op.name,
wilisa0189a8cdd2022-08-22 16:13:06 +0000865 ]
Tim Hall5ae6cb02022-11-11 18:55:49 +0000866 )
867
868 # print to console
869 print(sg_seperator_text)
870 line = ""
871 line2 = ""
872 for col_name, align, width, _ in header:
873 line_data = f"{col_name:{align}{width}}"
874 line += line_data + " "
875 line2 += "-" * len(line_data) + " "
876 print(line)
877 print(line2)
878
879 for op_data in data:
880 line = ""
881 for idx, item in enumerate(op_data):
882 _, align, width, precision = header[idx]
883 if precision == -1:
884 w = str(width)
885 else:
886 w = str(width + precision) + "f"
887 line += f"{item:{align}{w}}" + " "
888 print(line)
889
890 # print to csv
891 writer.writerow((sg_seperator_text,))
892 writer.writerow(col_name for col_name, _, _, _ in header)
893 for op_data in data:
894 writer.writerow(op_data)
895
896 # close the csv
897 csv_file.close()
Tim Hallc1be0872022-03-03 17:50:52 +0000898
899
wilisa0189a8cdd2022-08-22 16:13:06 +0000900def calc_new_performance_for_network(
901 nng: Graph,
902 arch,
903 network_type: NetworkType,
904 verbose_performance: bool,
905 output_basename: str = "output/unnamed_network",
906):
Tim Hall79d07d22020-04-27 18:20:16 +0100907 total_bws = make_bandwidth_array()
Diqing Zhong69aadd02020-12-08 13:08:48 +0100908 total_macs = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100909 total_cycles = np.zeros(PassCycles.Size)
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000910 total_weight_size = 0
911 total_encoded_weight_size = 0
912
913 # Store unique instances of original/encoded weight tensor uuids to prevent double counting of weights
914 original_weight_uuids: Set[UUID] = set()
915 encoded_npu_weight_uuids: Set[UUID] = set()
Tim Hall79d07d22020-04-27 18:20:16 +0100916
Tim Hallc1be0872022-03-03 17:50:52 +0000917 bws = {}
918 macs = {}
919 cycles = {}
920 mem_usage = {}
921
Tim Hall79d07d22020-04-27 18:20:16 +0100922 for sg in nng.subgraphs:
Tim Halld8339a72021-05-27 18:49:40 +0100923 prev_op = None
924 for sched_op in sg.sched_ops:
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000925 op_info: SchedulerOpInfo = sg.schedule.cost_map[sched_op]
Tim Hallc1be0872022-03-03 17:50:52 +0000926 bws[sched_op], macs[sched_op], cycles[sched_op] = estimate_full_op_performance(
927 arch, sg.schedule, sched_op, prev_op, op_info.block_config
928 )
929
930 # get op sram usage
931 mem_usage[sched_op] = (
932 sg.schedule.memory_snapshot[op_info.time_index]
933 if op_info.time_index < len(sg.schedule.memory_snapshot)
934 else 0
935 )
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000936
937 # Tensors for calculating weight sizes
938 original_weight = sched_op.parent_op.weights
939 encoded_npu_weight = op_info.npu_weights_tensor
940
941 # Save UUIDs of original_weight so only unique instances of tensors are used to calculate weights
942 if original_weight and (original_weight.equivalence_id not in original_weight_uuids):
943
944 original_weight_uuids.add(original_weight.equivalence_id)
945 total_weight_size += original_weight.values.itemsize * original_weight.values.size
946
947 # Save UUIDs of encoded_npu_weight so only unique instances of tensors are used to calculate weights
948 if encoded_npu_weight and (encoded_npu_weight.equivalence_id not in encoded_npu_weight_uuids):
949
Jonas Ohlsson77b448f2022-03-11 16:08:30 +0100950 encoded_npu_weight_uuids.add(encoded_npu_weight.equivalence_id)
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000951 total_encoded_weight_size += len(encoded_npu_weight.buffer)
952
Tim Hallc1be0872022-03-03 17:50:52 +0000953 total_bws += bws[sched_op]
954 total_macs += macs[sched_op]
955 total_cycles += cycles[sched_op]
Tim Halld8339a72021-05-27 18:49:40 +0100956 prev_op = sched_op
Tim Hall79d07d22020-04-27 18:20:16 +0100957
958 nng.bandwidths = total_bws
959 nng.macs = total_macs
960 nng.cycles = total_cycles
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000961 nng.total_original_weights = total_weight_size
962 nng.total_npu_encoded_weights = total_encoded_weight_size
Tim Hallc1be0872022-03-03 17:50:52 +0000963
964 if verbose_performance:
wilisa0189a8cdd2022-08-22 16:13:06 +0000965 print_performance(nng, arch, network_type, bws, macs, cycles, mem_usage, output_basename)