blob: 81d0be7edabb0abb1d4d55a1539827132138bd9e [file] [log] [blame]
erik.andersson@arm.com460c6892021-02-24 14:38:09 +01001# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# NPU performance estimation functions to estimate performance of a Pass and CascadedPass. Uses a model that takes the
18# maximum of the 'cycles required for bandwidth' and 'cycles required for computing'.
19#
20# Called during scheduling to evaluate different proposals, as well as post-scheduling to provide a final performance
21# estimate.
Tim Halld8339a72021-05-27 18:49:40 +010022import copy
Diqing Zhonge168b962020-11-05 17:18:47 +010023from enum import auto
24from enum import IntEnum
Jonas Ohlsson845e2322022-03-01 12:39:55 +010025from typing import Optional
Ayaan Masoodb801dda2022-02-22 11:28:55 +000026from typing import Set
27from uuid import UUID
Diego Russoea6111a2020-04-14 18:41:58 +010028
Tim Hall79d07d22020-04-27 18:20:16 +010029import numpy as np
Diego Russoea6111a2020-04-14 18:41:58 +010030
31from . import numeric_util
Tim Halld8339a72021-05-27 18:49:40 +010032from .architecture_allocator import ArchitectureBlockConfig
Diqing Zhong09387e22020-09-28 18:46:22 +020033from .architecture_features import Accelerator
Tim Halld8339a72021-05-27 18:49:40 +010034from .architecture_features import NpuBlockType
35from .architecture_features import SHRAMElements
36from .architecture_features import TensorFormat
Ayaan Masoodb801dda2022-02-22 11:28:55 +000037from .nn_graph import Graph
Tim Halld8339a72021-05-27 18:49:40 +010038from .numeric_util import round_up
Johan Alfvénf8e353b2022-02-04 17:24:23 +010039from .numeric_util import round_up_to_int
Tim Halld8339a72021-05-27 18:49:40 +010040from .operation import Kernel
Diqing Zhonge8887a32020-09-24 09:53:48 +020041from .operation import Op
Tim Halld8339a72021-05-27 18:49:40 +010042from .scheduler import Schedule
43from .scheduler import SchedulerOperation
Ayaan Masoodb801dda2022-02-22 11:28:55 +000044from .scheduler import SchedulerOpInfo
Tim Halld8339a72021-05-27 18:49:40 +010045from .shape4d import Shape4D
Diqing Zhongf842b692020-12-11 13:07:37 +010046from .tensor import BandwidthDirection
Diego Russoe8a10452020-04-21 17:39:10 +010047from .tensor import MemArea
Diego Russoe8a10452020-04-21 17:39:10 +010048from .tensor import TensorPurpose
Tim Halld8339a72021-05-27 18:49:40 +010049from .weight_compressor import WeightKey
Tim Hall79d07d22020-04-27 18:20:16 +010050
51
Diqing Zhonge168b962020-11-05 17:18:47 +010052class PassCycles(IntEnum):
Diqing Zhong42e833d2020-10-02 13:18:42 +020053 Npu = 0
Diqing Zhonge168b962020-11-05 17:18:47 +010054 SramAccess = auto()
55 DramAccess = auto()
56 OnChipFlashAccess = auto()
57 OffChipFlashAccess = auto()
58 Total = auto()
59 Size = auto()
Tim Hall79d07d22020-04-27 18:20:16 +010060
61 def display_name(self):
Jonas Ohlssond8575072022-03-30 10:30:25 +020062 return (
63 "NPU",
64 "SRAM Access",
65 "DRAM Access",
66 "On-chip Flash Access",
67 "Off-chip Flash Access",
68 "Total",
69 "Size",
70 )[self.value]
Tim Hall79d07d22020-04-27 18:20:16 +010071
72 def identifier_name(self):
Jonas Ohlssond8575072022-03-30 10:30:25 +020073 return (
74 "npu",
75 "sram_access",
76 "dram_access",
77 "on_chip_flash_access",
78 "off_chip_flash_access",
79 "total",
80 "size",
81 )[self.value]
Tim Hall79d07d22020-04-27 18:20:16 +010082
83 @staticmethod
84 def all():
85 return (
Diqing Zhong42e833d2020-10-02 13:18:42 +020086 PassCycles.Npu,
Tim Hall79d07d22020-04-27 18:20:16 +010087 PassCycles.SramAccess,
88 PassCycles.DramAccess,
89 PassCycles.OnChipFlashAccess,
90 PassCycles.OffChipFlashAccess,
91 PassCycles.Total,
92 )
93
94
Tim Halld8339a72021-05-27 18:49:40 +010095class PerformanceQuery:
96 def __init__(self, npu_block_type=0):
97 self.npu_block_type = npu_block_type
98 self.ifm_shape = Shape4D(0)
99 self.ifm_format = TensorFormat.NHWC
100 self.ifm_memory_area = MemArea.Unknown
101 self.ifm2_memory_area = MemArea.Unknown
102 self.ifm_bits = 0
103 self.ifm2_bits = 0
104 self.ifm2_shape = None
105 self.ifm2_format = TensorFormat.NHWC
106 self.ofm_shape = Shape4D(0)
107 self.ofm_format = TensorFormat.NHWC
108 self.ofm_memory_area = MemArea.Unknown
109 self.ofm_bits = 0
110 self.const_shape = Shape4D(0)
111 self.const_memory_area = MemArea.Unknown
112 self.kernel = Kernel(1, 1)
113 self.config = ArchitectureBlockConfig()
Tim Hall79d07d22020-04-27 18:20:16 +0100114
115
Tim Halld8339a72021-05-27 18:49:40 +0100116class CycleCost:
117 def __init__(self):
118 self.op_macs = 0
119 self.op_cycles = 0
120
121 def __mul__(self, scale):
122 out = CycleCost()
123 out.op_macs = self.op_macs * scale
124 out.op_cycles = self.op_cycles * scale
125 return out
126
127 def __iadd__(self, rhs):
128 self.op_macs += rhs.op_macs
129 self.op_cycles += rhs.op_cycles
130 return self
131
132 def __str__(self):
133 return "macs = {}, cycles = {}".format(self.op_macs, self.op_cycles)
Tim Hall79d07d22020-04-27 18:20:16 +0100134
135
Tim Halld8339a72021-05-27 18:49:40 +0100136class ElementAccess:
137 def __init__(self):
138 # List of ONLY element access counts, consumers
139 # need to scale these values by the correct bitwidths
140 # to calculated memory bandwidth
141 self.ifm_read = [0, 0] # ifm1, ifm2
142 self.ofm_write = 0
143 self.weights_refetch = 0
144 self.const_read = [0, 0] # weights, scales
145
146 def __mul__(self, scale):
147 out = ElementAccess()
148 out.ifm_read[0] = self.ifm_read[0] * scale
149 out.ifm_read[1] = self.ifm_read[1] * scale
150 out.ofm_write = self.ofm_write * scale
151 out.weights_refetch = self.weights_refetch * scale
152 out.const_read[0] = self.const_read[0] * scale
153 out.const_read[1] = self.const_read[1] * scale
154 return out
155
156 def __iadd__(self, rhs):
157 self.ifm_read[0] += rhs.ifm_read[0]
158 self.ifm_read[1] += rhs.ifm_read[1]
159 self.ofm_write += rhs.ofm_write
160 self.weights_refetch += rhs.weights_refetch
161 self.const_read[0] += rhs.const_read[0]
162 self.const_read[1] += rhs.const_read[1]
163 return self
164
165 def __str__(self):
166 return "ifm read = {}, ofm write = {}, const read={}".format(self.ifm_read, self.ofm_write, self.const_read)
Tim Hall79d07d22020-04-27 18:20:16 +0100167
168
Tim Halld8339a72021-05-27 18:49:40 +0100169def _strides_for_shape(shape: Shape4D, format: TensorFormat, element_bits):
170 if format == TensorFormat.NHWC:
171 strides = [0, 0, 0, 0]
172 strides[3] = element_bits / 8 # +Z
173 strides[2] = (element_bits * shape.depth) // 8 # +X
174 strides[1] = (element_bits * shape.depth * shape.width) // 8 # +Y
175 strides[0] = (element_bits * shape.depth * shape.width * shape.height) // 8 # +N
176 elif format == TensorFormat.NHCWB16:
177 strides = [0, 0, 0, 0, 0]
178 strides[4] = element_bits / 8 # +Z
179 strides[3] = (element_bits * 16) / 8 # +X
180 strides[2] = (element_bits * 16 * shape.width) / 8 # +C
181 strides[1] = (element_bits * shape.width * shape.depth) / 8 # +Y
182 strides[0] = (element_bits * shape.width * shape.depth) / 8 # +N
Diqing Zhong42e833d2020-10-02 13:18:42 +0200183
Tim Halld8339a72021-05-27 18:49:40 +0100184 return strides
Diqing Zhong42e833d2020-10-02 13:18:42 +0200185
186
Tim Halld8339a72021-05-27 18:49:40 +0100187def _estimate_memory_transfer_efficiency(
188 arch, is_read, mem_area, format: TensorFormat, element_bits, block_size, shape4D, to_transfer
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100189):
Tim Halld8339a72021-05-27 18:49:40 +0100190 burst_len = 8
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100191
Tim Halld8339a72021-05-27 18:49:40 +0100192 strides = _strides_for_shape(shape4D, format, element_bits)
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100193
Tim Halld8339a72021-05-27 18:49:40 +0100194 if format == TensorFormat.NHCWB16:
195 if strides[2] == block_size.depth: # TODO is this check corrrect for non 8-bit
196 burst_len = element_bits * block_size.depth * block_size.width
197 elif is_read:
198 burst_len = 16 * element_bits * block_size.width
Diqing Zhonge8887a32020-09-24 09:53:48 +0200199 else:
Tim Halld8339a72021-05-27 18:49:40 +0100200 burst_len = 16 * element_bits * block_size.width * arch.ncores
201 elif format == TensorFormat.NHWC:
202 if is_read:
203 if strides[3] == block_size.depth:
204 burst_len = element_bits * block_size.depth * block_size.width
205 else:
206 burst_len = element_bits * block_size.depth
207 else:
208 if block_size.depth <= 16 and strides[3] == block_size.depth:
209 burst_len = element_bits * block_size.depth * block_size.width
210 else:
211 burst_len = min(64 * 8, 16 * element_bits * arch.ncores, block_size.depth * element_bits)
212
213 burst_len = burst_len // 8 # bits->bytes
214 burst_len = min(arch.memory_burst_length[mem_area], burst_len)
215 return to_transfer * (arch.memory_burst_length[mem_area] / burst_len)
216
217
218def _estimate_minimum_memory_cycles(arch, query: PerformanceQuery):
219 # Input block HW transfer (only for elements present)
220 ifm_bytes = Shape4D.min(query.ifm_shape, query.config.ifm_block).elements()
221 cycles_ifm_blk = arch.memory_latency[query.ifm_memory_area][BandwidthDirection.Read]
222 cycles_ifm_blk = cycles_ifm_blk + (
223 _estimate_memory_transfer_efficiency(
224 arch,
225 True,
226 query.ifm_memory_area,
227 query.ifm_format,
228 query.ifm_bits,
229 query.config.ifm_block,
230 query.ifm_shape,
231 ifm_bytes,
232 )
233 / arch.memory_bandwidths_per_cycle[query.ifm_memory_area]
234 )
235 # Output block HW transfer (only for elements present)
236 ofm_bytes = Shape4D.min(query.ofm_shape, query.config.ofm_block).elements()
237 cycles_ofm_blk = arch.memory_latency[query.ofm_memory_area][BandwidthDirection.Write]
238 cycles_ofm_blk = cycles_ofm_blk + (
239 _estimate_memory_transfer_efficiency(
240 arch,
241 False,
242 query.ofm_memory_area,
243 query.ofm_format,
244 query.ofm_bits,
245 query.config.ofm_block,
246 query.ofm_shape,
247 ofm_bytes,
248 )
249 / arch.memory_bandwidths_per_cycle[query.ofm_memory_area]
250 )
251 return cycles_ifm_blk, cycles_ofm_blk
252
253
254def _estimate_output_cycles_per_element(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):
255 if query.npu_block_type == NpuBlockType.ElementWise and query.ifm_bits == 32:
256 # Unary op else Binary op
257 output_perf_index = 0 if query.ifm2_shape is not None else 1
258 elif op_type == Op.Mul and query.ofm_bits == 32:
Diqing Zhonge8887a32020-09-24 09:53:48 +0200259 output_perf_index = 2
Tim Halld8339a72021-05-27 18:49:40 +0100260 elif op_type == Op.Mul or (
261 query.npu_block_type
Diqing Zhonge8887a32020-09-24 09:53:48 +0200262 in (
263 NpuBlockType.ConvolutionMxN,
264 NpuBlockType.ConvolutionDepthWise,
265 NpuBlockType.Pooling,
266 NpuBlockType.ReduceSum,
267 NpuBlockType.VectorProduct,
268 )
Tim Halld8339a72021-05-27 18:49:40 +0100269 and query.config.acc_type == SHRAMElements.Acc40
Diqing Zhonge8887a32020-09-24 09:53:48 +0200270 ):
271 output_perf_index = 3
Tim Halld8339a72021-05-27 18:49:40 +0100272 elif op_type in (Op.Add, Op.Sub):
273 if False:
Diqing Zhonge8887a32020-09-24 09:53:48 +0200274 # Simple Add/Sub
275 output_perf_index = 4
276 else:
Tim Halld8339a72021-05-27 18:49:40 +0100277 # Advanced Add/Sub TODO: Add as perf selection as operator variant
Diqing Zhonge8887a32020-09-24 09:53:48 +0200278 output_perf_index = 5
Tim Halld8339a72021-05-27 18:49:40 +0100279 elif op_type.is_maxpool_op():
Diqing Zhonge8887a32020-09-24 09:53:48 +0200280 output_perf_index = 6
281 else:
282 output_perf_index = 7
283
Tim Halld8339a72021-05-27 18:49:40 +0100284 if faf_type in (Op.Sigmoid, Op.Tanh, Op.LUT):
Diqing Zhonge8887a32020-09-24 09:53:48 +0200285 activation_perf_index = 0
Tim Halld8339a72021-05-27 18:49:40 +0100286 elif faf_type in (Op.Relu, Op.Relu6, Op.ReluN1To1):
Diqing Zhonge8887a32020-09-24 09:53:48 +0200287 activation_perf_index = 1
288 else:
289 activation_perf_index = 2
290
Diqing Zhonge8887a32020-09-24 09:53:48 +0200291 cycle_per_elem = max(
292 arch.output_cycles_per_elem[output_perf_index], arch.activation_cycles_per_elem[activation_perf_index]
293 )
Diqing Zhong986e3192020-11-16 16:15:56 +0100294
Tim Halld8339a72021-05-27 18:49:40 +0100295 if op_type.is_elementwise_op():
296 num_elems_blk = query.config.ofm_block.elements()
297 ifm_blk_cycles, ofm_blk_cycles = _estimate_minimum_memory_cycles(arch, query)
298 cycle_cmd = ifm_blk_cycles + ofm_blk_cycles
299 cycle_cmd = (cycle_cmd + cycle_per_elem * num_elems_blk) / 4 # per DPU
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100300 cycle_per_elem = max(cycle_per_elem, cycle_cmd / num_elems_blk)
301
Tim Halld8339a72021-05-27 18:49:40 +0100302 return cycle_per_elem
Diqing Zhonge8887a32020-09-24 09:53:48 +0200303
304
Tim Halld8339a72021-05-27 18:49:40 +0100305def _estimate_conv_cycles(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):
306 ifm_block = Shape4D.min(query.ifm_shape, query.config.ifm_block)
307 ofm_block = Shape4D.min(query.ofm_shape, query.config.ofm_block)
Diqing Zhonge5204a62020-10-13 11:42:37 +0200308
309 if (
310 arch.config.ofm_ublock.height == 2
Tim Halld8339a72021-05-27 18:49:40 +0100311 and query.npu_block_type
Diqing Zhonge5204a62020-10-13 11:42:37 +0200312 in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)
Tim Halld8339a72021-05-27 18:49:40 +0100313 and query.ofm_shape.height == 1
Diqing Zhonge5204a62020-10-13 11:42:37 +0200314 # Optimisation only applies for even width tensors
Tim Halld8339a72021-05-27 18:49:40 +0100315 and query.ofm_shape.width % 2 == 0
316 and query.kernel.height == 1
Diqing Zhonge5204a62020-10-13 11:42:37 +0200317 ):
Tim Halld8339a72021-05-27 18:49:40 +0100318 ofm_ublock = Shape4D(1, 1, 4, arch.config.ofm_ublock.depth)
319 ofm_block = ofm_block.with_height(1)
320 else:
321 ofm_ublock = Shape4D(arch.config.ofm_ublock.to_hwc())
Diqing Zhonge5204a62020-10-13 11:42:37 +0200322
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100323 num_ublk_x = numeric_util.round_up_divide(ofm_block.width, ofm_ublock.width)
Tim Halld8339a72021-05-27 18:49:40 +0100324 num_ublk_y = numeric_util.round_up_divide(ofm_block.height, ofm_ublock.height)
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100325 num_ublk_xy = num_ublk_x * num_ublk_y
Tim Halld8339a72021-05-27 18:49:40 +0100326 num_ublk_z = numeric_util.round_up_divide(ofm_block.depth, ofm_ublock.depth)
327 use_acc_40bits = query.config.acc_type == SHRAMElements.Acc40
Diqing Zhong09387e22020-09-28 18:46:22 +0200328
Tim Halld8339a72021-05-27 18:49:40 +0100329 sub_kernel_limits = arch.sub_kernel_limits[query.npu_block_type]
330 n_sub_kernels_y = numeric_util.round_up_divide(query.kernel.height, sub_kernel_limits[0])
331 n_sub_kernels_x = numeric_util.round_up_divide(query.kernel.width, sub_kernel_limits[1])
Diqing Zhong09387e22020-09-28 18:46:22 +0200332 sub_kernel_x = [
Tim Halld8339a72021-05-27 18:49:40 +0100333 min((query.kernel.width - i * sub_kernel_limits[1]), sub_kernel_limits[1]) for i in range(n_sub_kernels_x)
Diqing Zhong09387e22020-09-28 18:46:22 +0200334 ]
335 sub_kernel_y = [
Tim Halld8339a72021-05-27 18:49:40 +0100336 min((query.kernel.height - i * sub_kernel_limits[0]), sub_kernel_limits[0]) for i in range(n_sub_kernels_y)
Diqing Zhong09387e22020-09-28 18:46:22 +0200337 ]
338 sub_kernel_size = (x * y for y in sub_kernel_y for x in sub_kernel_x)
339
Diqing Zhong09387e22020-09-28 18:46:22 +0200340 cycles_dpu_blk = 0
Diqing Zhong986e3192020-11-16 16:15:56 +0100341 cycles_wb = 32 * ofm_ublock.depth // 8
Diqing Zhong09387e22020-09-28 18:46:22 +0200342
343 for num_kernel_elems in sub_kernel_size:
Tim Halld8339a72021-05-27 18:49:40 +0100344 if query.npu_block_type == NpuBlockType.Pooling:
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100345 num_kernel_steps = 1
Diqing Zhong986e3192020-11-16 16:15:56 +0100346 cycles = max(4, num_kernel_elems) * num_ublk_xy * num_ublk_z
Tim Halld8339a72021-05-27 18:49:40 +0100347 if query.ifm_bits == 16 and arch.accelerator_config != Accelerator.Ethos_U55_32:
Diqing Zhong09387e22020-09-28 18:46:22 +0200348 cycles *= 2
Tim Halld8339a72021-05-27 18:49:40 +0100349 elif query.npu_block_type == NpuBlockType.ConvolutionDepthWise:
Diqing Zhong986e3192020-11-16 16:15:56 +0100350 cycles = 4 * num_ublk_xy
Tim Halld8339a72021-05-27 18:49:40 +0100351 if query.ifm_bits == 16:
Diqing Zhong09387e22020-09-28 18:46:22 +0200352 cycles *= 2
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100353 num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, 4)
354 cycles = max(cycles_wb, cycles) * num_kernel_steps * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200355 elif (
Tim Halld8339a72021-05-27 18:49:40 +0100356 (query.npu_block_type == NpuBlockType.ConvolutionMxN and not query.config.is_partkernel)
357 or query.npu_block_type == NpuBlockType.VectorProduct
358 or query.npu_block_type == NpuBlockType.ReduceSum
Diqing Zhong09387e22020-09-28 18:46:22 +0200359 ):
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100360 num_kernel_steps = num_kernel_elems
361 cycles = max(cycles_wb, 4 * num_ublk_xy) * num_kernel_steps * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200362 else:
Tim Halld8339a72021-05-27 18:49:40 +0100363 assert query.config.is_partkernel
364 divider = 2 if query.ifm_bits == 16 else 4
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100365 num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, divider)
Diqing Zhong986e3192020-11-16 16:15:56 +0100366 cycles = max(cycles_wb, 4 * num_ublk_xy) * (
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100367 num_kernel_steps * numeric_util.round_up_divide(ifm_block.depth, 8) * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200368 )
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100369
370 delay_cycles = 0
371 if arch.accelerator_config is Accelerator.Ethos_U55_32:
372 delay = 7 if use_acc_40bits else 3
373 if num_ublk_x == 1 and num_ublk_y == 1:
374 if num_ublk_z == 1:
375 delay_cycles = delay * num_kernel_steps
376 elif num_kernel_steps > 1:
377 delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z
378 if (num_ublk_x == 1 or num_ublk_y == 1) and num_ublk_z > 1 and use_acc_40bits:
379 delay_cycles += delay * num_ublk_z
380 else:
Tim Halld8339a72021-05-27 18:49:40 +0100381 if use_acc_40bits and arch.accelerator_config in (Accelerator.Ethos_U55_64, Accelerator.Ethos_U55_128):
382 delay = 3
383 else:
384 delay = 2
385
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100386 if num_ublk_x == 1 and num_ublk_y == 1:
387 if num_ublk_z == 1:
388 delay_cycles = delay * num_kernel_steps
389 elif num_kernel_steps > 1:
390 delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z
391
Tim Halld8339a72021-05-27 18:49:40 +0100392 if query.npu_block_type == NpuBlockType.ConvolutionMxN and query.config.is_partkernel:
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100393 delay_cycles *= numeric_util.round_up_divide(ifm_block.depth, 8)
394
Diqing Zhong09387e22020-09-28 18:46:22 +0200395 cycles_dpu_blk += cycles
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100396 cycles_dpu_blk += delay_cycles
397
Tim Halld8339a72021-05-27 18:49:40 +0100398 if query.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
399 cycles_dpu_blk *= numeric_util.round_up_divide(query.ifm_shape.depth, ifm_block.depth)
Diqing Zhong09387e22020-09-28 18:46:22 +0200400
401 cycles_dpu_blk /= arch.ncores
402
Tim Halld8339a72021-05-27 18:49:40 +0100403 # Estimate output cycles
404 num_ofm_blks = query.ofm_shape.div_round_up(ofm_block).elements()
Johan Alfvénf8e353b2022-02-04 17:24:23 +0100405 cycles_output_blk = round_up_to_int(
406 _estimate_output_cycles_per_element(arch, op_type, faf_type, query) * ofm_block.elements()
407 )
Diqing Zhong09387e22020-09-28 18:46:22 +0200408
Tim Halld8339a72021-05-27 18:49:40 +0100409 # Scale and bias tensor
410 if query.const_shape.depth > 0:
Diqing Zhongf842b692020-12-11 13:07:37 +0100411 cycles_bias_blk = (
Tim Halld8339a72021-05-27 18:49:40 +0100412 10 * ofm_block.depth * arch.memory_latency[query.const_memory_area][BandwidthDirection.Read] / 256
Diqing Zhongf842b692020-12-11 13:07:37 +0100413 )
Diqing Zhong986e3192020-11-16 16:15:56 +0100414 cycles_output_blk = max(cycles_output_blk, cycles_bias_blk)
415
Tim Halld8339a72021-05-27 18:49:40 +0100416 ifm_blk_cycles, ofm_blk_cycles = _estimate_minimum_memory_cycles(arch, query)
417 cycles_cmd = ifm_blk_cycles + ofm_blk_cycles
418 cycles_cmd = (cycles_cmd + cycles_output_blk + cycles_dpu_blk) / 4 # per DPU
419
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100420 cycles_dpu_blk = max(cycles_dpu_blk, cycles_cmd)
421 cycles_output_blk = max(cycles_output_blk, cycles_cmd)
422
Diqing Zhong09387e22020-09-28 18:46:22 +0200423 if cycles_dpu_blk > cycles_output_blk:
Tim Halld8339a72021-05-27 18:49:40 +0100424 total_cycles = cycles_dpu_blk * num_ofm_blks + cycles_output_blk
Diqing Zhong09387e22020-09-28 18:46:22 +0200425 else:
Tim Halld8339a72021-05-27 18:49:40 +0100426 total_cycles = cycles_output_blk * num_ofm_blks + cycles_dpu_blk
Diqing Zhong09387e22020-09-28 18:46:22 +0200427
428 return total_cycles
429
430
Tim Halld8339a72021-05-27 18:49:40 +0100431def measure_mem2mem_cycles(arch, from_mem_area, to_mem_area, to_transfer):
432 from_cycles = to_transfer // arch.memory_bandwidths_per_cycle[from_mem_area]
Tim Hall789e6f32021-06-17 17:02:31 +0100433 from_cycles += arch.memory_latency[from_mem_area][BandwidthDirection.Read]
Tim Halld8339a72021-05-27 18:49:40 +0100434 to_cycles = to_transfer // arch.memory_bandwidths_per_cycle[to_mem_area]
435 return max(from_cycles, to_cycles)
Diqing Zhonge168b962020-11-05 17:18:47 +0100436
Patrik Gustavssonee99bb12021-04-08 09:04:00 +0200437
Tim Halld8339a72021-05-27 18:49:40 +0100438def measure_cycle_cost(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):
439 cycles = CycleCost()
Diqing Zhonge168b962020-11-05 17:18:47 +0100440
Tim Halld8339a72021-05-27 18:49:40 +0100441 # Convolution/Vector product cycle calculation
442 if query.npu_block_type in (
443 NpuBlockType.ConvolutionMxN,
444 NpuBlockType.ConvolutionDepthWise,
445 NpuBlockType.VectorProduct,
446 NpuBlockType.Pooling,
447 NpuBlockType.ReduceSum,
448 ):
449 # cycles.op_macs and cycles.op_cycles should both handle >32-bits
450 if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
451 cycles.op_macs = int(query.kernel.elements_wh()) * 1 * int(query.ofm_shape.elements())
Diqing Zhonge168b962020-11-05 17:18:47 +0100452 else:
Tim Halld8339a72021-05-27 18:49:40 +0100453 cycles.op_macs = (
454 int(query.kernel.elements_wh()) * int(query.ifm_shape.depth) * int(query.ofm_shape.elements())
455 )
456
457 cycles.op_cycles = int(_estimate_conv_cycles(arch, op_type, faf_type, query))
458 # Elementwise cycle calculation
459 elif query.npu_block_type == NpuBlockType.ElementWise:
460 cycles.op_macs = 0
Johan Alfvénf8e353b2022-02-04 17:24:23 +0100461 ofm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ofm_format]))
462 cycles.op_cycles = round_up_to_int(
463 _estimate_output_cycles_per_element(arch, op_type, faf_type, query)
464 * Shape4D.round_up(query.ofm_shape, ofm_rounding).elements()
Tim Halld8339a72021-05-27 18:49:40 +0100465 )
Diqing Zhonge168b962020-11-05 17:18:47 +0100466 else:
Tim Halld8339a72021-05-27 18:49:40 +0100467 assert False
Diqing Zhonge168b962020-11-05 17:18:47 +0100468
Tim Halld8339a72021-05-27 18:49:40 +0100469 return cycles
Diqing Zhonge168b962020-11-05 17:18:47 +0100470
471
Tim Halld8339a72021-05-27 18:49:40 +0100472def measure_element_access(arch, query: PerformanceQuery):
473 access = ElementAccess()
Tim Hall79d07d22020-04-27 18:20:16 +0100474
Tim Halld8339a72021-05-27 18:49:40 +0100475 ifm_block = Shape4D.min(query.ifm_shape, query.config.ifm_block)
476 ofm_block = Shape4D.min(query.ofm_shape, query.config.ofm_block)
477 ifm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ifm_format]))
Tim Hall79d07d22020-04-27 18:20:16 +0100478
Tim Halld8339a72021-05-27 18:49:40 +0100479 # Number of ofm blocks in the overall output shape
480 ofm_blocks = query.ofm_shape.div_round_up(ofm_block)
481 ofm_block_depth = ofm_block.depth
482 if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
483 ofm_blocks = ofm_blocks.with_depth(1)
484 ofm_block_depth = query.ifm_shape.depth
Diqing Zhonge168b962020-11-05 17:18:47 +0100485
Tim Halld8339a72021-05-27 18:49:40 +0100486 # Convolution & pooling
487 if query.npu_block_type in (
488 NpuBlockType.ConvolutionMxN,
489 NpuBlockType.ConvolutionDepthWise,
490 NpuBlockType.VectorProduct,
491 NpuBlockType.Pooling,
492 NpuBlockType.ReduceSum,
493 ):
494 # Number of sub kernels
495 sub_kernel_limits = arch.sub_kernel_limits[query.npu_block_type]
496 subkernels = numeric_util.round_up_divide(query.kernel.width, sub_kernel_limits[0])
497 subkernels *= numeric_util.round_up_divide(query.kernel.height, sub_kernel_limits[1])
Tim Hall79d07d22020-04-27 18:20:16 +0100498
Tim Halld8339a72021-05-27 18:49:40 +0100499 ofm_block_count = ofm_blocks.elements()
Tim Hall79d07d22020-04-27 18:20:16 +0100500
Tim Halld8339a72021-05-27 18:49:40 +0100501 ifm_fetch = (
502 Shape4D.round_up(ifm_block, ifm_rounding).elements_wh()
503 * Shape4D.round_up(query.ifm_shape, ifm_rounding).depth
Diqing Zhonge168b962020-11-05 17:18:47 +0100504 )
Tim Hall79d07d22020-04-27 18:20:16 +0100505
Tim Halld8339a72021-05-27 18:49:40 +0100506 if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
507 kernel_read = query.kernel.elements_wh() * 1 # force to no reread
508 else:
509 kernel_read = query.kernel.elements_wh() * query.ifm_shape.depth
Tim Hall79d07d22020-04-27 18:20:16 +0100510
Tim Halld8339a72021-05-27 18:49:40 +0100511 weight_fetch = kernel_read * ofm_block_depth * ofm_block_count
512
513 access.ifm_read[0] = ifm_fetch * subkernels * ofm_block_count
514
515 if query.npu_block_type not in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):
516 access.const_read[0] = weight_fetch
517 access.const_read[1] = query.ofm_shape.depth # Scales & biases
518 access.weights_refetch = ofm_blocks.elements_wh()
519 # Elementwise
520 elif query.npu_block_type == NpuBlockType.ElementWise:
521 if query.ifm_shape.elements() == 1:
522 if query.ifm_bits > 8:
523 # ifm is a non 8-bit scalar
524 access.ifm_read[0] = Shape4D.round_up(query.ifm_shape, ifm_rounding).elements()
525 if query.ifm2_shape:
526 access.ifm_read[1] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()
527 else:
528 access.ifm_read[0] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()
529 if query.ifm2_shape:
530 if query.ifm2_shape.elements() > 1:
531 access.ifm_read[1] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()
532 elif query.ifm2_bits > 8:
533 # ifm2 is a non 8-bit scalar
534 access.ifm_read[1] = Shape4D.round_up(query.ifm2_shape, ifm_rounding).elements()
535 # Unknown
536 else:
537 assert False
538
539 ofm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ofm_format]))
540 access.ofm_write = Shape4D.round_up(query.ofm_shape, ofm_rounding).elements()
541 return access
542
543
544def measure_performance_cost(
545 arch, op_type: Op, faf_type: Op, query: PerformanceQuery, offset: Shape4D, sub_shape: Shape4D
546):
547 assert (query.ofm_bits > 0) and (query.ifm_bits > 0)
548 assert query.ofm_shape.elements() != 0
549
550 # Default to start if no offset provided
551 if offset is None:
552 offset = Shape4D(0, 0, 0, 0)
553
554 # Default to entire area if no sub-shape provided
555 if sub_shape is None:
556 sub_shape = query.ofm_shape
557 else:
558 sub_shape = Shape4D.min(sub_shape, query.ofm_shape)
559
560 sub_query = copy.deepcopy(query)
561 sub_query.ofm_shape = query.ofm_shape.clip(offset, sub_shape)
562
563 access = ElementAccess()
564 cycles = CycleCost()
565
566 cycle_tmp = measure_cycle_cost(arch, op_type, faf_type, sub_query)
567 cycles += cycle_tmp
568 access = measure_element_access(arch, sub_query)
569
570 return access, cycles
571
572
573def make_bandwidth_array():
574 return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size))
575
576
577def make_cycles_array():
578 return np.zeros(PassCycles.Size)
Tim Hall79d07d22020-04-27 18:20:16 +0100579
580
Diqing Zhonge168b962020-11-05 17:18:47 +0100581def update_summary_cycles(arch, bws, cycles):
582 cycles[PassCycles.SramAccess] = np.sum(bws[MemArea.Sram]) / arch.memory_bandwidths_per_cycle[MemArea.Sram]
Tim Hall79d07d22020-04-27 18:20:16 +0100583 cycles[PassCycles.DramAccess] = np.sum(bws[MemArea.Dram]) / arch.memory_bandwidths_per_cycle[MemArea.Dram]
584 cycles[PassCycles.OnChipFlashAccess] = (
585 np.sum(bws[MemArea.OnChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OnChipFlash]
586 )
587 cycles[PassCycles.OffChipFlashAccess] = (
588 np.sum(bws[MemArea.OffChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OffChipFlash]
589 )
590
591 cycles[PassCycles.Total] = np.max(cycles[: PassCycles.Total])
592 return cycles
593
594
Tim Halld8339a72021-05-27 18:49:40 +0100595def estimate_full_op_performance(
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100596 arch, schedule: Schedule, op: SchedulerOperation, prev_op: Optional[SchedulerOperation], block_config
Tim Halld8339a72021-05-27 18:49:40 +0100597):
598 cycles_a = make_cycles_array()
599 bws = make_bandwidth_array()
600 scaled_bws = make_bandwidth_array() # scaled bw with memory transfer efficiency
601 macs = 0
602
603 query = PerformanceQuery(op.op_type.npu_block_type)
604 query.ifm_shape = op.ifm.shape
605 query.ifm_format = op.ifm.format
606 query.ifm_memory_area = op.ifm.mem_area
607 query.ifm_bits = op.ifm.dtype.size_in_bits()
608 query.ifm2_shape = op.ifm2 and op.ifm2.shape
609 query.ifm2_format = op.ifm2 and op.ifm2.format
610 query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area
611 query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()
612 query.ofm_shape = op.ofm.shape
613 query.ofm_memory_area = op.ofm.mem_area
614 query.ofm_bits = op.ofm.dtype.size_in_bits()
615 query.ofm_format = op.ofm.format
616 query.kernel = op.kernel
617 query.config = block_config
618
619 cost = schedule.cost_map[op]
620 prev_cost = schedule.cost_map[prev_op] if prev_op else None
621 if op.parent_op.bias:
622 query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)
Tim Hallb5df7732022-05-04 16:20:43 +0100623 if cost.buffered_weight_tensor:
624 query.const_memory_area = cost.buffered_weight_tensor.mem_area
Tim Halld8339a72021-05-27 18:49:40 +0100625 else:
626 query.const_memory_area = cost.npu_weights_tensor.mem_area
627
628 cycles = measure_cycle_cost(arch, op.op_type, op.parent_op.activation and op.parent_op.activation.op_type, query)
629 cycles_a[PassCycles.Npu] = cycles.op_cycles
630 macs = cycles.op_macs
631
632 access = measure_element_access(arch, query)
633
634 # How many NPU cycles are available under the previously executing
635 # operator for performing buffered DMA transfers
636 slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0
637
638 # LUT Transfer
639 parent_op = op.parent_op
640 lut_transfer_cycles = 0
641 if parent_op.activation_lut:
642 lut_tensor = [tens for tens in parent_op.inputs if tens.purpose == TensorPurpose.LUT][0]
643 src_tensor = lut_tensor.src_tensor
644 if src_tensor and lut_tensor.mem_area != src_tensor.mem_area:
645 bw = src_tensor.storage_size()
646 lut_transfer_cycles = measure_mem2mem_cycles(arch, src_tensor.mem_area, lut_tensor.mem_area, bw)
647
648 bws[src_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw
649 # LUT read from SHRAM TODO remove?
Ayaan Masoodd5cbef32022-02-22 15:56:35 +0000650 scaled_bws[lut_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw
Tim Halld8339a72021-05-27 18:49:40 +0100651
Tim Hallb5df7732022-05-04 16:20:43 +0100652 if cost.npu_weights_tensor and cost.buffered_weight_tensor:
Tim Halld8339a72021-05-27 18:49:40 +0100653 # DMA Weight Transfer
654 sz = 0
655 # Get the size of the first DMA
656 for core in range(0, arch.ncores):
657 key = WeightKey(core, 0)
658 if key in cost.npu_weights_tensor.encoded_ranges:
659 weight_range = cost.npu_weights_tensor.encoded_ranges[key]
660 sz += round_up(weight_range.total_bytes, 16)
661
662 total_sz = len(cost.npu_weights_tensor.buffer)
663 bws[cost.npu_weights_tensor.mem_area][TensorPurpose.Weights][BandwidthDirection.Read] += total_sz
Tim Hallb5df7732022-05-04 16:20:43 +0100664 bws[cost.buffered_weight_tensor.mem_area][TensorPurpose.Weights][BandwidthDirection.Write] += total_sz
Tim Halld8339a72021-05-27 18:49:40 +0100665
666 ws_first_transfer_cycles = measure_mem2mem_cycles(
Tim Hallb5df7732022-05-04 16:20:43 +0100667 arch, cost.npu_weights_tensor.mem_area, cost.buffered_weight_tensor.mem_area, sz
Tim Halld8339a72021-05-27 18:49:40 +0100668 )
669
670 # Add cycles for Weight + Scale Transfer
671 cycles_a[PassCycles.Npu] = max(
672 cost.full_weight_transfer_cycles - slack_cycles + cost.slack_buffering_cycles,
673 cycles.op_cycles + max(ws_first_transfer_cycles - slack_cycles, 0),
674 )
675
676 # Add cycles for LUT Transfer
677 cycles_a[PassCycles.Npu] += lut_transfer_cycles
678 else:
679 # Add cycles for LUT Transfer
680 cycles_a[PassCycles.Npu] += max(lut_transfer_cycles - slack_cycles, 0)
681
682 # OFM write
683 ofm = op.parent_op.ofm
684 bw = access.ofm_write * ofm.element_size()
685 bws[query.ofm_memory_area][ofm.purpose][BandwidthDirection.Write] += bw
686 scaled_bws[ofm.mem_area][ofm.purpose][BandwidthDirection.Write] += _estimate_memory_transfer_efficiency(
687 arch, False, query.ofm_memory_area, ofm.format, query.ofm_bits, query.config.ofm_block, query.ofm_shape, bw
688 )
689
690 # IFM read
691 ifm = op.parent_op.ifm
692 bw = access.ifm_read[0] * ifm.element_size()
693 bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += bw
694 scaled_bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency(
695 arch, True, query.ifm_memory_area, ifm.format, query.ifm_bits, query.config.ifm_block, query.ifm_shape, bw
696 )
697 if query.ifm2_shape:
698 ifm2 = op.parent_op.ifm2
699 bw = access.ifm_read[1] * ifm2.element_size()
700 bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += bw
701 scaled_bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency(
702 arch,
703 True,
704 query.ifm2_memory_area,
705 ifm2.format,
706 op.ifm2.dtype.size_in_bits(),
707 query.config.ifm_block,
708 query.ifm2_shape,
709 bw,
710 )
711
712 # Weight read
713 if access.const_read[0] > 0:
714 # alignment not accounted for in bandwidth_compression_scale_approx
715 encoded_size_approx = (
716 cost.npu_weights_tensor.elements() - access.const_read[1] * op.parent_op.bias.element_size()
717 )
718 orig_weight_size = parent_op.weights.elements()
719 bandwidth_compression_scale_approx = encoded_size_approx / orig_weight_size
720 bw = access.const_read[0] * bandwidth_compression_scale_approx
721 bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw
722
Tim Hallb5df7732022-05-04 16:20:43 +0100723 if not cost.buffered_weight_tensor:
Patrik Gustavsson225e19d2021-06-01 12:43:43 +0200724 scaled_bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw
725
Tim Halld8339a72021-05-27 18:49:40 +0100726 if access.const_read[1] > 0:
727 # Scales & biases
728 bw = access.const_read[1] * op.parent_op.bias.element_size()
729 bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw
730
Tim Hallb5df7732022-05-04 16:20:43 +0100731 if not cost.buffered_weight_tensor:
Patrik Gustavsson225e19d2021-06-01 12:43:43 +0200732 scaled_bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw
733
Tim Halld8339a72021-05-27 18:49:40 +0100734 update_summary_cycles(arch, scaled_bws, cycles_a)
735
736 return bws, macs, cycles_a
Tim Hall79d07d22020-04-27 18:20:16 +0100737
738
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000739def calc_new_performance_for_network(nng: Graph, arch):
Tim Hall79d07d22020-04-27 18:20:16 +0100740 total_bws = make_bandwidth_array()
Diqing Zhong69aadd02020-12-08 13:08:48 +0100741 total_macs = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100742 total_cycles = np.zeros(PassCycles.Size)
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000743 total_weight_size = 0
744 total_encoded_weight_size = 0
745
746 # Store unique instances of original/encoded weight tensor uuids to prevent double counting of weights
747 original_weight_uuids: Set[UUID] = set()
748 encoded_npu_weight_uuids: Set[UUID] = set()
Tim Hall79d07d22020-04-27 18:20:16 +0100749
750 for sg in nng.subgraphs:
Tim Halld8339a72021-05-27 18:49:40 +0100751 prev_op = None
752 for sched_op in sg.sched_ops:
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000753 op_info: SchedulerOpInfo = sg.schedule.cost_map[sched_op]
Tim Halld8339a72021-05-27 18:49:40 +0100754 bws, macs, cycles = estimate_full_op_performance(arch, sg.schedule, sched_op, prev_op, op_info.block_config)
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000755
756 # Tensors for calculating weight sizes
757 original_weight = sched_op.parent_op.weights
758 encoded_npu_weight = op_info.npu_weights_tensor
759
760 # Save UUIDs of original_weight so only unique instances of tensors are used to calculate weights
761 if original_weight and (original_weight.equivalence_id not in original_weight_uuids):
762
763 original_weight_uuids.add(original_weight.equivalence_id)
764 total_weight_size += original_weight.values.itemsize * original_weight.values.size
765
766 # Save UUIDs of encoded_npu_weight so only unique instances of tensors are used to calculate weights
767 if encoded_npu_weight and (encoded_npu_weight.equivalence_id not in encoded_npu_weight_uuids):
768
Jonas Ohlsson77b448f2022-03-11 16:08:30 +0100769 encoded_npu_weight_uuids.add(encoded_npu_weight.equivalence_id)
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000770 total_encoded_weight_size += len(encoded_npu_weight.buffer)
771
Tim Hall79d07d22020-04-27 18:20:16 +0100772 total_bws += bws
773 total_macs += macs
774 total_cycles += cycles
Tim Halld8339a72021-05-27 18:49:40 +0100775 prev_op = sched_op
Tim Hall79d07d22020-04-27 18:20:16 +0100776
777 nng.bandwidths = total_bws
778 nng.macs = total_macs
779 nng.cycles = total_cycles
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000780 nng.total_original_weights = total_weight_size
781 nng.total_npu_encoded_weights = total_encoded_weight_size