blob: 8c4aee63215a36a82b8c7ea25d1ec9fe956bb235 [file] [log] [blame]
erik.andersson@arm.com460c6892021-02-24 14:38:09 +01001# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# NPU performance estimation functions to estimate performance of a Pass and CascadedPass. Uses a model that takes the
18# maximum of the 'cycles required for bandwidth' and 'cycles required for computing'.
19#
20# Called during scheduling to evaluate different proposals, as well as post-scheduling to provide a final performance
21# estimate.
Tim Halld8339a72021-05-27 18:49:40 +010022import copy
Diqing Zhonge168b962020-11-05 17:18:47 +010023from enum import auto
24from enum import IntEnum
Jonas Ohlsson845e2322022-03-01 12:39:55 +010025from typing import Optional
Ayaan Masoodb801dda2022-02-22 11:28:55 +000026from typing import Set
27from uuid import UUID
Diego Russoea6111a2020-04-14 18:41:58 +010028
Tim Hall79d07d22020-04-27 18:20:16 +010029import numpy as np
Diego Russoea6111a2020-04-14 18:41:58 +010030
31from . import numeric_util
Tim Halld8339a72021-05-27 18:49:40 +010032from .architecture_allocator import ArchitectureBlockConfig
Diqing Zhong09387e22020-09-28 18:46:22 +020033from .architecture_features import Accelerator
Tim Halld8339a72021-05-27 18:49:40 +010034from .architecture_features import NpuBlockType
35from .architecture_features import SHRAMElements
36from .architecture_features import TensorFormat
Ayaan Masoodb801dda2022-02-22 11:28:55 +000037from .nn_graph import Graph
Tim Halld8339a72021-05-27 18:49:40 +010038from .numeric_util import round_up
Johan Alfvénf8e353b2022-02-04 17:24:23 +010039from .numeric_util import round_up_to_int
Tim Halld8339a72021-05-27 18:49:40 +010040from .operation import Kernel
Diqing Zhonge8887a32020-09-24 09:53:48 +020041from .operation import Op
Tim Halld8339a72021-05-27 18:49:40 +010042from .scheduler import Schedule
43from .scheduler import SchedulerOperation
Ayaan Masoodb801dda2022-02-22 11:28:55 +000044from .scheduler import SchedulerOpInfo
Tim Halld8339a72021-05-27 18:49:40 +010045from .shape4d import Shape4D
Diqing Zhongf842b692020-12-11 13:07:37 +010046from .tensor import BandwidthDirection
Diego Russoe8a10452020-04-21 17:39:10 +010047from .tensor import MemArea
Diego Russoe8a10452020-04-21 17:39:10 +010048from .tensor import TensorPurpose
Tim Halld8339a72021-05-27 18:49:40 +010049from .weight_compressor import WeightKey
Tim Hall79d07d22020-04-27 18:20:16 +010050
51
Diqing Zhonge168b962020-11-05 17:18:47 +010052class PassCycles(IntEnum):
Diqing Zhong42e833d2020-10-02 13:18:42 +020053 Npu = 0
Diqing Zhonge168b962020-11-05 17:18:47 +010054 SramAccess = auto()
55 DramAccess = auto()
56 OnChipFlashAccess = auto()
57 OffChipFlashAccess = auto()
58 Total = auto()
59 Size = auto()
Tim Hall79d07d22020-04-27 18:20:16 +010060
61 def display_name(self):
Tim Hall1bd531d2020-11-01 20:59:36 +000062 return ("NPU", "SRAM Access", "DRAM Access", "On-chip Flash Access", "Off-chip Flash Access", "Total", "Size",)[
63 self.value
64 ]
Tim Hall79d07d22020-04-27 18:20:16 +010065
66 def identifier_name(self):
Tim Hall1bd531d2020-11-01 20:59:36 +000067 return ("npu", "sram_access", "dram_access", "on_chip_flash_access", "off_chip_flash_access", "total", "size",)[
68 self.value
69 ]
Tim Hall79d07d22020-04-27 18:20:16 +010070
71 @staticmethod
72 def all():
73 return (
Diqing Zhong42e833d2020-10-02 13:18:42 +020074 PassCycles.Npu,
Tim Hall79d07d22020-04-27 18:20:16 +010075 PassCycles.SramAccess,
76 PassCycles.DramAccess,
77 PassCycles.OnChipFlashAccess,
78 PassCycles.OffChipFlashAccess,
79 PassCycles.Total,
80 )
81
82
Tim Halld8339a72021-05-27 18:49:40 +010083class PerformanceQuery:
84 def __init__(self, npu_block_type=0):
85 self.npu_block_type = npu_block_type
86 self.ifm_shape = Shape4D(0)
87 self.ifm_format = TensorFormat.NHWC
88 self.ifm_memory_area = MemArea.Unknown
89 self.ifm2_memory_area = MemArea.Unknown
90 self.ifm_bits = 0
91 self.ifm2_bits = 0
92 self.ifm2_shape = None
93 self.ifm2_format = TensorFormat.NHWC
94 self.ofm_shape = Shape4D(0)
95 self.ofm_format = TensorFormat.NHWC
96 self.ofm_memory_area = MemArea.Unknown
97 self.ofm_bits = 0
98 self.const_shape = Shape4D(0)
99 self.const_memory_area = MemArea.Unknown
100 self.kernel = Kernel(1, 1)
101 self.config = ArchitectureBlockConfig()
Tim Hall79d07d22020-04-27 18:20:16 +0100102
103
Tim Halld8339a72021-05-27 18:49:40 +0100104class CycleCost:
105 def __init__(self):
106 self.op_macs = 0
107 self.op_cycles = 0
108
109 def __mul__(self, scale):
110 out = CycleCost()
111 out.op_macs = self.op_macs * scale
112 out.op_cycles = self.op_cycles * scale
113 return out
114
115 def __iadd__(self, rhs):
116 self.op_macs += rhs.op_macs
117 self.op_cycles += rhs.op_cycles
118 return self
119
120 def __str__(self):
121 return "macs = {}, cycles = {}".format(self.op_macs, self.op_cycles)
Tim Hall79d07d22020-04-27 18:20:16 +0100122
123
Tim Halld8339a72021-05-27 18:49:40 +0100124class ElementAccess:
125 def __init__(self):
126 # List of ONLY element access counts, consumers
127 # need to scale these values by the correct bitwidths
128 # to calculated memory bandwidth
129 self.ifm_read = [0, 0] # ifm1, ifm2
130 self.ofm_write = 0
131 self.weights_refetch = 0
132 self.const_read = [0, 0] # weights, scales
133
134 def __mul__(self, scale):
135 out = ElementAccess()
136 out.ifm_read[0] = self.ifm_read[0] * scale
137 out.ifm_read[1] = self.ifm_read[1] * scale
138 out.ofm_write = self.ofm_write * scale
139 out.weights_refetch = self.weights_refetch * scale
140 out.const_read[0] = self.const_read[0] * scale
141 out.const_read[1] = self.const_read[1] * scale
142 return out
143
144 def __iadd__(self, rhs):
145 self.ifm_read[0] += rhs.ifm_read[0]
146 self.ifm_read[1] += rhs.ifm_read[1]
147 self.ofm_write += rhs.ofm_write
148 self.weights_refetch += rhs.weights_refetch
149 self.const_read[0] += rhs.const_read[0]
150 self.const_read[1] += rhs.const_read[1]
151 return self
152
153 def __str__(self):
154 return "ifm read = {}, ofm write = {}, const read={}".format(self.ifm_read, self.ofm_write, self.const_read)
Tim Hall79d07d22020-04-27 18:20:16 +0100155
156
Tim Halld8339a72021-05-27 18:49:40 +0100157def _strides_for_shape(shape: Shape4D, format: TensorFormat, element_bits):
158 if format == TensorFormat.NHWC:
159 strides = [0, 0, 0, 0]
160 strides[3] = element_bits / 8 # +Z
161 strides[2] = (element_bits * shape.depth) // 8 # +X
162 strides[1] = (element_bits * shape.depth * shape.width) // 8 # +Y
163 strides[0] = (element_bits * shape.depth * shape.width * shape.height) // 8 # +N
164 elif format == TensorFormat.NHCWB16:
165 strides = [0, 0, 0, 0, 0]
166 strides[4] = element_bits / 8 # +Z
167 strides[3] = (element_bits * 16) / 8 # +X
168 strides[2] = (element_bits * 16 * shape.width) / 8 # +C
169 strides[1] = (element_bits * shape.width * shape.depth) / 8 # +Y
170 strides[0] = (element_bits * shape.width * shape.depth) / 8 # +N
Diqing Zhong42e833d2020-10-02 13:18:42 +0200171
Tim Halld8339a72021-05-27 18:49:40 +0100172 return strides
Diqing Zhong42e833d2020-10-02 13:18:42 +0200173
174
Tim Halld8339a72021-05-27 18:49:40 +0100175def _estimate_memory_transfer_efficiency(
176 arch, is_read, mem_area, format: TensorFormat, element_bits, block_size, shape4D, to_transfer
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100177):
Tim Halld8339a72021-05-27 18:49:40 +0100178 burst_len = 8
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100179
Tim Halld8339a72021-05-27 18:49:40 +0100180 strides = _strides_for_shape(shape4D, format, element_bits)
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100181
Tim Halld8339a72021-05-27 18:49:40 +0100182 if format == TensorFormat.NHCWB16:
183 if strides[2] == block_size.depth: # TODO is this check corrrect for non 8-bit
184 burst_len = element_bits * block_size.depth * block_size.width
185 elif is_read:
186 burst_len = 16 * element_bits * block_size.width
Diqing Zhonge8887a32020-09-24 09:53:48 +0200187 else:
Tim Halld8339a72021-05-27 18:49:40 +0100188 burst_len = 16 * element_bits * block_size.width * arch.ncores
189 elif format == TensorFormat.NHWC:
190 if is_read:
191 if strides[3] == block_size.depth:
192 burst_len = element_bits * block_size.depth * block_size.width
193 else:
194 burst_len = element_bits * block_size.depth
195 else:
196 if block_size.depth <= 16 and strides[3] == block_size.depth:
197 burst_len = element_bits * block_size.depth * block_size.width
198 else:
199 burst_len = min(64 * 8, 16 * element_bits * arch.ncores, block_size.depth * element_bits)
200
201 burst_len = burst_len // 8 # bits->bytes
202 burst_len = min(arch.memory_burst_length[mem_area], burst_len)
203 return to_transfer * (arch.memory_burst_length[mem_area] / burst_len)
204
205
206def _estimate_minimum_memory_cycles(arch, query: PerformanceQuery):
207 # Input block HW transfer (only for elements present)
208 ifm_bytes = Shape4D.min(query.ifm_shape, query.config.ifm_block).elements()
209 cycles_ifm_blk = arch.memory_latency[query.ifm_memory_area][BandwidthDirection.Read]
210 cycles_ifm_blk = cycles_ifm_blk + (
211 _estimate_memory_transfer_efficiency(
212 arch,
213 True,
214 query.ifm_memory_area,
215 query.ifm_format,
216 query.ifm_bits,
217 query.config.ifm_block,
218 query.ifm_shape,
219 ifm_bytes,
220 )
221 / arch.memory_bandwidths_per_cycle[query.ifm_memory_area]
222 )
223 # Output block HW transfer (only for elements present)
224 ofm_bytes = Shape4D.min(query.ofm_shape, query.config.ofm_block).elements()
225 cycles_ofm_blk = arch.memory_latency[query.ofm_memory_area][BandwidthDirection.Write]
226 cycles_ofm_blk = cycles_ofm_blk + (
227 _estimate_memory_transfer_efficiency(
228 arch,
229 False,
230 query.ofm_memory_area,
231 query.ofm_format,
232 query.ofm_bits,
233 query.config.ofm_block,
234 query.ofm_shape,
235 ofm_bytes,
236 )
237 / arch.memory_bandwidths_per_cycle[query.ofm_memory_area]
238 )
239 return cycles_ifm_blk, cycles_ofm_blk
240
241
242def _estimate_output_cycles_per_element(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):
243 if query.npu_block_type == NpuBlockType.ElementWise and query.ifm_bits == 32:
244 # Unary op else Binary op
245 output_perf_index = 0 if query.ifm2_shape is not None else 1
246 elif op_type == Op.Mul and query.ofm_bits == 32:
Diqing Zhonge8887a32020-09-24 09:53:48 +0200247 output_perf_index = 2
Tim Halld8339a72021-05-27 18:49:40 +0100248 elif op_type == Op.Mul or (
249 query.npu_block_type
Diqing Zhonge8887a32020-09-24 09:53:48 +0200250 in (
251 NpuBlockType.ConvolutionMxN,
252 NpuBlockType.ConvolutionDepthWise,
253 NpuBlockType.Pooling,
254 NpuBlockType.ReduceSum,
255 NpuBlockType.VectorProduct,
256 )
Tim Halld8339a72021-05-27 18:49:40 +0100257 and query.config.acc_type == SHRAMElements.Acc40
Diqing Zhonge8887a32020-09-24 09:53:48 +0200258 ):
259 output_perf_index = 3
Tim Halld8339a72021-05-27 18:49:40 +0100260 elif op_type in (Op.Add, Op.Sub):
261 if False:
Diqing Zhonge8887a32020-09-24 09:53:48 +0200262 # Simple Add/Sub
263 output_perf_index = 4
264 else:
Tim Halld8339a72021-05-27 18:49:40 +0100265 # Advanced Add/Sub TODO: Add as perf selection as operator variant
Diqing Zhonge8887a32020-09-24 09:53:48 +0200266 output_perf_index = 5
Tim Halld8339a72021-05-27 18:49:40 +0100267 elif op_type.is_maxpool_op():
Diqing Zhonge8887a32020-09-24 09:53:48 +0200268 output_perf_index = 6
269 else:
270 output_perf_index = 7
271
Tim Halld8339a72021-05-27 18:49:40 +0100272 if faf_type in (Op.Sigmoid, Op.Tanh, Op.LUT):
Diqing Zhonge8887a32020-09-24 09:53:48 +0200273 activation_perf_index = 0
Tim Halld8339a72021-05-27 18:49:40 +0100274 elif faf_type in (Op.Relu, Op.Relu6, Op.ReluN1To1):
Diqing Zhonge8887a32020-09-24 09:53:48 +0200275 activation_perf_index = 1
276 else:
277 activation_perf_index = 2
278
Diqing Zhonge8887a32020-09-24 09:53:48 +0200279 cycle_per_elem = max(
280 arch.output_cycles_per_elem[output_perf_index], arch.activation_cycles_per_elem[activation_perf_index]
281 )
Diqing Zhong986e3192020-11-16 16:15:56 +0100282
Tim Halld8339a72021-05-27 18:49:40 +0100283 if op_type.is_elementwise_op():
284 num_elems_blk = query.config.ofm_block.elements()
285 ifm_blk_cycles, ofm_blk_cycles = _estimate_minimum_memory_cycles(arch, query)
286 cycle_cmd = ifm_blk_cycles + ofm_blk_cycles
287 cycle_cmd = (cycle_cmd + cycle_per_elem * num_elems_blk) / 4 # per DPU
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100288 cycle_per_elem = max(cycle_per_elem, cycle_cmd / num_elems_blk)
289
Tim Halld8339a72021-05-27 18:49:40 +0100290 return cycle_per_elem
Diqing Zhonge8887a32020-09-24 09:53:48 +0200291
292
Tim Halld8339a72021-05-27 18:49:40 +0100293def _estimate_conv_cycles(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):
294 ifm_block = Shape4D.min(query.ifm_shape, query.config.ifm_block)
295 ofm_block = Shape4D.min(query.ofm_shape, query.config.ofm_block)
Diqing Zhonge5204a62020-10-13 11:42:37 +0200296
297 if (
298 arch.config.ofm_ublock.height == 2
Tim Halld8339a72021-05-27 18:49:40 +0100299 and query.npu_block_type
Diqing Zhonge5204a62020-10-13 11:42:37 +0200300 in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)
Tim Halld8339a72021-05-27 18:49:40 +0100301 and query.ofm_shape.height == 1
Diqing Zhonge5204a62020-10-13 11:42:37 +0200302 # Optimisation only applies for even width tensors
Tim Halld8339a72021-05-27 18:49:40 +0100303 and query.ofm_shape.width % 2 == 0
304 and query.kernel.height == 1
Diqing Zhonge5204a62020-10-13 11:42:37 +0200305 ):
Tim Halld8339a72021-05-27 18:49:40 +0100306 ofm_ublock = Shape4D(1, 1, 4, arch.config.ofm_ublock.depth)
307 ofm_block = ofm_block.with_height(1)
308 else:
309 ofm_ublock = Shape4D(arch.config.ofm_ublock.to_hwc())
Diqing Zhonge5204a62020-10-13 11:42:37 +0200310
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100311 num_ublk_x = numeric_util.round_up_divide(ofm_block.width, ofm_ublock.width)
Tim Halld8339a72021-05-27 18:49:40 +0100312 num_ublk_y = numeric_util.round_up_divide(ofm_block.height, ofm_ublock.height)
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100313 num_ublk_xy = num_ublk_x * num_ublk_y
Tim Halld8339a72021-05-27 18:49:40 +0100314 num_ublk_z = numeric_util.round_up_divide(ofm_block.depth, ofm_ublock.depth)
315 use_acc_40bits = query.config.acc_type == SHRAMElements.Acc40
Diqing Zhong09387e22020-09-28 18:46:22 +0200316
Tim Halld8339a72021-05-27 18:49:40 +0100317 sub_kernel_limits = arch.sub_kernel_limits[query.npu_block_type]
318 n_sub_kernels_y = numeric_util.round_up_divide(query.kernel.height, sub_kernel_limits[0])
319 n_sub_kernels_x = numeric_util.round_up_divide(query.kernel.width, sub_kernel_limits[1])
Diqing Zhong09387e22020-09-28 18:46:22 +0200320 sub_kernel_x = [
Tim Halld8339a72021-05-27 18:49:40 +0100321 min((query.kernel.width - i * sub_kernel_limits[1]), sub_kernel_limits[1]) for i in range(n_sub_kernels_x)
Diqing Zhong09387e22020-09-28 18:46:22 +0200322 ]
323 sub_kernel_y = [
Tim Halld8339a72021-05-27 18:49:40 +0100324 min((query.kernel.height - i * sub_kernel_limits[0]), sub_kernel_limits[0]) for i in range(n_sub_kernels_y)
Diqing Zhong09387e22020-09-28 18:46:22 +0200325 ]
326 sub_kernel_size = (x * y for y in sub_kernel_y for x in sub_kernel_x)
327
Diqing Zhong09387e22020-09-28 18:46:22 +0200328 cycles_dpu_blk = 0
Diqing Zhong986e3192020-11-16 16:15:56 +0100329 cycles_wb = 32 * ofm_ublock.depth // 8
Diqing Zhong09387e22020-09-28 18:46:22 +0200330
331 for num_kernel_elems in sub_kernel_size:
Tim Halld8339a72021-05-27 18:49:40 +0100332 if query.npu_block_type == NpuBlockType.Pooling:
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100333 num_kernel_steps = 1
Diqing Zhong986e3192020-11-16 16:15:56 +0100334 cycles = max(4, num_kernel_elems) * num_ublk_xy * num_ublk_z
Tim Halld8339a72021-05-27 18:49:40 +0100335 if query.ifm_bits == 16 and arch.accelerator_config != Accelerator.Ethos_U55_32:
Diqing Zhong09387e22020-09-28 18:46:22 +0200336 cycles *= 2
Tim Halld8339a72021-05-27 18:49:40 +0100337 elif query.npu_block_type == NpuBlockType.ConvolutionDepthWise:
Diqing Zhong986e3192020-11-16 16:15:56 +0100338 cycles = 4 * num_ublk_xy
Tim Halld8339a72021-05-27 18:49:40 +0100339 if query.ifm_bits == 16:
Diqing Zhong09387e22020-09-28 18:46:22 +0200340 cycles *= 2
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100341 num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, 4)
342 cycles = max(cycles_wb, cycles) * num_kernel_steps * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200343 elif (
Tim Halld8339a72021-05-27 18:49:40 +0100344 (query.npu_block_type == NpuBlockType.ConvolutionMxN and not query.config.is_partkernel)
345 or query.npu_block_type == NpuBlockType.VectorProduct
346 or query.npu_block_type == NpuBlockType.ReduceSum
Diqing Zhong09387e22020-09-28 18:46:22 +0200347 ):
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100348 num_kernel_steps = num_kernel_elems
349 cycles = max(cycles_wb, 4 * num_ublk_xy) * num_kernel_steps * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200350 else:
Tim Halld8339a72021-05-27 18:49:40 +0100351 assert query.config.is_partkernel
352 divider = 2 if query.ifm_bits == 16 else 4
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100353 num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, divider)
Diqing Zhong986e3192020-11-16 16:15:56 +0100354 cycles = max(cycles_wb, 4 * num_ublk_xy) * (
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100355 num_kernel_steps * numeric_util.round_up_divide(ifm_block.depth, 8) * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200356 )
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100357
358 delay_cycles = 0
359 if arch.accelerator_config is Accelerator.Ethos_U55_32:
360 delay = 7 if use_acc_40bits else 3
361 if num_ublk_x == 1 and num_ublk_y == 1:
362 if num_ublk_z == 1:
363 delay_cycles = delay * num_kernel_steps
364 elif num_kernel_steps > 1:
365 delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z
366 if (num_ublk_x == 1 or num_ublk_y == 1) and num_ublk_z > 1 and use_acc_40bits:
367 delay_cycles += delay * num_ublk_z
368 else:
Tim Halld8339a72021-05-27 18:49:40 +0100369 if use_acc_40bits and arch.accelerator_config in (Accelerator.Ethos_U55_64, Accelerator.Ethos_U55_128):
370 delay = 3
371 else:
372 delay = 2
373
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100374 if num_ublk_x == 1 and num_ublk_y == 1:
375 if num_ublk_z == 1:
376 delay_cycles = delay * num_kernel_steps
377 elif num_kernel_steps > 1:
378 delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z
379
Tim Halld8339a72021-05-27 18:49:40 +0100380 if query.npu_block_type == NpuBlockType.ConvolutionMxN and query.config.is_partkernel:
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100381 delay_cycles *= numeric_util.round_up_divide(ifm_block.depth, 8)
382
Diqing Zhong09387e22020-09-28 18:46:22 +0200383 cycles_dpu_blk += cycles
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100384 cycles_dpu_blk += delay_cycles
385
Tim Halld8339a72021-05-27 18:49:40 +0100386 if query.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
387 cycles_dpu_blk *= numeric_util.round_up_divide(query.ifm_shape.depth, ifm_block.depth)
Diqing Zhong09387e22020-09-28 18:46:22 +0200388
389 cycles_dpu_blk /= arch.ncores
390
Tim Halld8339a72021-05-27 18:49:40 +0100391 # Estimate output cycles
392 num_ofm_blks = query.ofm_shape.div_round_up(ofm_block).elements()
Johan Alfvénf8e353b2022-02-04 17:24:23 +0100393 cycles_output_blk = round_up_to_int(
394 _estimate_output_cycles_per_element(arch, op_type, faf_type, query) * ofm_block.elements()
395 )
Diqing Zhong09387e22020-09-28 18:46:22 +0200396
Tim Halld8339a72021-05-27 18:49:40 +0100397 # Scale and bias tensor
398 if query.const_shape.depth > 0:
Diqing Zhongf842b692020-12-11 13:07:37 +0100399 cycles_bias_blk = (
Tim Halld8339a72021-05-27 18:49:40 +0100400 10 * ofm_block.depth * arch.memory_latency[query.const_memory_area][BandwidthDirection.Read] / 256
Diqing Zhongf842b692020-12-11 13:07:37 +0100401 )
Diqing Zhong986e3192020-11-16 16:15:56 +0100402 cycles_output_blk = max(cycles_output_blk, cycles_bias_blk)
403
Tim Halld8339a72021-05-27 18:49:40 +0100404 ifm_blk_cycles, ofm_blk_cycles = _estimate_minimum_memory_cycles(arch, query)
405 cycles_cmd = ifm_blk_cycles + ofm_blk_cycles
406 cycles_cmd = (cycles_cmd + cycles_output_blk + cycles_dpu_blk) / 4 # per DPU
407
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100408 cycles_dpu_blk = max(cycles_dpu_blk, cycles_cmd)
409 cycles_output_blk = max(cycles_output_blk, cycles_cmd)
410
Diqing Zhong09387e22020-09-28 18:46:22 +0200411 if cycles_dpu_blk > cycles_output_blk:
Tim Halld8339a72021-05-27 18:49:40 +0100412 total_cycles = cycles_dpu_blk * num_ofm_blks + cycles_output_blk
Diqing Zhong09387e22020-09-28 18:46:22 +0200413 else:
Tim Halld8339a72021-05-27 18:49:40 +0100414 total_cycles = cycles_output_blk * num_ofm_blks + cycles_dpu_blk
Diqing Zhong09387e22020-09-28 18:46:22 +0200415
416 return total_cycles
417
418
Tim Halld8339a72021-05-27 18:49:40 +0100419def measure_mem2mem_cycles(arch, from_mem_area, to_mem_area, to_transfer):
420 from_cycles = to_transfer // arch.memory_bandwidths_per_cycle[from_mem_area]
Tim Hall789e6f32021-06-17 17:02:31 +0100421 from_cycles += arch.memory_latency[from_mem_area][BandwidthDirection.Read]
Tim Halld8339a72021-05-27 18:49:40 +0100422 to_cycles = to_transfer // arch.memory_bandwidths_per_cycle[to_mem_area]
423 return max(from_cycles, to_cycles)
Diqing Zhonge168b962020-11-05 17:18:47 +0100424
Patrik Gustavssonee99bb12021-04-08 09:04:00 +0200425
Tim Halld8339a72021-05-27 18:49:40 +0100426def measure_cycle_cost(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):
427 cycles = CycleCost()
Diqing Zhonge168b962020-11-05 17:18:47 +0100428
Tim Halld8339a72021-05-27 18:49:40 +0100429 # Convolution/Vector product cycle calculation
430 if query.npu_block_type in (
431 NpuBlockType.ConvolutionMxN,
432 NpuBlockType.ConvolutionDepthWise,
433 NpuBlockType.VectorProduct,
434 NpuBlockType.Pooling,
435 NpuBlockType.ReduceSum,
436 ):
437 # cycles.op_macs and cycles.op_cycles should both handle >32-bits
438 if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
439 cycles.op_macs = int(query.kernel.elements_wh()) * 1 * int(query.ofm_shape.elements())
Diqing Zhonge168b962020-11-05 17:18:47 +0100440 else:
Tim Halld8339a72021-05-27 18:49:40 +0100441 cycles.op_macs = (
442 int(query.kernel.elements_wh()) * int(query.ifm_shape.depth) * int(query.ofm_shape.elements())
443 )
444
445 cycles.op_cycles = int(_estimate_conv_cycles(arch, op_type, faf_type, query))
446 # Elementwise cycle calculation
447 elif query.npu_block_type == NpuBlockType.ElementWise:
448 cycles.op_macs = 0
Johan Alfvénf8e353b2022-02-04 17:24:23 +0100449 ofm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ofm_format]))
450 cycles.op_cycles = round_up_to_int(
451 _estimate_output_cycles_per_element(arch, op_type, faf_type, query)
452 * Shape4D.round_up(query.ofm_shape, ofm_rounding).elements()
Tim Halld8339a72021-05-27 18:49:40 +0100453 )
Diqing Zhonge168b962020-11-05 17:18:47 +0100454 else:
Tim Halld8339a72021-05-27 18:49:40 +0100455 assert False
Diqing Zhonge168b962020-11-05 17:18:47 +0100456
Tim Halld8339a72021-05-27 18:49:40 +0100457 return cycles
Diqing Zhonge168b962020-11-05 17:18:47 +0100458
459
Tim Halld8339a72021-05-27 18:49:40 +0100460def measure_element_access(arch, query: PerformanceQuery):
461 access = ElementAccess()
Tim Hall79d07d22020-04-27 18:20:16 +0100462
Tim Halld8339a72021-05-27 18:49:40 +0100463 ifm_block = Shape4D.min(query.ifm_shape, query.config.ifm_block)
464 ofm_block = Shape4D.min(query.ofm_shape, query.config.ofm_block)
465 ifm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ifm_format]))
Tim Hall79d07d22020-04-27 18:20:16 +0100466
Tim Halld8339a72021-05-27 18:49:40 +0100467 # Number of ofm blocks in the overall output shape
468 ofm_blocks = query.ofm_shape.div_round_up(ofm_block)
469 ofm_block_depth = ofm_block.depth
470 if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
471 ofm_blocks = ofm_blocks.with_depth(1)
472 ofm_block_depth = query.ifm_shape.depth
Diqing Zhonge168b962020-11-05 17:18:47 +0100473
Tim Halld8339a72021-05-27 18:49:40 +0100474 # Convolution & pooling
475 if query.npu_block_type in (
476 NpuBlockType.ConvolutionMxN,
477 NpuBlockType.ConvolutionDepthWise,
478 NpuBlockType.VectorProduct,
479 NpuBlockType.Pooling,
480 NpuBlockType.ReduceSum,
481 ):
482 # Number of sub kernels
483 sub_kernel_limits = arch.sub_kernel_limits[query.npu_block_type]
484 subkernels = numeric_util.round_up_divide(query.kernel.width, sub_kernel_limits[0])
485 subkernels *= numeric_util.round_up_divide(query.kernel.height, sub_kernel_limits[1])
Tim Hall79d07d22020-04-27 18:20:16 +0100486
Tim Halld8339a72021-05-27 18:49:40 +0100487 ofm_block_count = ofm_blocks.elements()
Tim Hall79d07d22020-04-27 18:20:16 +0100488
Tim Halld8339a72021-05-27 18:49:40 +0100489 ifm_fetch = (
490 Shape4D.round_up(ifm_block, ifm_rounding).elements_wh()
491 * Shape4D.round_up(query.ifm_shape, ifm_rounding).depth
Diqing Zhonge168b962020-11-05 17:18:47 +0100492 )
Tim Hall79d07d22020-04-27 18:20:16 +0100493
Tim Halld8339a72021-05-27 18:49:40 +0100494 if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
495 kernel_read = query.kernel.elements_wh() * 1 # force to no reread
496 else:
497 kernel_read = query.kernel.elements_wh() * query.ifm_shape.depth
Tim Hall79d07d22020-04-27 18:20:16 +0100498
Tim Halld8339a72021-05-27 18:49:40 +0100499 weight_fetch = kernel_read * ofm_block_depth * ofm_block_count
500
501 access.ifm_read[0] = ifm_fetch * subkernels * ofm_block_count
502
503 if query.npu_block_type not in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):
504 access.const_read[0] = weight_fetch
505 access.const_read[1] = query.ofm_shape.depth # Scales & biases
506 access.weights_refetch = ofm_blocks.elements_wh()
507 # Elementwise
508 elif query.npu_block_type == NpuBlockType.ElementWise:
509 if query.ifm_shape.elements() == 1:
510 if query.ifm_bits > 8:
511 # ifm is a non 8-bit scalar
512 access.ifm_read[0] = Shape4D.round_up(query.ifm_shape, ifm_rounding).elements()
513 if query.ifm2_shape:
514 access.ifm_read[1] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()
515 else:
516 access.ifm_read[0] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()
517 if query.ifm2_shape:
518 if query.ifm2_shape.elements() > 1:
519 access.ifm_read[1] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()
520 elif query.ifm2_bits > 8:
521 # ifm2 is a non 8-bit scalar
522 access.ifm_read[1] = Shape4D.round_up(query.ifm2_shape, ifm_rounding).elements()
523 # Unknown
524 else:
525 assert False
526
527 ofm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ofm_format]))
528 access.ofm_write = Shape4D.round_up(query.ofm_shape, ofm_rounding).elements()
529 return access
530
531
532def measure_performance_cost(
533 arch, op_type: Op, faf_type: Op, query: PerformanceQuery, offset: Shape4D, sub_shape: Shape4D
534):
535 assert (query.ofm_bits > 0) and (query.ifm_bits > 0)
536 assert query.ofm_shape.elements() != 0
537
538 # Default to start if no offset provided
539 if offset is None:
540 offset = Shape4D(0, 0, 0, 0)
541
542 # Default to entire area if no sub-shape provided
543 if sub_shape is None:
544 sub_shape = query.ofm_shape
545 else:
546 sub_shape = Shape4D.min(sub_shape, query.ofm_shape)
547
548 sub_query = copy.deepcopy(query)
549 sub_query.ofm_shape = query.ofm_shape.clip(offset, sub_shape)
550
551 access = ElementAccess()
552 cycles = CycleCost()
553
554 cycle_tmp = measure_cycle_cost(arch, op_type, faf_type, sub_query)
555 cycles += cycle_tmp
556 access = measure_element_access(arch, sub_query)
557
558 return access, cycles
559
560
561def make_bandwidth_array():
562 return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size))
563
564
565def make_cycles_array():
566 return np.zeros(PassCycles.Size)
Tim Hall79d07d22020-04-27 18:20:16 +0100567
568
Diqing Zhonge168b962020-11-05 17:18:47 +0100569def update_summary_cycles(arch, bws, cycles):
570 cycles[PassCycles.SramAccess] = np.sum(bws[MemArea.Sram]) / arch.memory_bandwidths_per_cycle[MemArea.Sram]
Tim Hall79d07d22020-04-27 18:20:16 +0100571 cycles[PassCycles.DramAccess] = np.sum(bws[MemArea.Dram]) / arch.memory_bandwidths_per_cycle[MemArea.Dram]
572 cycles[PassCycles.OnChipFlashAccess] = (
573 np.sum(bws[MemArea.OnChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OnChipFlash]
574 )
575 cycles[PassCycles.OffChipFlashAccess] = (
576 np.sum(bws[MemArea.OffChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OffChipFlash]
577 )
578
579 cycles[PassCycles.Total] = np.max(cycles[: PassCycles.Total])
580 return cycles
581
582
Tim Halld8339a72021-05-27 18:49:40 +0100583def estimate_full_op_performance(
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100584 arch, schedule: Schedule, op: SchedulerOperation, prev_op: Optional[SchedulerOperation], block_config
Tim Halld8339a72021-05-27 18:49:40 +0100585):
586 cycles_a = make_cycles_array()
587 bws = make_bandwidth_array()
588 scaled_bws = make_bandwidth_array() # scaled bw with memory transfer efficiency
589 macs = 0
590
591 query = PerformanceQuery(op.op_type.npu_block_type)
592 query.ifm_shape = op.ifm.shape
593 query.ifm_format = op.ifm.format
594 query.ifm_memory_area = op.ifm.mem_area
595 query.ifm_bits = op.ifm.dtype.size_in_bits()
596 query.ifm2_shape = op.ifm2 and op.ifm2.shape
597 query.ifm2_format = op.ifm2 and op.ifm2.format
598 query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area
599 query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()
600 query.ofm_shape = op.ofm.shape
601 query.ofm_memory_area = op.ofm.mem_area
602 query.ofm_bits = op.ofm.dtype.size_in_bits()
603 query.ofm_format = op.ofm.format
604 query.kernel = op.kernel
605 query.config = block_config
606
607 cost = schedule.cost_map[op]
608 prev_cost = schedule.cost_map[prev_op] if prev_op else None
609 if op.parent_op.bias:
610 query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)
611 if cost.buffered_weight_tensor:
612 query.const_memory_area = cost.buffered_weight_tensor.mem_area
613 else:
614 query.const_memory_area = cost.npu_weights_tensor.mem_area
615
616 cycles = measure_cycle_cost(arch, op.op_type, op.parent_op.activation and op.parent_op.activation.op_type, query)
617 cycles_a[PassCycles.Npu] = cycles.op_cycles
618 macs = cycles.op_macs
619
620 access = measure_element_access(arch, query)
621
622 # How many NPU cycles are available under the previously executing
623 # operator for performing buffered DMA transfers
624 slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0
625
626 # LUT Transfer
627 parent_op = op.parent_op
628 lut_transfer_cycles = 0
629 if parent_op.activation_lut:
630 lut_tensor = [tens for tens in parent_op.inputs if tens.purpose == TensorPurpose.LUT][0]
631 src_tensor = lut_tensor.src_tensor
632 if src_tensor and lut_tensor.mem_area != src_tensor.mem_area:
633 bw = src_tensor.storage_size()
634 lut_transfer_cycles = measure_mem2mem_cycles(arch, src_tensor.mem_area, lut_tensor.mem_area, bw)
635
636 bws[src_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw
637 # LUT read from SHRAM TODO remove?
Ayaan Masoodd5cbef32022-02-22 15:56:35 +0000638 scaled_bws[lut_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw
Tim Halld8339a72021-05-27 18:49:40 +0100639
640 if cost.npu_weights_tensor and cost.buffered_weight_tensor:
641 # DMA Weight Transfer
642 sz = 0
643 # Get the size of the first DMA
644 for core in range(0, arch.ncores):
645 key = WeightKey(core, 0)
646 if key in cost.npu_weights_tensor.encoded_ranges:
647 weight_range = cost.npu_weights_tensor.encoded_ranges[key]
648 sz += round_up(weight_range.total_bytes, 16)
649
650 total_sz = len(cost.npu_weights_tensor.buffer)
651 bws[cost.npu_weights_tensor.mem_area][TensorPurpose.Weights][BandwidthDirection.Read] += total_sz
652 bws[cost.buffered_weight_tensor.mem_area][TensorPurpose.Weights][BandwidthDirection.Write] += total_sz
653
654 ws_first_transfer_cycles = measure_mem2mem_cycles(
655 arch, cost.npu_weights_tensor.mem_area, cost.buffered_weight_tensor.mem_area, sz
656 )
657
658 # Add cycles for Weight + Scale Transfer
659 cycles_a[PassCycles.Npu] = max(
660 cost.full_weight_transfer_cycles - slack_cycles + cost.slack_buffering_cycles,
661 cycles.op_cycles + max(ws_first_transfer_cycles - slack_cycles, 0),
662 )
663
664 # Add cycles for LUT Transfer
665 cycles_a[PassCycles.Npu] += lut_transfer_cycles
666 else:
667 # Add cycles for LUT Transfer
668 cycles_a[PassCycles.Npu] += max(lut_transfer_cycles - slack_cycles, 0)
669
670 # OFM write
671 ofm = op.parent_op.ofm
672 bw = access.ofm_write * ofm.element_size()
673 bws[query.ofm_memory_area][ofm.purpose][BandwidthDirection.Write] += bw
674 scaled_bws[ofm.mem_area][ofm.purpose][BandwidthDirection.Write] += _estimate_memory_transfer_efficiency(
675 arch, False, query.ofm_memory_area, ofm.format, query.ofm_bits, query.config.ofm_block, query.ofm_shape, bw
676 )
677
678 # IFM read
679 ifm = op.parent_op.ifm
680 bw = access.ifm_read[0] * ifm.element_size()
681 bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += bw
682 scaled_bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency(
683 arch, True, query.ifm_memory_area, ifm.format, query.ifm_bits, query.config.ifm_block, query.ifm_shape, bw
684 )
685 if query.ifm2_shape:
686 ifm2 = op.parent_op.ifm2
687 bw = access.ifm_read[1] * ifm2.element_size()
688 bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += bw
689 scaled_bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency(
690 arch,
691 True,
692 query.ifm2_memory_area,
693 ifm2.format,
694 op.ifm2.dtype.size_in_bits(),
695 query.config.ifm_block,
696 query.ifm2_shape,
697 bw,
698 )
699
700 # Weight read
701 if access.const_read[0] > 0:
702 # alignment not accounted for in bandwidth_compression_scale_approx
703 encoded_size_approx = (
704 cost.npu_weights_tensor.elements() - access.const_read[1] * op.parent_op.bias.element_size()
705 )
706 orig_weight_size = parent_op.weights.elements()
707 bandwidth_compression_scale_approx = encoded_size_approx / orig_weight_size
708 bw = access.const_read[0] * bandwidth_compression_scale_approx
709 bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw
710
Patrik Gustavsson225e19d2021-06-01 12:43:43 +0200711 if not cost.buffered_weight_tensor:
712 scaled_bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw
713
Tim Halld8339a72021-05-27 18:49:40 +0100714 if access.const_read[1] > 0:
715 # Scales & biases
716 bw = access.const_read[1] * op.parent_op.bias.element_size()
717 bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw
718
Patrik Gustavsson225e19d2021-06-01 12:43:43 +0200719 if not cost.buffered_weight_tensor:
720 scaled_bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw
721
Tim Halld8339a72021-05-27 18:49:40 +0100722 update_summary_cycles(arch, scaled_bws, cycles_a)
723
724 return bws, macs, cycles_a
Tim Hall79d07d22020-04-27 18:20:16 +0100725
726
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000727def calc_new_performance_for_network(nng: Graph, arch):
Tim Hall79d07d22020-04-27 18:20:16 +0100728 total_bws = make_bandwidth_array()
Diqing Zhong69aadd02020-12-08 13:08:48 +0100729 total_macs = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100730 total_cycles = np.zeros(PassCycles.Size)
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000731 total_weight_size = 0
732 total_encoded_weight_size = 0
733
734 # Store unique instances of original/encoded weight tensor uuids to prevent double counting of weights
735 original_weight_uuids: Set[UUID] = set()
736 encoded_npu_weight_uuids: Set[UUID] = set()
Tim Hall79d07d22020-04-27 18:20:16 +0100737
738 for sg in nng.subgraphs:
Tim Halld8339a72021-05-27 18:49:40 +0100739 prev_op = None
740 for sched_op in sg.sched_ops:
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000741 op_info: SchedulerOpInfo = sg.schedule.cost_map[sched_op]
Tim Halld8339a72021-05-27 18:49:40 +0100742 bws, macs, cycles = estimate_full_op_performance(arch, sg.schedule, sched_op, prev_op, op_info.block_config)
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000743
744 # Tensors for calculating weight sizes
745 original_weight = sched_op.parent_op.weights
746 encoded_npu_weight = op_info.npu_weights_tensor
747
748 # Save UUIDs of original_weight so only unique instances of tensors are used to calculate weights
749 if original_weight and (original_weight.equivalence_id not in original_weight_uuids):
750
751 original_weight_uuids.add(original_weight.equivalence_id)
752 total_weight_size += original_weight.values.itemsize * original_weight.values.size
753
754 # Save UUIDs of encoded_npu_weight so only unique instances of tensors are used to calculate weights
755 if encoded_npu_weight and (encoded_npu_weight.equivalence_id not in encoded_npu_weight_uuids):
756
Jonas Ohlsson77b448f2022-03-11 16:08:30 +0100757 encoded_npu_weight_uuids.add(encoded_npu_weight.equivalence_id)
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000758 total_encoded_weight_size += len(encoded_npu_weight.buffer)
759
Tim Hall79d07d22020-04-27 18:20:16 +0100760 total_bws += bws
761 total_macs += macs
762 total_cycles += cycles
Tim Halld8339a72021-05-27 18:49:40 +0100763 prev_op = sched_op
Tim Hall79d07d22020-04-27 18:20:16 +0100764
765 nng.bandwidths = total_bws
766 nng.macs = total_macs
767 nng.cycles = total_cycles
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000768 nng.total_original_weights = total_weight_size
769 nng.total_npu_encoded_weights = total_encoded_weight_size