blob: c83f8f52318b767971eb3cd9b920981f92aac78e [file] [log] [blame]
erik.andersson@arm.com460c6892021-02-24 14:38:09 +01001# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# NPU performance estimation functions to estimate performance of a Pass and CascadedPass. Uses a model that takes the
18# maximum of the 'cycles required for bandwidth' and 'cycles required for computing'.
19#
20# Called during scheduling to evaluate different proposals, as well as post-scheduling to provide a final performance
21# estimate.
Diqing Zhonge168b962020-11-05 17:18:47 +010022from enum import auto
23from enum import IntEnum
Diego Russoea6111a2020-04-14 18:41:58 +010024
Tim Hall79d07d22020-04-27 18:20:16 +010025import numpy as np
Diego Russoea6111a2020-04-14 18:41:58 +010026
27from . import numeric_util
Diqing Zhong09387e22020-09-28 18:46:22 +020028from .architecture_features import Accelerator
Diego Russoe8a10452020-04-21 17:39:10 +010029from .architecture_features import Block
Diqing Zhonge8887a32020-09-24 09:53:48 +020030from .data_type import DataType
Diego Russoe8a10452020-04-21 17:39:10 +010031from .nn_graph import PassPlacement
32from .nn_graph import SchedulerRewrite
Diego Russoea6111a2020-04-14 18:41:58 +010033from .operation import NpuBlockType
Diqing Zhonge8887a32020-09-24 09:53:48 +020034from .operation import Op
Diqing Zhong09387e22020-09-28 18:46:22 +020035from .shared_buffer_allocation import is_acc_40bits_used
Diqing Zhongf842b692020-12-11 13:07:37 +010036from .tensor import BandwidthDirection
Diego Russoe8a10452020-04-21 17:39:10 +010037from .tensor import MemArea
38from .tensor import shape_num_elements
Diqing Zhongef0c7fe2020-11-24 14:38:20 +010039from .tensor import Tensor
Diego Russoe8a10452020-04-21 17:39:10 +010040from .tensor import TensorBlockTraversal
Diqing Zhonge168b962020-11-05 17:18:47 +010041from .tensor import TensorFormat
Diego Russoe8a10452020-04-21 17:39:10 +010042from .tensor import TensorPurpose
Tim Hall79d07d22020-04-27 18:20:16 +010043
44
45def rolling_buffer_dims_from_passes(arch, ps1, block_config_ps1, ps2, block_config_ps2):
Tim Hall79d07d22020-04-27 18:20:16 +010046 ofm_block = Block(block_config_ps2[-3], block_config_ps2[-4], block_config_ps2[-1])
Tim Hall4ed38bc2020-10-20 18:54:20 +010047 kernel = ps2.primary_op.kernel
Tim Hall79d07d22020-04-27 18:20:16 +010048
Michael McGeaghf3e3ad72020-12-02 12:39:03 +000049 if ps2.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct):
Louis Verhaard93dc5532020-06-07 12:40:18 +020050 op = ps2.primary_op
patrik.gustavssoneeb85152020-12-21 17:10:40 +000051 ifm_block_depth = arch.calc_ifm_block_depth(op.ifm_shapes[0].depth, op.ifm.dtype.size_in_bits())
Tim Hall79d07d22020-04-27 18:20:16 +010052 else:
53 ifm_block_depth = block_config_ps2[-1]
54
Louis Verhaard93dc5532020-06-07 12:40:18 +020055 ifm_block = arch.get_ifm_block_size(ifm_block_depth, ofm_block, kernel, arch.ofm_block_max)
Tim Hall79d07d22020-04-27 18:20:16 +010056
57 # The performed height calculation is for worst case
58 height = numeric_util.round_up(ifm_block.height + block_config_ps1[0], block_config_ps1[0])
59 width = ifm_block.width
Louis Verhaard93dc5532020-06-07 12:40:18 +020060 return [height, width]
Tim Hall79d07d22020-04-27 18:20:16 +010061
62
Diqing Zhonge168b962020-11-05 17:18:47 +010063class PassCycles(IntEnum):
Diqing Zhong42e833d2020-10-02 13:18:42 +020064 Npu = 0
Diqing Zhonge168b962020-11-05 17:18:47 +010065 SramAccess = auto()
66 DramAccess = auto()
67 OnChipFlashAccess = auto()
68 OffChipFlashAccess = auto()
69 Total = auto()
70 Size = auto()
Tim Hall79d07d22020-04-27 18:20:16 +010071
72 def display_name(self):
Tim Hall1bd531d2020-11-01 20:59:36 +000073 return ("NPU", "SRAM Access", "DRAM Access", "On-chip Flash Access", "Off-chip Flash Access", "Total", "Size",)[
74 self.value
75 ]
Tim Hall79d07d22020-04-27 18:20:16 +010076
77 def identifier_name(self):
Tim Hall1bd531d2020-11-01 20:59:36 +000078 return ("npu", "sram_access", "dram_access", "on_chip_flash_access", "off_chip_flash_access", "total", "size",)[
79 self.value
80 ]
Tim Hall79d07d22020-04-27 18:20:16 +010081
82 @staticmethod
83 def all():
84 return (
Diqing Zhong42e833d2020-10-02 13:18:42 +020085 PassCycles.Npu,
Tim Hall79d07d22020-04-27 18:20:16 +010086 PassCycles.SramAccess,
87 PassCycles.DramAccess,
88 PassCycles.OnChipFlashAccess,
89 PassCycles.OffChipFlashAccess,
90 PassCycles.Total,
91 )
92
93
Tim Hall79d07d22020-04-27 18:20:16 +010094def make_bandwidth_array():
95 return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size))
96
97
Tim Hall79d07d22020-04-27 18:20:16 +010098def make_cycles_array():
99 return np.zeros(PassCycles.Size)
100
101
102def make_metrics_arrays():
Diqing Zhong69aadd02020-12-08 13:08:48 +0100103 return (make_bandwidth_array(), 0, make_cycles_array())
Tim Hall79d07d22020-04-27 18:20:16 +0100104
105
Diqing Zhong42e833d2020-10-02 13:18:42 +0200106def get_ifm_block_depth(npu_block_type, ifm_depth, ifm_elemwidth, block_traversal, ofm_blk_depth):
107 ifm_blk_depth = ofm_blk_depth
108
Diqing Zhong69aadd02020-12-08 13:08:48 +0100109 if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
Diqing Zhong42e833d2020-10-02 13:18:42 +0200110 if ifm_elemwidth == 16 or block_traversal == TensorBlockTraversal.PartKernelFirst:
111 ifm_blk_depth = 16
112 elif ifm_elemwidth == 8:
113 ifm_blk_depth = 32
114 else:
115 ifm_blk_depth = 8
116
117 return min(ifm_depth, ifm_blk_depth)
118
119
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100120def get_minimal_cmd_cycles(
121 arch, ifm_tensor, ofm_tensor, ifm_blk: Block, ofm_blk: Block, output_cycles, ifm_shape4D, ofm_shape4D, dpu_cycles=0
122):
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100123 ifm_tens_blk = Tensor((1, ifm_blk.height, ifm_blk.width, ifm_blk.depth), ifm_tensor.dtype, "ifm_blk")
124 ofm_tens_blk = Tensor((1, ofm_blk.height, ofm_blk.width, ofm_blk.depth), ofm_tensor.dtype, "ofm_blk")
125 cycles_ifm_blk = (
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100126 estimate_memory_transfer_efficiency(
127 arch, ifm_tensor.mem_area, BandwidthDirection.Read, ifm_tens_blk, ifm_blk, shape4D=ifm_shape4D
128 )
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100129 / arch.memory_bandwidths_per_cycle[ifm_tensor.mem_area]
130 )
131 cycles_ofm_blk = (
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100132 estimate_memory_transfer_efficiency(
133 arch, ofm_tensor.mem_area, BandwidthDirection.Write, ofm_tens_blk, ofm_blk, shape4D=ofm_shape4D
134 )
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100135 / arch.memory_bandwidths_per_cycle[ofm_tensor.mem_area]
136 )
137 return (
Diqing Zhongf842b692020-12-11 13:07:37 +0100138 arch.memory_latency[ifm_tensor.mem_area][BandwidthDirection.Read]
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100139 + cycles_ifm_blk
140 + dpu_cycles
141 + output_cycles
Diqing Zhongf842b692020-12-11 13:07:37 +0100142 + arch.memory_latency[ofm_tensor.mem_area][BandwidthDirection.Write]
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100143 + cycles_ofm_blk
144 ) / 4
145
146
Diqing Zhong42e833d2020-10-02 13:18:42 +0200147def estimate_output_cycles(
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100148 arch,
149 npu_block_type,
150 primary_op,
151 num_elems,
152 ifm_tensor,
153 ofm_tensor,
154 use_acc_40bits=False,
155 ifm2_tensor=None,
156 block_config: Block = None,
Diqing Zhong09387e22020-09-28 18:46:22 +0200157):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100158 faf = None if primary_op.activation is None else primary_op.activation.op_type
Diqing Zhong09387e22020-09-28 18:46:22 +0200159 if npu_block_type == NpuBlockType.ElementWise and ifm_tensor.dtype == DataType.int32:
160 if ifm2_tensor is None:
Diqing Zhonge8887a32020-09-24 09:53:48 +0200161 # Unary op
162 output_perf_index = 0
163 else:
164 # Binary op
165 output_perf_index = 1
Diqing Zhong09387e22020-09-28 18:46:22 +0200166 elif primary_op.type == Op.Mul and ofm_tensor.dtype == DataType.int32:
Diqing Zhonge8887a32020-09-24 09:53:48 +0200167 output_perf_index = 2
Diqing Zhong09387e22020-09-28 18:46:22 +0200168 elif primary_op.type == Op.Mul or (
Diqing Zhonge8887a32020-09-24 09:53:48 +0200169 npu_block_type
170 in (
171 NpuBlockType.ConvolutionMxN,
172 NpuBlockType.ConvolutionDepthWise,
173 NpuBlockType.Pooling,
174 NpuBlockType.ReduceSum,
175 NpuBlockType.VectorProduct,
176 )
Diqing Zhong09387e22020-09-28 18:46:22 +0200177 and use_acc_40bits
Diqing Zhonge8887a32020-09-24 09:53:48 +0200178 ):
179 output_perf_index = 3
Diqing Zhong09387e22020-09-28 18:46:22 +0200180 elif primary_op.type in (Op.Add, Op.Sub):
181 input_scale = ifm_tensor.quantization.scale_f32
182 input2_scale = ifm2_tensor.quantization.scale_f32
183 output_scale = ofm_tensor.quantization.scale_f32
Diqing Zhonge8887a32020-09-24 09:53:48 +0200184
185 if "resizebilinear" in primary_op.attrs:
186 output_scale = input2_scale
187
188 if None in (input_scale, input2_scale, output_scale) or input_scale == input2_scale:
189 # Simple Add/Sub
190 output_perf_index = 4
191 else:
192 # Advanced Add/Sub
193 output_perf_index = 5
Diqing Zhong09387e22020-09-28 18:46:22 +0200194 elif primary_op.type.is_maxpool_op():
Diqing Zhonge8887a32020-09-24 09:53:48 +0200195 output_perf_index = 6
196 else:
197 output_perf_index = 7
198
199 if faf in (Op.Sigmoid, Op.Tanh, Op.LUT):
200 activation_perf_index = 0
201 elif faf in (Op.Relu, Op.Relu6, Op.ReluN1To1):
202 activation_perf_index = 1
203 else:
204 activation_perf_index = 2
205
Diqing Zhonge8887a32020-09-24 09:53:48 +0200206 cycle_per_elem = max(
207 arch.output_cycles_per_elem[output_perf_index], arch.activation_cycles_per_elem[activation_perf_index]
208 )
Diqing Zhong986e3192020-11-16 16:15:56 +0100209
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100210 if primary_op.type.is_elementwise_op() and block_config is not None:
211 num_elems_blk = block_config.width * block_config.height * block_config.depth
212 cycle_cmd = get_minimal_cmd_cycles(
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100213 arch,
214 ifm_tensor,
215 ofm_tensor,
216 block_config,
217 block_config,
218 num_elems_blk * cycle_per_elem,
219 primary_op.ifm_shapes[0],
220 primary_op.ofm_shapes[0],
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100221 )
222 cycle_per_elem = max(cycle_per_elem, cycle_cmd / num_elems_blk)
223
Diqing Zhonge8887a32020-09-24 09:53:48 +0200224 return num_elems * cycle_per_elem
225
226
Diqing Zhong42e833d2020-10-02 13:18:42 +0200227def estimate_conv_pooling_cycles(
Diqing Zhong986e3192020-11-16 16:15:56 +0100228 arch,
229 npu_block_type,
230 primary_op,
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100231 ifm_block: Block,
232 ofm_block: Block,
Diqing Zhong986e3192020-11-16 16:15:56 +0100233 block_traversal,
234 kernel_dims,
235 ifm_tensor,
236 ofm_tensor,
237 scale_tensor=None,
Diqing Zhong09387e22020-09-28 18:46:22 +0200238):
Diqing Zhonge5204a62020-10-13 11:42:37 +0200239 ofm_ublock = Block(arch.config.ofm_ublock.width, arch.config.ofm_ublock.height, arch.config.ofm_ublock.depth)
Patrik Gustavsson2349d422020-12-01 16:02:29 +0100240 ifm_tens_shape = primary_op.ifm_shapes[0]
241 ofm_tens_shape = primary_op.ofm_shapes[0]
Diqing Zhonge5204a62020-10-13 11:42:37 +0200242
243 if (
244 arch.config.ofm_ublock.height == 2
245 and npu_block_type
246 in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000247 and ofm_tens_shape.height == 1
Diqing Zhonge5204a62020-10-13 11:42:37 +0200248 # Optimisation only applies for even width tensors
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000249 and ofm_tens_shape.width % 2 == 0
Diqing Zhonge5204a62020-10-13 11:42:37 +0200250 and kernel_dims[0] == 1
251 ):
252 ofm_ublock.width = 4
253 ofm_ublock.height = 1
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100254 ofm_block.height = 1
Diqing Zhonge5204a62020-10-13 11:42:37 +0200255
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100256 num_ublk_x = numeric_util.round_up_divide(ofm_block.width, ofm_ublock.width)
257 num_ublk_y = ofm_block.height // ofm_ublock.height
258 num_ublk_xy = num_ublk_x * num_ublk_y
259 num_ublk_z = ofm_block.depth // ofm_ublock.depth
Diqing Zhong09387e22020-09-28 18:46:22 +0200260 num_ofm_blk = 0
261 total_cycles = 0
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100262 num_elems_blk = ofm_block.width * ofm_block.height * ofm_block.depth
Diqing Zhong09387e22020-09-28 18:46:22 +0200263 use_acc_40bits = is_acc_40bits_used(npu_block_type, ifm_tensor, ofm_tensor)
264
265 sub_kernel_limits = arch.sub_kernel_limits[npu_block_type]
266 n_sub_kernels_y = numeric_util.round_up_divide(kernel_dims[0], sub_kernel_limits[0])
267 n_sub_kernels_x = numeric_util.round_up_divide(kernel_dims[1], sub_kernel_limits[1])
268 sub_kernel_x = [
269 min((kernel_dims[1] - i * sub_kernel_limits[1]), sub_kernel_limits[1]) for i in range(n_sub_kernels_x)
270 ]
271 sub_kernel_y = [
272 min((kernel_dims[0] - i * sub_kernel_limits[0]), sub_kernel_limits[0]) for i in range(n_sub_kernels_y)
273 ]
274 sub_kernel_size = (x * y for y in sub_kernel_y for x in sub_kernel_x)
275
Diqing Zhong09387e22020-09-28 18:46:22 +0200276 cycles_dpu_blk = 0
Diqing Zhong986e3192020-11-16 16:15:56 +0100277 cycles_wb = 32 * ofm_ublock.depth // 8
Diqing Zhong09387e22020-09-28 18:46:22 +0200278
279 for num_kernel_elems in sub_kernel_size:
280 if npu_block_type == NpuBlockType.Pooling:
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100281 num_kernel_steps = 1
Diqing Zhong986e3192020-11-16 16:15:56 +0100282 cycles = max(4, num_kernel_elems) * num_ublk_xy * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200283 if ifm_tensor.dtype.size_in_bits() == 16 and arch.accelerator_config != Accelerator.Ethos_U55_32:
284 cycles *= 2
285 elif npu_block_type == NpuBlockType.ConvolutionDepthWise:
Diqing Zhong986e3192020-11-16 16:15:56 +0100286 cycles = 4 * num_ublk_xy
Diqing Zhong09387e22020-09-28 18:46:22 +0200287 if ifm_tensor.dtype.size_in_bits() == 16:
288 cycles *= 2
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100289 num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, 4)
290 cycles = max(cycles_wb, cycles) * num_kernel_steps * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200291 elif (
292 (npu_block_type == NpuBlockType.ConvolutionMxN and block_traversal != TensorBlockTraversal.PartKernelFirst)
293 or npu_block_type == NpuBlockType.VectorProduct
294 or npu_block_type == NpuBlockType.ReduceSum
295 ):
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100296 num_kernel_steps = num_kernel_elems
297 cycles = max(cycles_wb, 4 * num_ublk_xy) * num_kernel_steps * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200298 else:
299 assert block_traversal == TensorBlockTraversal.PartKernelFirst
300 divider = 2 if ifm_tensor.dtype.size_in_bits() == 16 else 4
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100301 num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, divider)
Diqing Zhong986e3192020-11-16 16:15:56 +0100302 cycles = max(cycles_wb, 4 * num_ublk_xy) * (
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100303 num_kernel_steps * numeric_util.round_up_divide(ifm_block.depth, 8) * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200304 )
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100305
306 delay_cycles = 0
307 if arch.accelerator_config is Accelerator.Ethos_U55_32:
308 delay = 7 if use_acc_40bits else 3
309 if num_ublk_x == 1 and num_ublk_y == 1:
310 if num_ublk_z == 1:
311 delay_cycles = delay * num_kernel_steps
312 elif num_kernel_steps > 1:
313 delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z
314 if (num_ublk_x == 1 or num_ublk_y == 1) and num_ublk_z > 1 and use_acc_40bits:
315 delay_cycles += delay * num_ublk_z
316 else:
317 delay = (
318 3
319 if use_acc_40bits and arch.accelerator_config in (Accelerator.Ethos_U55_64, Accelerator.Ethos_U55_128)
320 else 2
321 )
322 if num_ublk_x == 1 and num_ublk_y == 1:
323 if num_ublk_z == 1:
324 delay_cycles = delay * num_kernel_steps
325 elif num_kernel_steps > 1:
326 delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z
327
328 if npu_block_type == NpuBlockType.ConvolutionMxN and block_traversal == TensorBlockTraversal.PartKernelFirst:
329 delay_cycles *= numeric_util.round_up_divide(ifm_block.depth, 8)
330
Diqing Zhong09387e22020-09-28 18:46:22 +0200331 cycles_dpu_blk += cycles
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100332 cycles_dpu_blk += delay_cycles
333
334 if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000335 cycles_dpu_blk *= numeric_util.round_up_divide(ifm_tens_shape.depth, ifm_block.depth)
Diqing Zhong09387e22020-09-28 18:46:22 +0200336
337 cycles_dpu_blk /= arch.ncores
338
339 num_ofm_blk = (
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000340 numeric_util.round_up_divide(ofm_tens_shape.height, ofm_block.height)
341 * numeric_util.round_up_divide(ofm_tens_shape.width, ofm_block.width)
342 * numeric_util.round_up_divide(ofm_tens_shape.depth, ofm_block.depth)
Diqing Zhong09387e22020-09-28 18:46:22 +0200343 )
344
Diqing Zhong42e833d2020-10-02 13:18:42 +0200345 cycles_output_blk = estimate_output_cycles(
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100346 arch, npu_block_type, primary_op, num_elems_blk, ifm_tensor, ofm_tensor, use_acc_40bits
Diqing Zhong09387e22020-09-28 18:46:22 +0200347 )
348
Diqing Zhong986e3192020-11-16 16:15:56 +0100349 if scale_tensor:
Diqing Zhongf842b692020-12-11 13:07:37 +0100350 cycles_bias_blk = (
351 10
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000352 * min(ofm_block.depth, ofm_tens_shape.depth)
Diqing Zhongf842b692020-12-11 13:07:37 +0100353 * arch.memory_latency[scale_tensor.mem_area][BandwidthDirection.Read]
354 / 256
355 )
Diqing Zhong986e3192020-11-16 16:15:56 +0100356 cycles_output_blk = max(cycles_output_blk, cycles_bias_blk)
357
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100358 cycles_cmd = get_minimal_cmd_cycles(
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100359 arch,
360 ifm_tensor,
361 ofm_tensor,
362 ifm_block,
363 ofm_block,
364 cycles_dpu_blk,
365 ifm_tens_shape,
366 ofm_tens_shape,
367 cycles_output_blk,
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100368 )
369 cycles_dpu_blk = max(cycles_dpu_blk, cycles_cmd)
370 cycles_output_blk = max(cycles_output_blk, cycles_cmd)
371
Diqing Zhong09387e22020-09-28 18:46:22 +0200372 if cycles_dpu_blk > cycles_output_blk:
373 total_cycles = cycles_dpu_blk * num_ofm_blk + cycles_output_blk
374 else:
375 total_cycles = cycles_output_blk * num_ofm_blk + cycles_dpu_blk
376
377 return total_cycles
378
379
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100380def estimate_memory_transfer_efficiency(
381 arch, mem_area, direction, tensor, block_size: Block, replace_bw=None, shape4D=None
382):
Diqing Zhonge168b962020-11-05 17:18:47 +0100383 if tensor.format not in (TensorFormat.NHWC, TensorFormat.NHCWB16):
384 return tensor.bandwidth() if replace_bw is None else replace_bw
385
386 # Estimate memory transfer efficiency by calculating the burst length
387 # this is related to data format, block shape, and tensor shape, etc.
Diqing Zhonge168b962020-11-05 17:18:47 +0100388 burst_len = 0
389 elem_size = tensor.dtype.size_in_bytes()
390 is_ifm = direction == BandwidthDirection.Read
391 tens = tensor.clone()
Patrik Gustavssonee99bb12021-04-08 09:04:00 +0200392
393 if not tensor.needs_linear_format:
Diqing Zhonge168b962020-11-05 17:18:47 +0100394 tens.set_format(TensorFormat.NHCWB16, arch)
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100395 strides = tens.get_strides(shape4D=shape4D)
Diqing Zhonge168b962020-11-05 17:18:47 +0100396
397 if tens.format == TensorFormat.NHCWB16:
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100398 if strides[1] == block_size.depth:
Diqing Zhonge168b962020-11-05 17:18:47 +0100399 burst_len = elem_size * block_size.depth * block_size.width
400 elif is_ifm:
401 burst_len = 16 * elem_size * block_size.width
402 else:
403 burst_len = 16 * elem_size * block_size.width * arch.ncores
404 else:
405 assert tens.format == TensorFormat.NHWC
406 if is_ifm:
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100407 if strides[3] == block_size.depth:
Diqing Zhonge168b962020-11-05 17:18:47 +0100408 burst_len = elem_size * block_size.depth * block_size.width
409 else:
410 burst_len = elem_size * block_size.depth
411 else:
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100412 if block_size.depth <= 16 and strides[3] == block_size.depth:
Diqing Zhonge168b962020-11-05 17:18:47 +0100413 burst_len = elem_size * block_size.depth * block_size.width
414 else:
415 burst_len = min(64, 16 * elem_size * arch.ncores, block_size.depth * elem_size)
416
Diqing Zhongf842b692020-12-11 13:07:37 +0100417 burst_len = min(arch.memory_burst_length[mem_area], burst_len)
Diqing Zhonge168b962020-11-05 17:18:47 +0100418 bw = tens.bandwidth() if replace_bw is None else replace_bw
419
Diqing Zhongf842b692020-12-11 13:07:37 +0100420 return bw * (arch.memory_burst_length[mem_area] / burst_len)
Diqing Zhonge168b962020-11-05 17:18:47 +0100421
422
Michael McGeagh6f725262020-12-03 15:21:36 +0000423def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None, force_outputs_to_fast_storage=False):
Tim Hall79d07d22020-04-27 18:20:16 +0100424 if block_config is None:
425 block_config = ps.block_config
426 bws = make_bandwidth_array()
Diqing Zhong69aadd02020-12-08 13:08:48 +0100427 scaled_bws = make_bandwidth_array() # scaled bw with memory transfer efficiency
428 macs = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100429 cycles = make_cycles_array()
Tim Hall79d07d22020-04-27 18:20:16 +0100430 ifm_read_multiple = 1
431 weight_read_multiple = 0
432
Michael McGeaghf3e3ad72020-12-02 12:39:03 +0000433 if ps.placement in (PassPlacement.MemoryOnly, PassPlacement.StartupInit):
Diqing Zhong69aadd02020-12-08 13:08:48 +0100434 return bws, macs, cycles, ifm_read_multiple, weight_read_multiple # nothing real happening in this pass
Tim Hall79d07d22020-04-27 18:20:16 +0100435
Tim Hall79d07d22020-04-27 18:20:16 +0100436 explicit_padding = (0, 0, 0, 0)
437 primary_op = ps.primary_op
438 replacement_read_bws = {}
Diqing Zhonge168b962020-11-05 17:18:47 +0100439 ofm_block = Block(block_config[1], block_config[0], block_config[3])
440 ifm_block = Block(block_config[1], block_config[0], block_config[3])
441
Tim Hall1bd531d2020-11-01 20:59:36 +0000442 if ps.placement == PassPlacement.Npu and primary_op:
Tim Hall79d07d22020-04-27 18:20:16 +0100443 explicit_padding = primary_op.attrs.get("explicit_padding", explicit_padding)
Louis Verhaardaee5d752020-09-30 09:01:52 +0200444 assert primary_op.type.npu_block_type == ps.npu_block_type
445 npu_block_type = primary_op.type.npu_block_type
Tim Hall79d07d22020-04-27 18:20:16 +0100446
447 ifm_tensor, _, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()
Tim Hall73e843f2021-02-04 22:47:46 +0000448 ifm_tensor_shape = ps.primary_op.ifm_shapes[0]
449 ofm_tensor_shape = ps.primary_op.ofm_shapes[0]
Diqing Zhong016b8272020-12-16 16:46:06 +0100450 ofm_block.width = min(ofm_block.width, ofm_tensor_shape.width)
451 ofm_block.height = min(ofm_block.height, ofm_tensor_shape.height)
452 ofm_block.depth = min(ofm_block.depth, ofm_tensor_shape.depth)
Tim Hall79d07d22020-04-27 18:20:16 +0100453
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100454 if npu_block_type == NpuBlockType.ReduceSum:
455 block_traversal = TensorBlockTraversal.DepthFirst
456 elif npu_block_type in (
457 NpuBlockType.ConvolutionMxN,
458 NpuBlockType.ConvolutionDepthWise,
459 NpuBlockType.VectorProduct,
460 ):
461 block_traversal = weight_tensor.block_traversal
462 else:
463 block_traversal = TensorBlockTraversal.Default
464 ifm_block_depth = get_ifm_block_depth(
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000465 npu_block_type, ifm_tensor_shape.depth, ifm_tensor.dtype.size_in_bits(), block_traversal, ofm_block.depth
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100466 )
467 ifm_block = arch.get_ifm_block_size(
468 ifm_block_depth, ofm_block, primary_op.kernel, ifm_resampling_mode=ifm_tensor.resampling_mode
469 )
Diqing Zhong016b8272020-12-16 16:46:06 +0100470 ifm_block.width = min(ifm_block.width, ifm_tensor_shape.width)
471 ifm_block.height = min(ifm_block.height, ifm_tensor_shape.height)
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100472
Michael McGeaghf3e3ad72020-12-02 12:39:03 +0000473 if npu_block_type in (
474 NpuBlockType.ConvolutionMxN,
475 NpuBlockType.ConvolutionDepthWise,
Diqing Zhong69aadd02020-12-08 13:08:48 +0100476 NpuBlockType.VectorProduct,
Michael McGeaghf3e3ad72020-12-02 12:39:03 +0000477 NpuBlockType.Pooling,
478 NpuBlockType.ReduceSum,
Tim Hallc30f4952020-06-15 20:47:35 +0100479 ):
Charles Xu3e9c4342020-04-22 08:31:43 +0200480 # extent the ifm to full dimension
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000481
482 batch_size = ifm_tensor_shape.batch
Tim Hall79d07d22020-04-27 18:20:16 +0100483
Tim Hall73e843f2021-02-04 22:47:46 +0000484 # add in padding, height += top and bottom, width += left and right
485 ifm_tensor_shape = ifm_tensor_shape.add(
486 0, explicit_padding[0] + explicit_padding[2], explicit_padding[1] + explicit_padding[3], 0
487 )
Tim Hall79d07d22020-04-27 18:20:16 +0100488
Tim Hall79d07d22020-04-27 18:20:16 +0100489 if npu_block_type != NpuBlockType.Pooling:
Diqing Zhong09387e22020-09-28 18:46:22 +0200490 if npu_block_type == NpuBlockType.ReduceSum:
Diqing Zhong09387e22020-09-28 18:46:22 +0200491 weight_tensor_shape = [1, 1, ifm_tensor.shape[3], ofm_tensor.shape[3]]
492 weight_tensor_bandwidth_shape = [0] * 4
493 weight_tensor_element_size = 0
494 weight_tensor_bandwidth_compression_scale = 0.0
495 else:
Diqing Zhong69aadd02020-12-08 13:08:48 +0100496 # For Vector product, weight format of IO is extended to HWIO, with H=W=1
497 weight_tensor_shape = numeric_util.full_shape(4, weight_tensor.shape, 1)
498 weight_tensor_bandwidth_shape = numeric_util.full_shape(4, weight_tensor.bandwidth_shape, 1)
Diqing Zhong09387e22020-09-28 18:46:22 +0200499 weight_tensor_element_size = weight_tensor.element_size()
500 weight_tensor_bandwidth_compression_scale = weight_tensor.bandwidth_compression_scale
Diqing Zhong69aadd02020-12-08 13:08:48 +0100501
Tim Hall79d07d22020-04-27 18:20:16 +0100502 nn_ops = (
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000503 int(ofm_tensor_shape.batch)
504 * int(ofm_tensor_shape.height)
505 * int(ofm_tensor_shape.width)
Tim Hall79d07d22020-04-27 18:20:16 +0100506 * int(weight_tensor_shape[0])
507 * int(weight_tensor_shape[1])
508 * int(weight_tensor_shape[2])
509 * int(weight_tensor_shape[3])
Tim Hall79d07d22020-04-27 18:20:16 +0100510 )
511 else:
512 weight_tensor_shape = [
Dwight Lidman4f728c02020-12-17 15:14:45 +0100513 *primary_op.get_kernel_size(),
Tim Hall79d07d22020-04-27 18:20:16 +0100514 1,
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000515 ifm_tensor_shape.depth,
Tim Hall79d07d22020-04-27 18:20:16 +0100516 ]
517 weight_tensor_bandwidth_shape = weight_tensor_shape
518 weight_tensor_element_size = 0
519 weight_tensor_bandwidth_compression_scale = 0.0
520 nn_ops = 0 # pooling doesn't count as NN ops
521
522 kernel_dims = weight_tensor_shape[:2]
523
524 sub_kernel_limits = arch.sub_kernel_limits[npu_block_type]
525 # count the sub kernels; the IFM block needs to be refetched for each of them
526 n_sub_kernels_y = numeric_util.round_up_divide(kernel_dims[0], sub_kernel_limits[0])
527 n_sub_kernels_x = numeric_util.round_up_divide(kernel_dims[1], sub_kernel_limits[1])
528 n_sub_kernels = n_sub_kernels_y * n_sub_kernels_x
529
Diqing Zhong69aadd02020-12-08 13:08:48 +0100530 n_full_depth_stages = numeric_util.round_up_divide(weight_tensor_bandwidth_shape[3], ofm_block.depth)
531 if npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
532 n_full_depth_stages = 1 # force to no reread
Tim Hall79d07d22020-04-27 18:20:16 +0100533
Diqing Zhong69aadd02020-12-08 13:08:48 +0100534 ifm_read_multiple = n_sub_kernels * n_full_depth_stages
535 replacement_read_bws[ifm_tensor] = ifm_tensor.bandwidth() * ifm_read_multiple
Tim Hall79d07d22020-04-27 18:20:16 +0100536
Diqing Zhong69aadd02020-12-08 13:08:48 +0100537 weight_read_multiple = numeric_util.round_up_divide(
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000538 ofm_tensor_shape.height, ofm_block.height
539 ) * numeric_util.round_up_divide(ofm_tensor_shape.width, ofm_block.width)
Tim Hall79d07d22020-04-27 18:20:16 +0100540 replacement_read_bws[weight_tensor] = (
541 batch_size
542 * shape_num_elements(weight_tensor_bandwidth_shape)
543 * weight_tensor_element_size
544 * weight_tensor_bandwidth_compression_scale
Diqing Zhong69aadd02020-12-08 13:08:48 +0100545 * weight_read_multiple
546 )
Tim Hall79d07d22020-04-27 18:20:16 +0100547
Diqing Zhong69aadd02020-12-08 13:08:48 +0100548 macs += nn_ops
Diqing Zhong42e833d2020-10-02 13:18:42 +0200549 cycles[PassCycles.Npu] = estimate_conv_pooling_cycles(
Diqing Zhong986e3192020-11-16 16:15:56 +0100550 arch,
551 npu_block_type,
552 primary_op,
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100553 ifm_block,
Diqing Zhong986e3192020-11-16 16:15:56 +0100554 ofm_block,
555 block_traversal,
556 kernel_dims,
557 ifm_tensor,
558 ofm_tensor,
559 ps.scale_tensor,
Diqing Zhong09387e22020-09-28 18:46:22 +0200560 )
Diqing Zhonge8887a32020-09-24 09:53:48 +0200561 elif npu_block_type == NpuBlockType.ElementWise:
Tim Hall79d07d22020-04-27 18:20:16 +0100562 # Work out how many elements we have and calculate performance.
Diqing Zhong42e833d2020-10-02 13:18:42 +0200563 cycles[PassCycles.Npu] = estimate_output_cycles(
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100564 arch,
565 npu_block_type,
566 primary_op,
567 ofm_tensor.elements(),
568 ps.ifm_tensor,
569 ps.ofm_tensor,
570 None,
571 ps.ifm2_tensor,
572 ofm_block,
Diqing Zhong09387e22020-09-28 18:46:22 +0200573 )
Diqing Zhong42e833d2020-10-02 13:18:42 +0200574
575 prev_npu_pass = next((npu_ps for npu_ps in ps.dag_predecessors if npu_ps.placement is PassPlacement.Npu), None)
576 if prev_npu_pass is None:
577 # cycles for DMA ops in first pass
578 dma_ops = (op for op in ps.ops if op.type == Op.DMA)
579 for dma_op in dma_ops:
580 mem_area = dma_op.attrs["source"]
581 for tens in dma_op.inputs:
582 cycles[PassCycles.Npu] += tens.storage_size() / arch.memory_bandwidths_per_cycle[mem_area]
583
Michael McGeagh6f725262020-12-03 15:21:36 +0000584 if rewrite_list is not None:
585 # apply the desired rewrites
586 for rewrite_op, tens, _, _, _, ps_to_rewrite in rewrite_list:
587 if ps != ps_to_rewrite:
588 continue
589 if rewrite_op == SchedulerRewrite.Nop:
590 pass # these are fine, no bandwidth changes
591 elif rewrite_op in (SchedulerRewrite.ChangeTensorSubPurpose,):
Diqing Zhong69aadd02020-12-08 13:08:48 +0100592 bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Read] += replacement_read_bws[tens]
Michael McGeagh6f725262020-12-03 15:21:36 +0000593 if tens.purpose == TensorPurpose.FeatureMap:
Diqing Zhong69aadd02020-12-08 13:08:48 +0100594 scaled_bw = estimate_memory_transfer_efficiency(
Michael McGeagh6f725262020-12-03 15:21:36 +0000595 arch,
596 arch.fast_storage_mem_area,
597 BandwidthDirection.Read,
598 tens,
599 ifm_block,
600 replacement_read_bws[tens],
601 )
602 else:
Diqing Zhong69aadd02020-12-08 13:08:48 +0100603 scaled_bw = replacement_read_bws[tens]
604 scaled_bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Read] += scaled_bw
Michael McGeagh6f725262020-12-03 15:21:36 +0000605 replacement_read_bws[tens] = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100606
607 for tens in ps.outputs:
608 if force_outputs_to_fast_storage:
Diqing Zhong69aadd02020-12-08 13:08:48 +0100609 bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()
610 scaled_bws[arch.fast_storage_mem_area][tens.purpose][
611 BandwidthDirection.Write
612 ] += estimate_memory_transfer_efficiency(
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100613 arch, arch.fast_storage_mem_area, BandwidthDirection.Write, tens, ofm_block, shape4D=ps.ofm_shapes[0],
Diqing Zhonge168b962020-11-05 17:18:47 +0100614 )
Tim Hall79d07d22020-04-27 18:20:16 +0100615 else:
Diqing Zhong69aadd02020-12-08 13:08:48 +0100616 bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()
617 scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += estimate_memory_transfer_efficiency(
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100618 arch, tens.mem_area, BandwidthDirection.Write, tens, ofm_block, shape4D=ps.ofm_shapes[0]
Diqing Zhonge168b962020-11-05 17:18:47 +0100619 )
Tim Hall79d07d22020-04-27 18:20:16 +0100620
621 for tens in ps.intermediates:
622 bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()
Diqing Zhong69aadd02020-12-08 13:08:48 +0100623 scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()
Tim Hall79d07d22020-04-27 18:20:16 +0100624
625 if tens in replacement_read_bws:
626 bw = replacement_read_bws[tens]
627 else:
628 bw = tens.bandwidth()
629
630 bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw
Diqing Zhong69aadd02020-12-08 13:08:48 +0100631 scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw
Tim Hall79d07d22020-04-27 18:20:16 +0100632
633 for tens in ps.inputs:
Diqing Zhong69aadd02020-12-08 13:08:48 +0100634 if tens in replacement_read_bws:
635 bw = replacement_read_bws[tens]
636 else:
637 bw = tens.bandwidth()
638
639 bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100640
641 op_shape = None
642 if ps.placement == PassPlacement.Npu and primary_op:
643 if tens == ps.ifm_tensor:
644 op_shape = ps.ifm_shapes[0]
645 elif tens == ps.ifm2_tensor:
646 op_shape = ps.ifm_shapes[1]
647
Diqing Zhong69aadd02020-12-08 13:08:48 +0100648 scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += estimate_memory_transfer_efficiency(
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100649 arch, tens.mem_area, BandwidthDirection.Read, tens, ifm_block, bw, op_shape
Diqing Zhonge168b962020-11-05 17:18:47 +0100650 )
Tim Hall79d07d22020-04-27 18:20:16 +0100651
652 # quick build access counts for only current pass, even though these aren't the final numbers
Diqing Zhong69aadd02020-12-08 13:08:48 +0100653 update_summary_cycles(arch, scaled_bws, cycles)
Tim Hall79d07d22020-04-27 18:20:16 +0100654
Diqing Zhong69aadd02020-12-08 13:08:48 +0100655 return bws, macs, cycles, ifm_read_multiple, weight_read_multiple
Tim Hall79d07d22020-04-27 18:20:16 +0100656
657
Diqing Zhonge168b962020-11-05 17:18:47 +0100658def update_summary_cycles(arch, bws, cycles):
659 cycles[PassCycles.SramAccess] = np.sum(bws[MemArea.Sram]) / arch.memory_bandwidths_per_cycle[MemArea.Sram]
Tim Hall79d07d22020-04-27 18:20:16 +0100660 cycles[PassCycles.DramAccess] = np.sum(bws[MemArea.Dram]) / arch.memory_bandwidths_per_cycle[MemArea.Dram]
661 cycles[PassCycles.OnChipFlashAccess] = (
662 np.sum(bws[MemArea.OnChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OnChipFlash]
663 )
664 cycles[PassCycles.OffChipFlashAccess] = (
665 np.sum(bws[MemArea.OffChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OffChipFlash]
666 )
667
668 cycles[PassCycles.Total] = np.max(cycles[: PassCycles.Total])
669 return cycles
670
671
672def collate_stats_for_cascaded_pass(arch, bws, macs, cycles):
673 return bws, macs, cycles
674
675
676def performance_for_cascaded_pass(arch, cps):
677 total_bws = make_bandwidth_array()
Diqing Zhong69aadd02020-12-08 13:08:48 +0100678 total_macs = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100679 total_cycles = make_cycles_array()
680
681 for ps in cps.passes:
Diqing Zhong69aadd02020-12-08 13:08:48 +0100682 bws, macs, cycles, _, _ = performance_metrics_for_pass(arch, ps)
Tim Hall79d07d22020-04-27 18:20:16 +0100683 ps.bandwidths = bws
684 ps.macs = macs
685 ps.cycles = cycles
Tim Hall79d07d22020-04-27 18:20:16 +0100686 total_bws += bws
687 total_macs += macs
688 total_cycles += cycles
689
690 bws, macs, cycles = collate_stats_for_cascaded_pass(arch, total_bws, total_macs, total_cycles)
691 cps.bandwidths = bws
692 cps.macs = macs
693 cps.cycles = cycles
694 return bws, macs, cycles
695
696
697def calc_performance_for_network(nng, arch):
698 total_bws = make_bandwidth_array()
Diqing Zhong69aadd02020-12-08 13:08:48 +0100699 total_macs = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100700 total_cycles = np.zeros(PassCycles.Size)
701
702 for sg in nng.subgraphs:
703 for cps in sg.cascaded_passes:
704 bws, macs, cycles = performance_for_cascaded_pass(arch, cps)
705 total_bws += bws
706 total_macs += macs
707 total_cycles += cycles
Tim Hall79d07d22020-04-27 18:20:16 +0100708
709 nng.bandwidths = total_bws
710 nng.macs = total_macs
711 nng.cycles = total_cycles