blob: e315f1f1b764723e2f32502207b61f0c19e02062 [file] [log] [blame]
erik.andersson@arm.com460c6892021-02-24 14:38:09 +01001# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# NPU performance estimation functions to estimate performance of a Pass and CascadedPass. Uses a model that takes the
18# maximum of the 'cycles required for bandwidth' and 'cycles required for computing'.
19#
20# Called during scheduling to evaluate different proposals, as well as post-scheduling to provide a final performance
21# estimate.
Diqing Zhonge168b962020-11-05 17:18:47 +010022from enum import auto
23from enum import IntEnum
Diego Russoea6111a2020-04-14 18:41:58 +010024
Tim Hall79d07d22020-04-27 18:20:16 +010025import numpy as np
Diego Russoea6111a2020-04-14 18:41:58 +010026
27from . import numeric_util
Diqing Zhong09387e22020-09-28 18:46:22 +020028from .architecture_features import Accelerator
Diego Russoe8a10452020-04-21 17:39:10 +010029from .architecture_features import Block
Diqing Zhonge8887a32020-09-24 09:53:48 +020030from .data_type import DataType
Diego Russoe8a10452020-04-21 17:39:10 +010031from .nn_graph import PassPlacement
32from .nn_graph import SchedulerRewrite
Diego Russoea6111a2020-04-14 18:41:58 +010033from .operation import NpuBlockType
Diqing Zhonge8887a32020-09-24 09:53:48 +020034from .operation import Op
Diqing Zhong09387e22020-09-28 18:46:22 +020035from .shared_buffer_allocation import is_acc_40bits_used
Diqing Zhongf842b692020-12-11 13:07:37 +010036from .tensor import BandwidthDirection
Diego Russoe8a10452020-04-21 17:39:10 +010037from .tensor import MemArea
38from .tensor import shape_num_elements
Diqing Zhongef0c7fe2020-11-24 14:38:20 +010039from .tensor import Tensor
Diego Russoe8a10452020-04-21 17:39:10 +010040from .tensor import TensorBlockTraversal
Diqing Zhonge168b962020-11-05 17:18:47 +010041from .tensor import TensorFormat
Diego Russoe8a10452020-04-21 17:39:10 +010042from .tensor import TensorPurpose
Tim Hall79d07d22020-04-27 18:20:16 +010043
44
45def rolling_buffer_dims_from_passes(arch, ps1, block_config_ps1, ps2, block_config_ps2):
Tim Hall79d07d22020-04-27 18:20:16 +010046 ofm_block = Block(block_config_ps2[-3], block_config_ps2[-4], block_config_ps2[-1])
Tim Hall4ed38bc2020-10-20 18:54:20 +010047 kernel = ps2.primary_op.kernel
Tim Hall79d07d22020-04-27 18:20:16 +010048
Michael McGeaghf3e3ad72020-12-02 12:39:03 +000049 if ps2.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct):
Louis Verhaard93dc5532020-06-07 12:40:18 +020050 op = ps2.primary_op
patrik.gustavssoneeb85152020-12-21 17:10:40 +000051 ifm_block_depth = arch.calc_ifm_block_depth(op.ifm_shapes[0].depth, op.ifm.dtype.size_in_bits())
Tim Hall79d07d22020-04-27 18:20:16 +010052 else:
53 ifm_block_depth = block_config_ps2[-1]
54
Louis Verhaard93dc5532020-06-07 12:40:18 +020055 ifm_block = arch.get_ifm_block_size(ifm_block_depth, ofm_block, kernel, arch.ofm_block_max)
Tim Hall79d07d22020-04-27 18:20:16 +010056
57 # The performed height calculation is for worst case
58 height = numeric_util.round_up(ifm_block.height + block_config_ps1[0], block_config_ps1[0])
59 width = ifm_block.width
Louis Verhaard93dc5532020-06-07 12:40:18 +020060 return [height, width]
Tim Hall79d07d22020-04-27 18:20:16 +010061
62
Diqing Zhonge168b962020-11-05 17:18:47 +010063class PassCycles(IntEnum):
Diqing Zhong42e833d2020-10-02 13:18:42 +020064 Npu = 0
Diqing Zhonge168b962020-11-05 17:18:47 +010065 SramAccess = auto()
66 DramAccess = auto()
67 OnChipFlashAccess = auto()
68 OffChipFlashAccess = auto()
69 Total = auto()
70 Size = auto()
Tim Hall79d07d22020-04-27 18:20:16 +010071
72 def display_name(self):
Tim Hall1bd531d2020-11-01 20:59:36 +000073 return ("NPU", "SRAM Access", "DRAM Access", "On-chip Flash Access", "Off-chip Flash Access", "Total", "Size",)[
74 self.value
75 ]
Tim Hall79d07d22020-04-27 18:20:16 +010076
77 def identifier_name(self):
Tim Hall1bd531d2020-11-01 20:59:36 +000078 return ("npu", "sram_access", "dram_access", "on_chip_flash_access", "off_chip_flash_access", "total", "size",)[
79 self.value
80 ]
Tim Hall79d07d22020-04-27 18:20:16 +010081
82 @staticmethod
83 def all():
84 return (
Diqing Zhong42e833d2020-10-02 13:18:42 +020085 PassCycles.Npu,
Tim Hall79d07d22020-04-27 18:20:16 +010086 PassCycles.SramAccess,
87 PassCycles.DramAccess,
88 PassCycles.OnChipFlashAccess,
89 PassCycles.OffChipFlashAccess,
90 PassCycles.Total,
91 )
92
93
Tim Hall79d07d22020-04-27 18:20:16 +010094def make_bandwidth_array():
95 return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size))
96
97
Tim Hall79d07d22020-04-27 18:20:16 +010098def make_cycles_array():
99 return np.zeros(PassCycles.Size)
100
101
102def make_metrics_arrays():
Diqing Zhong69aadd02020-12-08 13:08:48 +0100103 return (make_bandwidth_array(), 0, make_cycles_array())
Tim Hall79d07d22020-04-27 18:20:16 +0100104
105
Diqing Zhong42e833d2020-10-02 13:18:42 +0200106def get_ifm_block_depth(npu_block_type, ifm_depth, ifm_elemwidth, block_traversal, ofm_blk_depth):
107 ifm_blk_depth = ofm_blk_depth
108
Diqing Zhong69aadd02020-12-08 13:08:48 +0100109 if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
Diqing Zhong42e833d2020-10-02 13:18:42 +0200110 if ifm_elemwidth == 16 or block_traversal == TensorBlockTraversal.PartKernelFirst:
111 ifm_blk_depth = 16
112 elif ifm_elemwidth == 8:
113 ifm_blk_depth = 32
114 else:
115 ifm_blk_depth = 8
116
117 return min(ifm_depth, ifm_blk_depth)
118
119
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100120def get_minimal_cmd_cycles(
121 arch, ifm_tensor, ofm_tensor, ifm_blk: Block, ofm_blk: Block, output_cycles, ifm_shape4D, ofm_shape4D, dpu_cycles=0
122):
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100123 ifm_tens_blk = Tensor((1, ifm_blk.height, ifm_blk.width, ifm_blk.depth), ifm_tensor.dtype, "ifm_blk")
124 ofm_tens_blk = Tensor((1, ofm_blk.height, ofm_blk.width, ofm_blk.depth), ofm_tensor.dtype, "ofm_blk")
125 cycles_ifm_blk = (
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100126 estimate_memory_transfer_efficiency(
127 arch, ifm_tensor.mem_area, BandwidthDirection.Read, ifm_tens_blk, ifm_blk, shape4D=ifm_shape4D
128 )
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100129 / arch.memory_bandwidths_per_cycle[ifm_tensor.mem_area]
130 )
131 cycles_ofm_blk = (
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100132 estimate_memory_transfer_efficiency(
133 arch, ofm_tensor.mem_area, BandwidthDirection.Write, ofm_tens_blk, ofm_blk, shape4D=ofm_shape4D
134 )
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100135 / arch.memory_bandwidths_per_cycle[ofm_tensor.mem_area]
136 )
137 return (
Diqing Zhongf842b692020-12-11 13:07:37 +0100138 arch.memory_latency[ifm_tensor.mem_area][BandwidthDirection.Read]
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100139 + cycles_ifm_blk
140 + dpu_cycles
141 + output_cycles
Diqing Zhongf842b692020-12-11 13:07:37 +0100142 + arch.memory_latency[ofm_tensor.mem_area][BandwidthDirection.Write]
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100143 + cycles_ofm_blk
144 ) / 4
145
146
Diqing Zhong42e833d2020-10-02 13:18:42 +0200147def estimate_output_cycles(
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100148 arch,
149 npu_block_type,
150 primary_op,
151 num_elems,
152 ifm_tensor,
153 ofm_tensor,
154 use_acc_40bits=False,
155 ifm2_tensor=None,
156 block_config: Block = None,
Diqing Zhong09387e22020-09-28 18:46:22 +0200157):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100158 faf = None if primary_op.activation is None else primary_op.activation.op_type
Diqing Zhong09387e22020-09-28 18:46:22 +0200159 if npu_block_type == NpuBlockType.ElementWise and ifm_tensor.dtype == DataType.int32:
160 if ifm2_tensor is None:
Diqing Zhonge8887a32020-09-24 09:53:48 +0200161 # Unary op
162 output_perf_index = 0
163 else:
164 # Binary op
165 output_perf_index = 1
Diqing Zhong09387e22020-09-28 18:46:22 +0200166 elif primary_op.type == Op.Mul and ofm_tensor.dtype == DataType.int32:
Diqing Zhonge8887a32020-09-24 09:53:48 +0200167 output_perf_index = 2
Diqing Zhong09387e22020-09-28 18:46:22 +0200168 elif primary_op.type == Op.Mul or (
Diqing Zhonge8887a32020-09-24 09:53:48 +0200169 npu_block_type
170 in (
171 NpuBlockType.ConvolutionMxN,
172 NpuBlockType.ConvolutionDepthWise,
173 NpuBlockType.Pooling,
174 NpuBlockType.ReduceSum,
175 NpuBlockType.VectorProduct,
176 )
Diqing Zhong09387e22020-09-28 18:46:22 +0200177 and use_acc_40bits
Diqing Zhonge8887a32020-09-24 09:53:48 +0200178 ):
179 output_perf_index = 3
Diqing Zhong09387e22020-09-28 18:46:22 +0200180 elif primary_op.type in (Op.Add, Op.Sub):
181 input_scale = ifm_tensor.quantization.scale_f32
182 input2_scale = ifm2_tensor.quantization.scale_f32
183 output_scale = ofm_tensor.quantization.scale_f32
Diqing Zhonge8887a32020-09-24 09:53:48 +0200184
185 if "resizebilinear" in primary_op.attrs:
186 output_scale = input2_scale
187
188 if None in (input_scale, input2_scale, output_scale) or input_scale == input2_scale:
189 # Simple Add/Sub
190 output_perf_index = 4
191 else:
192 # Advanced Add/Sub
193 output_perf_index = 5
Diqing Zhong09387e22020-09-28 18:46:22 +0200194 elif primary_op.type.is_maxpool_op():
Diqing Zhonge8887a32020-09-24 09:53:48 +0200195 output_perf_index = 6
196 else:
197 output_perf_index = 7
198
199 if faf in (Op.Sigmoid, Op.Tanh, Op.LUT):
200 activation_perf_index = 0
201 elif faf in (Op.Relu, Op.Relu6, Op.ReluN1To1):
202 activation_perf_index = 1
203 else:
204 activation_perf_index = 2
205
Diqing Zhonge8887a32020-09-24 09:53:48 +0200206 cycle_per_elem = max(
207 arch.output_cycles_per_elem[output_perf_index], arch.activation_cycles_per_elem[activation_perf_index]
208 )
Diqing Zhong986e3192020-11-16 16:15:56 +0100209
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100210 if primary_op.type.is_elementwise_op() and block_config is not None:
211 num_elems_blk = block_config.width * block_config.height * block_config.depth
212 cycle_cmd = get_minimal_cmd_cycles(
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100213 arch,
214 ifm_tensor,
215 ofm_tensor,
216 block_config,
217 block_config,
218 num_elems_blk * cycle_per_elem,
219 primary_op.ifm_shapes[0],
220 primary_op.ofm_shapes[0],
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100221 )
222 cycle_per_elem = max(cycle_per_elem, cycle_cmd / num_elems_blk)
223
Diqing Zhonge8887a32020-09-24 09:53:48 +0200224 return num_elems * cycle_per_elem
225
226
Diqing Zhong42e833d2020-10-02 13:18:42 +0200227def estimate_conv_pooling_cycles(
Diqing Zhong986e3192020-11-16 16:15:56 +0100228 arch,
229 npu_block_type,
230 primary_op,
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100231 ifm_block: Block,
232 ofm_block: Block,
Diqing Zhong986e3192020-11-16 16:15:56 +0100233 block_traversal,
234 kernel_dims,
235 ifm_tensor,
236 ofm_tensor,
237 scale_tensor=None,
Diqing Zhong09387e22020-09-28 18:46:22 +0200238):
Diqing Zhonge5204a62020-10-13 11:42:37 +0200239 ofm_ublock = Block(arch.config.ofm_ublock.width, arch.config.ofm_ublock.height, arch.config.ofm_ublock.depth)
Patrik Gustavsson2349d422020-12-01 16:02:29 +0100240 ifm_tens_shape = primary_op.ifm_shapes[0]
241 ofm_tens_shape = primary_op.ofm_shapes[0]
Diqing Zhonge5204a62020-10-13 11:42:37 +0200242
243 if (
244 arch.config.ofm_ublock.height == 2
245 and npu_block_type
246 in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000247 and ofm_tens_shape.height == 1
Diqing Zhonge5204a62020-10-13 11:42:37 +0200248 # Optimisation only applies for even width tensors
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000249 and ofm_tens_shape.width % 2 == 0
Diqing Zhonge5204a62020-10-13 11:42:37 +0200250 and kernel_dims[0] == 1
251 ):
252 ofm_ublock.width = 4
253 ofm_ublock.height = 1
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100254 ofm_block.height = 1
Diqing Zhonge5204a62020-10-13 11:42:37 +0200255
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100256 num_ublk_x = numeric_util.round_up_divide(ofm_block.width, ofm_ublock.width)
257 num_ublk_y = ofm_block.height // ofm_ublock.height
258 num_ublk_xy = num_ublk_x * num_ublk_y
259 num_ublk_z = ofm_block.depth // ofm_ublock.depth
Diqing Zhong09387e22020-09-28 18:46:22 +0200260 num_ofm_blk = 0
261 total_cycles = 0
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100262 num_elems_blk = ofm_block.width * ofm_block.height * ofm_block.depth
Diqing Zhong09387e22020-09-28 18:46:22 +0200263 use_acc_40bits = is_acc_40bits_used(npu_block_type, ifm_tensor, ofm_tensor)
264
265 sub_kernel_limits = arch.sub_kernel_limits[npu_block_type]
266 n_sub_kernels_y = numeric_util.round_up_divide(kernel_dims[0], sub_kernel_limits[0])
267 n_sub_kernels_x = numeric_util.round_up_divide(kernel_dims[1], sub_kernel_limits[1])
268 sub_kernel_x = [
269 min((kernel_dims[1] - i * sub_kernel_limits[1]), sub_kernel_limits[1]) for i in range(n_sub_kernels_x)
270 ]
271 sub_kernel_y = [
272 min((kernel_dims[0] - i * sub_kernel_limits[0]), sub_kernel_limits[0]) for i in range(n_sub_kernels_y)
273 ]
274 sub_kernel_size = (x * y for y in sub_kernel_y for x in sub_kernel_x)
275
Diqing Zhong09387e22020-09-28 18:46:22 +0200276 cycles_dpu_blk = 0
Diqing Zhong986e3192020-11-16 16:15:56 +0100277 cycles_wb = 32 * ofm_ublock.depth // 8
Diqing Zhong09387e22020-09-28 18:46:22 +0200278
279 for num_kernel_elems in sub_kernel_size:
280 if npu_block_type == NpuBlockType.Pooling:
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100281 num_kernel_steps = 1
Diqing Zhong986e3192020-11-16 16:15:56 +0100282 cycles = max(4, num_kernel_elems) * num_ublk_xy * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200283 if ifm_tensor.dtype.size_in_bits() == 16 and arch.accelerator_config != Accelerator.Ethos_U55_32:
284 cycles *= 2
285 elif npu_block_type == NpuBlockType.ConvolutionDepthWise:
Diqing Zhong986e3192020-11-16 16:15:56 +0100286 cycles = 4 * num_ublk_xy
Diqing Zhong09387e22020-09-28 18:46:22 +0200287 if ifm_tensor.dtype.size_in_bits() == 16:
288 cycles *= 2
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100289 num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, 4)
290 cycles = max(cycles_wb, cycles) * num_kernel_steps * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200291 elif (
292 (npu_block_type == NpuBlockType.ConvolutionMxN and block_traversal != TensorBlockTraversal.PartKernelFirst)
293 or npu_block_type == NpuBlockType.VectorProduct
294 or npu_block_type == NpuBlockType.ReduceSum
295 ):
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100296 num_kernel_steps = num_kernel_elems
297 cycles = max(cycles_wb, 4 * num_ublk_xy) * num_kernel_steps * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200298 else:
299 assert block_traversal == TensorBlockTraversal.PartKernelFirst
300 divider = 2 if ifm_tensor.dtype.size_in_bits() == 16 else 4
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100301 num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, divider)
Diqing Zhong986e3192020-11-16 16:15:56 +0100302 cycles = max(cycles_wb, 4 * num_ublk_xy) * (
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100303 num_kernel_steps * numeric_util.round_up_divide(ifm_block.depth, 8) * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200304 )
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100305
306 delay_cycles = 0
307 if arch.accelerator_config is Accelerator.Ethos_U55_32:
308 delay = 7 if use_acc_40bits else 3
309 if num_ublk_x == 1 and num_ublk_y == 1:
310 if num_ublk_z == 1:
311 delay_cycles = delay * num_kernel_steps
312 elif num_kernel_steps > 1:
313 delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z
314 if (num_ublk_x == 1 or num_ublk_y == 1) and num_ublk_z > 1 and use_acc_40bits:
315 delay_cycles += delay * num_ublk_z
316 else:
317 delay = (
318 3
319 if use_acc_40bits and arch.accelerator_config in (Accelerator.Ethos_U55_64, Accelerator.Ethos_U55_128)
320 else 2
321 )
322 if num_ublk_x == 1 and num_ublk_y == 1:
323 if num_ublk_z == 1:
324 delay_cycles = delay * num_kernel_steps
325 elif num_kernel_steps > 1:
326 delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z
327
328 if npu_block_type == NpuBlockType.ConvolutionMxN and block_traversal == TensorBlockTraversal.PartKernelFirst:
329 delay_cycles *= numeric_util.round_up_divide(ifm_block.depth, 8)
330
Diqing Zhong09387e22020-09-28 18:46:22 +0200331 cycles_dpu_blk += cycles
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100332 cycles_dpu_blk += delay_cycles
333
334 if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000335 cycles_dpu_blk *= numeric_util.round_up_divide(ifm_tens_shape.depth, ifm_block.depth)
Diqing Zhong09387e22020-09-28 18:46:22 +0200336
337 cycles_dpu_blk /= arch.ncores
338
339 num_ofm_blk = (
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000340 numeric_util.round_up_divide(ofm_tens_shape.height, ofm_block.height)
341 * numeric_util.round_up_divide(ofm_tens_shape.width, ofm_block.width)
342 * numeric_util.round_up_divide(ofm_tens_shape.depth, ofm_block.depth)
Diqing Zhong09387e22020-09-28 18:46:22 +0200343 )
344
Diqing Zhong42e833d2020-10-02 13:18:42 +0200345 cycles_output_blk = estimate_output_cycles(
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100346 arch, npu_block_type, primary_op, num_elems_blk, ifm_tensor, ofm_tensor, use_acc_40bits
Diqing Zhong09387e22020-09-28 18:46:22 +0200347 )
348
Diqing Zhong986e3192020-11-16 16:15:56 +0100349 if scale_tensor:
Diqing Zhongf842b692020-12-11 13:07:37 +0100350 cycles_bias_blk = (
351 10
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000352 * min(ofm_block.depth, ofm_tens_shape.depth)
Diqing Zhongf842b692020-12-11 13:07:37 +0100353 * arch.memory_latency[scale_tensor.mem_area][BandwidthDirection.Read]
354 / 256
355 )
Diqing Zhong986e3192020-11-16 16:15:56 +0100356 cycles_output_blk = max(cycles_output_blk, cycles_bias_blk)
357
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100358 cycles_cmd = get_minimal_cmd_cycles(
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100359 arch,
360 ifm_tensor,
361 ofm_tensor,
362 ifm_block,
363 ofm_block,
364 cycles_dpu_blk,
365 ifm_tens_shape,
366 ofm_tens_shape,
367 cycles_output_blk,
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100368 )
369 cycles_dpu_blk = max(cycles_dpu_blk, cycles_cmd)
370 cycles_output_blk = max(cycles_output_blk, cycles_cmd)
371
Diqing Zhong09387e22020-09-28 18:46:22 +0200372 if cycles_dpu_blk > cycles_output_blk:
373 total_cycles = cycles_dpu_blk * num_ofm_blk + cycles_output_blk
374 else:
375 total_cycles = cycles_output_blk * num_ofm_blk + cycles_dpu_blk
376
377 return total_cycles
378
379
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100380def estimate_memory_transfer_efficiency(
381 arch, mem_area, direction, tensor, block_size: Block, replace_bw=None, shape4D=None
382):
Diqing Zhonge168b962020-11-05 17:18:47 +0100383 if tensor.format not in (TensorFormat.NHWC, TensorFormat.NHCWB16):
384 return tensor.bandwidth() if replace_bw is None else replace_bw
385
386 # Estimate memory transfer efficiency by calculating the burst length
387 # this is related to data format, block shape, and tensor shape, etc.
Diqing Zhonge168b962020-11-05 17:18:47 +0100388 burst_len = 0
389 elem_size = tensor.dtype.size_in_bytes()
390 is_ifm = direction == BandwidthDirection.Read
391 tens = tensor.clone()
392 if not tens.avoid_NHCWB16:
393 tens.set_format(TensorFormat.NHCWB16, arch)
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100394 strides = tens.get_strides(shape4D=shape4D)
Diqing Zhonge168b962020-11-05 17:18:47 +0100395
396 if tens.format == TensorFormat.NHCWB16:
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100397 if strides[1] == block_size.depth:
Diqing Zhonge168b962020-11-05 17:18:47 +0100398 burst_len = elem_size * block_size.depth * block_size.width
399 elif is_ifm:
400 burst_len = 16 * elem_size * block_size.width
401 else:
402 burst_len = 16 * elem_size * block_size.width * arch.ncores
403 else:
404 assert tens.format == TensorFormat.NHWC
405 if is_ifm:
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100406 if strides[3] == block_size.depth:
Diqing Zhonge168b962020-11-05 17:18:47 +0100407 burst_len = elem_size * block_size.depth * block_size.width
408 else:
409 burst_len = elem_size * block_size.depth
410 else:
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100411 if block_size.depth <= 16 and strides[3] == block_size.depth:
Diqing Zhonge168b962020-11-05 17:18:47 +0100412 burst_len = elem_size * block_size.depth * block_size.width
413 else:
414 burst_len = min(64, 16 * elem_size * arch.ncores, block_size.depth * elem_size)
415
Diqing Zhongf842b692020-12-11 13:07:37 +0100416 burst_len = min(arch.memory_burst_length[mem_area], burst_len)
Diqing Zhonge168b962020-11-05 17:18:47 +0100417 bw = tens.bandwidth() if replace_bw is None else replace_bw
418
Diqing Zhongf842b692020-12-11 13:07:37 +0100419 return bw * (arch.memory_burst_length[mem_area] / burst_len)
Diqing Zhonge168b962020-11-05 17:18:47 +0100420
421
Michael McGeagh6f725262020-12-03 15:21:36 +0000422def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None, force_outputs_to_fast_storage=False):
Tim Hall79d07d22020-04-27 18:20:16 +0100423 if block_config is None:
424 block_config = ps.block_config
425 bws = make_bandwidth_array()
Diqing Zhong69aadd02020-12-08 13:08:48 +0100426 scaled_bws = make_bandwidth_array() # scaled bw with memory transfer efficiency
427 macs = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100428 cycles = make_cycles_array()
Tim Hall79d07d22020-04-27 18:20:16 +0100429 ifm_read_multiple = 1
430 weight_read_multiple = 0
431
Michael McGeaghf3e3ad72020-12-02 12:39:03 +0000432 if ps.placement in (PassPlacement.MemoryOnly, PassPlacement.StartupInit):
Diqing Zhong69aadd02020-12-08 13:08:48 +0100433 return bws, macs, cycles, ifm_read_multiple, weight_read_multiple # nothing real happening in this pass
Tim Hall79d07d22020-04-27 18:20:16 +0100434
Tim Hall79d07d22020-04-27 18:20:16 +0100435 explicit_padding = (0, 0, 0, 0)
436 primary_op = ps.primary_op
437 replacement_read_bws = {}
Diqing Zhonge168b962020-11-05 17:18:47 +0100438 ofm_block = Block(block_config[1], block_config[0], block_config[3])
439 ifm_block = Block(block_config[1], block_config[0], block_config[3])
440
Tim Hall1bd531d2020-11-01 20:59:36 +0000441 if ps.placement == PassPlacement.Npu and primary_op:
Tim Hall79d07d22020-04-27 18:20:16 +0100442 explicit_padding = primary_op.attrs.get("explicit_padding", explicit_padding)
Louis Verhaardaee5d752020-09-30 09:01:52 +0200443 assert primary_op.type.npu_block_type == ps.npu_block_type
444 npu_block_type = primary_op.type.npu_block_type
Tim Hall79d07d22020-04-27 18:20:16 +0100445
446 ifm_tensor, _, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()
Tim Hall73e843f2021-02-04 22:47:46 +0000447 ifm_tensor_shape = ps.primary_op.ifm_shapes[0]
448 ofm_tensor_shape = ps.primary_op.ofm_shapes[0]
Diqing Zhong016b8272020-12-16 16:46:06 +0100449 ofm_block.width = min(ofm_block.width, ofm_tensor_shape.width)
450 ofm_block.height = min(ofm_block.height, ofm_tensor_shape.height)
451 ofm_block.depth = min(ofm_block.depth, ofm_tensor_shape.depth)
Tim Hall79d07d22020-04-27 18:20:16 +0100452
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100453 if npu_block_type == NpuBlockType.ReduceSum:
454 block_traversal = TensorBlockTraversal.DepthFirst
455 elif npu_block_type in (
456 NpuBlockType.ConvolutionMxN,
457 NpuBlockType.ConvolutionDepthWise,
458 NpuBlockType.VectorProduct,
459 ):
460 block_traversal = weight_tensor.block_traversal
461 else:
462 block_traversal = TensorBlockTraversal.Default
463 ifm_block_depth = get_ifm_block_depth(
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000464 npu_block_type, ifm_tensor_shape.depth, ifm_tensor.dtype.size_in_bits(), block_traversal, ofm_block.depth
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100465 )
466 ifm_block = arch.get_ifm_block_size(
467 ifm_block_depth, ofm_block, primary_op.kernel, ifm_resampling_mode=ifm_tensor.resampling_mode
468 )
Diqing Zhong016b8272020-12-16 16:46:06 +0100469 ifm_block.width = min(ifm_block.width, ifm_tensor_shape.width)
470 ifm_block.height = min(ifm_block.height, ifm_tensor_shape.height)
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100471
Michael McGeaghf3e3ad72020-12-02 12:39:03 +0000472 if npu_block_type in (
473 NpuBlockType.ConvolutionMxN,
474 NpuBlockType.ConvolutionDepthWise,
Diqing Zhong69aadd02020-12-08 13:08:48 +0100475 NpuBlockType.VectorProduct,
Michael McGeaghf3e3ad72020-12-02 12:39:03 +0000476 NpuBlockType.Pooling,
477 NpuBlockType.ReduceSum,
Tim Hallc30f4952020-06-15 20:47:35 +0100478 ):
Charles Xu3e9c4342020-04-22 08:31:43 +0200479 # extent the ifm to full dimension
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000480
481 batch_size = ifm_tensor_shape.batch
Tim Hall79d07d22020-04-27 18:20:16 +0100482
Tim Hall73e843f2021-02-04 22:47:46 +0000483 # add in padding, height += top and bottom, width += left and right
484 ifm_tensor_shape = ifm_tensor_shape.add(
485 0, explicit_padding[0] + explicit_padding[2], explicit_padding[1] + explicit_padding[3], 0
486 )
Tim Hall79d07d22020-04-27 18:20:16 +0100487
Tim Hall79d07d22020-04-27 18:20:16 +0100488 if npu_block_type != NpuBlockType.Pooling:
Diqing Zhong09387e22020-09-28 18:46:22 +0200489 if npu_block_type == NpuBlockType.ReduceSum:
Diqing Zhong09387e22020-09-28 18:46:22 +0200490 weight_tensor_shape = [1, 1, ifm_tensor.shape[3], ofm_tensor.shape[3]]
491 weight_tensor_bandwidth_shape = [0] * 4
492 weight_tensor_element_size = 0
493 weight_tensor_bandwidth_compression_scale = 0.0
494 else:
Diqing Zhong69aadd02020-12-08 13:08:48 +0100495 # For Vector product, weight format of IO is extended to HWIO, with H=W=1
496 weight_tensor_shape = numeric_util.full_shape(4, weight_tensor.shape, 1)
497 weight_tensor_bandwidth_shape = numeric_util.full_shape(4, weight_tensor.bandwidth_shape, 1)
Diqing Zhong09387e22020-09-28 18:46:22 +0200498 weight_tensor_element_size = weight_tensor.element_size()
499 weight_tensor_bandwidth_compression_scale = weight_tensor.bandwidth_compression_scale
Diqing Zhong69aadd02020-12-08 13:08:48 +0100500
Tim Hall79d07d22020-04-27 18:20:16 +0100501 nn_ops = (
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000502 int(ofm_tensor_shape.batch)
503 * int(ofm_tensor_shape.height)
504 * int(ofm_tensor_shape.width)
Tim Hall79d07d22020-04-27 18:20:16 +0100505 * int(weight_tensor_shape[0])
506 * int(weight_tensor_shape[1])
507 * int(weight_tensor_shape[2])
508 * int(weight_tensor_shape[3])
Tim Hall79d07d22020-04-27 18:20:16 +0100509 )
510 else:
511 weight_tensor_shape = [
Dwight Lidman4f728c02020-12-17 15:14:45 +0100512 *primary_op.get_kernel_size(),
Tim Hall79d07d22020-04-27 18:20:16 +0100513 1,
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000514 ifm_tensor_shape.depth,
Tim Hall79d07d22020-04-27 18:20:16 +0100515 ]
516 weight_tensor_bandwidth_shape = weight_tensor_shape
517 weight_tensor_element_size = 0
518 weight_tensor_bandwidth_compression_scale = 0.0
519 nn_ops = 0 # pooling doesn't count as NN ops
520
521 kernel_dims = weight_tensor_shape[:2]
522
523 sub_kernel_limits = arch.sub_kernel_limits[npu_block_type]
524 # count the sub kernels; the IFM block needs to be refetched for each of them
525 n_sub_kernels_y = numeric_util.round_up_divide(kernel_dims[0], sub_kernel_limits[0])
526 n_sub_kernels_x = numeric_util.round_up_divide(kernel_dims[1], sub_kernel_limits[1])
527 n_sub_kernels = n_sub_kernels_y * n_sub_kernels_x
528
Diqing Zhong69aadd02020-12-08 13:08:48 +0100529 n_full_depth_stages = numeric_util.round_up_divide(weight_tensor_bandwidth_shape[3], ofm_block.depth)
530 if npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
531 n_full_depth_stages = 1 # force to no reread
Tim Hall79d07d22020-04-27 18:20:16 +0100532
Diqing Zhong69aadd02020-12-08 13:08:48 +0100533 ifm_read_multiple = n_sub_kernels * n_full_depth_stages
534 replacement_read_bws[ifm_tensor] = ifm_tensor.bandwidth() * ifm_read_multiple
Tim Hall79d07d22020-04-27 18:20:16 +0100535
Diqing Zhong69aadd02020-12-08 13:08:48 +0100536 weight_read_multiple = numeric_util.round_up_divide(
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000537 ofm_tensor_shape.height, ofm_block.height
538 ) * numeric_util.round_up_divide(ofm_tensor_shape.width, ofm_block.width)
Tim Hall79d07d22020-04-27 18:20:16 +0100539 replacement_read_bws[weight_tensor] = (
540 batch_size
541 * shape_num_elements(weight_tensor_bandwidth_shape)
542 * weight_tensor_element_size
543 * weight_tensor_bandwidth_compression_scale
Diqing Zhong69aadd02020-12-08 13:08:48 +0100544 * weight_read_multiple
545 )
Tim Hall79d07d22020-04-27 18:20:16 +0100546
Diqing Zhong69aadd02020-12-08 13:08:48 +0100547 macs += nn_ops
Diqing Zhong42e833d2020-10-02 13:18:42 +0200548 cycles[PassCycles.Npu] = estimate_conv_pooling_cycles(
Diqing Zhong986e3192020-11-16 16:15:56 +0100549 arch,
550 npu_block_type,
551 primary_op,
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100552 ifm_block,
Diqing Zhong986e3192020-11-16 16:15:56 +0100553 ofm_block,
554 block_traversal,
555 kernel_dims,
556 ifm_tensor,
557 ofm_tensor,
558 ps.scale_tensor,
Diqing Zhong09387e22020-09-28 18:46:22 +0200559 )
Diqing Zhonge8887a32020-09-24 09:53:48 +0200560 elif npu_block_type == NpuBlockType.ElementWise:
Tim Hall79d07d22020-04-27 18:20:16 +0100561 # Work out how many elements we have and calculate performance.
Diqing Zhong42e833d2020-10-02 13:18:42 +0200562 cycles[PassCycles.Npu] = estimate_output_cycles(
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100563 arch,
564 npu_block_type,
565 primary_op,
566 ofm_tensor.elements(),
567 ps.ifm_tensor,
568 ps.ofm_tensor,
569 None,
570 ps.ifm2_tensor,
571 ofm_block,
Diqing Zhong09387e22020-09-28 18:46:22 +0200572 )
Diqing Zhong42e833d2020-10-02 13:18:42 +0200573
574 prev_npu_pass = next((npu_ps for npu_ps in ps.dag_predecessors if npu_ps.placement is PassPlacement.Npu), None)
575 if prev_npu_pass is None:
576 # cycles for DMA ops in first pass
577 dma_ops = (op for op in ps.ops if op.type == Op.DMA)
578 for dma_op in dma_ops:
579 mem_area = dma_op.attrs["source"]
580 for tens in dma_op.inputs:
581 cycles[PassCycles.Npu] += tens.storage_size() / arch.memory_bandwidths_per_cycle[mem_area]
582
Michael McGeagh6f725262020-12-03 15:21:36 +0000583 if rewrite_list is not None:
584 # apply the desired rewrites
585 for rewrite_op, tens, _, _, _, ps_to_rewrite in rewrite_list:
586 if ps != ps_to_rewrite:
587 continue
588 if rewrite_op == SchedulerRewrite.Nop:
589 pass # these are fine, no bandwidth changes
590 elif rewrite_op in (SchedulerRewrite.ChangeTensorSubPurpose,):
Diqing Zhong69aadd02020-12-08 13:08:48 +0100591 bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Read] += replacement_read_bws[tens]
Michael McGeagh6f725262020-12-03 15:21:36 +0000592 if tens.purpose == TensorPurpose.FeatureMap:
Diqing Zhong69aadd02020-12-08 13:08:48 +0100593 scaled_bw = estimate_memory_transfer_efficiency(
Michael McGeagh6f725262020-12-03 15:21:36 +0000594 arch,
595 arch.fast_storage_mem_area,
596 BandwidthDirection.Read,
597 tens,
598 ifm_block,
599 replacement_read_bws[tens],
600 )
601 else:
Diqing Zhong69aadd02020-12-08 13:08:48 +0100602 scaled_bw = replacement_read_bws[tens]
603 scaled_bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Read] += scaled_bw
Michael McGeagh6f725262020-12-03 15:21:36 +0000604 replacement_read_bws[tens] = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100605
606 for tens in ps.outputs:
607 if force_outputs_to_fast_storage:
Diqing Zhong69aadd02020-12-08 13:08:48 +0100608 bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()
609 scaled_bws[arch.fast_storage_mem_area][tens.purpose][
610 BandwidthDirection.Write
611 ] += estimate_memory_transfer_efficiency(
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100612 arch, arch.fast_storage_mem_area, BandwidthDirection.Write, tens, ofm_block, shape4D=ps.ofm_shapes[0],
Diqing Zhonge168b962020-11-05 17:18:47 +0100613 )
Tim Hall79d07d22020-04-27 18:20:16 +0100614 else:
Diqing Zhong69aadd02020-12-08 13:08:48 +0100615 bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()
616 scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += estimate_memory_transfer_efficiency(
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100617 arch, tens.mem_area, BandwidthDirection.Write, tens, ofm_block, shape4D=ps.ofm_shapes[0]
Diqing Zhonge168b962020-11-05 17:18:47 +0100618 )
Tim Hall79d07d22020-04-27 18:20:16 +0100619
620 for tens in ps.intermediates:
621 bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()
Diqing Zhong69aadd02020-12-08 13:08:48 +0100622 scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()
Tim Hall79d07d22020-04-27 18:20:16 +0100623
624 if tens in replacement_read_bws:
625 bw = replacement_read_bws[tens]
626 else:
627 bw = tens.bandwidth()
628
629 bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw
Diqing Zhong69aadd02020-12-08 13:08:48 +0100630 scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw
Tim Hall79d07d22020-04-27 18:20:16 +0100631
632 for tens in ps.inputs:
Diqing Zhong69aadd02020-12-08 13:08:48 +0100633 if tens in replacement_read_bws:
634 bw = replacement_read_bws[tens]
635 else:
636 bw = tens.bandwidth()
637
638 bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100639
640 op_shape = None
641 if ps.placement == PassPlacement.Npu and primary_op:
642 if tens == ps.ifm_tensor:
643 op_shape = ps.ifm_shapes[0]
644 elif tens == ps.ifm2_tensor:
645 op_shape = ps.ifm_shapes[1]
646
Diqing Zhong69aadd02020-12-08 13:08:48 +0100647 scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += estimate_memory_transfer_efficiency(
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100648 arch, tens.mem_area, BandwidthDirection.Read, tens, ifm_block, bw, op_shape
Diqing Zhonge168b962020-11-05 17:18:47 +0100649 )
Tim Hall79d07d22020-04-27 18:20:16 +0100650
651 # quick build access counts for only current pass, even though these aren't the final numbers
Diqing Zhong69aadd02020-12-08 13:08:48 +0100652 update_summary_cycles(arch, scaled_bws, cycles)
Tim Hall79d07d22020-04-27 18:20:16 +0100653
Diqing Zhong69aadd02020-12-08 13:08:48 +0100654 return bws, macs, cycles, ifm_read_multiple, weight_read_multiple
Tim Hall79d07d22020-04-27 18:20:16 +0100655
656
Diqing Zhonge168b962020-11-05 17:18:47 +0100657def update_summary_cycles(arch, bws, cycles):
658 cycles[PassCycles.SramAccess] = np.sum(bws[MemArea.Sram]) / arch.memory_bandwidths_per_cycle[MemArea.Sram]
Tim Hall79d07d22020-04-27 18:20:16 +0100659 cycles[PassCycles.DramAccess] = np.sum(bws[MemArea.Dram]) / arch.memory_bandwidths_per_cycle[MemArea.Dram]
660 cycles[PassCycles.OnChipFlashAccess] = (
661 np.sum(bws[MemArea.OnChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OnChipFlash]
662 )
663 cycles[PassCycles.OffChipFlashAccess] = (
664 np.sum(bws[MemArea.OffChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OffChipFlash]
665 )
666
667 cycles[PassCycles.Total] = np.max(cycles[: PassCycles.Total])
668 return cycles
669
670
671def collate_stats_for_cascaded_pass(arch, bws, macs, cycles):
672 return bws, macs, cycles
673
674
675def performance_for_cascaded_pass(arch, cps):
676 total_bws = make_bandwidth_array()
Diqing Zhong69aadd02020-12-08 13:08:48 +0100677 total_macs = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100678 total_cycles = make_cycles_array()
679
680 for ps in cps.passes:
Diqing Zhong69aadd02020-12-08 13:08:48 +0100681 bws, macs, cycles, _, _ = performance_metrics_for_pass(arch, ps)
Tim Hall79d07d22020-04-27 18:20:16 +0100682 ps.bandwidths = bws
683 ps.macs = macs
684 ps.cycles = cycles
Tim Hall79d07d22020-04-27 18:20:16 +0100685 total_bws += bws
686 total_macs += macs
687 total_cycles += cycles
688
689 bws, macs, cycles = collate_stats_for_cascaded_pass(arch, total_bws, total_macs, total_cycles)
690 cps.bandwidths = bws
691 cps.macs = macs
692 cps.cycles = cycles
693 return bws, macs, cycles
694
695
696def calc_performance_for_network(nng, arch):
697 total_bws = make_bandwidth_array()
Diqing Zhong69aadd02020-12-08 13:08:48 +0100698 total_macs = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100699 total_cycles = np.zeros(PassCycles.Size)
700
701 for sg in nng.subgraphs:
702 for cps in sg.cascaded_passes:
703 bws, macs, cycles = performance_for_cascaded_pass(arch, cps)
704 total_bws += bws
705 total_macs += macs
706 total_cycles += cycles
Tim Hall79d07d22020-04-27 18:20:16 +0100707
708 nng.bandwidths = total_bws
709 nng.macs = total_macs
710 nng.cycles = total_cycles