blob: 4ca468313695363292650d9c73ae4b3e7e435be2 [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# NPU performance estimation functions to estimate performance of a Pass and CascadedPass. Uses a model that takes the
18# maximum of the 'cycles required for bandwidth' and 'cycles required for computing'.
19#
20# Called during scheduling to evaluate different proposals, as well as post-scheduling to provide a final performance
21# estimate.
Diqing Zhonge168b962020-11-05 17:18:47 +010022from enum import auto
23from enum import IntEnum
Diego Russoea6111a2020-04-14 18:41:58 +010024
Tim Hall79d07d22020-04-27 18:20:16 +010025import numpy as np
Diego Russoea6111a2020-04-14 18:41:58 +010026
27from . import numeric_util
Diqing Zhong09387e22020-09-28 18:46:22 +020028from .architecture_features import Accelerator
Diego Russoe8a10452020-04-21 17:39:10 +010029from .architecture_features import Block
Diqing Zhonge8887a32020-09-24 09:53:48 +020030from .data_type import DataType
Diego Russoe8a10452020-04-21 17:39:10 +010031from .nn_graph import PassPlacement
32from .nn_graph import SchedulerRewrite
Diego Russoea6111a2020-04-14 18:41:58 +010033from .operation import NpuBlockType
Diqing Zhonge8887a32020-09-24 09:53:48 +020034from .operation import Op
Diqing Zhong09387e22020-09-28 18:46:22 +020035from .shared_buffer_allocation import is_acc_40bits_used
Diqing Zhongf842b692020-12-11 13:07:37 +010036from .tensor import BandwidthDirection
Diego Russoe8a10452020-04-21 17:39:10 +010037from .tensor import MemArea
38from .tensor import shape_num_elements
Diqing Zhongef0c7fe2020-11-24 14:38:20 +010039from .tensor import Tensor
Diego Russoe8a10452020-04-21 17:39:10 +010040from .tensor import TensorBlockTraversal
Diqing Zhonge168b962020-11-05 17:18:47 +010041from .tensor import TensorFormat
Diego Russoe8a10452020-04-21 17:39:10 +010042from .tensor import TensorPurpose
Tim Hall79d07d22020-04-27 18:20:16 +010043
44
45def rolling_buffer_dims_from_passes(arch, ps1, block_config_ps1, ps2, block_config_ps2):
Tim Hall79d07d22020-04-27 18:20:16 +010046 ofm_block = Block(block_config_ps2[-3], block_config_ps2[-4], block_config_ps2[-1])
Tim Hall4ed38bc2020-10-20 18:54:20 +010047 kernel = ps2.primary_op.kernel
Tim Hall79d07d22020-04-27 18:20:16 +010048
Michael McGeaghf3e3ad72020-12-02 12:39:03 +000049 if ps2.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct):
Louis Verhaard93dc5532020-06-07 12:40:18 +020050 op = ps2.primary_op
patrik.gustavssoneeb85152020-12-21 17:10:40 +000051 ifm_block_depth = arch.calc_ifm_block_depth(op.ifm_shapes[0].depth, op.ifm.dtype.size_in_bits())
Tim Hall79d07d22020-04-27 18:20:16 +010052 else:
53 ifm_block_depth = block_config_ps2[-1]
54
Louis Verhaard93dc5532020-06-07 12:40:18 +020055 ifm_block = arch.get_ifm_block_size(ifm_block_depth, ofm_block, kernel, arch.ofm_block_max)
Tim Hall79d07d22020-04-27 18:20:16 +010056
57 # The performed height calculation is for worst case
58 height = numeric_util.round_up(ifm_block.height + block_config_ps1[0], block_config_ps1[0])
59 width = ifm_block.width
Louis Verhaard93dc5532020-06-07 12:40:18 +020060 return [height, width]
Tim Hall79d07d22020-04-27 18:20:16 +010061
62
Diqing Zhonge168b962020-11-05 17:18:47 +010063class PassCycles(IntEnum):
Diqing Zhong42e833d2020-10-02 13:18:42 +020064 Npu = 0
Diqing Zhonge168b962020-11-05 17:18:47 +010065 SramAccess = auto()
66 DramAccess = auto()
67 OnChipFlashAccess = auto()
68 OffChipFlashAccess = auto()
69 Total = auto()
70 Size = auto()
Tim Hall79d07d22020-04-27 18:20:16 +010071
72 def display_name(self):
Tim Hall1bd531d2020-11-01 20:59:36 +000073 return ("NPU", "SRAM Access", "DRAM Access", "On-chip Flash Access", "Off-chip Flash Access", "Total", "Size",)[
74 self.value
75 ]
Tim Hall79d07d22020-04-27 18:20:16 +010076
77 def identifier_name(self):
Tim Hall1bd531d2020-11-01 20:59:36 +000078 return ("npu", "sram_access", "dram_access", "on_chip_flash_access", "off_chip_flash_access", "total", "size",)[
79 self.value
80 ]
Tim Hall79d07d22020-04-27 18:20:16 +010081
82 @staticmethod
83 def all():
84 return (
Diqing Zhong42e833d2020-10-02 13:18:42 +020085 PassCycles.Npu,
Tim Hall79d07d22020-04-27 18:20:16 +010086 PassCycles.SramAccess,
87 PassCycles.DramAccess,
88 PassCycles.OnChipFlashAccess,
89 PassCycles.OffChipFlashAccess,
90 PassCycles.Total,
91 )
92
93
Tim Hall79d07d22020-04-27 18:20:16 +010094def make_bandwidth_array():
95 return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size))
96
97
Tim Hall79d07d22020-04-27 18:20:16 +010098def make_cycles_array():
99 return np.zeros(PassCycles.Size)
100
101
102def make_metrics_arrays():
Diqing Zhong69aadd02020-12-08 13:08:48 +0100103 return (make_bandwidth_array(), 0, make_cycles_array())
Tim Hall79d07d22020-04-27 18:20:16 +0100104
105
Diqing Zhong42e833d2020-10-02 13:18:42 +0200106def get_ifm_block_depth(npu_block_type, ifm_depth, ifm_elemwidth, block_traversal, ofm_blk_depth):
107 ifm_blk_depth = ofm_blk_depth
108
Diqing Zhong69aadd02020-12-08 13:08:48 +0100109 if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
Diqing Zhong42e833d2020-10-02 13:18:42 +0200110 if ifm_elemwidth == 16 or block_traversal == TensorBlockTraversal.PartKernelFirst:
111 ifm_blk_depth = 16
112 elif ifm_elemwidth == 8:
113 ifm_blk_depth = 32
114 else:
115 ifm_blk_depth = 8
116
117 return min(ifm_depth, ifm_blk_depth)
118
119
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100120def get_minimal_cmd_cycles(arch, ifm_tensor, ofm_tensor, ifm_blk: Block, ofm_blk: Block, output_cycles, dpu_cycles=0):
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100121 ifm_tens_blk = Tensor((1, ifm_blk.height, ifm_blk.width, ifm_blk.depth), ifm_tensor.dtype, "ifm_blk")
122 ofm_tens_blk = Tensor((1, ofm_blk.height, ofm_blk.width, ofm_blk.depth), ofm_tensor.dtype, "ofm_blk")
123 cycles_ifm_blk = (
Diqing Zhong69aadd02020-12-08 13:08:48 +0100124 estimate_memory_transfer_efficiency(arch, ifm_tensor.mem_area, BandwidthDirection.Read, ifm_tens_blk, ifm_blk)
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100125 / arch.memory_bandwidths_per_cycle[ifm_tensor.mem_area]
126 )
127 cycles_ofm_blk = (
Diqing Zhong69aadd02020-12-08 13:08:48 +0100128 estimate_memory_transfer_efficiency(arch, ofm_tensor.mem_area, BandwidthDirection.Write, ofm_tens_blk, ofm_blk)
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100129 / arch.memory_bandwidths_per_cycle[ofm_tensor.mem_area]
130 )
131 return (
Diqing Zhongf842b692020-12-11 13:07:37 +0100132 arch.memory_latency[ifm_tensor.mem_area][BandwidthDirection.Read]
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100133 + cycles_ifm_blk
134 + dpu_cycles
135 + output_cycles
Diqing Zhongf842b692020-12-11 13:07:37 +0100136 + arch.memory_latency[ofm_tensor.mem_area][BandwidthDirection.Write]
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100137 + cycles_ofm_blk
138 ) / 4
139
140
Diqing Zhong42e833d2020-10-02 13:18:42 +0200141def estimate_output_cycles(
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100142 arch,
143 npu_block_type,
144 primary_op,
145 num_elems,
146 ifm_tensor,
147 ofm_tensor,
148 use_acc_40bits=False,
149 ifm2_tensor=None,
150 block_config: Block = None,
Diqing Zhong09387e22020-09-28 18:46:22 +0200151):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100152 faf = None if primary_op.activation is None else primary_op.activation.op_type
Diqing Zhong09387e22020-09-28 18:46:22 +0200153 if npu_block_type == NpuBlockType.ElementWise and ifm_tensor.dtype == DataType.int32:
154 if ifm2_tensor is None:
Diqing Zhonge8887a32020-09-24 09:53:48 +0200155 # Unary op
156 output_perf_index = 0
157 else:
158 # Binary op
159 output_perf_index = 1
Diqing Zhong09387e22020-09-28 18:46:22 +0200160 elif primary_op.type == Op.Mul and ofm_tensor.dtype == DataType.int32:
Diqing Zhonge8887a32020-09-24 09:53:48 +0200161 output_perf_index = 2
Diqing Zhong09387e22020-09-28 18:46:22 +0200162 elif primary_op.type == Op.Mul or (
Diqing Zhonge8887a32020-09-24 09:53:48 +0200163 npu_block_type
164 in (
165 NpuBlockType.ConvolutionMxN,
166 NpuBlockType.ConvolutionDepthWise,
167 NpuBlockType.Pooling,
168 NpuBlockType.ReduceSum,
169 NpuBlockType.VectorProduct,
170 )
Diqing Zhong09387e22020-09-28 18:46:22 +0200171 and use_acc_40bits
Diqing Zhonge8887a32020-09-24 09:53:48 +0200172 ):
173 output_perf_index = 3
Diqing Zhong09387e22020-09-28 18:46:22 +0200174 elif primary_op.type in (Op.Add, Op.Sub):
175 input_scale = ifm_tensor.quantization.scale_f32
176 input2_scale = ifm2_tensor.quantization.scale_f32
177 output_scale = ofm_tensor.quantization.scale_f32
Diqing Zhonge8887a32020-09-24 09:53:48 +0200178
179 if "resizebilinear" in primary_op.attrs:
180 output_scale = input2_scale
181
182 if None in (input_scale, input2_scale, output_scale) or input_scale == input2_scale:
183 # Simple Add/Sub
184 output_perf_index = 4
185 else:
186 # Advanced Add/Sub
187 output_perf_index = 5
Diqing Zhong09387e22020-09-28 18:46:22 +0200188 elif primary_op.type.is_maxpool_op():
Diqing Zhonge8887a32020-09-24 09:53:48 +0200189 output_perf_index = 6
190 else:
191 output_perf_index = 7
192
193 if faf in (Op.Sigmoid, Op.Tanh, Op.LUT):
194 activation_perf_index = 0
195 elif faf in (Op.Relu, Op.Relu6, Op.ReluN1To1):
196 activation_perf_index = 1
197 else:
198 activation_perf_index = 2
199
Diqing Zhonge8887a32020-09-24 09:53:48 +0200200 cycle_per_elem = max(
201 arch.output_cycles_per_elem[output_perf_index], arch.activation_cycles_per_elem[activation_perf_index]
202 )
Diqing Zhong986e3192020-11-16 16:15:56 +0100203
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100204 if primary_op.type.is_elementwise_op() and block_config is not None:
205 num_elems_blk = block_config.width * block_config.height * block_config.depth
206 cycle_cmd = get_minimal_cmd_cycles(
207 arch, ifm_tensor, ofm_tensor, block_config, block_config, num_elems_blk * cycle_per_elem
208 )
209 cycle_per_elem = max(cycle_per_elem, cycle_cmd / num_elems_blk)
210
Diqing Zhonge8887a32020-09-24 09:53:48 +0200211 return num_elems * cycle_per_elem
212
213
Diqing Zhong42e833d2020-10-02 13:18:42 +0200214def estimate_conv_pooling_cycles(
Diqing Zhong986e3192020-11-16 16:15:56 +0100215 arch,
216 npu_block_type,
217 primary_op,
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100218 ifm_block: Block,
219 ofm_block: Block,
Diqing Zhong986e3192020-11-16 16:15:56 +0100220 block_traversal,
221 kernel_dims,
222 ifm_tensor,
223 ofm_tensor,
224 scale_tensor=None,
Diqing Zhong09387e22020-09-28 18:46:22 +0200225):
Diqing Zhonge5204a62020-10-13 11:42:37 +0200226 ofm_ublock = Block(arch.config.ofm_ublock.width, arch.config.ofm_ublock.height, arch.config.ofm_ublock.depth)
Patrik Gustavsson2349d422020-12-01 16:02:29 +0100227 ifm_tens_shape = primary_op.ifm_shapes[0]
228 ofm_tens_shape = primary_op.ofm_shapes[0]
Diqing Zhonge5204a62020-10-13 11:42:37 +0200229
230 if (
231 arch.config.ofm_ublock.height == 2
232 and npu_block_type
233 in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000234 and ofm_tens_shape.height == 1
Diqing Zhonge5204a62020-10-13 11:42:37 +0200235 # Optimisation only applies for even width tensors
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000236 and ofm_tens_shape.width % 2 == 0
Diqing Zhonge5204a62020-10-13 11:42:37 +0200237 and kernel_dims[0] == 1
238 ):
239 ofm_ublock.width = 4
240 ofm_ublock.height = 1
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100241 ofm_block.height = 1
Diqing Zhonge5204a62020-10-13 11:42:37 +0200242
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100243 num_ublk_x = numeric_util.round_up_divide(ofm_block.width, ofm_ublock.width)
244 num_ublk_y = ofm_block.height // ofm_ublock.height
245 num_ublk_xy = num_ublk_x * num_ublk_y
246 num_ublk_z = ofm_block.depth // ofm_ublock.depth
Diqing Zhong09387e22020-09-28 18:46:22 +0200247 num_ofm_blk = 0
248 total_cycles = 0
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100249 num_elems_blk = ofm_block.width * ofm_block.height * ofm_block.depth
Diqing Zhong09387e22020-09-28 18:46:22 +0200250 use_acc_40bits = is_acc_40bits_used(npu_block_type, ifm_tensor, ofm_tensor)
251
252 sub_kernel_limits = arch.sub_kernel_limits[npu_block_type]
253 n_sub_kernels_y = numeric_util.round_up_divide(kernel_dims[0], sub_kernel_limits[0])
254 n_sub_kernels_x = numeric_util.round_up_divide(kernel_dims[1], sub_kernel_limits[1])
255 sub_kernel_x = [
256 min((kernel_dims[1] - i * sub_kernel_limits[1]), sub_kernel_limits[1]) for i in range(n_sub_kernels_x)
257 ]
258 sub_kernel_y = [
259 min((kernel_dims[0] - i * sub_kernel_limits[0]), sub_kernel_limits[0]) for i in range(n_sub_kernels_y)
260 ]
261 sub_kernel_size = (x * y for y in sub_kernel_y for x in sub_kernel_x)
262
Diqing Zhong09387e22020-09-28 18:46:22 +0200263 cycles_dpu_blk = 0
Diqing Zhong986e3192020-11-16 16:15:56 +0100264 cycles_wb = 32 * ofm_ublock.depth // 8
Diqing Zhong09387e22020-09-28 18:46:22 +0200265
266 for num_kernel_elems in sub_kernel_size:
267 if npu_block_type == NpuBlockType.Pooling:
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100268 num_kernel_steps = 1
Diqing Zhong986e3192020-11-16 16:15:56 +0100269 cycles = max(4, num_kernel_elems) * num_ublk_xy * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200270 if ifm_tensor.dtype.size_in_bits() == 16 and arch.accelerator_config != Accelerator.Ethos_U55_32:
271 cycles *= 2
272 elif npu_block_type == NpuBlockType.ConvolutionDepthWise:
Diqing Zhong986e3192020-11-16 16:15:56 +0100273 cycles = 4 * num_ublk_xy
Diqing Zhong09387e22020-09-28 18:46:22 +0200274 if ifm_tensor.dtype.size_in_bits() == 16:
275 cycles *= 2
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100276 num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, 4)
277 cycles = max(cycles_wb, cycles) * num_kernel_steps * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200278 elif (
279 (npu_block_type == NpuBlockType.ConvolutionMxN and block_traversal != TensorBlockTraversal.PartKernelFirst)
280 or npu_block_type == NpuBlockType.VectorProduct
281 or npu_block_type == NpuBlockType.ReduceSum
282 ):
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100283 num_kernel_steps = num_kernel_elems
284 cycles = max(cycles_wb, 4 * num_ublk_xy) * num_kernel_steps * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200285 else:
286 assert block_traversal == TensorBlockTraversal.PartKernelFirst
287 divider = 2 if ifm_tensor.dtype.size_in_bits() == 16 else 4
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100288 num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, divider)
Diqing Zhong986e3192020-11-16 16:15:56 +0100289 cycles = max(cycles_wb, 4 * num_ublk_xy) * (
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100290 num_kernel_steps * numeric_util.round_up_divide(ifm_block.depth, 8) * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200291 )
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100292
293 delay_cycles = 0
294 if arch.accelerator_config is Accelerator.Ethos_U55_32:
295 delay = 7 if use_acc_40bits else 3
296 if num_ublk_x == 1 and num_ublk_y == 1:
297 if num_ublk_z == 1:
298 delay_cycles = delay * num_kernel_steps
299 elif num_kernel_steps > 1:
300 delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z
301 if (num_ublk_x == 1 or num_ublk_y == 1) and num_ublk_z > 1 and use_acc_40bits:
302 delay_cycles += delay * num_ublk_z
303 else:
304 delay = (
305 3
306 if use_acc_40bits and arch.accelerator_config in (Accelerator.Ethos_U55_64, Accelerator.Ethos_U55_128)
307 else 2
308 )
309 if num_ublk_x == 1 and num_ublk_y == 1:
310 if num_ublk_z == 1:
311 delay_cycles = delay * num_kernel_steps
312 elif num_kernel_steps > 1:
313 delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z
314
315 if npu_block_type == NpuBlockType.ConvolutionMxN and block_traversal == TensorBlockTraversal.PartKernelFirst:
316 delay_cycles *= numeric_util.round_up_divide(ifm_block.depth, 8)
317
Diqing Zhong09387e22020-09-28 18:46:22 +0200318 cycles_dpu_blk += cycles
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100319 cycles_dpu_blk += delay_cycles
320
321 if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000322 cycles_dpu_blk *= numeric_util.round_up_divide(ifm_tens_shape.depth, ifm_block.depth)
Diqing Zhong09387e22020-09-28 18:46:22 +0200323
324 cycles_dpu_blk /= arch.ncores
325
326 num_ofm_blk = (
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000327 numeric_util.round_up_divide(ofm_tens_shape.height, ofm_block.height)
328 * numeric_util.round_up_divide(ofm_tens_shape.width, ofm_block.width)
329 * numeric_util.round_up_divide(ofm_tens_shape.depth, ofm_block.depth)
Diqing Zhong09387e22020-09-28 18:46:22 +0200330 )
331
Diqing Zhong42e833d2020-10-02 13:18:42 +0200332 cycles_output_blk = estimate_output_cycles(
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100333 arch, npu_block_type, primary_op, num_elems_blk, ifm_tensor, ofm_tensor, use_acc_40bits
Diqing Zhong09387e22020-09-28 18:46:22 +0200334 )
335
Diqing Zhong986e3192020-11-16 16:15:56 +0100336 if scale_tensor:
Diqing Zhongf842b692020-12-11 13:07:37 +0100337 cycles_bias_blk = (
338 10
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000339 * min(ofm_block.depth, ofm_tens_shape.depth)
Diqing Zhongf842b692020-12-11 13:07:37 +0100340 * arch.memory_latency[scale_tensor.mem_area][BandwidthDirection.Read]
341 / 256
342 )
Diqing Zhong986e3192020-11-16 16:15:56 +0100343 cycles_output_blk = max(cycles_output_blk, cycles_bias_blk)
344
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100345 cycles_cmd = get_minimal_cmd_cycles(
346 arch, ifm_tensor, ofm_tensor, ifm_block, ofm_block, cycles_dpu_blk, cycles_output_blk
347 )
348 cycles_dpu_blk = max(cycles_dpu_blk, cycles_cmd)
349 cycles_output_blk = max(cycles_output_blk, cycles_cmd)
350
Diqing Zhong09387e22020-09-28 18:46:22 +0200351 if cycles_dpu_blk > cycles_output_blk:
352 total_cycles = cycles_dpu_blk * num_ofm_blk + cycles_output_blk
353 else:
354 total_cycles = cycles_output_blk * num_ofm_blk + cycles_dpu_blk
355
356 return total_cycles
357
358
Diqing Zhong69aadd02020-12-08 13:08:48 +0100359def estimate_memory_transfer_efficiency(arch, mem_area, direction, tensor, block_size: Block, replace_bw=None):
Diqing Zhonge168b962020-11-05 17:18:47 +0100360 if tensor.format not in (TensorFormat.NHWC, TensorFormat.NHCWB16):
361 return tensor.bandwidth() if replace_bw is None else replace_bw
362
363 # Estimate memory transfer efficiency by calculating the burst length
364 # this is related to data format, block shape, and tensor shape, etc.
Diqing Zhonge168b962020-11-05 17:18:47 +0100365 burst_len = 0
366 elem_size = tensor.dtype.size_in_bytes()
367 is_ifm = direction == BandwidthDirection.Read
368 tens = tensor.clone()
369 if not tens.avoid_NHCWB16:
370 tens.set_format(TensorFormat.NHCWB16, arch)
371
372 if tens.format == TensorFormat.NHCWB16:
373 if tens.get_strides()[1] == block_size.depth:
374 burst_len = elem_size * block_size.depth * block_size.width
375 elif is_ifm:
376 burst_len = 16 * elem_size * block_size.width
377 else:
378 burst_len = 16 * elem_size * block_size.width * arch.ncores
379 else:
380 assert tens.format == TensorFormat.NHWC
381 if is_ifm:
382 if tens.get_strides()[3] == block_size.depth:
383 burst_len = elem_size * block_size.depth * block_size.width
384 else:
385 burst_len = elem_size * block_size.depth
386 else:
387 if block_size.depth <= 16 and tens.get_strides()[3] == block_size.depth:
388 burst_len = elem_size * block_size.depth * block_size.width
389 else:
390 burst_len = min(64, 16 * elem_size * arch.ncores, block_size.depth * elem_size)
391
Diqing Zhongf842b692020-12-11 13:07:37 +0100392 burst_len = min(arch.memory_burst_length[mem_area], burst_len)
Diqing Zhonge168b962020-11-05 17:18:47 +0100393 bw = tens.bandwidth() if replace_bw is None else replace_bw
394
Diqing Zhongf842b692020-12-11 13:07:37 +0100395 return bw * (arch.memory_burst_length[mem_area] / burst_len)
Diqing Zhonge168b962020-11-05 17:18:47 +0100396
397
Michael McGeagh6f725262020-12-03 15:21:36 +0000398def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None, force_outputs_to_fast_storage=False):
Tim Hall79d07d22020-04-27 18:20:16 +0100399 if block_config is None:
400 block_config = ps.block_config
401 bws = make_bandwidth_array()
Diqing Zhong69aadd02020-12-08 13:08:48 +0100402 scaled_bws = make_bandwidth_array() # scaled bw with memory transfer efficiency
403 macs = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100404 cycles = make_cycles_array()
Tim Hall79d07d22020-04-27 18:20:16 +0100405 ifm_read_multiple = 1
406 weight_read_multiple = 0
407
Michael McGeaghf3e3ad72020-12-02 12:39:03 +0000408 if ps.placement in (PassPlacement.MemoryOnly, PassPlacement.StartupInit):
Diqing Zhong69aadd02020-12-08 13:08:48 +0100409 return bws, macs, cycles, ifm_read_multiple, weight_read_multiple # nothing real happening in this pass
Tim Hall79d07d22020-04-27 18:20:16 +0100410
Tim Hall79d07d22020-04-27 18:20:16 +0100411 explicit_padding = (0, 0, 0, 0)
412 primary_op = ps.primary_op
413 replacement_read_bws = {}
Diqing Zhonge168b962020-11-05 17:18:47 +0100414 ofm_block = Block(block_config[1], block_config[0], block_config[3])
415 ifm_block = Block(block_config[1], block_config[0], block_config[3])
416
Tim Hall1bd531d2020-11-01 20:59:36 +0000417 if ps.placement == PassPlacement.Npu and primary_op:
Tim Hall79d07d22020-04-27 18:20:16 +0100418 explicit_padding = primary_op.attrs.get("explicit_padding", explicit_padding)
Louis Verhaardaee5d752020-09-30 09:01:52 +0200419 assert primary_op.type.npu_block_type == ps.npu_block_type
420 npu_block_type = primary_op.type.npu_block_type
Tim Hall79d07d22020-04-27 18:20:16 +0100421
422 ifm_tensor, _, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000423 ifm_tensor_shape = ps.primary_op.ifm_shapes[0].clone()
424 ofm_tensor_shape = ps.primary_op.ofm_shapes[0].clone()
Tim Hall79d07d22020-04-27 18:20:16 +0100425
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100426 if npu_block_type == NpuBlockType.ReduceSum:
427 block_traversal = TensorBlockTraversal.DepthFirst
428 elif npu_block_type in (
429 NpuBlockType.ConvolutionMxN,
430 NpuBlockType.ConvolutionDepthWise,
431 NpuBlockType.VectorProduct,
432 ):
433 block_traversal = weight_tensor.block_traversal
434 else:
435 block_traversal = TensorBlockTraversal.Default
436 ifm_block_depth = get_ifm_block_depth(
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000437 npu_block_type, ifm_tensor_shape.depth, ifm_tensor.dtype.size_in_bits(), block_traversal, ofm_block.depth
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100438 )
439 ifm_block = arch.get_ifm_block_size(
440 ifm_block_depth, ofm_block, primary_op.kernel, ifm_resampling_mode=ifm_tensor.resampling_mode
441 )
442
Michael McGeaghf3e3ad72020-12-02 12:39:03 +0000443 if npu_block_type in (
444 NpuBlockType.ConvolutionMxN,
445 NpuBlockType.ConvolutionDepthWise,
Diqing Zhong69aadd02020-12-08 13:08:48 +0100446 NpuBlockType.VectorProduct,
Michael McGeaghf3e3ad72020-12-02 12:39:03 +0000447 NpuBlockType.Pooling,
448 NpuBlockType.ReduceSum,
Tim Hallc30f4952020-06-15 20:47:35 +0100449 ):
Charles Xu3e9c4342020-04-22 08:31:43 +0200450 # extent the ifm to full dimension
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000451
452 batch_size = ifm_tensor_shape.batch
Tim Hall79d07d22020-04-27 18:20:16 +0100453
454 # add in padding
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000455 ifm_tensor_shape.height += explicit_padding[0] + explicit_padding[2] # height += top and bottom
456 ifm_tensor_shape.width += explicit_padding[1] + explicit_padding[3] # width += left and right
Tim Hall79d07d22020-04-27 18:20:16 +0100457
Tim Hall79d07d22020-04-27 18:20:16 +0100458 if npu_block_type != NpuBlockType.Pooling:
Diqing Zhong09387e22020-09-28 18:46:22 +0200459 if npu_block_type == NpuBlockType.ReduceSum:
Diqing Zhong09387e22020-09-28 18:46:22 +0200460 weight_tensor_shape = [1, 1, ifm_tensor.shape[3], ofm_tensor.shape[3]]
461 weight_tensor_bandwidth_shape = [0] * 4
462 weight_tensor_element_size = 0
463 weight_tensor_bandwidth_compression_scale = 0.0
464 else:
Diqing Zhong69aadd02020-12-08 13:08:48 +0100465 # For Vector product, weight format of IO is extended to HWIO, with H=W=1
466 weight_tensor_shape = numeric_util.full_shape(4, weight_tensor.shape, 1)
467 weight_tensor_bandwidth_shape = numeric_util.full_shape(4, weight_tensor.bandwidth_shape, 1)
Diqing Zhong09387e22020-09-28 18:46:22 +0200468 weight_tensor_element_size = weight_tensor.element_size()
469 weight_tensor_bandwidth_compression_scale = weight_tensor.bandwidth_compression_scale
Diqing Zhong69aadd02020-12-08 13:08:48 +0100470
Tim Hall79d07d22020-04-27 18:20:16 +0100471 nn_ops = (
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000472 int(ofm_tensor_shape.batch)
473 * int(ofm_tensor_shape.height)
474 * int(ofm_tensor_shape.width)
Tim Hall79d07d22020-04-27 18:20:16 +0100475 * int(weight_tensor_shape[0])
476 * int(weight_tensor_shape[1])
477 * int(weight_tensor_shape[2])
478 * int(weight_tensor_shape[3])
Tim Hall79d07d22020-04-27 18:20:16 +0100479 )
480 else:
481 weight_tensor_shape = [
482 primary_op.attrs["ksize"][1],
483 primary_op.attrs["ksize"][2],
484 1,
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000485 ifm_tensor_shape.depth,
Tim Hall79d07d22020-04-27 18:20:16 +0100486 ]
487 weight_tensor_bandwidth_shape = weight_tensor_shape
488 weight_tensor_element_size = 0
489 weight_tensor_bandwidth_compression_scale = 0.0
490 nn_ops = 0 # pooling doesn't count as NN ops
491
492 kernel_dims = weight_tensor_shape[:2]
493
494 sub_kernel_limits = arch.sub_kernel_limits[npu_block_type]
495 # count the sub kernels; the IFM block needs to be refetched for each of them
496 n_sub_kernels_y = numeric_util.round_up_divide(kernel_dims[0], sub_kernel_limits[0])
497 n_sub_kernels_x = numeric_util.round_up_divide(kernel_dims[1], sub_kernel_limits[1])
498 n_sub_kernels = n_sub_kernels_y * n_sub_kernels_x
499
Diqing Zhong69aadd02020-12-08 13:08:48 +0100500 n_full_depth_stages = numeric_util.round_up_divide(weight_tensor_bandwidth_shape[3], ofm_block.depth)
501 if npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
502 n_full_depth_stages = 1 # force to no reread
Tim Hall79d07d22020-04-27 18:20:16 +0100503
Diqing Zhong69aadd02020-12-08 13:08:48 +0100504 ifm_read_multiple = n_sub_kernels * n_full_depth_stages
505 replacement_read_bws[ifm_tensor] = ifm_tensor.bandwidth() * ifm_read_multiple
Tim Hall79d07d22020-04-27 18:20:16 +0100506
Diqing Zhong69aadd02020-12-08 13:08:48 +0100507 weight_read_multiple = numeric_util.round_up_divide(
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000508 ofm_tensor_shape.height, ofm_block.height
509 ) * numeric_util.round_up_divide(ofm_tensor_shape.width, ofm_block.width)
Tim Hall79d07d22020-04-27 18:20:16 +0100510 replacement_read_bws[weight_tensor] = (
511 batch_size
512 * shape_num_elements(weight_tensor_bandwidth_shape)
513 * weight_tensor_element_size
514 * weight_tensor_bandwidth_compression_scale
Diqing Zhong69aadd02020-12-08 13:08:48 +0100515 * weight_read_multiple
516 )
Tim Hall79d07d22020-04-27 18:20:16 +0100517
Diqing Zhong69aadd02020-12-08 13:08:48 +0100518 macs += nn_ops
Diqing Zhong42e833d2020-10-02 13:18:42 +0200519 cycles[PassCycles.Npu] = estimate_conv_pooling_cycles(
Diqing Zhong986e3192020-11-16 16:15:56 +0100520 arch,
521 npu_block_type,
522 primary_op,
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100523 ifm_block,
Diqing Zhong986e3192020-11-16 16:15:56 +0100524 ofm_block,
525 block_traversal,
526 kernel_dims,
527 ifm_tensor,
528 ofm_tensor,
529 ps.scale_tensor,
Diqing Zhong09387e22020-09-28 18:46:22 +0200530 )
Diqing Zhonge8887a32020-09-24 09:53:48 +0200531 elif npu_block_type == NpuBlockType.ElementWise:
Tim Hall79d07d22020-04-27 18:20:16 +0100532 # Work out how many elements we have and calculate performance.
Diqing Zhong42e833d2020-10-02 13:18:42 +0200533 cycles[PassCycles.Npu] = estimate_output_cycles(
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100534 arch,
535 npu_block_type,
536 primary_op,
537 ofm_tensor.elements(),
538 ps.ifm_tensor,
539 ps.ofm_tensor,
540 None,
541 ps.ifm2_tensor,
542 ofm_block,
Diqing Zhong09387e22020-09-28 18:46:22 +0200543 )
Diqing Zhong42e833d2020-10-02 13:18:42 +0200544
545 prev_npu_pass = next((npu_ps for npu_ps in ps.dag_predecessors if npu_ps.placement is PassPlacement.Npu), None)
546 if prev_npu_pass is None:
547 # cycles for DMA ops in first pass
548 dma_ops = (op for op in ps.ops if op.type == Op.DMA)
549 for dma_op in dma_ops:
550 mem_area = dma_op.attrs["source"]
551 for tens in dma_op.inputs:
552 cycles[PassCycles.Npu] += tens.storage_size() / arch.memory_bandwidths_per_cycle[mem_area]
553
Michael McGeagh6f725262020-12-03 15:21:36 +0000554 if rewrite_list is not None:
555 # apply the desired rewrites
556 for rewrite_op, tens, _, _, _, ps_to_rewrite in rewrite_list:
557 if ps != ps_to_rewrite:
558 continue
559 if rewrite_op == SchedulerRewrite.Nop:
560 pass # these are fine, no bandwidth changes
561 elif rewrite_op in (SchedulerRewrite.ChangeTensorSubPurpose,):
Diqing Zhong69aadd02020-12-08 13:08:48 +0100562 bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Read] += replacement_read_bws[tens]
Michael McGeagh6f725262020-12-03 15:21:36 +0000563 if tens.purpose == TensorPurpose.FeatureMap:
Diqing Zhong69aadd02020-12-08 13:08:48 +0100564 scaled_bw = estimate_memory_transfer_efficiency(
Michael McGeagh6f725262020-12-03 15:21:36 +0000565 arch,
566 arch.fast_storage_mem_area,
567 BandwidthDirection.Read,
568 tens,
569 ifm_block,
570 replacement_read_bws[tens],
571 )
572 else:
Diqing Zhong69aadd02020-12-08 13:08:48 +0100573 scaled_bw = replacement_read_bws[tens]
574 scaled_bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Read] += scaled_bw
Michael McGeagh6f725262020-12-03 15:21:36 +0000575 replacement_read_bws[tens] = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100576
577 for tens in ps.outputs:
578 if force_outputs_to_fast_storage:
Diqing Zhong69aadd02020-12-08 13:08:48 +0100579 bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()
580 scaled_bws[arch.fast_storage_mem_area][tens.purpose][
581 BandwidthDirection.Write
582 ] += estimate_memory_transfer_efficiency(
Diqing Zhonge168b962020-11-05 17:18:47 +0100583 arch, arch.fast_storage_mem_area, BandwidthDirection.Write, tens, ofm_block
584 )
Tim Hall79d07d22020-04-27 18:20:16 +0100585 else:
Diqing Zhong69aadd02020-12-08 13:08:48 +0100586 bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()
587 scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += estimate_memory_transfer_efficiency(
Diqing Zhonge168b962020-11-05 17:18:47 +0100588 arch, tens.mem_area, BandwidthDirection.Write, tens, ofm_block
589 )
Tim Hall79d07d22020-04-27 18:20:16 +0100590
591 for tens in ps.intermediates:
592 bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()
Diqing Zhong69aadd02020-12-08 13:08:48 +0100593 scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()
Tim Hall79d07d22020-04-27 18:20:16 +0100594
595 if tens in replacement_read_bws:
596 bw = replacement_read_bws[tens]
597 else:
598 bw = tens.bandwidth()
599
600 bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw
Diqing Zhong69aadd02020-12-08 13:08:48 +0100601 scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw
Tim Hall79d07d22020-04-27 18:20:16 +0100602
603 for tens in ps.inputs:
Diqing Zhong69aadd02020-12-08 13:08:48 +0100604 if tens in replacement_read_bws:
605 bw = replacement_read_bws[tens]
606 else:
607 bw = tens.bandwidth()
608
609 bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw
610 scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += estimate_memory_transfer_efficiency(
611 arch, tens.mem_area, BandwidthDirection.Read, tens, ifm_block, bw
Diqing Zhonge168b962020-11-05 17:18:47 +0100612 )
Tim Hall79d07d22020-04-27 18:20:16 +0100613
614 # quick build access counts for only current pass, even though these aren't the final numbers
Diqing Zhong69aadd02020-12-08 13:08:48 +0100615 update_summary_cycles(arch, scaled_bws, cycles)
Tim Hall79d07d22020-04-27 18:20:16 +0100616
Diqing Zhong69aadd02020-12-08 13:08:48 +0100617 return bws, macs, cycles, ifm_read_multiple, weight_read_multiple
Tim Hall79d07d22020-04-27 18:20:16 +0100618
619
Diqing Zhonge168b962020-11-05 17:18:47 +0100620def update_summary_cycles(arch, bws, cycles):
621 cycles[PassCycles.SramAccess] = np.sum(bws[MemArea.Sram]) / arch.memory_bandwidths_per_cycle[MemArea.Sram]
Tim Hall79d07d22020-04-27 18:20:16 +0100622 cycles[PassCycles.DramAccess] = np.sum(bws[MemArea.Dram]) / arch.memory_bandwidths_per_cycle[MemArea.Dram]
623 cycles[PassCycles.OnChipFlashAccess] = (
624 np.sum(bws[MemArea.OnChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OnChipFlash]
625 )
626 cycles[PassCycles.OffChipFlashAccess] = (
627 np.sum(bws[MemArea.OffChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OffChipFlash]
628 )
629
630 cycles[PassCycles.Total] = np.max(cycles[: PassCycles.Total])
631 return cycles
632
633
634def collate_stats_for_cascaded_pass(arch, bws, macs, cycles):
635 return bws, macs, cycles
636
637
638def performance_for_cascaded_pass(arch, cps):
639 total_bws = make_bandwidth_array()
Diqing Zhong69aadd02020-12-08 13:08:48 +0100640 total_macs = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100641 total_cycles = make_cycles_array()
642
643 for ps in cps.passes:
Diqing Zhong69aadd02020-12-08 13:08:48 +0100644 bws, macs, cycles, _, _ = performance_metrics_for_pass(arch, ps)
Tim Hall79d07d22020-04-27 18:20:16 +0100645 ps.bandwidths = bws
646 ps.macs = macs
647 ps.cycles = cycles
Tim Hall79d07d22020-04-27 18:20:16 +0100648 total_bws += bws
649 total_macs += macs
650 total_cycles += cycles
651
652 bws, macs, cycles = collate_stats_for_cascaded_pass(arch, total_bws, total_macs, total_cycles)
653 cps.bandwidths = bws
654 cps.macs = macs
655 cps.cycles = cycles
656 return bws, macs, cycles
657
658
659def calc_performance_for_network(nng, arch):
660 total_bws = make_bandwidth_array()
Diqing Zhong69aadd02020-12-08 13:08:48 +0100661 total_macs = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100662 total_cycles = np.zeros(PassCycles.Size)
663
664 for sg in nng.subgraphs:
665 for cps in sg.cascaded_passes:
666 bws, macs, cycles = performance_for_cascaded_pass(arch, cps)
667 total_bws += bws
668 total_macs += macs
669 total_cycles += cycles
Tim Hall79d07d22020-04-27 18:20:16 +0100670
671 nng.bandwidths = total_bws
672 nng.macs = total_macs
673 nng.cycles = total_cycles