blob: c2418d733cde38f5b2675d3e205caf548847768b [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# NPU performance estimation functions to estimate performance of a Pass and CascadedPass. Uses a model that takes the
18# maximum of the 'cycles required for bandwidth' and 'cycles required for computing'.
19#
20# Called during scheduling to evaluate different proposals, as well as post-scheduling to provide a final performance
21# estimate.
Diqing Zhonge168b962020-11-05 17:18:47 +010022from enum import auto
23from enum import IntEnum
Diego Russoea6111a2020-04-14 18:41:58 +010024
Tim Hall79d07d22020-04-27 18:20:16 +010025import numpy as np
Diego Russoea6111a2020-04-14 18:41:58 +010026
27from . import numeric_util
Diqing Zhong09387e22020-09-28 18:46:22 +020028from .architecture_features import Accelerator
Diego Russoe8a10452020-04-21 17:39:10 +010029from .architecture_features import Block
Diqing Zhonge8887a32020-09-24 09:53:48 +020030from .data_type import DataType
Diego Russoe8a10452020-04-21 17:39:10 +010031from .nn_graph import PassPlacement
32from .nn_graph import SchedulerRewrite
Diego Russoea6111a2020-04-14 18:41:58 +010033from .operation import NpuBlockType
Diqing Zhonge8887a32020-09-24 09:53:48 +020034from .operation import Op
Diqing Zhong09387e22020-09-28 18:46:22 +020035from .shared_buffer_allocation import is_acc_40bits_used
Diqing Zhongf842b692020-12-11 13:07:37 +010036from .tensor import BandwidthDirection
Diego Russoe8a10452020-04-21 17:39:10 +010037from .tensor import MemArea
38from .tensor import shape_num_elements
Diqing Zhongef0c7fe2020-11-24 14:38:20 +010039from .tensor import Tensor
Diego Russoe8a10452020-04-21 17:39:10 +010040from .tensor import TensorBlockTraversal
Diqing Zhonge168b962020-11-05 17:18:47 +010041from .tensor import TensorFormat
Diego Russoe8a10452020-04-21 17:39:10 +010042from .tensor import TensorPurpose
Tim Hall79d07d22020-04-27 18:20:16 +010043
44
45def rolling_buffer_dims_from_passes(arch, ps1, block_config_ps1, ps2, block_config_ps2):
Tim Hall79d07d22020-04-27 18:20:16 +010046 ofm_block = Block(block_config_ps2[-3], block_config_ps2[-4], block_config_ps2[-1])
Tim Hall4ed38bc2020-10-20 18:54:20 +010047 kernel = ps2.primary_op.kernel
Tim Hall79d07d22020-04-27 18:20:16 +010048
Michael McGeaghf3e3ad72020-12-02 12:39:03 +000049 if ps2.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct):
Louis Verhaard93dc5532020-06-07 12:40:18 +020050 op = ps2.primary_op
patrik.gustavssoneeb85152020-12-21 17:10:40 +000051 ifm_block_depth = arch.calc_ifm_block_depth(op.ifm_shapes[0].depth, op.ifm.dtype.size_in_bits())
Tim Hall79d07d22020-04-27 18:20:16 +010052 else:
53 ifm_block_depth = block_config_ps2[-1]
54
Louis Verhaard93dc5532020-06-07 12:40:18 +020055 ifm_block = arch.get_ifm_block_size(ifm_block_depth, ofm_block, kernel, arch.ofm_block_max)
Tim Hall79d07d22020-04-27 18:20:16 +010056
57 # The performed height calculation is for worst case
58 height = numeric_util.round_up(ifm_block.height + block_config_ps1[0], block_config_ps1[0])
59 width = ifm_block.width
Louis Verhaard93dc5532020-06-07 12:40:18 +020060 return [height, width]
Tim Hall79d07d22020-04-27 18:20:16 +010061
62
Diqing Zhonge168b962020-11-05 17:18:47 +010063class PassCycles(IntEnum):
Diqing Zhong42e833d2020-10-02 13:18:42 +020064 Npu = 0
Diqing Zhonge168b962020-11-05 17:18:47 +010065 SramAccess = auto()
66 DramAccess = auto()
67 OnChipFlashAccess = auto()
68 OffChipFlashAccess = auto()
69 Total = auto()
70 Size = auto()
Tim Hall79d07d22020-04-27 18:20:16 +010071
72 def display_name(self):
Tim Hall1bd531d2020-11-01 20:59:36 +000073 return ("NPU", "SRAM Access", "DRAM Access", "On-chip Flash Access", "Off-chip Flash Access", "Total", "Size",)[
74 self.value
75 ]
Tim Hall79d07d22020-04-27 18:20:16 +010076
77 def identifier_name(self):
Tim Hall1bd531d2020-11-01 20:59:36 +000078 return ("npu", "sram_access", "dram_access", "on_chip_flash_access", "off_chip_flash_access", "total", "size",)[
79 self.value
80 ]
Tim Hall79d07d22020-04-27 18:20:16 +010081
82 @staticmethod
83 def all():
84 return (
Diqing Zhong42e833d2020-10-02 13:18:42 +020085 PassCycles.Npu,
Tim Hall79d07d22020-04-27 18:20:16 +010086 PassCycles.SramAccess,
87 PassCycles.DramAccess,
88 PassCycles.OnChipFlashAccess,
89 PassCycles.OffChipFlashAccess,
90 PassCycles.Total,
91 )
92
93
Tim Hall79d07d22020-04-27 18:20:16 +010094def make_bandwidth_array():
95 return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size))
96
97
Tim Hall79d07d22020-04-27 18:20:16 +010098def make_cycles_array():
99 return np.zeros(PassCycles.Size)
100
101
102def make_metrics_arrays():
Diqing Zhong69aadd02020-12-08 13:08:48 +0100103 return (make_bandwidth_array(), 0, make_cycles_array())
Tim Hall79d07d22020-04-27 18:20:16 +0100104
105
Diqing Zhong42e833d2020-10-02 13:18:42 +0200106def get_ifm_block_depth(npu_block_type, ifm_depth, ifm_elemwidth, block_traversal, ofm_blk_depth):
107 ifm_blk_depth = ofm_blk_depth
108
Diqing Zhong69aadd02020-12-08 13:08:48 +0100109 if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
Diqing Zhong42e833d2020-10-02 13:18:42 +0200110 if ifm_elemwidth == 16 or block_traversal == TensorBlockTraversal.PartKernelFirst:
111 ifm_blk_depth = 16
112 elif ifm_elemwidth == 8:
113 ifm_blk_depth = 32
114 else:
115 ifm_blk_depth = 8
116
117 return min(ifm_depth, ifm_blk_depth)
118
119
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100120def get_minimal_cmd_cycles(arch, ifm_tensor, ofm_tensor, ifm_blk: Block, ofm_blk: Block, output_cycles, dpu_cycles=0):
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100121 ifm_tens_blk = Tensor((1, ifm_blk.height, ifm_blk.width, ifm_blk.depth), ifm_tensor.dtype, "ifm_blk")
122 ofm_tens_blk = Tensor((1, ofm_blk.height, ofm_blk.width, ofm_blk.depth), ofm_tensor.dtype, "ofm_blk")
123 cycles_ifm_blk = (
Diqing Zhong69aadd02020-12-08 13:08:48 +0100124 estimate_memory_transfer_efficiency(arch, ifm_tensor.mem_area, BandwidthDirection.Read, ifm_tens_blk, ifm_blk)
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100125 / arch.memory_bandwidths_per_cycle[ifm_tensor.mem_area]
126 )
127 cycles_ofm_blk = (
Diqing Zhong69aadd02020-12-08 13:08:48 +0100128 estimate_memory_transfer_efficiency(arch, ofm_tensor.mem_area, BandwidthDirection.Write, ofm_tens_blk, ofm_blk)
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100129 / arch.memory_bandwidths_per_cycle[ofm_tensor.mem_area]
130 )
131 return (
Diqing Zhongf842b692020-12-11 13:07:37 +0100132 arch.memory_latency[ifm_tensor.mem_area][BandwidthDirection.Read]
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100133 + cycles_ifm_blk
134 + dpu_cycles
135 + output_cycles
Diqing Zhongf842b692020-12-11 13:07:37 +0100136 + arch.memory_latency[ofm_tensor.mem_area][BandwidthDirection.Write]
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100137 + cycles_ofm_blk
138 ) / 4
139
140
Diqing Zhong42e833d2020-10-02 13:18:42 +0200141def estimate_output_cycles(
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100142 arch,
143 npu_block_type,
144 primary_op,
145 num_elems,
146 ifm_tensor,
147 ofm_tensor,
148 use_acc_40bits=False,
149 ifm2_tensor=None,
150 block_config: Block = None,
Diqing Zhong09387e22020-09-28 18:46:22 +0200151):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100152 faf = None if primary_op.activation is None else primary_op.activation.op_type
Diqing Zhong09387e22020-09-28 18:46:22 +0200153 if npu_block_type == NpuBlockType.ElementWise and ifm_tensor.dtype == DataType.int32:
154 if ifm2_tensor is None:
Diqing Zhonge8887a32020-09-24 09:53:48 +0200155 # Unary op
156 output_perf_index = 0
157 else:
158 # Binary op
159 output_perf_index = 1
Diqing Zhong09387e22020-09-28 18:46:22 +0200160 elif primary_op.type == Op.Mul and ofm_tensor.dtype == DataType.int32:
Diqing Zhonge8887a32020-09-24 09:53:48 +0200161 output_perf_index = 2
Diqing Zhong09387e22020-09-28 18:46:22 +0200162 elif primary_op.type == Op.Mul or (
Diqing Zhonge8887a32020-09-24 09:53:48 +0200163 npu_block_type
164 in (
165 NpuBlockType.ConvolutionMxN,
166 NpuBlockType.ConvolutionDepthWise,
167 NpuBlockType.Pooling,
168 NpuBlockType.ReduceSum,
169 NpuBlockType.VectorProduct,
170 )
Diqing Zhong09387e22020-09-28 18:46:22 +0200171 and use_acc_40bits
Diqing Zhonge8887a32020-09-24 09:53:48 +0200172 ):
173 output_perf_index = 3
Diqing Zhong09387e22020-09-28 18:46:22 +0200174 elif primary_op.type in (Op.Add, Op.Sub):
175 input_scale = ifm_tensor.quantization.scale_f32
176 input2_scale = ifm2_tensor.quantization.scale_f32
177 output_scale = ofm_tensor.quantization.scale_f32
Diqing Zhonge8887a32020-09-24 09:53:48 +0200178
179 if "resizebilinear" in primary_op.attrs:
180 output_scale = input2_scale
181
182 if None in (input_scale, input2_scale, output_scale) or input_scale == input2_scale:
183 # Simple Add/Sub
184 output_perf_index = 4
185 else:
186 # Advanced Add/Sub
187 output_perf_index = 5
Diqing Zhong09387e22020-09-28 18:46:22 +0200188 elif primary_op.type.is_maxpool_op():
Diqing Zhonge8887a32020-09-24 09:53:48 +0200189 output_perf_index = 6
190 else:
191 output_perf_index = 7
192
193 if faf in (Op.Sigmoid, Op.Tanh, Op.LUT):
194 activation_perf_index = 0
195 elif faf in (Op.Relu, Op.Relu6, Op.ReluN1To1):
196 activation_perf_index = 1
197 else:
198 activation_perf_index = 2
199
Diqing Zhonge8887a32020-09-24 09:53:48 +0200200 cycle_per_elem = max(
201 arch.output_cycles_per_elem[output_perf_index], arch.activation_cycles_per_elem[activation_perf_index]
202 )
Diqing Zhong986e3192020-11-16 16:15:56 +0100203
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100204 if primary_op.type.is_elementwise_op() and block_config is not None:
205 num_elems_blk = block_config.width * block_config.height * block_config.depth
206 cycle_cmd = get_minimal_cmd_cycles(
207 arch, ifm_tensor, ofm_tensor, block_config, block_config, num_elems_blk * cycle_per_elem
208 )
209 cycle_per_elem = max(cycle_per_elem, cycle_cmd / num_elems_blk)
210
Diqing Zhonge8887a32020-09-24 09:53:48 +0200211 return num_elems * cycle_per_elem
212
213
Diqing Zhong42e833d2020-10-02 13:18:42 +0200214def estimate_conv_pooling_cycles(
Diqing Zhong986e3192020-11-16 16:15:56 +0100215 arch,
216 npu_block_type,
217 primary_op,
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100218 ifm_block: Block,
219 ofm_block: Block,
Diqing Zhong986e3192020-11-16 16:15:56 +0100220 block_traversal,
221 kernel_dims,
222 ifm_tensor,
223 ofm_tensor,
224 scale_tensor=None,
Diqing Zhong09387e22020-09-28 18:46:22 +0200225):
Diqing Zhonge5204a62020-10-13 11:42:37 +0200226 ofm_ublock = Block(arch.config.ofm_ublock.width, arch.config.ofm_ublock.height, arch.config.ofm_ublock.depth)
Patrik Gustavsson2349d422020-12-01 16:02:29 +0100227 ifm_tens_shape = primary_op.ifm_shapes[0]
228 ofm_tens_shape = primary_op.ofm_shapes[0]
Diqing Zhonge5204a62020-10-13 11:42:37 +0200229
230 if (
231 arch.config.ofm_ublock.height == 2
232 and npu_block_type
233 in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000234 and ofm_tens_shape.height == 1
Diqing Zhonge5204a62020-10-13 11:42:37 +0200235 # Optimisation only applies for even width tensors
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000236 and ofm_tens_shape.width % 2 == 0
Diqing Zhonge5204a62020-10-13 11:42:37 +0200237 and kernel_dims[0] == 1
238 ):
239 ofm_ublock.width = 4
240 ofm_ublock.height = 1
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100241 ofm_block.height = 1
Diqing Zhonge5204a62020-10-13 11:42:37 +0200242
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100243 num_ublk_x = numeric_util.round_up_divide(ofm_block.width, ofm_ublock.width)
244 num_ublk_y = ofm_block.height // ofm_ublock.height
245 num_ublk_xy = num_ublk_x * num_ublk_y
246 num_ublk_z = ofm_block.depth // ofm_ublock.depth
Diqing Zhong09387e22020-09-28 18:46:22 +0200247 num_ofm_blk = 0
248 total_cycles = 0
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100249 num_elems_blk = ofm_block.width * ofm_block.height * ofm_block.depth
Diqing Zhong09387e22020-09-28 18:46:22 +0200250 use_acc_40bits = is_acc_40bits_used(npu_block_type, ifm_tensor, ofm_tensor)
251
252 sub_kernel_limits = arch.sub_kernel_limits[npu_block_type]
253 n_sub_kernels_y = numeric_util.round_up_divide(kernel_dims[0], sub_kernel_limits[0])
254 n_sub_kernels_x = numeric_util.round_up_divide(kernel_dims[1], sub_kernel_limits[1])
255 sub_kernel_x = [
256 min((kernel_dims[1] - i * sub_kernel_limits[1]), sub_kernel_limits[1]) for i in range(n_sub_kernels_x)
257 ]
258 sub_kernel_y = [
259 min((kernel_dims[0] - i * sub_kernel_limits[0]), sub_kernel_limits[0]) for i in range(n_sub_kernels_y)
260 ]
261 sub_kernel_size = (x * y for y in sub_kernel_y for x in sub_kernel_x)
262
Diqing Zhong09387e22020-09-28 18:46:22 +0200263 cycles_dpu_blk = 0
Diqing Zhong986e3192020-11-16 16:15:56 +0100264 cycles_wb = 32 * ofm_ublock.depth // 8
Diqing Zhong09387e22020-09-28 18:46:22 +0200265
266 for num_kernel_elems in sub_kernel_size:
267 if npu_block_type == NpuBlockType.Pooling:
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100268 num_kernel_steps = 1
Diqing Zhong986e3192020-11-16 16:15:56 +0100269 cycles = max(4, num_kernel_elems) * num_ublk_xy * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200270 if ifm_tensor.dtype.size_in_bits() == 16 and arch.accelerator_config != Accelerator.Ethos_U55_32:
271 cycles *= 2
272 elif npu_block_type == NpuBlockType.ConvolutionDepthWise:
Diqing Zhong986e3192020-11-16 16:15:56 +0100273 cycles = 4 * num_ublk_xy
Diqing Zhong09387e22020-09-28 18:46:22 +0200274 if ifm_tensor.dtype.size_in_bits() == 16:
275 cycles *= 2
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100276 num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, 4)
277 cycles = max(cycles_wb, cycles) * num_kernel_steps * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200278 elif (
279 (npu_block_type == NpuBlockType.ConvolutionMxN and block_traversal != TensorBlockTraversal.PartKernelFirst)
280 or npu_block_type == NpuBlockType.VectorProduct
281 or npu_block_type == NpuBlockType.ReduceSum
282 ):
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100283 num_kernel_steps = num_kernel_elems
284 cycles = max(cycles_wb, 4 * num_ublk_xy) * num_kernel_steps * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200285 else:
286 assert block_traversal == TensorBlockTraversal.PartKernelFirst
287 divider = 2 if ifm_tensor.dtype.size_in_bits() == 16 else 4
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100288 num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, divider)
Diqing Zhong986e3192020-11-16 16:15:56 +0100289 cycles = max(cycles_wb, 4 * num_ublk_xy) * (
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100290 num_kernel_steps * numeric_util.round_up_divide(ifm_block.depth, 8) * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200291 )
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100292
293 delay_cycles = 0
294 if arch.accelerator_config is Accelerator.Ethos_U55_32:
295 delay = 7 if use_acc_40bits else 3
296 if num_ublk_x == 1 and num_ublk_y == 1:
297 if num_ublk_z == 1:
298 delay_cycles = delay * num_kernel_steps
299 elif num_kernel_steps > 1:
300 delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z
301 if (num_ublk_x == 1 or num_ublk_y == 1) and num_ublk_z > 1 and use_acc_40bits:
302 delay_cycles += delay * num_ublk_z
303 else:
304 delay = (
305 3
306 if use_acc_40bits and arch.accelerator_config in (Accelerator.Ethos_U55_64, Accelerator.Ethos_U55_128)
307 else 2
308 )
309 if num_ublk_x == 1 and num_ublk_y == 1:
310 if num_ublk_z == 1:
311 delay_cycles = delay * num_kernel_steps
312 elif num_kernel_steps > 1:
313 delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z
314
315 if npu_block_type == NpuBlockType.ConvolutionMxN and block_traversal == TensorBlockTraversal.PartKernelFirst:
316 delay_cycles *= numeric_util.round_up_divide(ifm_block.depth, 8)
317
Diqing Zhong09387e22020-09-28 18:46:22 +0200318 cycles_dpu_blk += cycles
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100319 cycles_dpu_blk += delay_cycles
320
321 if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000322 cycles_dpu_blk *= numeric_util.round_up_divide(ifm_tens_shape.depth, ifm_block.depth)
Diqing Zhong09387e22020-09-28 18:46:22 +0200323
324 cycles_dpu_blk /= arch.ncores
325
326 num_ofm_blk = (
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000327 numeric_util.round_up_divide(ofm_tens_shape.height, ofm_block.height)
328 * numeric_util.round_up_divide(ofm_tens_shape.width, ofm_block.width)
329 * numeric_util.round_up_divide(ofm_tens_shape.depth, ofm_block.depth)
Diqing Zhong09387e22020-09-28 18:46:22 +0200330 )
331
Diqing Zhong42e833d2020-10-02 13:18:42 +0200332 cycles_output_blk = estimate_output_cycles(
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100333 arch, npu_block_type, primary_op, num_elems_blk, ifm_tensor, ofm_tensor, use_acc_40bits
Diqing Zhong09387e22020-09-28 18:46:22 +0200334 )
335
Diqing Zhong986e3192020-11-16 16:15:56 +0100336 if scale_tensor:
Diqing Zhongf842b692020-12-11 13:07:37 +0100337 cycles_bias_blk = (
338 10
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000339 * min(ofm_block.depth, ofm_tens_shape.depth)
Diqing Zhongf842b692020-12-11 13:07:37 +0100340 * arch.memory_latency[scale_tensor.mem_area][BandwidthDirection.Read]
341 / 256
342 )
Diqing Zhong986e3192020-11-16 16:15:56 +0100343 cycles_output_blk = max(cycles_output_blk, cycles_bias_blk)
344
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100345 cycles_cmd = get_minimal_cmd_cycles(
346 arch, ifm_tensor, ofm_tensor, ifm_block, ofm_block, cycles_dpu_blk, cycles_output_blk
347 )
348 cycles_dpu_blk = max(cycles_dpu_blk, cycles_cmd)
349 cycles_output_blk = max(cycles_output_blk, cycles_cmd)
350
Diqing Zhong09387e22020-09-28 18:46:22 +0200351 if cycles_dpu_blk > cycles_output_blk:
352 total_cycles = cycles_dpu_blk * num_ofm_blk + cycles_output_blk
353 else:
354 total_cycles = cycles_output_blk * num_ofm_blk + cycles_dpu_blk
355
356 return total_cycles
357
358
Diqing Zhong69aadd02020-12-08 13:08:48 +0100359def estimate_memory_transfer_efficiency(arch, mem_area, direction, tensor, block_size: Block, replace_bw=None):
Diqing Zhonge168b962020-11-05 17:18:47 +0100360 if tensor.format not in (TensorFormat.NHWC, TensorFormat.NHCWB16):
361 return tensor.bandwidth() if replace_bw is None else replace_bw
362
363 # Estimate memory transfer efficiency by calculating the burst length
364 # this is related to data format, block shape, and tensor shape, etc.
Diqing Zhonge168b962020-11-05 17:18:47 +0100365 burst_len = 0
366 elem_size = tensor.dtype.size_in_bytes()
367 is_ifm = direction == BandwidthDirection.Read
368 tens = tensor.clone()
369 if not tens.avoid_NHCWB16:
370 tens.set_format(TensorFormat.NHCWB16, arch)
371
372 if tens.format == TensorFormat.NHCWB16:
373 if tens.get_strides()[1] == block_size.depth:
374 burst_len = elem_size * block_size.depth * block_size.width
375 elif is_ifm:
376 burst_len = 16 * elem_size * block_size.width
377 else:
378 burst_len = 16 * elem_size * block_size.width * arch.ncores
379 else:
380 assert tens.format == TensorFormat.NHWC
381 if is_ifm:
382 if tens.get_strides()[3] == block_size.depth:
383 burst_len = elem_size * block_size.depth * block_size.width
384 else:
385 burst_len = elem_size * block_size.depth
386 else:
387 if block_size.depth <= 16 and tens.get_strides()[3] == block_size.depth:
388 burst_len = elem_size * block_size.depth * block_size.width
389 else:
390 burst_len = min(64, 16 * elem_size * arch.ncores, block_size.depth * elem_size)
391
Diqing Zhongf842b692020-12-11 13:07:37 +0100392 burst_len = min(arch.memory_burst_length[mem_area], burst_len)
Diqing Zhonge168b962020-11-05 17:18:47 +0100393 bw = tens.bandwidth() if replace_bw is None else replace_bw
394
Diqing Zhongf842b692020-12-11 13:07:37 +0100395 return bw * (arch.memory_burst_length[mem_area] / burst_len)
Diqing Zhonge168b962020-11-05 17:18:47 +0100396
397
Michael McGeagh6f725262020-12-03 15:21:36 +0000398def performance_metrics_for_pass(arch, ps, block_config=None, rewrite_list=None, force_outputs_to_fast_storage=False):
Tim Hall79d07d22020-04-27 18:20:16 +0100399 if block_config is None:
400 block_config = ps.block_config
401 bws = make_bandwidth_array()
Diqing Zhong69aadd02020-12-08 13:08:48 +0100402 scaled_bws = make_bandwidth_array() # scaled bw with memory transfer efficiency
403 macs = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100404 cycles = make_cycles_array()
Tim Hall79d07d22020-04-27 18:20:16 +0100405 ifm_read_multiple = 1
406 weight_read_multiple = 0
407
Michael McGeaghf3e3ad72020-12-02 12:39:03 +0000408 if ps.placement in (PassPlacement.MemoryOnly, PassPlacement.StartupInit):
Diqing Zhong69aadd02020-12-08 13:08:48 +0100409 return bws, macs, cycles, ifm_read_multiple, weight_read_multiple # nothing real happening in this pass
Tim Hall79d07d22020-04-27 18:20:16 +0100410
Tim Hall79d07d22020-04-27 18:20:16 +0100411 explicit_padding = (0, 0, 0, 0)
412 primary_op = ps.primary_op
413 replacement_read_bws = {}
Diqing Zhonge168b962020-11-05 17:18:47 +0100414 ofm_block = Block(block_config[1], block_config[0], block_config[3])
415 ifm_block = Block(block_config[1], block_config[0], block_config[3])
416
Tim Hall1bd531d2020-11-01 20:59:36 +0000417 if ps.placement == PassPlacement.Npu and primary_op:
Tim Hall79d07d22020-04-27 18:20:16 +0100418 explicit_padding = primary_op.attrs.get("explicit_padding", explicit_padding)
Louis Verhaardaee5d752020-09-30 09:01:52 +0200419 assert primary_op.type.npu_block_type == ps.npu_block_type
420 npu_block_type = primary_op.type.npu_block_type
Tim Hall79d07d22020-04-27 18:20:16 +0100421
422 ifm_tensor, _, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000423 ifm_tensor_shape = ps.primary_op.ifm_shapes[0].clone()
424 ofm_tensor_shape = ps.primary_op.ofm_shapes[0].clone()
Diqing Zhong016b8272020-12-16 16:46:06 +0100425 ofm_block.width = min(ofm_block.width, ofm_tensor_shape.width)
426 ofm_block.height = min(ofm_block.height, ofm_tensor_shape.height)
427 ofm_block.depth = min(ofm_block.depth, ofm_tensor_shape.depth)
Tim Hall79d07d22020-04-27 18:20:16 +0100428
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100429 if npu_block_type == NpuBlockType.ReduceSum:
430 block_traversal = TensorBlockTraversal.DepthFirst
431 elif npu_block_type in (
432 NpuBlockType.ConvolutionMxN,
433 NpuBlockType.ConvolutionDepthWise,
434 NpuBlockType.VectorProduct,
435 ):
436 block_traversal = weight_tensor.block_traversal
437 else:
438 block_traversal = TensorBlockTraversal.Default
439 ifm_block_depth = get_ifm_block_depth(
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000440 npu_block_type, ifm_tensor_shape.depth, ifm_tensor.dtype.size_in_bits(), block_traversal, ofm_block.depth
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100441 )
442 ifm_block = arch.get_ifm_block_size(
443 ifm_block_depth, ofm_block, primary_op.kernel, ifm_resampling_mode=ifm_tensor.resampling_mode
444 )
Diqing Zhong016b8272020-12-16 16:46:06 +0100445 ifm_block.width = min(ifm_block.width, ifm_tensor_shape.width)
446 ifm_block.height = min(ifm_block.height, ifm_tensor_shape.height)
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100447
Michael McGeaghf3e3ad72020-12-02 12:39:03 +0000448 if npu_block_type in (
449 NpuBlockType.ConvolutionMxN,
450 NpuBlockType.ConvolutionDepthWise,
Diqing Zhong69aadd02020-12-08 13:08:48 +0100451 NpuBlockType.VectorProduct,
Michael McGeaghf3e3ad72020-12-02 12:39:03 +0000452 NpuBlockType.Pooling,
453 NpuBlockType.ReduceSum,
Tim Hallc30f4952020-06-15 20:47:35 +0100454 ):
Charles Xu3e9c4342020-04-22 08:31:43 +0200455 # extent the ifm to full dimension
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000456
457 batch_size = ifm_tensor_shape.batch
Tim Hall79d07d22020-04-27 18:20:16 +0100458
459 # add in padding
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000460 ifm_tensor_shape.height += explicit_padding[0] + explicit_padding[2] # height += top and bottom
461 ifm_tensor_shape.width += explicit_padding[1] + explicit_padding[3] # width += left and right
Tim Hall79d07d22020-04-27 18:20:16 +0100462
Tim Hall79d07d22020-04-27 18:20:16 +0100463 if npu_block_type != NpuBlockType.Pooling:
Diqing Zhong09387e22020-09-28 18:46:22 +0200464 if npu_block_type == NpuBlockType.ReduceSum:
Diqing Zhong09387e22020-09-28 18:46:22 +0200465 weight_tensor_shape = [1, 1, ifm_tensor.shape[3], ofm_tensor.shape[3]]
466 weight_tensor_bandwidth_shape = [0] * 4
467 weight_tensor_element_size = 0
468 weight_tensor_bandwidth_compression_scale = 0.0
469 else:
Diqing Zhong69aadd02020-12-08 13:08:48 +0100470 # For Vector product, weight format of IO is extended to HWIO, with H=W=1
471 weight_tensor_shape = numeric_util.full_shape(4, weight_tensor.shape, 1)
472 weight_tensor_bandwidth_shape = numeric_util.full_shape(4, weight_tensor.bandwidth_shape, 1)
Diqing Zhong09387e22020-09-28 18:46:22 +0200473 weight_tensor_element_size = weight_tensor.element_size()
474 weight_tensor_bandwidth_compression_scale = weight_tensor.bandwidth_compression_scale
Diqing Zhong69aadd02020-12-08 13:08:48 +0100475
Tim Hall79d07d22020-04-27 18:20:16 +0100476 nn_ops = (
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000477 int(ofm_tensor_shape.batch)
478 * int(ofm_tensor_shape.height)
479 * int(ofm_tensor_shape.width)
Tim Hall79d07d22020-04-27 18:20:16 +0100480 * int(weight_tensor_shape[0])
481 * int(weight_tensor_shape[1])
482 * int(weight_tensor_shape[2])
483 * int(weight_tensor_shape[3])
Tim Hall79d07d22020-04-27 18:20:16 +0100484 )
485 else:
486 weight_tensor_shape = [
487 primary_op.attrs["ksize"][1],
488 primary_op.attrs["ksize"][2],
489 1,
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000490 ifm_tensor_shape.depth,
Tim Hall79d07d22020-04-27 18:20:16 +0100491 ]
492 weight_tensor_bandwidth_shape = weight_tensor_shape
493 weight_tensor_element_size = 0
494 weight_tensor_bandwidth_compression_scale = 0.0
495 nn_ops = 0 # pooling doesn't count as NN ops
496
497 kernel_dims = weight_tensor_shape[:2]
498
499 sub_kernel_limits = arch.sub_kernel_limits[npu_block_type]
500 # count the sub kernels; the IFM block needs to be refetched for each of them
501 n_sub_kernels_y = numeric_util.round_up_divide(kernel_dims[0], sub_kernel_limits[0])
502 n_sub_kernels_x = numeric_util.round_up_divide(kernel_dims[1], sub_kernel_limits[1])
503 n_sub_kernels = n_sub_kernels_y * n_sub_kernels_x
504
Diqing Zhong69aadd02020-12-08 13:08:48 +0100505 n_full_depth_stages = numeric_util.round_up_divide(weight_tensor_bandwidth_shape[3], ofm_block.depth)
506 if npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
507 n_full_depth_stages = 1 # force to no reread
Tim Hall79d07d22020-04-27 18:20:16 +0100508
Diqing Zhong69aadd02020-12-08 13:08:48 +0100509 ifm_read_multiple = n_sub_kernels * n_full_depth_stages
510 replacement_read_bws[ifm_tensor] = ifm_tensor.bandwidth() * ifm_read_multiple
Tim Hall79d07d22020-04-27 18:20:16 +0100511
Diqing Zhong69aadd02020-12-08 13:08:48 +0100512 weight_read_multiple = numeric_util.round_up_divide(
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000513 ofm_tensor_shape.height, ofm_block.height
514 ) * numeric_util.round_up_divide(ofm_tensor_shape.width, ofm_block.width)
Tim Hall79d07d22020-04-27 18:20:16 +0100515 replacement_read_bws[weight_tensor] = (
516 batch_size
517 * shape_num_elements(weight_tensor_bandwidth_shape)
518 * weight_tensor_element_size
519 * weight_tensor_bandwidth_compression_scale
Diqing Zhong69aadd02020-12-08 13:08:48 +0100520 * weight_read_multiple
521 )
Tim Hall79d07d22020-04-27 18:20:16 +0100522
Diqing Zhong69aadd02020-12-08 13:08:48 +0100523 macs += nn_ops
Diqing Zhong42e833d2020-10-02 13:18:42 +0200524 cycles[PassCycles.Npu] = estimate_conv_pooling_cycles(
Diqing Zhong986e3192020-11-16 16:15:56 +0100525 arch,
526 npu_block_type,
527 primary_op,
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100528 ifm_block,
Diqing Zhong986e3192020-11-16 16:15:56 +0100529 ofm_block,
530 block_traversal,
531 kernel_dims,
532 ifm_tensor,
533 ofm_tensor,
534 ps.scale_tensor,
Diqing Zhong09387e22020-09-28 18:46:22 +0200535 )
Diqing Zhonge8887a32020-09-24 09:53:48 +0200536 elif npu_block_type == NpuBlockType.ElementWise:
Tim Hall79d07d22020-04-27 18:20:16 +0100537 # Work out how many elements we have and calculate performance.
Diqing Zhong42e833d2020-10-02 13:18:42 +0200538 cycles[PassCycles.Npu] = estimate_output_cycles(
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100539 arch,
540 npu_block_type,
541 primary_op,
542 ofm_tensor.elements(),
543 ps.ifm_tensor,
544 ps.ofm_tensor,
545 None,
546 ps.ifm2_tensor,
547 ofm_block,
Diqing Zhong09387e22020-09-28 18:46:22 +0200548 )
Diqing Zhong42e833d2020-10-02 13:18:42 +0200549
550 prev_npu_pass = next((npu_ps for npu_ps in ps.dag_predecessors if npu_ps.placement is PassPlacement.Npu), None)
551 if prev_npu_pass is None:
552 # cycles for DMA ops in first pass
553 dma_ops = (op for op in ps.ops if op.type == Op.DMA)
554 for dma_op in dma_ops:
555 mem_area = dma_op.attrs["source"]
556 for tens in dma_op.inputs:
557 cycles[PassCycles.Npu] += tens.storage_size() / arch.memory_bandwidths_per_cycle[mem_area]
558
Michael McGeagh6f725262020-12-03 15:21:36 +0000559 if rewrite_list is not None:
560 # apply the desired rewrites
561 for rewrite_op, tens, _, _, _, ps_to_rewrite in rewrite_list:
562 if ps != ps_to_rewrite:
563 continue
564 if rewrite_op == SchedulerRewrite.Nop:
565 pass # these are fine, no bandwidth changes
566 elif rewrite_op in (SchedulerRewrite.ChangeTensorSubPurpose,):
Diqing Zhong69aadd02020-12-08 13:08:48 +0100567 bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Read] += replacement_read_bws[tens]
Michael McGeagh6f725262020-12-03 15:21:36 +0000568 if tens.purpose == TensorPurpose.FeatureMap:
Diqing Zhong69aadd02020-12-08 13:08:48 +0100569 scaled_bw = estimate_memory_transfer_efficiency(
Michael McGeagh6f725262020-12-03 15:21:36 +0000570 arch,
571 arch.fast_storage_mem_area,
572 BandwidthDirection.Read,
573 tens,
574 ifm_block,
575 replacement_read_bws[tens],
576 )
577 else:
Diqing Zhong69aadd02020-12-08 13:08:48 +0100578 scaled_bw = replacement_read_bws[tens]
579 scaled_bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Read] += scaled_bw
Michael McGeagh6f725262020-12-03 15:21:36 +0000580 replacement_read_bws[tens] = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100581
582 for tens in ps.outputs:
583 if force_outputs_to_fast_storage:
Diqing Zhong69aadd02020-12-08 13:08:48 +0100584 bws[arch.fast_storage_mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()
585 scaled_bws[arch.fast_storage_mem_area][tens.purpose][
586 BandwidthDirection.Write
587 ] += estimate_memory_transfer_efficiency(
Diqing Zhonge168b962020-11-05 17:18:47 +0100588 arch, arch.fast_storage_mem_area, BandwidthDirection.Write, tens, ofm_block
589 )
Tim Hall79d07d22020-04-27 18:20:16 +0100590 else:
Diqing Zhong69aadd02020-12-08 13:08:48 +0100591 bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()
592 scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += estimate_memory_transfer_efficiency(
Diqing Zhonge168b962020-11-05 17:18:47 +0100593 arch, tens.mem_area, BandwidthDirection.Write, tens, ofm_block
594 )
Tim Hall79d07d22020-04-27 18:20:16 +0100595
596 for tens in ps.intermediates:
597 bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()
Diqing Zhong69aadd02020-12-08 13:08:48 +0100598 scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Write] += tens.bandwidth()
Tim Hall79d07d22020-04-27 18:20:16 +0100599
600 if tens in replacement_read_bws:
601 bw = replacement_read_bws[tens]
602 else:
603 bw = tens.bandwidth()
604
605 bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw
Diqing Zhong69aadd02020-12-08 13:08:48 +0100606 scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw
Tim Hall79d07d22020-04-27 18:20:16 +0100607
608 for tens in ps.inputs:
Diqing Zhong69aadd02020-12-08 13:08:48 +0100609 if tens in replacement_read_bws:
610 bw = replacement_read_bws[tens]
611 else:
612 bw = tens.bandwidth()
613
614 bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += bw
615 scaled_bws[tens.mem_area][tens.purpose][BandwidthDirection.Read] += estimate_memory_transfer_efficiency(
616 arch, tens.mem_area, BandwidthDirection.Read, tens, ifm_block, bw
Diqing Zhonge168b962020-11-05 17:18:47 +0100617 )
Tim Hall79d07d22020-04-27 18:20:16 +0100618
619 # quick build access counts for only current pass, even though these aren't the final numbers
Diqing Zhong69aadd02020-12-08 13:08:48 +0100620 update_summary_cycles(arch, scaled_bws, cycles)
Tim Hall79d07d22020-04-27 18:20:16 +0100621
Diqing Zhong69aadd02020-12-08 13:08:48 +0100622 return bws, macs, cycles, ifm_read_multiple, weight_read_multiple
Tim Hall79d07d22020-04-27 18:20:16 +0100623
624
Diqing Zhonge168b962020-11-05 17:18:47 +0100625def update_summary_cycles(arch, bws, cycles):
626 cycles[PassCycles.SramAccess] = np.sum(bws[MemArea.Sram]) / arch.memory_bandwidths_per_cycle[MemArea.Sram]
Tim Hall79d07d22020-04-27 18:20:16 +0100627 cycles[PassCycles.DramAccess] = np.sum(bws[MemArea.Dram]) / arch.memory_bandwidths_per_cycle[MemArea.Dram]
628 cycles[PassCycles.OnChipFlashAccess] = (
629 np.sum(bws[MemArea.OnChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OnChipFlash]
630 )
631 cycles[PassCycles.OffChipFlashAccess] = (
632 np.sum(bws[MemArea.OffChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OffChipFlash]
633 )
634
635 cycles[PassCycles.Total] = np.max(cycles[: PassCycles.Total])
636 return cycles
637
638
639def collate_stats_for_cascaded_pass(arch, bws, macs, cycles):
640 return bws, macs, cycles
641
642
643def performance_for_cascaded_pass(arch, cps):
644 total_bws = make_bandwidth_array()
Diqing Zhong69aadd02020-12-08 13:08:48 +0100645 total_macs = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100646 total_cycles = make_cycles_array()
647
648 for ps in cps.passes:
Diqing Zhong69aadd02020-12-08 13:08:48 +0100649 bws, macs, cycles, _, _ = performance_metrics_for_pass(arch, ps)
Tim Hall79d07d22020-04-27 18:20:16 +0100650 ps.bandwidths = bws
651 ps.macs = macs
652 ps.cycles = cycles
Tim Hall79d07d22020-04-27 18:20:16 +0100653 total_bws += bws
654 total_macs += macs
655 total_cycles += cycles
656
657 bws, macs, cycles = collate_stats_for_cascaded_pass(arch, total_bws, total_macs, total_cycles)
658 cps.bandwidths = bws
659 cps.macs = macs
660 cps.cycles = cycles
661 return bws, macs, cycles
662
663
664def calc_performance_for_network(nng, arch):
665 total_bws = make_bandwidth_array()
Diqing Zhong69aadd02020-12-08 13:08:48 +0100666 total_macs = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100667 total_cycles = np.zeros(PassCycles.Size)
668
669 for sg in nng.subgraphs:
670 for cps in sg.cascaded_passes:
671 bws, macs, cycles = performance_for_cascaded_pass(arch, cps)
672 total_bws += bws
673 total_macs += macs
674 total_cycles += cycles
Tim Hall79d07d22020-04-27 18:20:16 +0100675
676 nng.bandwidths = total_bws
677 nng.macs = total_macs
678 nng.cycles = total_cycles