blob: b1dae4e048ba803e539eae0669263593f7078900 [file] [log] [blame]
erik.andersson@arm.com460c6892021-02-24 14:38:09 +01001# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# NPU performance estimation functions to estimate performance of a Pass and CascadedPass. Uses a model that takes the
18# maximum of the 'cycles required for bandwidth' and 'cycles required for computing'.
19#
20# Called during scheduling to evaluate different proposals, as well as post-scheduling to provide a final performance
21# estimate.
Tim Halld8339a72021-05-27 18:49:40 +010022import copy
Diqing Zhonge168b962020-11-05 17:18:47 +010023from enum import auto
24from enum import IntEnum
Diego Russoea6111a2020-04-14 18:41:58 +010025
Tim Hall79d07d22020-04-27 18:20:16 +010026import numpy as np
Diego Russoea6111a2020-04-14 18:41:58 +010027
28from . import numeric_util
Tim Halld8339a72021-05-27 18:49:40 +010029from .architecture_allocator import ArchitectureBlockConfig
Diqing Zhong09387e22020-09-28 18:46:22 +020030from .architecture_features import Accelerator
Tim Halld8339a72021-05-27 18:49:40 +010031from .architecture_features import NpuBlockType
32from .architecture_features import SHRAMElements
33from .architecture_features import TensorFormat
34from .numeric_util import round_up
35from .operation import Kernel
Diqing Zhonge8887a32020-09-24 09:53:48 +020036from .operation import Op
Tim Halld8339a72021-05-27 18:49:40 +010037from .scheduler import Schedule
38from .scheduler import SchedulerOperation
39from .shape4d import Shape4D
Diqing Zhongf842b692020-12-11 13:07:37 +010040from .tensor import BandwidthDirection
Diego Russoe8a10452020-04-21 17:39:10 +010041from .tensor import MemArea
Diego Russoe8a10452020-04-21 17:39:10 +010042from .tensor import TensorPurpose
Tim Halld8339a72021-05-27 18:49:40 +010043from .weight_compressor import WeightKey
Tim Hall79d07d22020-04-27 18:20:16 +010044
45
Diqing Zhonge168b962020-11-05 17:18:47 +010046class PassCycles(IntEnum):
Diqing Zhong42e833d2020-10-02 13:18:42 +020047 Npu = 0
Diqing Zhonge168b962020-11-05 17:18:47 +010048 SramAccess = auto()
49 DramAccess = auto()
50 OnChipFlashAccess = auto()
51 OffChipFlashAccess = auto()
52 Total = auto()
53 Size = auto()
Tim Hall79d07d22020-04-27 18:20:16 +010054
55 def display_name(self):
Tim Hall1bd531d2020-11-01 20:59:36 +000056 return ("NPU", "SRAM Access", "DRAM Access", "On-chip Flash Access", "Off-chip Flash Access", "Total", "Size",)[
57 self.value
58 ]
Tim Hall79d07d22020-04-27 18:20:16 +010059
60 def identifier_name(self):
Tim Hall1bd531d2020-11-01 20:59:36 +000061 return ("npu", "sram_access", "dram_access", "on_chip_flash_access", "off_chip_flash_access", "total", "size",)[
62 self.value
63 ]
Tim Hall79d07d22020-04-27 18:20:16 +010064
65 @staticmethod
66 def all():
67 return (
Diqing Zhong42e833d2020-10-02 13:18:42 +020068 PassCycles.Npu,
Tim Hall79d07d22020-04-27 18:20:16 +010069 PassCycles.SramAccess,
70 PassCycles.DramAccess,
71 PassCycles.OnChipFlashAccess,
72 PassCycles.OffChipFlashAccess,
73 PassCycles.Total,
74 )
75
76
Tim Halld8339a72021-05-27 18:49:40 +010077class PerformanceQuery:
78 def __init__(self, npu_block_type=0):
79 self.npu_block_type = npu_block_type
80 self.ifm_shape = Shape4D(0)
81 self.ifm_format = TensorFormat.NHWC
82 self.ifm_memory_area = MemArea.Unknown
83 self.ifm2_memory_area = MemArea.Unknown
84 self.ifm_bits = 0
85 self.ifm2_bits = 0
86 self.ifm2_shape = None
87 self.ifm2_format = TensorFormat.NHWC
88 self.ofm_shape = Shape4D(0)
89 self.ofm_format = TensorFormat.NHWC
90 self.ofm_memory_area = MemArea.Unknown
91 self.ofm_bits = 0
92 self.const_shape = Shape4D(0)
93 self.const_memory_area = MemArea.Unknown
94 self.kernel = Kernel(1, 1)
95 self.config = ArchitectureBlockConfig()
Tim Hall79d07d22020-04-27 18:20:16 +010096
97
Tim Halld8339a72021-05-27 18:49:40 +010098class CycleCost:
99 def __init__(self):
100 self.op_macs = 0
101 self.op_cycles = 0
102
103 def __mul__(self, scale):
104 out = CycleCost()
105 out.op_macs = self.op_macs * scale
106 out.op_cycles = self.op_cycles * scale
107 return out
108
109 def __iadd__(self, rhs):
110 self.op_macs += rhs.op_macs
111 self.op_cycles += rhs.op_cycles
112 return self
113
114 def __str__(self):
115 return "macs = {}, cycles = {}".format(self.op_macs, self.op_cycles)
Tim Hall79d07d22020-04-27 18:20:16 +0100116
117
Tim Halld8339a72021-05-27 18:49:40 +0100118class ElementAccess:
119 def __init__(self):
120 # List of ONLY element access counts, consumers
121 # need to scale these values by the correct bitwidths
122 # to calculated memory bandwidth
123 self.ifm_read = [0, 0] # ifm1, ifm2
124 self.ofm_write = 0
125 self.weights_refetch = 0
126 self.const_read = [0, 0] # weights, scales
127
128 def __mul__(self, scale):
129 out = ElementAccess()
130 out.ifm_read[0] = self.ifm_read[0] * scale
131 out.ifm_read[1] = self.ifm_read[1] * scale
132 out.ofm_write = self.ofm_write * scale
133 out.weights_refetch = self.weights_refetch * scale
134 out.const_read[0] = self.const_read[0] * scale
135 out.const_read[1] = self.const_read[1] * scale
136 return out
137
138 def __iadd__(self, rhs):
139 self.ifm_read[0] += rhs.ifm_read[0]
140 self.ifm_read[1] += rhs.ifm_read[1]
141 self.ofm_write += rhs.ofm_write
142 self.weights_refetch += rhs.weights_refetch
143 self.const_read[0] += rhs.const_read[0]
144 self.const_read[1] += rhs.const_read[1]
145 return self
146
147 def __str__(self):
148 return "ifm read = {}, ofm write = {}, const read={}".format(self.ifm_read, self.ofm_write, self.const_read)
Tim Hall79d07d22020-04-27 18:20:16 +0100149
150
Tim Halld8339a72021-05-27 18:49:40 +0100151def _strides_for_shape(shape: Shape4D, format: TensorFormat, element_bits):
152 if format == TensorFormat.NHWC:
153 strides = [0, 0, 0, 0]
154 strides[3] = element_bits / 8 # +Z
155 strides[2] = (element_bits * shape.depth) // 8 # +X
156 strides[1] = (element_bits * shape.depth * shape.width) // 8 # +Y
157 strides[0] = (element_bits * shape.depth * shape.width * shape.height) // 8 # +N
158 elif format == TensorFormat.NHCWB16:
159 strides = [0, 0, 0, 0, 0]
160 strides[4] = element_bits / 8 # +Z
161 strides[3] = (element_bits * 16) / 8 # +X
162 strides[2] = (element_bits * 16 * shape.width) / 8 # +C
163 strides[1] = (element_bits * shape.width * shape.depth) / 8 # +Y
164 strides[0] = (element_bits * shape.width * shape.depth) / 8 # +N
Diqing Zhong42e833d2020-10-02 13:18:42 +0200165
Tim Halld8339a72021-05-27 18:49:40 +0100166 return strides
Diqing Zhong42e833d2020-10-02 13:18:42 +0200167
168
Tim Halld8339a72021-05-27 18:49:40 +0100169def _estimate_memory_transfer_efficiency(
170 arch, is_read, mem_area, format: TensorFormat, element_bits, block_size, shape4D, to_transfer
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100171):
Tim Halld8339a72021-05-27 18:49:40 +0100172 burst_len = 8
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100173
Tim Halld8339a72021-05-27 18:49:40 +0100174 strides = _strides_for_shape(shape4D, format, element_bits)
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100175
Tim Halld8339a72021-05-27 18:49:40 +0100176 if format == TensorFormat.NHCWB16:
177 if strides[2] == block_size.depth: # TODO is this check corrrect for non 8-bit
178 burst_len = element_bits * block_size.depth * block_size.width
179 elif is_read:
180 burst_len = 16 * element_bits * block_size.width
Diqing Zhonge8887a32020-09-24 09:53:48 +0200181 else:
Tim Halld8339a72021-05-27 18:49:40 +0100182 burst_len = 16 * element_bits * block_size.width * arch.ncores
183 elif format == TensorFormat.NHWC:
184 if is_read:
185 if strides[3] == block_size.depth:
186 burst_len = element_bits * block_size.depth * block_size.width
187 else:
188 burst_len = element_bits * block_size.depth
189 else:
190 if block_size.depth <= 16 and strides[3] == block_size.depth:
191 burst_len = element_bits * block_size.depth * block_size.width
192 else:
193 burst_len = min(64 * 8, 16 * element_bits * arch.ncores, block_size.depth * element_bits)
194
195 burst_len = burst_len // 8 # bits->bytes
196 burst_len = min(arch.memory_burst_length[mem_area], burst_len)
197 return to_transfer * (arch.memory_burst_length[mem_area] / burst_len)
198
199
200def _estimate_minimum_memory_cycles(arch, query: PerformanceQuery):
201 # Input block HW transfer (only for elements present)
202 ifm_bytes = Shape4D.min(query.ifm_shape, query.config.ifm_block).elements()
203 cycles_ifm_blk = arch.memory_latency[query.ifm_memory_area][BandwidthDirection.Read]
204 cycles_ifm_blk = cycles_ifm_blk + (
205 _estimate_memory_transfer_efficiency(
206 arch,
207 True,
208 query.ifm_memory_area,
209 query.ifm_format,
210 query.ifm_bits,
211 query.config.ifm_block,
212 query.ifm_shape,
213 ifm_bytes,
214 )
215 / arch.memory_bandwidths_per_cycle[query.ifm_memory_area]
216 )
217 # Output block HW transfer (only for elements present)
218 ofm_bytes = Shape4D.min(query.ofm_shape, query.config.ofm_block).elements()
219 cycles_ofm_blk = arch.memory_latency[query.ofm_memory_area][BandwidthDirection.Write]
220 cycles_ofm_blk = cycles_ofm_blk + (
221 _estimate_memory_transfer_efficiency(
222 arch,
223 False,
224 query.ofm_memory_area,
225 query.ofm_format,
226 query.ofm_bits,
227 query.config.ofm_block,
228 query.ofm_shape,
229 ofm_bytes,
230 )
231 / arch.memory_bandwidths_per_cycle[query.ofm_memory_area]
232 )
233 return cycles_ifm_blk, cycles_ofm_blk
234
235
236def _estimate_output_cycles_per_element(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):
237 if query.npu_block_type == NpuBlockType.ElementWise and query.ifm_bits == 32:
238 # Unary op else Binary op
239 output_perf_index = 0 if query.ifm2_shape is not None else 1
240 elif op_type == Op.Mul and query.ofm_bits == 32:
Diqing Zhonge8887a32020-09-24 09:53:48 +0200241 output_perf_index = 2
Tim Halld8339a72021-05-27 18:49:40 +0100242 elif op_type == Op.Mul or (
243 query.npu_block_type
Diqing Zhonge8887a32020-09-24 09:53:48 +0200244 in (
245 NpuBlockType.ConvolutionMxN,
246 NpuBlockType.ConvolutionDepthWise,
247 NpuBlockType.Pooling,
248 NpuBlockType.ReduceSum,
249 NpuBlockType.VectorProduct,
250 )
Tim Halld8339a72021-05-27 18:49:40 +0100251 and query.config.acc_type == SHRAMElements.Acc40
Diqing Zhonge8887a32020-09-24 09:53:48 +0200252 ):
253 output_perf_index = 3
Tim Halld8339a72021-05-27 18:49:40 +0100254 elif op_type in (Op.Add, Op.Sub):
255 if False:
Diqing Zhonge8887a32020-09-24 09:53:48 +0200256 # Simple Add/Sub
257 output_perf_index = 4
258 else:
Tim Halld8339a72021-05-27 18:49:40 +0100259 # Advanced Add/Sub TODO: Add as perf selection as operator variant
Diqing Zhonge8887a32020-09-24 09:53:48 +0200260 output_perf_index = 5
Tim Halld8339a72021-05-27 18:49:40 +0100261 elif op_type.is_maxpool_op():
Diqing Zhonge8887a32020-09-24 09:53:48 +0200262 output_perf_index = 6
263 else:
264 output_perf_index = 7
265
Tim Halld8339a72021-05-27 18:49:40 +0100266 if faf_type in (Op.Sigmoid, Op.Tanh, Op.LUT):
Diqing Zhonge8887a32020-09-24 09:53:48 +0200267 activation_perf_index = 0
Tim Halld8339a72021-05-27 18:49:40 +0100268 elif faf_type in (Op.Relu, Op.Relu6, Op.ReluN1To1):
Diqing Zhonge8887a32020-09-24 09:53:48 +0200269 activation_perf_index = 1
270 else:
271 activation_perf_index = 2
272
Diqing Zhonge8887a32020-09-24 09:53:48 +0200273 cycle_per_elem = max(
274 arch.output_cycles_per_elem[output_perf_index], arch.activation_cycles_per_elem[activation_perf_index]
275 )
Diqing Zhong986e3192020-11-16 16:15:56 +0100276
Tim Halld8339a72021-05-27 18:49:40 +0100277 if op_type.is_elementwise_op():
278 num_elems_blk = query.config.ofm_block.elements()
279 ifm_blk_cycles, ofm_blk_cycles = _estimate_minimum_memory_cycles(arch, query)
280 cycle_cmd = ifm_blk_cycles + ofm_blk_cycles
281 cycle_cmd = (cycle_cmd + cycle_per_elem * num_elems_blk) / 4 # per DPU
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100282 cycle_per_elem = max(cycle_per_elem, cycle_cmd / num_elems_blk)
283
Tim Halld8339a72021-05-27 18:49:40 +0100284 return cycle_per_elem
Diqing Zhonge8887a32020-09-24 09:53:48 +0200285
286
Tim Halld8339a72021-05-27 18:49:40 +0100287def _estimate_conv_cycles(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):
288 ifm_block = Shape4D.min(query.ifm_shape, query.config.ifm_block)
289 ofm_block = Shape4D.min(query.ofm_shape, query.config.ofm_block)
Diqing Zhonge5204a62020-10-13 11:42:37 +0200290
291 if (
292 arch.config.ofm_ublock.height == 2
Tim Halld8339a72021-05-27 18:49:40 +0100293 and query.npu_block_type
Diqing Zhonge5204a62020-10-13 11:42:37 +0200294 in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.VectorProduct)
Tim Halld8339a72021-05-27 18:49:40 +0100295 and query.ofm_shape.height == 1
Diqing Zhonge5204a62020-10-13 11:42:37 +0200296 # Optimisation only applies for even width tensors
Tim Halld8339a72021-05-27 18:49:40 +0100297 and query.ofm_shape.width % 2 == 0
298 and query.kernel.height == 1
Diqing Zhonge5204a62020-10-13 11:42:37 +0200299 ):
Tim Halld8339a72021-05-27 18:49:40 +0100300 ofm_ublock = Shape4D(1, 1, 4, arch.config.ofm_ublock.depth)
301 ofm_block = ofm_block.with_height(1)
302 else:
303 ofm_ublock = Shape4D(arch.config.ofm_ublock.to_hwc())
Diqing Zhonge5204a62020-10-13 11:42:37 +0200304
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100305 num_ublk_x = numeric_util.round_up_divide(ofm_block.width, ofm_ublock.width)
Tim Halld8339a72021-05-27 18:49:40 +0100306 num_ublk_y = numeric_util.round_up_divide(ofm_block.height, ofm_ublock.height)
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100307 num_ublk_xy = num_ublk_x * num_ublk_y
Tim Halld8339a72021-05-27 18:49:40 +0100308 num_ublk_z = numeric_util.round_up_divide(ofm_block.depth, ofm_ublock.depth)
309 use_acc_40bits = query.config.acc_type == SHRAMElements.Acc40
Diqing Zhong09387e22020-09-28 18:46:22 +0200310
Tim Halld8339a72021-05-27 18:49:40 +0100311 sub_kernel_limits = arch.sub_kernel_limits[query.npu_block_type]
312 n_sub_kernels_y = numeric_util.round_up_divide(query.kernel.height, sub_kernel_limits[0])
313 n_sub_kernels_x = numeric_util.round_up_divide(query.kernel.width, sub_kernel_limits[1])
Diqing Zhong09387e22020-09-28 18:46:22 +0200314 sub_kernel_x = [
Tim Halld8339a72021-05-27 18:49:40 +0100315 min((query.kernel.width - i * sub_kernel_limits[1]), sub_kernel_limits[1]) for i in range(n_sub_kernels_x)
Diqing Zhong09387e22020-09-28 18:46:22 +0200316 ]
317 sub_kernel_y = [
Tim Halld8339a72021-05-27 18:49:40 +0100318 min((query.kernel.height - i * sub_kernel_limits[0]), sub_kernel_limits[0]) for i in range(n_sub_kernels_y)
Diqing Zhong09387e22020-09-28 18:46:22 +0200319 ]
320 sub_kernel_size = (x * y for y in sub_kernel_y for x in sub_kernel_x)
321
Diqing Zhong09387e22020-09-28 18:46:22 +0200322 cycles_dpu_blk = 0
Diqing Zhong986e3192020-11-16 16:15:56 +0100323 cycles_wb = 32 * ofm_ublock.depth // 8
Diqing Zhong09387e22020-09-28 18:46:22 +0200324
325 for num_kernel_elems in sub_kernel_size:
Tim Halld8339a72021-05-27 18:49:40 +0100326 if query.npu_block_type == NpuBlockType.Pooling:
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100327 num_kernel_steps = 1
Diqing Zhong986e3192020-11-16 16:15:56 +0100328 cycles = max(4, num_kernel_elems) * num_ublk_xy * num_ublk_z
Tim Halld8339a72021-05-27 18:49:40 +0100329 if query.ifm_bits == 16 and arch.accelerator_config != Accelerator.Ethos_U55_32:
Diqing Zhong09387e22020-09-28 18:46:22 +0200330 cycles *= 2
Tim Halld8339a72021-05-27 18:49:40 +0100331 elif query.npu_block_type == NpuBlockType.ConvolutionDepthWise:
Diqing Zhong986e3192020-11-16 16:15:56 +0100332 cycles = 4 * num_ublk_xy
Tim Halld8339a72021-05-27 18:49:40 +0100333 if query.ifm_bits == 16:
Diqing Zhong09387e22020-09-28 18:46:22 +0200334 cycles *= 2
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100335 num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, 4)
336 cycles = max(cycles_wb, cycles) * num_kernel_steps * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200337 elif (
Tim Halld8339a72021-05-27 18:49:40 +0100338 (query.npu_block_type == NpuBlockType.ConvolutionMxN and not query.config.is_partkernel)
339 or query.npu_block_type == NpuBlockType.VectorProduct
340 or query.npu_block_type == NpuBlockType.ReduceSum
Diqing Zhong09387e22020-09-28 18:46:22 +0200341 ):
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100342 num_kernel_steps = num_kernel_elems
343 cycles = max(cycles_wb, 4 * num_ublk_xy) * num_kernel_steps * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200344 else:
Tim Halld8339a72021-05-27 18:49:40 +0100345 assert query.config.is_partkernel
346 divider = 2 if query.ifm_bits == 16 else 4
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100347 num_kernel_steps = numeric_util.round_up_divide(num_kernel_elems, divider)
Diqing Zhong986e3192020-11-16 16:15:56 +0100348 cycles = max(cycles_wb, 4 * num_ublk_xy) * (
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100349 num_kernel_steps * numeric_util.round_up_divide(ifm_block.depth, 8) * num_ublk_z
Diqing Zhong09387e22020-09-28 18:46:22 +0200350 )
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100351
352 delay_cycles = 0
353 if arch.accelerator_config is Accelerator.Ethos_U55_32:
354 delay = 7 if use_acc_40bits else 3
355 if num_ublk_x == 1 and num_ublk_y == 1:
356 if num_ublk_z == 1:
357 delay_cycles = delay * num_kernel_steps
358 elif num_kernel_steps > 1:
359 delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z
360 if (num_ublk_x == 1 or num_ublk_y == 1) and num_ublk_z > 1 and use_acc_40bits:
361 delay_cycles += delay * num_ublk_z
362 else:
Tim Halld8339a72021-05-27 18:49:40 +0100363 if use_acc_40bits and arch.accelerator_config in (Accelerator.Ethos_U55_64, Accelerator.Ethos_U55_128):
364 delay = 3
365 else:
366 delay = 2
367
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100368 if num_ublk_x == 1 and num_ublk_y == 1:
369 if num_ublk_z == 1:
370 delay_cycles = delay * num_kernel_steps
371 elif num_kernel_steps > 1:
372 delay_cycles = delay * (num_kernel_steps - 1) * num_ublk_z
373
Tim Halld8339a72021-05-27 18:49:40 +0100374 if query.npu_block_type == NpuBlockType.ConvolutionMxN and query.config.is_partkernel:
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100375 delay_cycles *= numeric_util.round_up_divide(ifm_block.depth, 8)
376
Diqing Zhong09387e22020-09-28 18:46:22 +0200377 cycles_dpu_blk += cycles
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100378 cycles_dpu_blk += delay_cycles
379
Tim Halld8339a72021-05-27 18:49:40 +0100380 if query.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
381 cycles_dpu_blk *= numeric_util.round_up_divide(query.ifm_shape.depth, ifm_block.depth)
Diqing Zhong09387e22020-09-28 18:46:22 +0200382
383 cycles_dpu_blk /= arch.ncores
384
Tim Halld8339a72021-05-27 18:49:40 +0100385 # Estimate output cycles
386 num_ofm_blks = query.ofm_shape.div_round_up(ofm_block).elements()
387 cycles_output_blk = _estimate_output_cycles_per_element(arch, op_type, faf_type, query) * ofm_block.elements()
Diqing Zhong09387e22020-09-28 18:46:22 +0200388
Tim Halld8339a72021-05-27 18:49:40 +0100389 # Scale and bias tensor
390 if query.const_shape.depth > 0:
Diqing Zhongf842b692020-12-11 13:07:37 +0100391 cycles_bias_blk = (
Tim Halld8339a72021-05-27 18:49:40 +0100392 10 * ofm_block.depth * arch.memory_latency[query.const_memory_area][BandwidthDirection.Read] / 256
Diqing Zhongf842b692020-12-11 13:07:37 +0100393 )
Diqing Zhong986e3192020-11-16 16:15:56 +0100394 cycles_output_blk = max(cycles_output_blk, cycles_bias_blk)
395
Tim Halld8339a72021-05-27 18:49:40 +0100396 ifm_blk_cycles, ofm_blk_cycles = _estimate_minimum_memory_cycles(arch, query)
397 cycles_cmd = ifm_blk_cycles + ofm_blk_cycles
398 cycles_cmd = (cycles_cmd + cycles_output_blk + cycles_dpu_blk) / 4 # per DPU
399
Diqing Zhongef0c7fe2020-11-24 14:38:20 +0100400 cycles_dpu_blk = max(cycles_dpu_blk, cycles_cmd)
401 cycles_output_blk = max(cycles_output_blk, cycles_cmd)
402
Diqing Zhong09387e22020-09-28 18:46:22 +0200403 if cycles_dpu_blk > cycles_output_blk:
Tim Halld8339a72021-05-27 18:49:40 +0100404 total_cycles = cycles_dpu_blk * num_ofm_blks + cycles_output_blk
Diqing Zhong09387e22020-09-28 18:46:22 +0200405 else:
Tim Halld8339a72021-05-27 18:49:40 +0100406 total_cycles = cycles_output_blk * num_ofm_blks + cycles_dpu_blk
Diqing Zhong09387e22020-09-28 18:46:22 +0200407
408 return total_cycles
409
410
Tim Halld8339a72021-05-27 18:49:40 +0100411def measure_mem2mem_cycles(arch, from_mem_area, to_mem_area, to_transfer):
412 from_cycles = to_transfer // arch.memory_bandwidths_per_cycle[from_mem_area]
413 to_cycles = to_transfer // arch.memory_bandwidths_per_cycle[to_mem_area]
414 return max(from_cycles, to_cycles)
Diqing Zhonge168b962020-11-05 17:18:47 +0100415
Patrik Gustavssonee99bb12021-04-08 09:04:00 +0200416
Tim Halld8339a72021-05-27 18:49:40 +0100417def measure_cycle_cost(arch, op_type: Op, faf_type: Op, query: PerformanceQuery):
418 cycles = CycleCost()
Diqing Zhonge168b962020-11-05 17:18:47 +0100419
Tim Halld8339a72021-05-27 18:49:40 +0100420 # Convolution/Vector product cycle calculation
421 if query.npu_block_type in (
422 NpuBlockType.ConvolutionMxN,
423 NpuBlockType.ConvolutionDepthWise,
424 NpuBlockType.VectorProduct,
425 NpuBlockType.Pooling,
426 NpuBlockType.ReduceSum,
427 ):
428 # cycles.op_macs and cycles.op_cycles should both handle >32-bits
429 if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
430 cycles.op_macs = int(query.kernel.elements_wh()) * 1 * int(query.ofm_shape.elements())
Diqing Zhonge168b962020-11-05 17:18:47 +0100431 else:
Tim Halld8339a72021-05-27 18:49:40 +0100432 cycles.op_macs = (
433 int(query.kernel.elements_wh()) * int(query.ifm_shape.depth) * int(query.ofm_shape.elements())
434 )
435
436 cycles.op_cycles = int(_estimate_conv_cycles(arch, op_type, faf_type, query))
437 # Elementwise cycle calculation
438 elif query.npu_block_type == NpuBlockType.ElementWise:
439 cycles.op_macs = 0
440 cycles.op_cycles = int(_estimate_output_cycles_per_element(arch, op_type, faf_type, query)) * int(
441 query.ofm_shape.elements()
442 )
Diqing Zhonge168b962020-11-05 17:18:47 +0100443 else:
Tim Halld8339a72021-05-27 18:49:40 +0100444 assert False
Diqing Zhonge168b962020-11-05 17:18:47 +0100445
Tim Halld8339a72021-05-27 18:49:40 +0100446 return cycles
Diqing Zhonge168b962020-11-05 17:18:47 +0100447
448
Tim Halld8339a72021-05-27 18:49:40 +0100449def measure_element_access(arch, query: PerformanceQuery):
450 access = ElementAccess()
Tim Hall79d07d22020-04-27 18:20:16 +0100451
Tim Halld8339a72021-05-27 18:49:40 +0100452 ifm_block = Shape4D.min(query.ifm_shape, query.config.ifm_block)
453 ofm_block = Shape4D.min(query.ofm_shape, query.config.ofm_block)
454 ifm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ifm_format]))
Tim Hall79d07d22020-04-27 18:20:16 +0100455
Tim Halld8339a72021-05-27 18:49:40 +0100456 # Number of ofm blocks in the overall output shape
457 ofm_blocks = query.ofm_shape.div_round_up(ofm_block)
458 ofm_block_depth = ofm_block.depth
459 if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
460 ofm_blocks = ofm_blocks.with_depth(1)
461 ofm_block_depth = query.ifm_shape.depth
Diqing Zhonge168b962020-11-05 17:18:47 +0100462
Tim Halld8339a72021-05-27 18:49:40 +0100463 # Convolution & pooling
464 if query.npu_block_type in (
465 NpuBlockType.ConvolutionMxN,
466 NpuBlockType.ConvolutionDepthWise,
467 NpuBlockType.VectorProduct,
468 NpuBlockType.Pooling,
469 NpuBlockType.ReduceSum,
470 ):
471 # Number of sub kernels
472 sub_kernel_limits = arch.sub_kernel_limits[query.npu_block_type]
473 subkernels = numeric_util.round_up_divide(query.kernel.width, sub_kernel_limits[0])
474 subkernels *= numeric_util.round_up_divide(query.kernel.height, sub_kernel_limits[1])
Tim Hall79d07d22020-04-27 18:20:16 +0100475
Tim Halld8339a72021-05-27 18:49:40 +0100476 ofm_block_count = ofm_blocks.elements()
Tim Hall79d07d22020-04-27 18:20:16 +0100477
Tim Halld8339a72021-05-27 18:49:40 +0100478 ifm_fetch = (
479 Shape4D.round_up(ifm_block, ifm_rounding).elements_wh()
480 * Shape4D.round_up(query.ifm_shape, ifm_rounding).depth
Diqing Zhonge168b962020-11-05 17:18:47 +0100481 )
Tim Hall79d07d22020-04-27 18:20:16 +0100482
Tim Halld8339a72021-05-27 18:49:40 +0100483 if query.npu_block_type in (NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling):
484 kernel_read = query.kernel.elements_wh() * 1 # force to no reread
485 else:
486 kernel_read = query.kernel.elements_wh() * query.ifm_shape.depth
Tim Hall79d07d22020-04-27 18:20:16 +0100487
Tim Halld8339a72021-05-27 18:49:40 +0100488 weight_fetch = kernel_read * ofm_block_depth * ofm_block_count
489
490 access.ifm_read[0] = ifm_fetch * subkernels * ofm_block_count
491
492 if query.npu_block_type not in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):
493 access.const_read[0] = weight_fetch
494 access.const_read[1] = query.ofm_shape.depth # Scales & biases
495 access.weights_refetch = ofm_blocks.elements_wh()
496 # Elementwise
497 elif query.npu_block_type == NpuBlockType.ElementWise:
498 if query.ifm_shape.elements() == 1:
499 if query.ifm_bits > 8:
500 # ifm is a non 8-bit scalar
501 access.ifm_read[0] = Shape4D.round_up(query.ifm_shape, ifm_rounding).elements()
502 if query.ifm2_shape:
503 access.ifm_read[1] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()
504 else:
505 access.ifm_read[0] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()
506 if query.ifm2_shape:
507 if query.ifm2_shape.elements() > 1:
508 access.ifm_read[1] = Shape4D.round_up(query.ofm_shape, ifm_rounding).elements()
509 elif query.ifm2_bits > 8:
510 # ifm2 is a non 8-bit scalar
511 access.ifm_read[1] = Shape4D.round_up(query.ifm2_shape, ifm_rounding).elements()
512 # Unknown
513 else:
514 assert False
515
516 ofm_rounding = Shape4D(list(arch.storage_rounding_quantums[query.ofm_format]))
517 access.ofm_write = Shape4D.round_up(query.ofm_shape, ofm_rounding).elements()
518 return access
519
520
521def measure_performance_cost(
522 arch, op_type: Op, faf_type: Op, query: PerformanceQuery, offset: Shape4D, sub_shape: Shape4D
523):
524 assert (query.ofm_bits > 0) and (query.ifm_bits > 0)
525 assert query.ofm_shape.elements() != 0
526
527 # Default to start if no offset provided
528 if offset is None:
529 offset = Shape4D(0, 0, 0, 0)
530
531 # Default to entire area if no sub-shape provided
532 if sub_shape is None:
533 sub_shape = query.ofm_shape
534 else:
535 sub_shape = Shape4D.min(sub_shape, query.ofm_shape)
536
537 sub_query = copy.deepcopy(query)
538 sub_query.ofm_shape = query.ofm_shape.clip(offset, sub_shape)
539
540 access = ElementAccess()
541 cycles = CycleCost()
542
543 cycle_tmp = measure_cycle_cost(arch, op_type, faf_type, sub_query)
544 cycles += cycle_tmp
545 access = measure_element_access(arch, sub_query)
546
547 return access, cycles
548
549
550def make_bandwidth_array():
551 return np.zeros((MemArea.Size, TensorPurpose.Size, BandwidthDirection.Size))
552
553
554def make_cycles_array():
555 return np.zeros(PassCycles.Size)
Tim Hall79d07d22020-04-27 18:20:16 +0100556
557
Diqing Zhonge168b962020-11-05 17:18:47 +0100558def update_summary_cycles(arch, bws, cycles):
559 cycles[PassCycles.SramAccess] = np.sum(bws[MemArea.Sram]) / arch.memory_bandwidths_per_cycle[MemArea.Sram]
Tim Hall79d07d22020-04-27 18:20:16 +0100560 cycles[PassCycles.DramAccess] = np.sum(bws[MemArea.Dram]) / arch.memory_bandwidths_per_cycle[MemArea.Dram]
561 cycles[PassCycles.OnChipFlashAccess] = (
562 np.sum(bws[MemArea.OnChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OnChipFlash]
563 )
564 cycles[PassCycles.OffChipFlashAccess] = (
565 np.sum(bws[MemArea.OffChipFlash]) / arch.memory_bandwidths_per_cycle[MemArea.OffChipFlash]
566 )
567
568 cycles[PassCycles.Total] = np.max(cycles[: PassCycles.Total])
569 return cycles
570
571
Tim Halld8339a72021-05-27 18:49:40 +0100572def estimate_full_op_performance(
573 arch, schedule: Schedule, op: SchedulerOperation, prev_op: SchedulerOperation, block_config
574):
575 cycles_a = make_cycles_array()
576 bws = make_bandwidth_array()
577 scaled_bws = make_bandwidth_array() # scaled bw with memory transfer efficiency
578 macs = 0
579
580 query = PerformanceQuery(op.op_type.npu_block_type)
581 query.ifm_shape = op.ifm.shape
582 query.ifm_format = op.ifm.format
583 query.ifm_memory_area = op.ifm.mem_area
584 query.ifm_bits = op.ifm.dtype.size_in_bits()
585 query.ifm2_shape = op.ifm2 and op.ifm2.shape
586 query.ifm2_format = op.ifm2 and op.ifm2.format
587 query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area
588 query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()
589 query.ofm_shape = op.ofm.shape
590 query.ofm_memory_area = op.ofm.mem_area
591 query.ofm_bits = op.ofm.dtype.size_in_bits()
592 query.ofm_format = op.ofm.format
593 query.kernel = op.kernel
594 query.config = block_config
595
596 cost = schedule.cost_map[op]
597 prev_cost = schedule.cost_map[prev_op] if prev_op else None
598 if op.parent_op.bias:
599 query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)
600 if cost.buffered_weight_tensor:
601 query.const_memory_area = cost.buffered_weight_tensor.mem_area
602 else:
603 query.const_memory_area = cost.npu_weights_tensor.mem_area
604
605 cycles = measure_cycle_cost(arch, op.op_type, op.parent_op.activation and op.parent_op.activation.op_type, query)
606 cycles_a[PassCycles.Npu] = cycles.op_cycles
607 macs = cycles.op_macs
608
609 access = measure_element_access(arch, query)
610
611 # How many NPU cycles are available under the previously executing
612 # operator for performing buffered DMA transfers
613 slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0
614
615 # LUT Transfer
616 parent_op = op.parent_op
617 lut_transfer_cycles = 0
618 if parent_op.activation_lut:
619 lut_tensor = [tens for tens in parent_op.inputs if tens.purpose == TensorPurpose.LUT][0]
620 src_tensor = lut_tensor.src_tensor
621 if src_tensor and lut_tensor.mem_area != src_tensor.mem_area:
622 bw = src_tensor.storage_size()
623 lut_transfer_cycles = measure_mem2mem_cycles(arch, src_tensor.mem_area, lut_tensor.mem_area, bw)
624
625 bws[src_tensor.mem_area][lut_tensor.purpose][BandwidthDirection.Read] += bw
626 # LUT read from SHRAM TODO remove?
627 scaled_bws[lut_tensor.mem_area][lut_tensor.purpose][
628 BandwidthDirection.Read
629 ] += _estimate_memory_transfer_efficiency(
630 arch,
631 True,
632 lut_tensor.mem_area,
633 lut_tensor.format,
634 lut_tensor.element_size(),
635 query.config.ifm_block,
636 Shape4D(lut_tensor.shape),
637 bw,
638 )
639
640 if cost.npu_weights_tensor and cost.buffered_weight_tensor:
641 # DMA Weight Transfer
642 sz = 0
643 # Get the size of the first DMA
644 for core in range(0, arch.ncores):
645 key = WeightKey(core, 0)
646 if key in cost.npu_weights_tensor.encoded_ranges:
647 weight_range = cost.npu_weights_tensor.encoded_ranges[key]
648 sz += round_up(weight_range.total_bytes, 16)
649
650 total_sz = len(cost.npu_weights_tensor.buffer)
651 bws[cost.npu_weights_tensor.mem_area][TensorPurpose.Weights][BandwidthDirection.Read] += total_sz
652 bws[cost.buffered_weight_tensor.mem_area][TensorPurpose.Weights][BandwidthDirection.Write] += total_sz
653
654 ws_first_transfer_cycles = measure_mem2mem_cycles(
655 arch, cost.npu_weights_tensor.mem_area, cost.buffered_weight_tensor.mem_area, sz
656 )
657
658 # Add cycles for Weight + Scale Transfer
659 cycles_a[PassCycles.Npu] = max(
660 cost.full_weight_transfer_cycles - slack_cycles + cost.slack_buffering_cycles,
661 cycles.op_cycles + max(ws_first_transfer_cycles - slack_cycles, 0),
662 )
663
664 # Add cycles for LUT Transfer
665 cycles_a[PassCycles.Npu] += lut_transfer_cycles
666 else:
667 # Add cycles for LUT Transfer
668 cycles_a[PassCycles.Npu] += max(lut_transfer_cycles - slack_cycles, 0)
669
670 # OFM write
671 ofm = op.parent_op.ofm
672 bw = access.ofm_write * ofm.element_size()
673 bws[query.ofm_memory_area][ofm.purpose][BandwidthDirection.Write] += bw
674 scaled_bws[ofm.mem_area][ofm.purpose][BandwidthDirection.Write] += _estimate_memory_transfer_efficiency(
675 arch, False, query.ofm_memory_area, ofm.format, query.ofm_bits, query.config.ofm_block, query.ofm_shape, bw
676 )
677
678 # IFM read
679 ifm = op.parent_op.ifm
680 bw = access.ifm_read[0] * ifm.element_size()
681 bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += bw
682 scaled_bws[ifm.mem_area][ifm.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency(
683 arch, True, query.ifm_memory_area, ifm.format, query.ifm_bits, query.config.ifm_block, query.ifm_shape, bw
684 )
685 if query.ifm2_shape:
686 ifm2 = op.parent_op.ifm2
687 bw = access.ifm_read[1] * ifm2.element_size()
688 bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += bw
689 scaled_bws[ifm2.mem_area][ifm2.purpose][BandwidthDirection.Read] += _estimate_memory_transfer_efficiency(
690 arch,
691 True,
692 query.ifm2_memory_area,
693 ifm2.format,
694 op.ifm2.dtype.size_in_bits(),
695 query.config.ifm_block,
696 query.ifm2_shape,
697 bw,
698 )
699
700 # Weight read
701 if access.const_read[0] > 0:
702 # alignment not accounted for in bandwidth_compression_scale_approx
703 encoded_size_approx = (
704 cost.npu_weights_tensor.elements() - access.const_read[1] * op.parent_op.bias.element_size()
705 )
706 orig_weight_size = parent_op.weights.elements()
707 bandwidth_compression_scale_approx = encoded_size_approx / orig_weight_size
708 bw = access.const_read[0] * bandwidth_compression_scale_approx
709 bws[query.const_memory_area][TensorPurpose.Weights][BandwidthDirection.Read] += bw
710
711 if access.const_read[1] > 0:
712 # Scales & biases
713 bw = access.const_read[1] * op.parent_op.bias.element_size()
714 bws[query.const_memory_area][TensorPurpose.FSBias][BandwidthDirection.Read] += bw
715
716 update_summary_cycles(arch, scaled_bws, cycles_a)
717
718 return bws, macs, cycles_a
Tim Hall79d07d22020-04-27 18:20:16 +0100719
720
Tim Halld8339a72021-05-27 18:49:40 +0100721def calc_new_performance_for_network(nng, arch):
Tim Hall79d07d22020-04-27 18:20:16 +0100722 total_bws = make_bandwidth_array()
Diqing Zhong69aadd02020-12-08 13:08:48 +0100723 total_macs = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100724 total_cycles = np.zeros(PassCycles.Size)
725
726 for sg in nng.subgraphs:
Tim Halld8339a72021-05-27 18:49:40 +0100727 prev_op = None
728 for sched_op in sg.sched_ops:
729 op_info = sg.schedule.cost_map[sched_op]
730 bws, macs, cycles = estimate_full_op_performance(arch, sg.schedule, sched_op, prev_op, op_info.block_config)
Tim Hall79d07d22020-04-27 18:20:16 +0100731 total_bws += bws
732 total_macs += macs
733 total_cycles += cycles
Tim Halld8339a72021-05-27 18:49:40 +0100734 prev_op = sched_op
Tim Hall79d07d22020-04-27 18:20:16 +0100735
736 nng.bandwidths = total_bws
737 nng.macs = total_macs
738 nng.cycles = total_cycles