blob: 84d8354bdfe03bddf9bd65fe902173c86a928fc8 [file] [log] [blame]
Tim Halld8339a72021-05-27 18:49:40 +01001# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17# Description: Architecture SHRAM allocator
18import enum
19import math
20from typing import Optional
21from typing import Tuple
22
23from .architecture_features import ArchitectureFeatures
24from .architecture_features import Block
25from .architecture_features import SHRAMConfig
26from .architecture_features import SHRAMElements
27from .ethos_u55_regs.ethos_u55_regs import resampling_mode
28from .numeric_util import round_up
29from .numeric_util import round_up_divide
30from .operation import Kernel
31from .operation import NpuBlockType
32from .range_set import MemoryRangeSet
33from .shape4d import Shape4D
34from .tensor import MemArea
35
36
37class SHRAMLayout:
38 def __init__(self):
39 self.ib_start = 0
40 self.ib_end = 0
41 self.ib_start2 = 0
42 self.ab_start = 0
43 self.lut_start = 0
44
45
46class ArchitectureBlockConfig:
47 def __init__(self):
48 self.layout = SHRAMLayout()
49 self.ifm_block = Shape4D()
James Ward399c4a22021-10-20 11:04:46 +010050 self.ofm_block = Shape4D() # non-1D-optimised block
Tim Halld8339a72021-05-27 18:49:40 +010051 self.acc_type = SHRAMElements.Acc32
52 self.is_partkernel = False
53 self.bank_size = 0
Louis Verhaardd2b55102022-03-17 15:59:04 +010054 self.ifm_depth_buf_scaling = 0
Tim Halld8339a72021-05-27 18:49:40 +010055
56 def get_shram_memory_access_range(self):
57 # Returns the SHRAM memory access range used by this shared buffer,
58 # excluding access to LUT
59 return MemoryRangeSet(MemArea.Shram, 0, self.layout.lut_start * self.bank_size)
60
61 def old_style_representation(self):
62 return [self.ofm_block.height, self.ofm_block.width, self.ifm_block.depth, self.ofm_block.depth]
63
64 def __str__(self):
65 return str(self.old_style_representation())
66
67
68_AccumulatorBits = {SHRAMElements.Acc16: 16, SHRAMElements.Acc32: 32, SHRAMElements.Acc40: 40}
69
70
71class ElementwiseUsage(enum.IntEnum):
72 No = 0
73 Full = 1
74 Scalar = 2
75
76
77def _try_block_config(
78 shram: SHRAMConfig,
79 ew_usage: ElementwiseUsage,
80 ofm_block: Block,
81 ifm_block: Block,
82 ifm_bits: int,
83 ifm_granule: int,
84 acc_bits: int,
85 acc_granule: int,
86 lut_banks: int,
Louis Verhaardd2b55102022-03-17 15:59:04 +010087 ifm_depth_buf_scaling: int,
88 cores: int,
Tim Halld8339a72021-05-27 18:49:40 +010089) -> SHRAMLayout:
90 assert (acc_bits > 0) and (acc_granule > 0)
91 assert (ifm_bits >= 8) and ((ifm_bits % 8) == 0) and (ifm_granule > 0)
92
Louis Verhaardd2b55102022-03-17 15:59:04 +010093 # Scale depth with cores
94 ifm_depth = round_up_divide(ifm_block.depth, ifm_depth_buf_scaling)
95 ofm_depth = round_up_divide(ofm_block.depth, cores)
96
Tim Halld8339a72021-05-27 18:49:40 +010097 # Aways need IFM space
Louis Verhaardd2b55102022-03-17 15:59:04 +010098 ifm_bytes = ifm_block.elements_wh() * round_up((ifm_depth * ifm_bits) / 8, 8)
Tim Halld8339a72021-05-27 18:49:40 +010099 ifm_banks = round_up_divide(ifm_bytes, shram.bank_size_bytes) * 2
100 ifm_banks = round_up(ifm_banks, ifm_granule)
101
102 # Calculate SHRAM boundaries of the IFM and Accumulators
103 lut_start = shram.total_banks - lut_banks
104 ifm_end = shram.reserved_output_banks + ifm_banks
105 ifm2_start = ifm_end
106 acc_start = lut_start
107
108 # If not elementwise then we need accumulator space
109 if ew_usage == ElementwiseUsage.No:
Louis Verhaardd2b55102022-03-17 15:59:04 +0100110 acc_bytes = (ofm_block.elements_wh() * round_up(ofm_depth, 8) * acc_bits) // 8
Tim Halld8339a72021-05-27 18:49:40 +0100111 acc_banks = round_up_divide(acc_bytes, shram.bank_size_bytes) * 2
112 acc_banks = round_up(acc_banks, acc_granule)
113 acc_start = acc_start - acc_banks
114 else:
115 ifm2_banks = ifm_banks if ew_usage == ElementwiseUsage.Full else 0
116 if ifm2_start + ifm2_banks > acc_start:
117 return None
118 ifm_end = acc_start
119
120 # IFM must still fit before accumulators
121 if ifm_end > acc_start:
122 return None
123
124 # Should all fit, so return this layout
125 layout = SHRAMLayout()
126 layout.ib_start = shram.reserved_output_banks
127 layout.ib_start2 = ifm2_start
128 layout.ib_end = ifm_end
129 layout.ab_start = acc_start
130 layout.lut_start = lut_start
131 return layout
132
133
134def _choose_kernel_method(ifm_shape: Shape4D, ifm_bits: int, kernel: Kernel) -> bool:
135 if ifm_shape.depth <= 8:
136 return True
137
138 # Compare part-kernel to depth-kernel and choose the one with best utilisation
139 kernel_elements = kernel.elements_wh()
140 depth_utilisation = ifm_shape.depth / round_up(ifm_shape.depth, 32 if ifm_bits == 8 else 16)
141 part_utilisation = (
142 ifm_shape.depth
143 * kernel_elements
144 / (round_up(ifm_shape.depth, 8) * round_up(kernel_elements, 4 if ifm_bits == 8 else 2))
145 )
146
147 return part_utilisation > depth_utilisation
148
149
150def _ew_usage(npu_op_type: NpuBlockType, uses_scalar: bool) -> ElementwiseUsage:
151 ew_usage = ElementwiseUsage.No
152 if npu_op_type == NpuBlockType.ElementWise:
153 ew_usage = ElementwiseUsage.Full
154 if uses_scalar:
155 ew_usage = ElementwiseUsage.Scalar
156 return ew_usage
157
158
159def _acc_type(npu_op_type: NpuBlockType, ifm_bits: int, scaled: bool) -> int:
160 """Returns accumulator type"""
161 acc_type = SHRAMElements.Acc32
162 if (ifm_bits == 16) and npu_op_type != NpuBlockType.Pooling and scaled:
163 acc_type = SHRAMElements.Acc40
164 return acc_type
165
166
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200167def is_nearest(ifm_resampling: resampling_mode) -> bool:
168 return ifm_resampling == resampling_mode.NEAREST
169
170
Tim Halld8339a72021-05-27 18:49:40 +0100171def to_upscale(ifm_resampling: resampling_mode) -> int:
172 # Upscaling depending on resampling mode
173 return 1 if ifm_resampling == resampling_mode.NONE else 2
174
175
176def _ifm_blockdepth(arch, ifm_shape: Shape4D, ifm_bits: int, is_partkernel: bool):
177 if ifm_bits == 16:
178 ifm_blockdepth = round_up(min(ifm_shape.depth, 16), 4)
179 else:
180 ifm_blockdepth = round_up(min(ifm_shape.depth, 16 if is_partkernel else 32), arch.ifm_ublock.depth)
181 return ifm_blockdepth
182
183
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200184def _required_size(value: int, stride: int, border: int, upscale: int, nearest: bool) -> int:
185 return int(math.ceil(((value - 1) * stride + border + nearest) / upscale))
Tim Halld8339a72021-05-27 18:49:40 +0100186
187
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200188def get_ifm_area_required(ofm_shape: Shape4D, kernel: Kernel, resampling_mode: resampling_mode) -> Tuple[int, int]:
189 upscale = to_upscale(resampling_mode)
190 nearest = is_nearest(resampling_mode)
191 h1 = _required_size(ofm_shape.height, kernel.stride.y, kernel.area_height(), upscale, nearest)
192 w1 = _required_size(ofm_shape.width, kernel.stride.x, kernel.area_width(), upscale, nearest)
Tim Halld8339a72021-05-27 18:49:40 +0100193 return (w1, h1)
194
195
196def _get_ifm_blocksize(
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200197 ofm_block: Shape4D, kernel: Kernel, ublock: Block, subkernel_limit: Block, upscale: int, nearest: bool
Tim Halld8339a72021-05-27 18:49:40 +0100198) -> Shape4D:
199 # IFM block height
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200200 h1 = _required_size(
201 ofm_block.height, kernel.stride.y, min(kernel.area_height(), subkernel_limit.height), upscale, nearest
202 )
Tim Halld8339a72021-05-27 18:49:40 +0100203 h2 = h1
204 height = round_up(min(h1, h2), ublock.height)
205
206 # IFM block width
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200207 w1 = _required_size(
208 ofm_block.width, kernel.stride.x, min(kernel.area_width(), subkernel_limit.width), upscale, nearest
209 )
Tim Halld8339a72021-05-27 18:49:40 +0100210 w2 = w1
211 width = round_up(min(w1, w2), ublock.width)
212
213 return Shape4D(1, height, width, ofm_block.depth)
214
215
Tim Hall30161572021-06-17 17:03:49 +0100216def fit_block_for_ofm(arch: ArchitectureFeatures, ofm_shape: Shape4D, kernel: Kernel, block: Shape4D):
217 # 256/512 Conv1D optimisation (ratio of IFM:Accumulators changes) This is a specific
218 # interpretation of a more general constraint that can't be applied because the
219 # find_block_config function must return block configs that can be applied to any OFM shape.
220 if (ofm_shape.height == 1) and (kernel.height == 1) and (arch.ofm_ublock.height == 2):
221 return Shape4D(1, min(block.height, ofm_shape.height), block.width, block.depth)
222 return block
223
224
Tim Halld8339a72021-05-27 18:49:40 +0100225def find_block_config(
226 arch: ArchitectureFeatures,
227 npu_op_type: NpuBlockType,
228 ofm_shape: Shape4D,
229 ifm_shape: Shape4D,
230 ifm2_shape: Shape4D,
231 uses_scalar: bool,
232 ifm_bits: int,
233 kernel: Kernel,
234 lut_banks: int,
235 scaled: bool,
236 ifm_resampling: resampling_mode,
237) -> ArchitectureBlockConfig:
238 SplitDepth = ArchitectureFeatures.OFMSplitDepth
239 # Elementwise larger-volume correction
240 if ifm2_shape is not None and ifm2_shape.elements() > ifm_shape.elements():
241 ifm_shape = ifm2_shape
242
243 # Figure out if SHRAM should be portioned for elementwise
244 ew_usage = _ew_usage(npu_op_type, uses_scalar)
245
246 # Operator typing help
247 is_pooling = npu_op_type == NpuBlockType.Pooling
248 is_depthwise = npu_op_type == NpuBlockType.ConvolutionDepthWise
249 is_equal_depth_op = (ew_usage != ElementwiseUsage.No) or is_pooling or is_depthwise
250 is_convolution = (npu_op_type == NpuBlockType.ConvolutionMxN) or is_depthwise
251
252 # Block config to be returned
253 config = ArchitectureBlockConfig()
254 config.is_partkernel = is_convolution and _choose_kernel_method(ifm_shape, ifm_bits, kernel)
255
Louis Verhaardd2b55102022-03-17 15:59:04 +0100256 # IFM is not broadcasted for pooling and depthwise ops and for elementwise
257 # when there's no elementwise-broadcasting in depth
258 elemwise_buf_scalable = npu_op_type == NpuBlockType.ElementWise and (
259 not ifm2_shape or ifm_shape.depth == ifm2_shape.depth
260 )
261 ifm_depth_buf_scaling = arch.ncores if is_pooling or is_depthwise or elemwise_buf_scalable else 1
262 config.ifm_depth_buf_scaling = ifm_depth_buf_scaling
263
Tim Halld8339a72021-05-27 18:49:40 +0100264 # Accumulator & granule settings
265 config.acc_type = _acc_type(npu_op_type, ifm_bits, scaled)
266
267 # Memory rounding granules
268 acc_granule = arch.accumulator_granules[config.acc_type]
269 acc_bits = _AccumulatorBits[config.acc_type]
270 if ew_usage != ElementwiseUsage.No:
271 ifm_granule = arch.ifm_ew_bank_granules[ifm_bits]
272 else:
273 ifm_granule = arch.ifm_bank_granules[ifm_bits]
274 lut_banks = max(lut_banks, arch.shram.reserved_end_banks)
275 upscale = to_upscale(ifm_resampling)
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200276 nearest = is_nearest(ifm_resampling)
Tim Halld8339a72021-05-27 18:49:40 +0100277
278 # Subkernel repeats of the IFM
279 ifm_repeats = round_up_divide(kernel.area_width(), arch.SubKernelMax.width) * round_up_divide(
280 kernel.area_height(), arch.SubKernelMax.height
281 )
282 ifm_blockdepth = _ifm_blockdepth(arch, ifm_shape, ifm_bits, config.is_partkernel)
283
284 # Weights fetch (for operators that have them)
285 weight_fetch_wh = (kernel.area_width() * kernel.area_height()) if is_convolution else 0
286
Louis Verhaardd2b55102022-03-17 15:59:04 +0100287 ofm_ublock_depth = arch.ofm_ublock.depth * arch.ncores
Tim Halld8339a72021-05-27 18:49:40 +0100288 search_space = Shape4D.min(ofm_shape, Shape4D(arch.ofm_block_max.to_hwc()))
Louis Verhaardd2b55102022-03-17 15:59:04 +0100289 search_space = Shape4D.round_up(search_space, Shape4D(arch.ofm_ublock.to_hwc()).with_depth(ofm_ublock_depth))
Tim Halld8339a72021-05-27 18:49:40 +0100290
291 # Block WHC search, loops across the search space looking for best efficiency
292 best_cost = math.inf
Tim Halldaed1522021-07-19 21:22:46 +0100293 best_coverage = math.inf
Tim Halld8339a72021-05-27 18:49:40 +0100294 depth = max(arch.ofm_ublock.depth, min(search_space.depth, SplitDepth))
295 if depth < ofm_shape.depth:
296 depth = round_up(depth, SplitDepth)
297
298 while depth <= search_space.depth:
299 wont_fit = {}
300 for height in range(arch.ofm_ublock.height, search_space.height + 1, arch.ofm_ublock.height):
301 for width in range(arch.ofm_ublock.width, search_space.width + 1, arch.ofm_ublock.width):
302 # Avoid checking W/H transposed blocks that already didn't fit. i.e. if 8x4x16 didn't
303 # fit, then 4x8x16 won't either.
304 if wont_fit.get((height, width), False):
305 continue
306
307 # Calculate the IFM block dimensions required to feed this OFM block
308 ofm_block = Shape4D(1, height, width, depth)
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200309 ifm_block = _get_ifm_blocksize(ofm_block, kernel, arch.ofm_ublock, arch.SubKernelMax, upscale, nearest)
Tim Halld8339a72021-05-27 18:49:40 +0100310 if not is_equal_depth_op:
311 ifm_block = ifm_block.with_depth(ifm_blockdepth)
312
313 # Test if the IFM/OFM blocks fit into SHRAM
Tim Hall30161572021-06-17 17:03:49 +0100314 ofm_block = fit_block_for_ofm(arch, ofm_shape, kernel, ofm_block)
Tim Halld8339a72021-05-27 18:49:40 +0100315 layout = _try_block_config(
Louis Verhaardd2b55102022-03-17 15:59:04 +0100316 arch.shram,
317 ew_usage,
318 ofm_block,
319 ifm_block,
320 ifm_bits,
321 ifm_granule,
322 acc_bits,
323 acc_granule,
324 lut_banks,
325 ifm_depth_buf_scaling,
326 arch.ncores,
Tim Halld8339a72021-05-27 18:49:40 +0100327 )
328
329 if layout:
Tim Hall789e6f32021-06-17 17:02:31 +0100330 full_blocks = Shape4D.div_round_up(ofm_shape, ofm_block)
331 blocks = ofm_shape / ofm_block
Tim Halld8339a72021-05-27 18:49:40 +0100332
Tim Hall789e6f32021-06-17 17:02:31 +0100333 # Weights fetching
334 weight_fetch = weight_fetch_wh * ifm_shape.depth * full_blocks.elements_wh()
335 if not is_depthwise:
336 weight_fetch *= ofm_block.depth * blocks.depth
Tim Halld8339a72021-05-27 18:49:40 +0100337
Tim Hall789e6f32021-06-17 17:02:31 +0100338 # IFM fetching
339 ifm_fetch = ifm_block.elements_wh() * ifm_shape.depth * ifm_repeats * blocks.elements_wh()
340 if not is_equal_depth_op:
341 ifm_fetch *= full_blocks.depth
Tim Halld8339a72021-05-27 18:49:40 +0100342
Tim Hall789e6f32021-06-17 17:02:31 +0100343 # Scale relative to every output OFM element
344 relative_cost = (ifm_fetch + weight_fetch) / ofm_shape.elements()
Tim Halld8339a72021-05-27 18:49:40 +0100345
346 # If the entire IFM can be encompassed by both buffers, bias to prefer this configuration
347 if ifm_shape.elements() < ifm_block.elements() * 2:
348 relative_cost = relative_cost / 2
349
Tim Halldaed1522021-07-19 21:22:46 +0100350 # Choose based on relative minimum cost or larger IFM area (if equal cost)
351 if relative_cost <= best_cost:
352 choose_this = False
353 # Check IFM coverage only when it's equal best_cost and small OFM
354 if relative_cost == best_cost:
355 coverage_shape = Shape4D.min(ifm_shape, ifm_block)
356 coverage = ifm_shape.elements_wh() / coverage_shape.elements_wh()
357 # Small 4x4 IFM constraint found through analysis of networks
358 if coverage <= best_coverage and (height <= 4 and width <= 4):
359 best_coverage = coverage
360 choose_this = True
361 else:
362 best_coverage = math.inf
363 choose_this = True
364
365 if choose_this:
366 best_cost = relative_cost
367 config.layout = layout
368 config.bank_size = arch.shram_bank_size
369 config.ifm_block = ifm_block
370 config.ofm_block = Shape4D(1, height, width, depth)
Tim Halld8339a72021-05-27 18:49:40 +0100371 else:
372 wont_fit[(width, height)] = True
373
374 depth = depth + arch.ofm_ublock.depth
375 if depth < ofm_shape.depth:
376 depth = round_up(depth, SplitDepth)
377
378 if best_cost != math.inf:
379 return config
380
381 return None
382
383
384def try_block_config(
385 block_config: Block,
386 arch: ArchitectureFeatures,
387 npu_op_type: NpuBlockType,
Tim Hall30161572021-06-17 17:03:49 +0100388 ofm_shape: Block,
Tim Halld8339a72021-05-27 18:49:40 +0100389 ifm_shape: Block,
390 ifm2_shape: Optional[Block],
391 uses_scalar: bool,
392 ifm_bits: int,
393 is_partkernel: bool,
394 kernel: Kernel,
395 lut_banks: int,
396 scaled: bool,
397 ifm_resampling: resampling_mode,
398) -> Optional[ArchitectureBlockConfig]:
399 """
400 Given a block_config, returns a corresponding ArchitectureBlockConfig.
401 Returns None if the block_config does not fit or is invalid.
402 """
403 # Check block config validity
404 if not all(
405 blk > 0 and blk <= blk_max and blk % ublk == 0
406 for blk, blk_max, ublk in zip(block_config.as_list(), arch.ofm_block_max.as_list(), arch.ofm_ublock.as_list())
407 ):
408 return None
409 # Elementwise larger-volume correction
410 if ifm2_shape is not None and ifm2_shape.elements() > ifm_shape.elements():
411 ifm_shape = ifm2_shape
412
413 ew_usage = _ew_usage(npu_op_type, uses_scalar)
414
415 # Operator typing help
416 is_pooling = npu_op_type == NpuBlockType.Pooling
417 is_depthwise = npu_op_type == NpuBlockType.ConvolutionDepthWise
418 is_equal_depth_op = (ew_usage != ElementwiseUsage.No) or is_pooling or is_depthwise
419
420 # Block config to be returned
421 config = ArchitectureBlockConfig()
422 config.is_partkernel = is_partkernel
423
Louis Verhaardd2b55102022-03-17 15:59:04 +0100424 # IFM is not broadcasted for pooling and depthwise ops and for elementwise
425 # when there's no elementwise-broadcasting in depth
426 elemwise_buf_scalable = npu_op_type == NpuBlockType.ElementWise and (
427 not ifm2_shape or ifm_shape.depth == ifm2_shape.depth
428 )
429 ifm_depth_buf_scaling = arch.ncores if is_pooling or is_depthwise or elemwise_buf_scalable else 1
430 config.ifm_depth_buf_scaling = ifm_depth_buf_scaling
431
Tim Halld8339a72021-05-27 18:49:40 +0100432 # Accumulator & granule settings
433 config.acc_type = _acc_type(npu_op_type, ifm_bits, scaled)
434
435 # Memory rounding granules
436 acc_granule = arch.accumulator_granules[config.acc_type]
437 acc_bits = _AccumulatorBits[config.acc_type]
438 if ew_usage != ElementwiseUsage.No:
439 ifm_granule = arch.ifm_ew_bank_granules[ifm_bits]
440 else:
441 ifm_granule = arch.ifm_bank_granules[ifm_bits]
442 lut_banks = max(lut_banks, arch.shram.reserved_end_banks)
443 upscale = to_upscale(ifm_resampling)
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200444 nearest = is_nearest(ifm_resampling)
Tim Halld8339a72021-05-27 18:49:40 +0100445 ifm_blockdepth = _ifm_blockdepth(arch, ifm_shape, ifm_bits, is_partkernel)
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200446 ifm_block = _get_ifm_blocksize(block_config, kernel, arch.ofm_ublock, arch.SubKernelMax, upscale, nearest)
Tim Halld8339a72021-05-27 18:49:40 +0100447 if not is_equal_depth_op:
448 ifm_block = ifm_block.with_depth(ifm_blockdepth)
449
Tim Hall30161572021-06-17 17:03:49 +0100450 # 256/512 Conv1D optimisation (ratio of IFM:Accumulators changes)
James Ward399c4a22021-10-20 11:04:46 +0100451 block_config_opt = fit_block_for_ofm(arch, ofm_shape, kernel, block_config)
Tim Hall30161572021-06-17 17:03:49 +0100452
Tim Halld8339a72021-05-27 18:49:40 +0100453 layout = _try_block_config(
Louis Verhaardd2b55102022-03-17 15:59:04 +0100454 arch.shram,
455 ew_usage,
456 block_config_opt,
457 ifm_block,
458 ifm_bits,
459 ifm_granule,
460 acc_bits,
461 acc_granule,
462 lut_banks,
463 ifm_depth_buf_scaling,
464 arch.ncores,
Tim Halld8339a72021-05-27 18:49:40 +0100465 )
466 if layout is None:
467 return None
468 config.layout = layout
469 config.bank_size = arch.shram_bank_size
470 config.ifm_block = ifm_block
Jacob Bohlinb8060f52021-08-09 12:22:51 +0100471 config.ofm_block = block_config
Tim Halld8339a72021-05-27 18:49:40 +0100472 return config