blob: b5edcabb94a29c2580c1d077bb1d3145ebd3af0d [file] [log] [blame]
Tim Halld8339a72021-05-27 18:49:40 +01001# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17# Description: Architecture SHRAM allocator
18import enum
19import math
Jonas Ohlsson845e2322022-03-01 12:39:55 +010020from typing import Dict
Tim Halld8339a72021-05-27 18:49:40 +010021from typing import Optional
22from typing import Tuple
Jonas Ohlsson845e2322022-03-01 12:39:55 +010023from typing import Union
Tim Halld8339a72021-05-27 18:49:40 +010024
25from .architecture_features import ArchitectureFeatures
26from .architecture_features import Block
27from .architecture_features import SHRAMConfig
28from .architecture_features import SHRAMElements
29from .ethos_u55_regs.ethos_u55_regs import resampling_mode
30from .numeric_util import round_up
31from .numeric_util import round_up_divide
32from .operation import Kernel
33from .operation import NpuBlockType
34from .range_set import MemoryRangeSet
35from .shape4d import Shape4D
36from .tensor import MemArea
37
38
39class SHRAMLayout:
40 def __init__(self):
41 self.ib_start = 0
42 self.ib_end = 0
43 self.ib_start2 = 0
44 self.ab_start = 0
45 self.lut_start = 0
46
47
48class ArchitectureBlockConfig:
49 def __init__(self):
50 self.layout = SHRAMLayout()
51 self.ifm_block = Shape4D()
James Ward399c4a22021-10-20 11:04:46 +010052 self.ofm_block = Shape4D() # non-1D-optimised block
Tim Halld8339a72021-05-27 18:49:40 +010053 self.acc_type = SHRAMElements.Acc32
54 self.is_partkernel = False
55 self.bank_size = 0
56
57 def get_shram_memory_access_range(self):
58 # Returns the SHRAM memory access range used by this shared buffer,
59 # excluding access to LUT
60 return MemoryRangeSet(MemArea.Shram, 0, self.layout.lut_start * self.bank_size)
61
62 def old_style_representation(self):
63 return [self.ofm_block.height, self.ofm_block.width, self.ifm_block.depth, self.ofm_block.depth]
64
65 def __str__(self):
66 return str(self.old_style_representation())
67
68
69_AccumulatorBits = {SHRAMElements.Acc16: 16, SHRAMElements.Acc32: 32, SHRAMElements.Acc40: 40}
70
71
72class ElementwiseUsage(enum.IntEnum):
73 No = 0
74 Full = 1
75 Scalar = 2
76
77
78def _try_block_config(
79 shram: SHRAMConfig,
80 ew_usage: ElementwiseUsage,
Jonas Ohlsson845e2322022-03-01 12:39:55 +010081 ofm_block: Union[Shape4D, Block],
82 ifm_block: Union[Shape4D, Block],
Tim Halld8339a72021-05-27 18:49:40 +010083 ifm_bits: int,
84 ifm_granule: int,
85 acc_bits: int,
86 acc_granule: int,
87 lut_banks: int,
Jonas Ohlsson845e2322022-03-01 12:39:55 +010088) -> Union[SHRAMLayout, None]:
Tim Halld8339a72021-05-27 18:49:40 +010089 assert (acc_bits > 0) and (acc_granule > 0)
90 assert (ifm_bits >= 8) and ((ifm_bits % 8) == 0) and (ifm_granule > 0)
91
92 # Aways need IFM space
Tim Halle80038a2022-05-10 13:41:24 +010093 ifm_bytes = ifm_block.elements_wh() * round_up((ifm_block.depth * ifm_bits) / 8, 8)
Tim Halld8339a72021-05-27 18:49:40 +010094 ifm_banks = round_up_divide(ifm_bytes, shram.bank_size_bytes) * 2
95 ifm_banks = round_up(ifm_banks, ifm_granule)
96
97 # Calculate SHRAM boundaries of the IFM and Accumulators
98 lut_start = shram.total_banks - lut_banks
99 ifm_end = shram.reserved_output_banks + ifm_banks
100 ifm2_start = ifm_end
101 acc_start = lut_start
102
103 # If not elementwise then we need accumulator space
104 if ew_usage == ElementwiseUsage.No:
Tim Halle80038a2022-05-10 13:41:24 +0100105 acc_bytes = (ofm_block.elements_wh() * round_up(ofm_block.depth, 8) * acc_bits) // 8
Tim Halld8339a72021-05-27 18:49:40 +0100106 acc_banks = round_up_divide(acc_bytes, shram.bank_size_bytes) * 2
107 acc_banks = round_up(acc_banks, acc_granule)
108 acc_start = acc_start - acc_banks
109 else:
110 ifm2_banks = ifm_banks if ew_usage == ElementwiseUsage.Full else 0
111 if ifm2_start + ifm2_banks > acc_start:
112 return None
113 ifm_end = acc_start
114
115 # IFM must still fit before accumulators
116 if ifm_end > acc_start:
117 return None
118
119 # Should all fit, so return this layout
120 layout = SHRAMLayout()
121 layout.ib_start = shram.reserved_output_banks
122 layout.ib_start2 = ifm2_start
123 layout.ib_end = ifm_end
124 layout.ab_start = acc_start
125 layout.lut_start = lut_start
126 return layout
127
128
129def _choose_kernel_method(ifm_shape: Shape4D, ifm_bits: int, kernel: Kernel) -> bool:
130 if ifm_shape.depth <= 8:
131 return True
132
133 # Compare part-kernel to depth-kernel and choose the one with best utilisation
134 kernel_elements = kernel.elements_wh()
135 depth_utilisation = ifm_shape.depth / round_up(ifm_shape.depth, 32 if ifm_bits == 8 else 16)
136 part_utilisation = (
137 ifm_shape.depth
138 * kernel_elements
139 / (round_up(ifm_shape.depth, 8) * round_up(kernel_elements, 4 if ifm_bits == 8 else 2))
140 )
141
142 return part_utilisation > depth_utilisation
143
144
145def _ew_usage(npu_op_type: NpuBlockType, uses_scalar: bool) -> ElementwiseUsage:
146 ew_usage = ElementwiseUsage.No
147 if npu_op_type == NpuBlockType.ElementWise:
148 ew_usage = ElementwiseUsage.Full
149 if uses_scalar:
150 ew_usage = ElementwiseUsage.Scalar
151 return ew_usage
152
153
154def _acc_type(npu_op_type: NpuBlockType, ifm_bits: int, scaled: bool) -> int:
155 """Returns accumulator type"""
156 acc_type = SHRAMElements.Acc32
157 if (ifm_bits == 16) and npu_op_type != NpuBlockType.Pooling and scaled:
158 acc_type = SHRAMElements.Acc40
159 return acc_type
160
161
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200162def is_nearest(ifm_resampling: resampling_mode) -> bool:
163 return ifm_resampling == resampling_mode.NEAREST
164
165
Tim Halld8339a72021-05-27 18:49:40 +0100166def to_upscale(ifm_resampling: resampling_mode) -> int:
167 # Upscaling depending on resampling mode
168 return 1 if ifm_resampling == resampling_mode.NONE else 2
169
170
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100171def _ifm_blockdepth(arch, ifm_shape: Union[Shape4D, Block], ifm_bits: int, is_partkernel: bool):
Tim Halld8339a72021-05-27 18:49:40 +0100172 if ifm_bits == 16:
173 ifm_blockdepth = round_up(min(ifm_shape.depth, 16), 4)
174 else:
175 ifm_blockdepth = round_up(min(ifm_shape.depth, 16 if is_partkernel else 32), arch.ifm_ublock.depth)
176 return ifm_blockdepth
177
178
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200179def _required_size(value: int, stride: int, border: int, upscale: int, nearest: bool) -> int:
180 return int(math.ceil(((value - 1) * stride + border + nearest) / upscale))
Tim Halld8339a72021-05-27 18:49:40 +0100181
182
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100183def get_ifm_area_required(
184 ofm_shape: Union[Shape4D, Block], kernel: Kernel, resampling_mode: resampling_mode
185) -> Tuple[int, int]:
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200186 upscale = to_upscale(resampling_mode)
187 nearest = is_nearest(resampling_mode)
188 h1 = _required_size(ofm_shape.height, kernel.stride.y, kernel.area_height(), upscale, nearest)
189 w1 = _required_size(ofm_shape.width, kernel.stride.x, kernel.area_width(), upscale, nearest)
Tim Halld8339a72021-05-27 18:49:40 +0100190 return (w1, h1)
191
192
193def _get_ifm_blocksize(
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100194 ofm_block: Union[Shape4D, Block], kernel: Kernel, ublock: Block, subkernel_limit: Block, upscale: int, nearest: bool
Tim Halld8339a72021-05-27 18:49:40 +0100195) -> Shape4D:
196 # IFM block height
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200197 h1 = _required_size(
198 ofm_block.height, kernel.stride.y, min(kernel.area_height(), subkernel_limit.height), upscale, nearest
199 )
Tim Halld8339a72021-05-27 18:49:40 +0100200 h2 = h1
201 height = round_up(min(h1, h2), ublock.height)
202
203 # IFM block width
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200204 w1 = _required_size(
205 ofm_block.width, kernel.stride.x, min(kernel.area_width(), subkernel_limit.width), upscale, nearest
206 )
Tim Halld8339a72021-05-27 18:49:40 +0100207 w2 = w1
208 width = round_up(min(w1, w2), ublock.width)
209
210 return Shape4D(1, height, width, ofm_block.depth)
211
212
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100213def fit_block_for_ofm(
214 arch: ArchitectureFeatures, ofm_shape: Union[Shape4D, Block], kernel: Kernel, block: Union[Shape4D, Block]
215):
Tim Hall30161572021-06-17 17:03:49 +0100216 # 256/512 Conv1D optimisation (ratio of IFM:Accumulators changes) This is a specific
217 # interpretation of a more general constraint that can't be applied because the
218 # find_block_config function must return block configs that can be applied to any OFM shape.
219 if (ofm_shape.height == 1) and (kernel.height == 1) and (arch.ofm_ublock.height == 2):
220 return Shape4D(1, min(block.height, ofm_shape.height), block.width, block.depth)
221 return block
222
223
Tim Halld8339a72021-05-27 18:49:40 +0100224def find_block_config(
225 arch: ArchitectureFeatures,
226 npu_op_type: NpuBlockType,
227 ofm_shape: Shape4D,
228 ifm_shape: Shape4D,
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100229 ifm2_shape: Optional[Shape4D],
Tim Halld8339a72021-05-27 18:49:40 +0100230 uses_scalar: bool,
231 ifm_bits: int,
232 kernel: Kernel,
233 lut_banks: int,
234 scaled: bool,
235 ifm_resampling: resampling_mode,
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100236) -> Optional[ArchitectureBlockConfig]:
Tim Halld8339a72021-05-27 18:49:40 +0100237 SplitDepth = ArchitectureFeatures.OFMSplitDepth
238 # Elementwise larger-volume correction
239 if ifm2_shape is not None and ifm2_shape.elements() > ifm_shape.elements():
240 ifm_shape = ifm2_shape
241
242 # Figure out if SHRAM should be portioned for elementwise
243 ew_usage = _ew_usage(npu_op_type, uses_scalar)
244
245 # Operator typing help
246 is_pooling = npu_op_type == NpuBlockType.Pooling
247 is_depthwise = npu_op_type == NpuBlockType.ConvolutionDepthWise
248 is_equal_depth_op = (ew_usage != ElementwiseUsage.No) or is_pooling or is_depthwise
249 is_convolution = (npu_op_type == NpuBlockType.ConvolutionMxN) or is_depthwise
250
251 # Block config to be returned
252 config = ArchitectureBlockConfig()
253 config.is_partkernel = is_convolution and _choose_kernel_method(ifm_shape, ifm_bits, kernel)
254
255 # Accumulator & granule settings
256 config.acc_type = _acc_type(npu_op_type, ifm_bits, scaled)
257
258 # Memory rounding granules
259 acc_granule = arch.accumulator_granules[config.acc_type]
260 acc_bits = _AccumulatorBits[config.acc_type]
261 if ew_usage != ElementwiseUsage.No:
262 ifm_granule = arch.ifm_ew_bank_granules[ifm_bits]
263 else:
264 ifm_granule = arch.ifm_bank_granules[ifm_bits]
265 lut_banks = max(lut_banks, arch.shram.reserved_end_banks)
266 upscale = to_upscale(ifm_resampling)
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200267 nearest = is_nearest(ifm_resampling)
Tim Halld8339a72021-05-27 18:49:40 +0100268
269 # Subkernel repeats of the IFM
270 ifm_repeats = round_up_divide(kernel.area_width(), arch.SubKernelMax.width) * round_up_divide(
271 kernel.area_height(), arch.SubKernelMax.height
272 )
273 ifm_blockdepth = _ifm_blockdepth(arch, ifm_shape, ifm_bits, config.is_partkernel)
274
275 # Weights fetch (for operators that have them)
276 weight_fetch_wh = (kernel.area_width() * kernel.area_height()) if is_convolution else 0
277
278 search_space = Shape4D.min(ofm_shape, Shape4D(arch.ofm_block_max.to_hwc()))
Tim Halle80038a2022-05-10 13:41:24 +0100279 search_space = Shape4D.round_up(search_space, Shape4D(arch.ofm_ublock.to_hwc()))
Tim Halld8339a72021-05-27 18:49:40 +0100280
281 # Block WHC search, loops across the search space looking for best efficiency
282 best_cost = math.inf
Tim Halldaed1522021-07-19 21:22:46 +0100283 best_coverage = math.inf
Tim Halld8339a72021-05-27 18:49:40 +0100284 depth = max(arch.ofm_ublock.depth, min(search_space.depth, SplitDepth))
285 if depth < ofm_shape.depth:
286 depth = round_up(depth, SplitDepth)
287
288 while depth <= search_space.depth:
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100289 wont_fit: Dict[Tuple[int, int], bool] = {}
Tim Halld8339a72021-05-27 18:49:40 +0100290 for height in range(arch.ofm_ublock.height, search_space.height + 1, arch.ofm_ublock.height):
291 for width in range(arch.ofm_ublock.width, search_space.width + 1, arch.ofm_ublock.width):
292 # Avoid checking W/H transposed blocks that already didn't fit. i.e. if 8x4x16 didn't
293 # fit, then 4x8x16 won't either.
294 if wont_fit.get((height, width), False):
295 continue
296
297 # Calculate the IFM block dimensions required to feed this OFM block
298 ofm_block = Shape4D(1, height, width, depth)
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200299 ifm_block = _get_ifm_blocksize(ofm_block, kernel, arch.ofm_ublock, arch.SubKernelMax, upscale, nearest)
Tim Halld8339a72021-05-27 18:49:40 +0100300 if not is_equal_depth_op:
301 ifm_block = ifm_block.with_depth(ifm_blockdepth)
302
303 # Test if the IFM/OFM blocks fit into SHRAM
Tim Hall30161572021-06-17 17:03:49 +0100304 ofm_block = fit_block_for_ofm(arch, ofm_shape, kernel, ofm_block)
Tim Halld8339a72021-05-27 18:49:40 +0100305 layout = _try_block_config(
Louis Verhaardd2b55102022-03-17 15:59:04 +0100306 arch.shram,
307 ew_usage,
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100308 Block(ofm_block.width, ofm_block.height, ofm_block.depth),
309 Block(ifm_block.width, ifm_block.height, ifm_block.depth),
Louis Verhaardd2b55102022-03-17 15:59:04 +0100310 ifm_bits,
311 ifm_granule,
312 acc_bits,
313 acc_granule,
314 lut_banks,
Tim Halld8339a72021-05-27 18:49:40 +0100315 )
316
317 if layout:
Tim Hall789e6f32021-06-17 17:02:31 +0100318 full_blocks = Shape4D.div_round_up(ofm_shape, ofm_block)
319 blocks = ofm_shape / ofm_block
Tim Halld8339a72021-05-27 18:49:40 +0100320
Tim Hall789e6f32021-06-17 17:02:31 +0100321 # Weights fetching
322 weight_fetch = weight_fetch_wh * ifm_shape.depth * full_blocks.elements_wh()
323 if not is_depthwise:
324 weight_fetch *= ofm_block.depth * blocks.depth
Tim Halld8339a72021-05-27 18:49:40 +0100325
Tim Hall789e6f32021-06-17 17:02:31 +0100326 # IFM fetching
327 ifm_fetch = ifm_block.elements_wh() * ifm_shape.depth * ifm_repeats * blocks.elements_wh()
328 if not is_equal_depth_op:
329 ifm_fetch *= full_blocks.depth
Tim Halld8339a72021-05-27 18:49:40 +0100330
Tim Hall789e6f32021-06-17 17:02:31 +0100331 # Scale relative to every output OFM element
Fredrik Svedberg5cc4c762022-06-16 13:14:52 +0200332 if npu_op_type == NpuBlockType.ElementWise:
333 relative_cost = ofm_shape.elements() / (height * width * depth)
334 else:
335 relative_cost = (ifm_fetch + weight_fetch) / ofm_shape.elements()
Tim Halld8339a72021-05-27 18:49:40 +0100336
337 # If the entire IFM can be encompassed by both buffers, bias to prefer this configuration
338 if ifm_shape.elements() < ifm_block.elements() * 2:
339 relative_cost = relative_cost / 2
340
Tim Halldaed1522021-07-19 21:22:46 +0100341 # Choose based on relative minimum cost or larger IFM area (if equal cost)
342 if relative_cost <= best_cost:
343 choose_this = False
344 # Check IFM coverage only when it's equal best_cost and small OFM
345 if relative_cost == best_cost:
346 coverage_shape = Shape4D.min(ifm_shape, ifm_block)
347 coverage = ifm_shape.elements_wh() / coverage_shape.elements_wh()
348 # Small 4x4 IFM constraint found through analysis of networks
349 if coverage <= best_coverage and (height <= 4 and width <= 4):
350 best_coverage = coverage
351 choose_this = True
352 else:
353 best_coverage = math.inf
354 choose_this = True
355
356 if choose_this:
357 best_cost = relative_cost
358 config.layout = layout
359 config.bank_size = arch.shram_bank_size
360 config.ifm_block = ifm_block
361 config.ofm_block = Shape4D(1, height, width, depth)
Tim Halld8339a72021-05-27 18:49:40 +0100362 else:
363 wont_fit[(width, height)] = True
364
365 depth = depth + arch.ofm_ublock.depth
366 if depth < ofm_shape.depth:
367 depth = round_up(depth, SplitDepth)
368
369 if best_cost != math.inf:
370 return config
371
372 return None
373
374
375def try_block_config(
376 block_config: Block,
377 arch: ArchitectureFeatures,
378 npu_op_type: NpuBlockType,
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100379 ofm_shape: Union[Shape4D, Block],
380 ifm_shape: Union[Shape4D, Block],
381 ifm2_shape: Optional[Union[Shape4D, Block]],
Tim Halld8339a72021-05-27 18:49:40 +0100382 uses_scalar: bool,
383 ifm_bits: int,
384 is_partkernel: bool,
385 kernel: Kernel,
386 lut_banks: int,
387 scaled: bool,
388 ifm_resampling: resampling_mode,
389) -> Optional[ArchitectureBlockConfig]:
390 """
391 Given a block_config, returns a corresponding ArchitectureBlockConfig.
392 Returns None if the block_config does not fit or is invalid.
393 """
394 # Check block config validity
395 if not all(
396 blk > 0 and blk <= blk_max and blk % ublk == 0
397 for blk, blk_max, ublk in zip(block_config.as_list(), arch.ofm_block_max.as_list(), arch.ofm_ublock.as_list())
398 ):
399 return None
400 # Elementwise larger-volume correction
401 if ifm2_shape is not None and ifm2_shape.elements() > ifm_shape.elements():
402 ifm_shape = ifm2_shape
403
404 ew_usage = _ew_usage(npu_op_type, uses_scalar)
405
406 # Operator typing help
407 is_pooling = npu_op_type == NpuBlockType.Pooling
408 is_depthwise = npu_op_type == NpuBlockType.ConvolutionDepthWise
409 is_equal_depth_op = (ew_usage != ElementwiseUsage.No) or is_pooling or is_depthwise
410
411 # Block config to be returned
412 config = ArchitectureBlockConfig()
413 config.is_partkernel = is_partkernel
414
415 # Accumulator & granule settings
416 config.acc_type = _acc_type(npu_op_type, ifm_bits, scaled)
417
418 # Memory rounding granules
419 acc_granule = arch.accumulator_granules[config.acc_type]
420 acc_bits = _AccumulatorBits[config.acc_type]
421 if ew_usage != ElementwiseUsage.No:
422 ifm_granule = arch.ifm_ew_bank_granules[ifm_bits]
423 else:
424 ifm_granule = arch.ifm_bank_granules[ifm_bits]
425 lut_banks = max(lut_banks, arch.shram.reserved_end_banks)
426 upscale = to_upscale(ifm_resampling)
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200427 nearest = is_nearest(ifm_resampling)
Tim Halld8339a72021-05-27 18:49:40 +0100428 ifm_blockdepth = _ifm_blockdepth(arch, ifm_shape, ifm_bits, is_partkernel)
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200429 ifm_block = _get_ifm_blocksize(block_config, kernel, arch.ofm_ublock, arch.SubKernelMax, upscale, nearest)
Tim Halld8339a72021-05-27 18:49:40 +0100430 if not is_equal_depth_op:
431 ifm_block = ifm_block.with_depth(ifm_blockdepth)
432
Tim Hall30161572021-06-17 17:03:49 +0100433 # 256/512 Conv1D optimisation (ratio of IFM:Accumulators changes)
James Ward399c4a22021-10-20 11:04:46 +0100434 block_config_opt = fit_block_for_ofm(arch, ofm_shape, kernel, block_config)
Tim Hall30161572021-06-17 17:03:49 +0100435
Tim Halld8339a72021-05-27 18:49:40 +0100436 layout = _try_block_config(
Tim Halle80038a2022-05-10 13:41:24 +0100437 arch.shram, ew_usage, block_config_opt, ifm_block, ifm_bits, ifm_granule, acc_bits, acc_granule, lut_banks
Tim Halld8339a72021-05-27 18:49:40 +0100438 )
439 if layout is None:
440 return None
441 config.layout = layout
442 config.bank_size = arch.shram_bank_size
443 config.ifm_block = ifm_block
Jacob Bohlinb8060f52021-08-09 12:22:51 +0100444 config.ofm_block = block_config
Tim Halld8339a72021-05-27 18:49:40 +0100445 return config