Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 1 | # Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved. |
| 2 | # |
| 3 | # SPDX-License-Identifier: Apache-2.0 |
| 4 | # |
| 5 | # Licensed under the Apache License, Version 2.0 (the License); you may |
| 6 | # not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| 8 | # |
| 9 | # www.apache.org/licenses/LICENSE-2.0 |
| 10 | # |
| 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an AS IS BASIS, WITHOUT |
| 13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
| 16 | # |
| 17 | # Description: Architecture SHRAM allocator |
| 18 | import enum |
| 19 | import math |
| 20 | from typing import Optional |
| 21 | from typing import Tuple |
| 22 | |
| 23 | from .architecture_features import ArchitectureFeatures |
| 24 | from .architecture_features import Block |
| 25 | from .architecture_features import SHRAMConfig |
| 26 | from .architecture_features import SHRAMElements |
| 27 | from .ethos_u55_regs.ethos_u55_regs import resampling_mode |
| 28 | from .numeric_util import round_up |
| 29 | from .numeric_util import round_up_divide |
| 30 | from .operation import Kernel |
| 31 | from .operation import NpuBlockType |
| 32 | from .range_set import MemoryRangeSet |
| 33 | from .shape4d import Shape4D |
| 34 | from .tensor import MemArea |
| 35 | |
| 36 | |
| 37 | class SHRAMLayout: |
| 38 | def __init__(self): |
| 39 | self.ib_start = 0 |
| 40 | self.ib_end = 0 |
| 41 | self.ib_start2 = 0 |
| 42 | self.ab_start = 0 |
| 43 | self.lut_start = 0 |
| 44 | |
| 45 | |
| 46 | class ArchitectureBlockConfig: |
| 47 | def __init__(self): |
| 48 | self.layout = SHRAMLayout() |
| 49 | self.ifm_block = Shape4D() |
| 50 | self.ofm_block = Shape4D() |
| 51 | self.acc_type = SHRAMElements.Acc32 |
| 52 | self.is_partkernel = False |
| 53 | self.bank_size = 0 |
| 54 | |
| 55 | def get_shram_memory_access_range(self): |
| 56 | # Returns the SHRAM memory access range used by this shared buffer, |
| 57 | # excluding access to LUT |
| 58 | return MemoryRangeSet(MemArea.Shram, 0, self.layout.lut_start * self.bank_size) |
| 59 | |
| 60 | def old_style_representation(self): |
| 61 | return [self.ofm_block.height, self.ofm_block.width, self.ifm_block.depth, self.ofm_block.depth] |
| 62 | |
| 63 | def __str__(self): |
| 64 | return str(self.old_style_representation()) |
| 65 | |
| 66 | |
| 67 | _AccumulatorBits = {SHRAMElements.Acc16: 16, SHRAMElements.Acc32: 32, SHRAMElements.Acc40: 40} |
| 68 | |
| 69 | |
| 70 | class ElementwiseUsage(enum.IntEnum): |
| 71 | No = 0 |
| 72 | Full = 1 |
| 73 | Scalar = 2 |
| 74 | |
| 75 | |
| 76 | def _try_block_config( |
| 77 | shram: SHRAMConfig, |
| 78 | ew_usage: ElementwiseUsage, |
| 79 | ofm_block: Block, |
| 80 | ifm_block: Block, |
| 81 | ifm_bits: int, |
| 82 | ifm_granule: int, |
| 83 | acc_bits: int, |
| 84 | acc_granule: int, |
| 85 | lut_banks: int, |
| 86 | ) -> SHRAMLayout: |
| 87 | assert (acc_bits > 0) and (acc_granule > 0) |
| 88 | assert (ifm_bits >= 8) and ((ifm_bits % 8) == 0) and (ifm_granule > 0) |
| 89 | |
| 90 | # Aways need IFM space |
| 91 | ifm_bytes = ifm_block.elements_wh() * round_up((ifm_block.depth * ifm_bits) / 8, 8) |
| 92 | ifm_banks = round_up_divide(ifm_bytes, shram.bank_size_bytes) * 2 |
| 93 | ifm_banks = round_up(ifm_banks, ifm_granule) |
| 94 | |
| 95 | # Calculate SHRAM boundaries of the IFM and Accumulators |
| 96 | lut_start = shram.total_banks - lut_banks |
| 97 | ifm_end = shram.reserved_output_banks + ifm_banks |
| 98 | ifm2_start = ifm_end |
| 99 | acc_start = lut_start |
| 100 | |
| 101 | # If not elementwise then we need accumulator space |
| 102 | if ew_usage == ElementwiseUsage.No: |
| 103 | acc_bytes = (ofm_block.elements_wh() * round_up(ofm_block.depth, 8) * acc_bits) // 8 |
| 104 | acc_banks = round_up_divide(acc_bytes, shram.bank_size_bytes) * 2 |
| 105 | acc_banks = round_up(acc_banks, acc_granule) |
| 106 | acc_start = acc_start - acc_banks |
| 107 | else: |
| 108 | ifm2_banks = ifm_banks if ew_usage == ElementwiseUsage.Full else 0 |
| 109 | if ifm2_start + ifm2_banks > acc_start: |
| 110 | return None |
| 111 | ifm_end = acc_start |
| 112 | |
| 113 | # IFM must still fit before accumulators |
| 114 | if ifm_end > acc_start: |
| 115 | return None |
| 116 | |
| 117 | # Should all fit, so return this layout |
| 118 | layout = SHRAMLayout() |
| 119 | layout.ib_start = shram.reserved_output_banks |
| 120 | layout.ib_start2 = ifm2_start |
| 121 | layout.ib_end = ifm_end |
| 122 | layout.ab_start = acc_start |
| 123 | layout.lut_start = lut_start |
| 124 | return layout |
| 125 | |
| 126 | |
| 127 | def _choose_kernel_method(ifm_shape: Shape4D, ifm_bits: int, kernel: Kernel) -> bool: |
| 128 | if ifm_shape.depth <= 8: |
| 129 | return True |
| 130 | |
| 131 | # Compare part-kernel to depth-kernel and choose the one with best utilisation |
| 132 | kernel_elements = kernel.elements_wh() |
| 133 | depth_utilisation = ifm_shape.depth / round_up(ifm_shape.depth, 32 if ifm_bits == 8 else 16) |
| 134 | part_utilisation = ( |
| 135 | ifm_shape.depth |
| 136 | * kernel_elements |
| 137 | / (round_up(ifm_shape.depth, 8) * round_up(kernel_elements, 4 if ifm_bits == 8 else 2)) |
| 138 | ) |
| 139 | |
| 140 | return part_utilisation > depth_utilisation |
| 141 | |
| 142 | |
| 143 | def _ew_usage(npu_op_type: NpuBlockType, uses_scalar: bool) -> ElementwiseUsage: |
| 144 | ew_usage = ElementwiseUsage.No |
| 145 | if npu_op_type == NpuBlockType.ElementWise: |
| 146 | ew_usage = ElementwiseUsage.Full |
| 147 | if uses_scalar: |
| 148 | ew_usage = ElementwiseUsage.Scalar |
| 149 | return ew_usage |
| 150 | |
| 151 | |
| 152 | def _acc_type(npu_op_type: NpuBlockType, ifm_bits: int, scaled: bool) -> int: |
| 153 | """Returns accumulator type""" |
| 154 | acc_type = SHRAMElements.Acc32 |
| 155 | if (ifm_bits == 16) and npu_op_type != NpuBlockType.Pooling and scaled: |
| 156 | acc_type = SHRAMElements.Acc40 |
| 157 | return acc_type |
| 158 | |
| 159 | |
| 160 | def to_upscale(ifm_resampling: resampling_mode) -> int: |
| 161 | # Upscaling depending on resampling mode |
| 162 | return 1 if ifm_resampling == resampling_mode.NONE else 2 |
| 163 | |
| 164 | |
| 165 | def _ifm_blockdepth(arch, ifm_shape: Shape4D, ifm_bits: int, is_partkernel: bool): |
| 166 | if ifm_bits == 16: |
| 167 | ifm_blockdepth = round_up(min(ifm_shape.depth, 16), 4) |
| 168 | else: |
| 169 | ifm_blockdepth = round_up(min(ifm_shape.depth, 16 if is_partkernel else 32), arch.ifm_ublock.depth) |
| 170 | return ifm_blockdepth |
| 171 | |
| 172 | |
| 173 | def _required_size(value: int, stride: int, border: int, upscale: int) -> int: |
| 174 | return int(math.ceil(((value - 1) * stride + border) / upscale)) |
| 175 | |
| 176 | |
| 177 | def get_ifm_area_required(ofm_shape: Shape4D, kernel: Kernel, upscale: int) -> Tuple[int, int]: |
| 178 | h1 = _required_size(ofm_shape.height, kernel.stride.y, kernel.area_height(), upscale) |
| 179 | w1 = _required_size(ofm_shape.width, kernel.stride.x, kernel.area_width(), upscale) |
| 180 | return (w1, h1) |
| 181 | |
| 182 | |
| 183 | def _get_ifm_blocksize( |
| 184 | ofm_block: Shape4D, kernel: Kernel, ublock: Block, subkernel_limit: Block, upscale: int |
| 185 | ) -> Shape4D: |
| 186 | # IFM block height |
| 187 | h1 = _required_size(ofm_block.height, kernel.stride.y, min(kernel.area_height(), subkernel_limit.height), upscale) |
| 188 | h2 = h1 |
| 189 | height = round_up(min(h1, h2), ublock.height) |
| 190 | |
| 191 | # IFM block width |
| 192 | w1 = _required_size(ofm_block.width, kernel.stride.x, min(kernel.area_width(), subkernel_limit.width), upscale) |
| 193 | w2 = w1 |
| 194 | width = round_up(min(w1, w2), ublock.width) |
| 195 | |
| 196 | return Shape4D(1, height, width, ofm_block.depth) |
| 197 | |
| 198 | |
Tim Hall | 3016157 | 2021-06-17 17:03:49 +0100 | [diff] [blame] | 199 | def fit_block_for_ofm(arch: ArchitectureFeatures, ofm_shape: Shape4D, kernel: Kernel, block: Shape4D): |
| 200 | # 256/512 Conv1D optimisation (ratio of IFM:Accumulators changes) This is a specific |
| 201 | # interpretation of a more general constraint that can't be applied because the |
| 202 | # find_block_config function must return block configs that can be applied to any OFM shape. |
| 203 | if (ofm_shape.height == 1) and (kernel.height == 1) and (arch.ofm_ublock.height == 2): |
| 204 | return Shape4D(1, min(block.height, ofm_shape.height), block.width, block.depth) |
| 205 | return block |
| 206 | |
| 207 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 208 | def find_block_config( |
| 209 | arch: ArchitectureFeatures, |
| 210 | npu_op_type: NpuBlockType, |
| 211 | ofm_shape: Shape4D, |
| 212 | ifm_shape: Shape4D, |
| 213 | ifm2_shape: Shape4D, |
| 214 | uses_scalar: bool, |
| 215 | ifm_bits: int, |
| 216 | kernel: Kernel, |
| 217 | lut_banks: int, |
| 218 | scaled: bool, |
| 219 | ifm_resampling: resampling_mode, |
| 220 | ) -> ArchitectureBlockConfig: |
| 221 | SplitDepth = ArchitectureFeatures.OFMSplitDepth |
| 222 | # Elementwise larger-volume correction |
| 223 | if ifm2_shape is not None and ifm2_shape.elements() > ifm_shape.elements(): |
| 224 | ifm_shape = ifm2_shape |
| 225 | |
| 226 | # Figure out if SHRAM should be portioned for elementwise |
| 227 | ew_usage = _ew_usage(npu_op_type, uses_scalar) |
| 228 | |
| 229 | # Operator typing help |
| 230 | is_pooling = npu_op_type == NpuBlockType.Pooling |
| 231 | is_depthwise = npu_op_type == NpuBlockType.ConvolutionDepthWise |
| 232 | is_equal_depth_op = (ew_usage != ElementwiseUsage.No) or is_pooling or is_depthwise |
| 233 | is_convolution = (npu_op_type == NpuBlockType.ConvolutionMxN) or is_depthwise |
| 234 | |
| 235 | # Block config to be returned |
| 236 | config = ArchitectureBlockConfig() |
| 237 | config.is_partkernel = is_convolution and _choose_kernel_method(ifm_shape, ifm_bits, kernel) |
| 238 | |
| 239 | # Accumulator & granule settings |
| 240 | config.acc_type = _acc_type(npu_op_type, ifm_bits, scaled) |
| 241 | |
| 242 | # Memory rounding granules |
| 243 | acc_granule = arch.accumulator_granules[config.acc_type] |
| 244 | acc_bits = _AccumulatorBits[config.acc_type] |
| 245 | if ew_usage != ElementwiseUsage.No: |
| 246 | ifm_granule = arch.ifm_ew_bank_granules[ifm_bits] |
| 247 | else: |
| 248 | ifm_granule = arch.ifm_bank_granules[ifm_bits] |
| 249 | lut_banks = max(lut_banks, arch.shram.reserved_end_banks) |
| 250 | upscale = to_upscale(ifm_resampling) |
| 251 | |
| 252 | # Subkernel repeats of the IFM |
| 253 | ifm_repeats = round_up_divide(kernel.area_width(), arch.SubKernelMax.width) * round_up_divide( |
| 254 | kernel.area_height(), arch.SubKernelMax.height |
| 255 | ) |
| 256 | ifm_blockdepth = _ifm_blockdepth(arch, ifm_shape, ifm_bits, config.is_partkernel) |
| 257 | |
| 258 | # Weights fetch (for operators that have them) |
| 259 | weight_fetch_wh = (kernel.area_width() * kernel.area_height()) if is_convolution else 0 |
| 260 | |
| 261 | search_space = Shape4D.min(ofm_shape, Shape4D(arch.ofm_block_max.to_hwc())) |
| 262 | search_space = Shape4D.round_up(search_space, Shape4D(arch.ofm_ublock.to_hwc())) |
| 263 | |
| 264 | # Block WHC search, loops across the search space looking for best efficiency |
| 265 | best_cost = math.inf |
Tim Hall | daed152 | 2021-07-19 21:22:46 +0100 | [diff] [blame^] | 266 | best_coverage = math.inf |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 267 | depth = max(arch.ofm_ublock.depth, min(search_space.depth, SplitDepth)) |
| 268 | if depth < ofm_shape.depth: |
| 269 | depth = round_up(depth, SplitDepth) |
| 270 | |
| 271 | while depth <= search_space.depth: |
| 272 | wont_fit = {} |
| 273 | for height in range(arch.ofm_ublock.height, search_space.height + 1, arch.ofm_ublock.height): |
| 274 | for width in range(arch.ofm_ublock.width, search_space.width + 1, arch.ofm_ublock.width): |
| 275 | # Avoid checking W/H transposed blocks that already didn't fit. i.e. if 8x4x16 didn't |
| 276 | # fit, then 4x8x16 won't either. |
| 277 | if wont_fit.get((height, width), False): |
| 278 | continue |
| 279 | |
| 280 | # Calculate the IFM block dimensions required to feed this OFM block |
| 281 | ofm_block = Shape4D(1, height, width, depth) |
| 282 | ifm_block = _get_ifm_blocksize(ofm_block, kernel, arch.ofm_ublock, arch.SubKernelMax, upscale) |
| 283 | if not is_equal_depth_op: |
| 284 | ifm_block = ifm_block.with_depth(ifm_blockdepth) |
| 285 | |
| 286 | # Test if the IFM/OFM blocks fit into SHRAM |
Tim Hall | 3016157 | 2021-06-17 17:03:49 +0100 | [diff] [blame] | 287 | ofm_block = fit_block_for_ofm(arch, ofm_shape, kernel, ofm_block) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 288 | layout = _try_block_config( |
| 289 | arch.shram, ew_usage, ofm_block, ifm_block, ifm_bits, ifm_granule, acc_bits, acc_granule, lut_banks |
| 290 | ) |
| 291 | |
| 292 | if layout: |
Tim Hall | 789e6f3 | 2021-06-17 17:02:31 +0100 | [diff] [blame] | 293 | full_blocks = Shape4D.div_round_up(ofm_shape, ofm_block) |
| 294 | blocks = ofm_shape / ofm_block |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 295 | |
Tim Hall | 789e6f3 | 2021-06-17 17:02:31 +0100 | [diff] [blame] | 296 | # Weights fetching |
| 297 | weight_fetch = weight_fetch_wh * ifm_shape.depth * full_blocks.elements_wh() |
| 298 | if not is_depthwise: |
| 299 | weight_fetch *= ofm_block.depth * blocks.depth |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 300 | |
Tim Hall | 789e6f3 | 2021-06-17 17:02:31 +0100 | [diff] [blame] | 301 | # IFM fetching |
| 302 | ifm_fetch = ifm_block.elements_wh() * ifm_shape.depth * ifm_repeats * blocks.elements_wh() |
| 303 | if not is_equal_depth_op: |
| 304 | ifm_fetch *= full_blocks.depth |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 305 | |
Tim Hall | 789e6f3 | 2021-06-17 17:02:31 +0100 | [diff] [blame] | 306 | # Scale relative to every output OFM element |
| 307 | relative_cost = (ifm_fetch + weight_fetch) / ofm_shape.elements() |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 308 | |
| 309 | # If the entire IFM can be encompassed by both buffers, bias to prefer this configuration |
| 310 | if ifm_shape.elements() < ifm_block.elements() * 2: |
| 311 | relative_cost = relative_cost / 2 |
| 312 | |
Tim Hall | daed152 | 2021-07-19 21:22:46 +0100 | [diff] [blame^] | 313 | # Choose based on relative minimum cost or larger IFM area (if equal cost) |
| 314 | if relative_cost <= best_cost: |
| 315 | choose_this = False |
| 316 | # Check IFM coverage only when it's equal best_cost and small OFM |
| 317 | if relative_cost == best_cost: |
| 318 | coverage_shape = Shape4D.min(ifm_shape, ifm_block) |
| 319 | coverage = ifm_shape.elements_wh() / coverage_shape.elements_wh() |
| 320 | # Small 4x4 IFM constraint found through analysis of networks |
| 321 | if coverage <= best_coverage and (height <= 4 and width <= 4): |
| 322 | best_coverage = coverage |
| 323 | choose_this = True |
| 324 | else: |
| 325 | best_coverage = math.inf |
| 326 | choose_this = True |
| 327 | |
| 328 | if choose_this: |
| 329 | best_cost = relative_cost |
| 330 | config.layout = layout |
| 331 | config.bank_size = arch.shram_bank_size |
| 332 | config.ifm_block = ifm_block |
| 333 | config.ofm_block = Shape4D(1, height, width, depth) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 334 | else: |
| 335 | wont_fit[(width, height)] = True |
| 336 | |
| 337 | depth = depth + arch.ofm_ublock.depth |
| 338 | if depth < ofm_shape.depth: |
| 339 | depth = round_up(depth, SplitDepth) |
| 340 | |
| 341 | if best_cost != math.inf: |
| 342 | return config |
| 343 | |
| 344 | return None |
| 345 | |
| 346 | |
| 347 | def try_block_config( |
| 348 | block_config: Block, |
| 349 | arch: ArchitectureFeatures, |
| 350 | npu_op_type: NpuBlockType, |
Tim Hall | 3016157 | 2021-06-17 17:03:49 +0100 | [diff] [blame] | 351 | ofm_shape: Block, |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 352 | ifm_shape: Block, |
| 353 | ifm2_shape: Optional[Block], |
| 354 | uses_scalar: bool, |
| 355 | ifm_bits: int, |
| 356 | is_partkernel: bool, |
| 357 | kernel: Kernel, |
| 358 | lut_banks: int, |
| 359 | scaled: bool, |
| 360 | ifm_resampling: resampling_mode, |
| 361 | ) -> Optional[ArchitectureBlockConfig]: |
| 362 | """ |
| 363 | Given a block_config, returns a corresponding ArchitectureBlockConfig. |
| 364 | Returns None if the block_config does not fit or is invalid. |
| 365 | """ |
| 366 | # Check block config validity |
| 367 | if not all( |
| 368 | blk > 0 and blk <= blk_max and blk % ublk == 0 |
| 369 | for blk, blk_max, ublk in zip(block_config.as_list(), arch.ofm_block_max.as_list(), arch.ofm_ublock.as_list()) |
| 370 | ): |
| 371 | return None |
| 372 | # Elementwise larger-volume correction |
| 373 | if ifm2_shape is not None and ifm2_shape.elements() > ifm_shape.elements(): |
| 374 | ifm_shape = ifm2_shape |
| 375 | |
| 376 | ew_usage = _ew_usage(npu_op_type, uses_scalar) |
| 377 | |
| 378 | # Operator typing help |
| 379 | is_pooling = npu_op_type == NpuBlockType.Pooling |
| 380 | is_depthwise = npu_op_type == NpuBlockType.ConvolutionDepthWise |
| 381 | is_equal_depth_op = (ew_usage != ElementwiseUsage.No) or is_pooling or is_depthwise |
| 382 | |
| 383 | # Block config to be returned |
| 384 | config = ArchitectureBlockConfig() |
| 385 | config.is_partkernel = is_partkernel |
| 386 | |
| 387 | # Accumulator & granule settings |
| 388 | config.acc_type = _acc_type(npu_op_type, ifm_bits, scaled) |
| 389 | |
| 390 | # Memory rounding granules |
| 391 | acc_granule = arch.accumulator_granules[config.acc_type] |
| 392 | acc_bits = _AccumulatorBits[config.acc_type] |
| 393 | if ew_usage != ElementwiseUsage.No: |
| 394 | ifm_granule = arch.ifm_ew_bank_granules[ifm_bits] |
| 395 | else: |
| 396 | ifm_granule = arch.ifm_bank_granules[ifm_bits] |
| 397 | lut_banks = max(lut_banks, arch.shram.reserved_end_banks) |
| 398 | upscale = to_upscale(ifm_resampling) |
| 399 | ifm_blockdepth = _ifm_blockdepth(arch, ifm_shape, ifm_bits, is_partkernel) |
| 400 | ifm_block = _get_ifm_blocksize(block_config, kernel, arch.ofm_ublock, arch.SubKernelMax, upscale) |
| 401 | if not is_equal_depth_op: |
| 402 | ifm_block = ifm_block.with_depth(ifm_blockdepth) |
| 403 | |
Tim Hall | 3016157 | 2021-06-17 17:03:49 +0100 | [diff] [blame] | 404 | # 256/512 Conv1D optimisation (ratio of IFM:Accumulators changes) |
| 405 | block_config = fit_block_for_ofm(arch, ofm_shape, kernel, block_config) |
| 406 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 407 | layout = _try_block_config( |
| 408 | arch.shram, ew_usage, block_config, ifm_block, ifm_bits, ifm_granule, acc_bits, acc_granule, lut_banks |
| 409 | ) |
| 410 | if layout is None: |
| 411 | return None |
| 412 | config.layout = layout |
| 413 | config.bank_size = arch.shram_bank_size |
| 414 | config.ifm_block = ifm_block |
| 415 | return config |