Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 1 | # Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved. |
| 2 | # |
| 3 | # SPDX-License-Identifier: Apache-2.0 |
| 4 | # |
| 5 | # Licensed under the Apache License, Version 2.0 (the License); you may |
| 6 | # not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| 8 | # |
| 9 | # www.apache.org/licenses/LICENSE-2.0 |
| 10 | # |
| 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an AS IS BASIS, WITHOUT |
| 13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
| 16 | # |
| 17 | # Description: Architecture SHRAM allocator |
| 18 | import enum |
| 19 | import math |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame] | 20 | from typing import Dict |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 21 | from typing import Optional |
| 22 | from typing import Tuple |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame] | 23 | from typing import Union |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 24 | |
| 25 | from .architecture_features import ArchitectureFeatures |
| 26 | from .architecture_features import Block |
| 27 | from .architecture_features import SHRAMConfig |
| 28 | from .architecture_features import SHRAMElements |
| 29 | from .ethos_u55_regs.ethos_u55_regs import resampling_mode |
| 30 | from .numeric_util import round_up |
| 31 | from .numeric_util import round_up_divide |
| 32 | from .operation import Kernel |
| 33 | from .operation import NpuBlockType |
| 34 | from .range_set import MemoryRangeSet |
| 35 | from .shape4d import Shape4D |
| 36 | from .tensor import MemArea |
| 37 | |
| 38 | |
| 39 | class SHRAMLayout: |
| 40 | def __init__(self): |
| 41 | self.ib_start = 0 |
| 42 | self.ib_end = 0 |
| 43 | self.ib_start2 = 0 |
| 44 | self.ab_start = 0 |
| 45 | self.lut_start = 0 |
| 46 | |
| 47 | |
| 48 | class ArchitectureBlockConfig: |
| 49 | def __init__(self): |
| 50 | self.layout = SHRAMLayout() |
| 51 | self.ifm_block = Shape4D() |
James Ward | 399c4a2 | 2021-10-20 11:04:46 +0100 | [diff] [blame] | 52 | self.ofm_block = Shape4D() # non-1D-optimised block |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 53 | self.acc_type = SHRAMElements.Acc32 |
| 54 | self.is_partkernel = False |
| 55 | self.bank_size = 0 |
| 56 | |
| 57 | def get_shram_memory_access_range(self): |
| 58 | # Returns the SHRAM memory access range used by this shared buffer, |
| 59 | # excluding access to LUT |
| 60 | return MemoryRangeSet(MemArea.Shram, 0, self.layout.lut_start * self.bank_size) |
| 61 | |
| 62 | def old_style_representation(self): |
| 63 | return [self.ofm_block.height, self.ofm_block.width, self.ifm_block.depth, self.ofm_block.depth] |
| 64 | |
| 65 | def __str__(self): |
| 66 | return str(self.old_style_representation()) |
| 67 | |
| 68 | |
| 69 | _AccumulatorBits = {SHRAMElements.Acc16: 16, SHRAMElements.Acc32: 32, SHRAMElements.Acc40: 40} |
| 70 | |
| 71 | |
| 72 | class ElementwiseUsage(enum.IntEnum): |
| 73 | No = 0 |
| 74 | Full = 1 |
| 75 | Scalar = 2 |
| 76 | |
| 77 | |
| 78 | def _try_block_config( |
| 79 | shram: SHRAMConfig, |
| 80 | ew_usage: ElementwiseUsage, |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame] | 81 | ofm_block: Union[Shape4D, Block], |
| 82 | ifm_block: Union[Shape4D, Block], |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 83 | ifm_bits: int, |
| 84 | ifm_granule: int, |
| 85 | acc_bits: int, |
| 86 | acc_granule: int, |
| 87 | lut_banks: int, |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame] | 88 | ) -> Union[SHRAMLayout, None]: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 89 | assert (acc_bits > 0) and (acc_granule > 0) |
| 90 | assert (ifm_bits >= 8) and ((ifm_bits % 8) == 0) and (ifm_granule > 0) |
| 91 | |
| 92 | # Aways need IFM space |
Tim Hall | e80038a | 2022-05-10 13:41:24 +0100 | [diff] [blame] | 93 | ifm_bytes = ifm_block.elements_wh() * round_up((ifm_block.depth * ifm_bits) / 8, 8) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 94 | ifm_banks = round_up_divide(ifm_bytes, shram.bank_size_bytes) * 2 |
| 95 | ifm_banks = round_up(ifm_banks, ifm_granule) |
| 96 | |
| 97 | # Calculate SHRAM boundaries of the IFM and Accumulators |
| 98 | lut_start = shram.total_banks - lut_banks |
| 99 | ifm_end = shram.reserved_output_banks + ifm_banks |
| 100 | ifm2_start = ifm_end |
| 101 | acc_start = lut_start |
| 102 | |
| 103 | # If not elementwise then we need accumulator space |
| 104 | if ew_usage == ElementwiseUsage.No: |
Tim Hall | e80038a | 2022-05-10 13:41:24 +0100 | [diff] [blame] | 105 | acc_bytes = (ofm_block.elements_wh() * round_up(ofm_block.depth, 8) * acc_bits) // 8 |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 106 | acc_banks = round_up_divide(acc_bytes, shram.bank_size_bytes) * 2 |
| 107 | acc_banks = round_up(acc_banks, acc_granule) |
| 108 | acc_start = acc_start - acc_banks |
| 109 | else: |
| 110 | ifm2_banks = ifm_banks if ew_usage == ElementwiseUsage.Full else 0 |
| 111 | if ifm2_start + ifm2_banks > acc_start: |
| 112 | return None |
| 113 | ifm_end = acc_start |
| 114 | |
| 115 | # IFM must still fit before accumulators |
| 116 | if ifm_end > acc_start: |
| 117 | return None |
| 118 | |
| 119 | # Should all fit, so return this layout |
| 120 | layout = SHRAMLayout() |
| 121 | layout.ib_start = shram.reserved_output_banks |
| 122 | layout.ib_start2 = ifm2_start |
| 123 | layout.ib_end = ifm_end |
| 124 | layout.ab_start = acc_start |
| 125 | layout.lut_start = lut_start |
| 126 | return layout |
| 127 | |
| 128 | |
| 129 | def _choose_kernel_method(ifm_shape: Shape4D, ifm_bits: int, kernel: Kernel) -> bool: |
| 130 | if ifm_shape.depth <= 8: |
| 131 | return True |
| 132 | |
| 133 | # Compare part-kernel to depth-kernel and choose the one with best utilisation |
| 134 | kernel_elements = kernel.elements_wh() |
| 135 | depth_utilisation = ifm_shape.depth / round_up(ifm_shape.depth, 32 if ifm_bits == 8 else 16) |
| 136 | part_utilisation = ( |
| 137 | ifm_shape.depth |
| 138 | * kernel_elements |
| 139 | / (round_up(ifm_shape.depth, 8) * round_up(kernel_elements, 4 if ifm_bits == 8 else 2)) |
| 140 | ) |
| 141 | |
| 142 | return part_utilisation > depth_utilisation |
| 143 | |
| 144 | |
| 145 | def _ew_usage(npu_op_type: NpuBlockType, uses_scalar: bool) -> ElementwiseUsage: |
| 146 | ew_usage = ElementwiseUsage.No |
| 147 | if npu_op_type == NpuBlockType.ElementWise: |
| 148 | ew_usage = ElementwiseUsage.Full |
| 149 | if uses_scalar: |
| 150 | ew_usage = ElementwiseUsage.Scalar |
| 151 | return ew_usage |
| 152 | |
| 153 | |
| 154 | def _acc_type(npu_op_type: NpuBlockType, ifm_bits: int, scaled: bool) -> int: |
| 155 | """Returns accumulator type""" |
| 156 | acc_type = SHRAMElements.Acc32 |
| 157 | if (ifm_bits == 16) and npu_op_type != NpuBlockType.Pooling and scaled: |
| 158 | acc_type = SHRAMElements.Acc40 |
| 159 | return acc_type |
| 160 | |
| 161 | |
Fredrik Svedberg | 3ff7a4a | 2021-09-29 10:08:04 +0200 | [diff] [blame] | 162 | def is_nearest(ifm_resampling: resampling_mode) -> bool: |
| 163 | return ifm_resampling == resampling_mode.NEAREST |
| 164 | |
| 165 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 166 | def to_upscale(ifm_resampling: resampling_mode) -> int: |
| 167 | # Upscaling depending on resampling mode |
| 168 | return 1 if ifm_resampling == resampling_mode.NONE else 2 |
| 169 | |
| 170 | |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame] | 171 | def _ifm_blockdepth(arch, ifm_shape: Union[Shape4D, Block], ifm_bits: int, is_partkernel: bool): |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 172 | if ifm_bits == 16: |
| 173 | ifm_blockdepth = round_up(min(ifm_shape.depth, 16), 4) |
| 174 | else: |
| 175 | ifm_blockdepth = round_up(min(ifm_shape.depth, 16 if is_partkernel else 32), arch.ifm_ublock.depth) |
| 176 | return ifm_blockdepth |
| 177 | |
| 178 | |
Fredrik Svedberg | 3ff7a4a | 2021-09-29 10:08:04 +0200 | [diff] [blame] | 179 | def _required_size(value: int, stride: int, border: int, upscale: int, nearest: bool) -> int: |
| 180 | return int(math.ceil(((value - 1) * stride + border + nearest) / upscale)) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 181 | |
| 182 | |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame] | 183 | def get_ifm_area_required( |
| 184 | ofm_shape: Union[Shape4D, Block], kernel: Kernel, resampling_mode: resampling_mode |
| 185 | ) -> Tuple[int, int]: |
Fredrik Svedberg | 3ff7a4a | 2021-09-29 10:08:04 +0200 | [diff] [blame] | 186 | upscale = to_upscale(resampling_mode) |
| 187 | nearest = is_nearest(resampling_mode) |
| 188 | h1 = _required_size(ofm_shape.height, kernel.stride.y, kernel.area_height(), upscale, nearest) |
| 189 | w1 = _required_size(ofm_shape.width, kernel.stride.x, kernel.area_width(), upscale, nearest) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 190 | return (w1, h1) |
| 191 | |
| 192 | |
| 193 | def _get_ifm_blocksize( |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame] | 194 | ofm_block: Union[Shape4D, Block], kernel: Kernel, ublock: Block, subkernel_limit: Block, upscale: int, nearest: bool |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 195 | ) -> Shape4D: |
| 196 | # IFM block height |
Fredrik Svedberg | 3ff7a4a | 2021-09-29 10:08:04 +0200 | [diff] [blame] | 197 | h1 = _required_size( |
| 198 | ofm_block.height, kernel.stride.y, min(kernel.area_height(), subkernel_limit.height), upscale, nearest |
| 199 | ) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 200 | h2 = h1 |
| 201 | height = round_up(min(h1, h2), ublock.height) |
| 202 | |
| 203 | # IFM block width |
Fredrik Svedberg | 3ff7a4a | 2021-09-29 10:08:04 +0200 | [diff] [blame] | 204 | w1 = _required_size( |
| 205 | ofm_block.width, kernel.stride.x, min(kernel.area_width(), subkernel_limit.width), upscale, nearest |
| 206 | ) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 207 | w2 = w1 |
| 208 | width = round_up(min(w1, w2), ublock.width) |
| 209 | |
| 210 | return Shape4D(1, height, width, ofm_block.depth) |
| 211 | |
| 212 | |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame] | 213 | def fit_block_for_ofm( |
| 214 | arch: ArchitectureFeatures, ofm_shape: Union[Shape4D, Block], kernel: Kernel, block: Union[Shape4D, Block] |
| 215 | ): |
Tim Hall | 3016157 | 2021-06-17 17:03:49 +0100 | [diff] [blame] | 216 | # 256/512 Conv1D optimisation (ratio of IFM:Accumulators changes) This is a specific |
| 217 | # interpretation of a more general constraint that can't be applied because the |
| 218 | # find_block_config function must return block configs that can be applied to any OFM shape. |
| 219 | if (ofm_shape.height == 1) and (kernel.height == 1) and (arch.ofm_ublock.height == 2): |
| 220 | return Shape4D(1, min(block.height, ofm_shape.height), block.width, block.depth) |
| 221 | return block |
| 222 | |
| 223 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 224 | def find_block_config( |
| 225 | arch: ArchitectureFeatures, |
| 226 | npu_op_type: NpuBlockType, |
| 227 | ofm_shape: Shape4D, |
| 228 | ifm_shape: Shape4D, |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame] | 229 | ifm2_shape: Optional[Shape4D], |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 230 | uses_scalar: bool, |
| 231 | ifm_bits: int, |
| 232 | kernel: Kernel, |
| 233 | lut_banks: int, |
| 234 | scaled: bool, |
| 235 | ifm_resampling: resampling_mode, |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame] | 236 | ) -> Optional[ArchitectureBlockConfig]: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 237 | SplitDepth = ArchitectureFeatures.OFMSplitDepth |
| 238 | # Elementwise larger-volume correction |
| 239 | if ifm2_shape is not None and ifm2_shape.elements() > ifm_shape.elements(): |
| 240 | ifm_shape = ifm2_shape |
| 241 | |
| 242 | # Figure out if SHRAM should be portioned for elementwise |
| 243 | ew_usage = _ew_usage(npu_op_type, uses_scalar) |
| 244 | |
| 245 | # Operator typing help |
| 246 | is_pooling = npu_op_type == NpuBlockType.Pooling |
| 247 | is_depthwise = npu_op_type == NpuBlockType.ConvolutionDepthWise |
| 248 | is_equal_depth_op = (ew_usage != ElementwiseUsage.No) or is_pooling or is_depthwise |
| 249 | is_convolution = (npu_op_type == NpuBlockType.ConvolutionMxN) or is_depthwise |
| 250 | |
| 251 | # Block config to be returned |
| 252 | config = ArchitectureBlockConfig() |
| 253 | config.is_partkernel = is_convolution and _choose_kernel_method(ifm_shape, ifm_bits, kernel) |
| 254 | |
| 255 | # Accumulator & granule settings |
| 256 | config.acc_type = _acc_type(npu_op_type, ifm_bits, scaled) |
| 257 | |
| 258 | # Memory rounding granules |
| 259 | acc_granule = arch.accumulator_granules[config.acc_type] |
| 260 | acc_bits = _AccumulatorBits[config.acc_type] |
| 261 | if ew_usage != ElementwiseUsage.No: |
| 262 | ifm_granule = arch.ifm_ew_bank_granules[ifm_bits] |
| 263 | else: |
| 264 | ifm_granule = arch.ifm_bank_granules[ifm_bits] |
| 265 | lut_banks = max(lut_banks, arch.shram.reserved_end_banks) |
| 266 | upscale = to_upscale(ifm_resampling) |
Fredrik Svedberg | 3ff7a4a | 2021-09-29 10:08:04 +0200 | [diff] [blame] | 267 | nearest = is_nearest(ifm_resampling) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 268 | |
| 269 | # Subkernel repeats of the IFM |
| 270 | ifm_repeats = round_up_divide(kernel.area_width(), arch.SubKernelMax.width) * round_up_divide( |
| 271 | kernel.area_height(), arch.SubKernelMax.height |
| 272 | ) |
| 273 | ifm_blockdepth = _ifm_blockdepth(arch, ifm_shape, ifm_bits, config.is_partkernel) |
| 274 | |
| 275 | # Weights fetch (for operators that have them) |
| 276 | weight_fetch_wh = (kernel.area_width() * kernel.area_height()) if is_convolution else 0 |
| 277 | |
| 278 | search_space = Shape4D.min(ofm_shape, Shape4D(arch.ofm_block_max.to_hwc())) |
Tim Hall | e80038a | 2022-05-10 13:41:24 +0100 | [diff] [blame] | 279 | search_space = Shape4D.round_up(search_space, Shape4D(arch.ofm_ublock.to_hwc())) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 280 | |
| 281 | # Block WHC search, loops across the search space looking for best efficiency |
| 282 | best_cost = math.inf |
Tim Hall | daed152 | 2021-07-19 21:22:46 +0100 | [diff] [blame] | 283 | best_coverage = math.inf |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 284 | depth = max(arch.ofm_ublock.depth, min(search_space.depth, SplitDepth)) |
| 285 | if depth < ofm_shape.depth: |
| 286 | depth = round_up(depth, SplitDepth) |
| 287 | |
| 288 | while depth <= search_space.depth: |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame] | 289 | wont_fit: Dict[Tuple[int, int], bool] = {} |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 290 | for height in range(arch.ofm_ublock.height, search_space.height + 1, arch.ofm_ublock.height): |
| 291 | for width in range(arch.ofm_ublock.width, search_space.width + 1, arch.ofm_ublock.width): |
| 292 | # Avoid checking W/H transposed blocks that already didn't fit. i.e. if 8x4x16 didn't |
| 293 | # fit, then 4x8x16 won't either. |
| 294 | if wont_fit.get((height, width), False): |
| 295 | continue |
| 296 | |
| 297 | # Calculate the IFM block dimensions required to feed this OFM block |
| 298 | ofm_block = Shape4D(1, height, width, depth) |
Fredrik Svedberg | 3ff7a4a | 2021-09-29 10:08:04 +0200 | [diff] [blame] | 299 | ifm_block = _get_ifm_blocksize(ofm_block, kernel, arch.ofm_ublock, arch.SubKernelMax, upscale, nearest) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 300 | if not is_equal_depth_op: |
| 301 | ifm_block = ifm_block.with_depth(ifm_blockdepth) |
| 302 | |
| 303 | # Test if the IFM/OFM blocks fit into SHRAM |
Tim Hall | 3016157 | 2021-06-17 17:03:49 +0100 | [diff] [blame] | 304 | ofm_block = fit_block_for_ofm(arch, ofm_shape, kernel, ofm_block) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 305 | layout = _try_block_config( |
Louis Verhaard | d2b5510 | 2022-03-17 15:59:04 +0100 | [diff] [blame] | 306 | arch.shram, |
| 307 | ew_usage, |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame] | 308 | Block(ofm_block.width, ofm_block.height, ofm_block.depth), |
| 309 | Block(ifm_block.width, ifm_block.height, ifm_block.depth), |
Louis Verhaard | d2b5510 | 2022-03-17 15:59:04 +0100 | [diff] [blame] | 310 | ifm_bits, |
| 311 | ifm_granule, |
| 312 | acc_bits, |
| 313 | acc_granule, |
| 314 | lut_banks, |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 315 | ) |
| 316 | |
| 317 | if layout: |
Tim Hall | 789e6f3 | 2021-06-17 17:02:31 +0100 | [diff] [blame] | 318 | full_blocks = Shape4D.div_round_up(ofm_shape, ofm_block) |
| 319 | blocks = ofm_shape / ofm_block |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 320 | |
Tim Hall | 789e6f3 | 2021-06-17 17:02:31 +0100 | [diff] [blame] | 321 | # Weights fetching |
| 322 | weight_fetch = weight_fetch_wh * ifm_shape.depth * full_blocks.elements_wh() |
| 323 | if not is_depthwise: |
| 324 | weight_fetch *= ofm_block.depth * blocks.depth |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 325 | |
Tim Hall | 789e6f3 | 2021-06-17 17:02:31 +0100 | [diff] [blame] | 326 | # IFM fetching |
| 327 | ifm_fetch = ifm_block.elements_wh() * ifm_shape.depth * ifm_repeats * blocks.elements_wh() |
| 328 | if not is_equal_depth_op: |
| 329 | ifm_fetch *= full_blocks.depth |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 330 | |
Tim Hall | 789e6f3 | 2021-06-17 17:02:31 +0100 | [diff] [blame] | 331 | # Scale relative to every output OFM element |
Fredrik Svedberg | 5cc4c76 | 2022-06-16 13:14:52 +0200 | [diff] [blame] | 332 | if npu_op_type == NpuBlockType.ElementWise: |
Rickard Bolin | 21d9031 | 2022-08-11 14:26:20 +0000 | [diff] [blame] | 333 | relative_cost = max(ofm_shape.elements() / (height * width * depth), 1) |
Fredrik Svedberg | 5cc4c76 | 2022-06-16 13:14:52 +0200 | [diff] [blame] | 334 | else: |
| 335 | relative_cost = (ifm_fetch + weight_fetch) / ofm_shape.elements() |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 336 | |
| 337 | # If the entire IFM can be encompassed by both buffers, bias to prefer this configuration |
| 338 | if ifm_shape.elements() < ifm_block.elements() * 2: |
| 339 | relative_cost = relative_cost / 2 |
| 340 | |
Tim Hall | daed152 | 2021-07-19 21:22:46 +0100 | [diff] [blame] | 341 | # Choose based on relative minimum cost or larger IFM area (if equal cost) |
| 342 | if relative_cost <= best_cost: |
| 343 | choose_this = False |
| 344 | # Check IFM coverage only when it's equal best_cost and small OFM |
| 345 | if relative_cost == best_cost: |
| 346 | coverage_shape = Shape4D.min(ifm_shape, ifm_block) |
| 347 | coverage = ifm_shape.elements_wh() / coverage_shape.elements_wh() |
| 348 | # Small 4x4 IFM constraint found through analysis of networks |
| 349 | if coverage <= best_coverage and (height <= 4 and width <= 4): |
| 350 | best_coverage = coverage |
| 351 | choose_this = True |
| 352 | else: |
| 353 | best_coverage = math.inf |
| 354 | choose_this = True |
| 355 | |
| 356 | if choose_this: |
| 357 | best_cost = relative_cost |
| 358 | config.layout = layout |
| 359 | config.bank_size = arch.shram_bank_size |
| 360 | config.ifm_block = ifm_block |
| 361 | config.ofm_block = Shape4D(1, height, width, depth) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 362 | else: |
| 363 | wont_fit[(width, height)] = True |
| 364 | |
| 365 | depth = depth + arch.ofm_ublock.depth |
| 366 | if depth < ofm_shape.depth: |
| 367 | depth = round_up(depth, SplitDepth) |
| 368 | |
| 369 | if best_cost != math.inf: |
| 370 | return config |
| 371 | |
| 372 | return None |
| 373 | |
| 374 | |
| 375 | def try_block_config( |
| 376 | block_config: Block, |
| 377 | arch: ArchitectureFeatures, |
| 378 | npu_op_type: NpuBlockType, |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame] | 379 | ofm_shape: Union[Shape4D, Block], |
| 380 | ifm_shape: Union[Shape4D, Block], |
| 381 | ifm2_shape: Optional[Union[Shape4D, Block]], |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 382 | uses_scalar: bool, |
| 383 | ifm_bits: int, |
| 384 | is_partkernel: bool, |
| 385 | kernel: Kernel, |
| 386 | lut_banks: int, |
| 387 | scaled: bool, |
| 388 | ifm_resampling: resampling_mode, |
| 389 | ) -> Optional[ArchitectureBlockConfig]: |
| 390 | """ |
| 391 | Given a block_config, returns a corresponding ArchitectureBlockConfig. |
| 392 | Returns None if the block_config does not fit or is invalid. |
| 393 | """ |
| 394 | # Check block config validity |
| 395 | if not all( |
| 396 | blk > 0 and blk <= blk_max and blk % ublk == 0 |
| 397 | for blk, blk_max, ublk in zip(block_config.as_list(), arch.ofm_block_max.as_list(), arch.ofm_ublock.as_list()) |
| 398 | ): |
| 399 | return None |
| 400 | # Elementwise larger-volume correction |
| 401 | if ifm2_shape is not None and ifm2_shape.elements() > ifm_shape.elements(): |
| 402 | ifm_shape = ifm2_shape |
| 403 | |
| 404 | ew_usage = _ew_usage(npu_op_type, uses_scalar) |
| 405 | |
| 406 | # Operator typing help |
| 407 | is_pooling = npu_op_type == NpuBlockType.Pooling |
| 408 | is_depthwise = npu_op_type == NpuBlockType.ConvolutionDepthWise |
| 409 | is_equal_depth_op = (ew_usage != ElementwiseUsage.No) or is_pooling or is_depthwise |
| 410 | |
| 411 | # Block config to be returned |
| 412 | config = ArchitectureBlockConfig() |
| 413 | config.is_partkernel = is_partkernel |
| 414 | |
| 415 | # Accumulator & granule settings |
| 416 | config.acc_type = _acc_type(npu_op_type, ifm_bits, scaled) |
| 417 | |
| 418 | # Memory rounding granules |
| 419 | acc_granule = arch.accumulator_granules[config.acc_type] |
| 420 | acc_bits = _AccumulatorBits[config.acc_type] |
| 421 | if ew_usage != ElementwiseUsage.No: |
| 422 | ifm_granule = arch.ifm_ew_bank_granules[ifm_bits] |
| 423 | else: |
| 424 | ifm_granule = arch.ifm_bank_granules[ifm_bits] |
| 425 | lut_banks = max(lut_banks, arch.shram.reserved_end_banks) |
| 426 | upscale = to_upscale(ifm_resampling) |
Fredrik Svedberg | 3ff7a4a | 2021-09-29 10:08:04 +0200 | [diff] [blame] | 427 | nearest = is_nearest(ifm_resampling) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 428 | ifm_blockdepth = _ifm_blockdepth(arch, ifm_shape, ifm_bits, is_partkernel) |
Fredrik Svedberg | 3ff7a4a | 2021-09-29 10:08:04 +0200 | [diff] [blame] | 429 | ifm_block = _get_ifm_blocksize(block_config, kernel, arch.ofm_ublock, arch.SubKernelMax, upscale, nearest) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 430 | if not is_equal_depth_op: |
| 431 | ifm_block = ifm_block.with_depth(ifm_blockdepth) |
| 432 | |
Tim Hall | 3016157 | 2021-06-17 17:03:49 +0100 | [diff] [blame] | 433 | # 256/512 Conv1D optimisation (ratio of IFM:Accumulators changes) |
James Ward | 399c4a2 | 2021-10-20 11:04:46 +0100 | [diff] [blame] | 434 | block_config_opt = fit_block_for_ofm(arch, ofm_shape, kernel, block_config) |
Tim Hall | 3016157 | 2021-06-17 17:03:49 +0100 | [diff] [blame] | 435 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 436 | layout = _try_block_config( |
Tim Hall | e80038a | 2022-05-10 13:41:24 +0100 | [diff] [blame] | 437 | arch.shram, ew_usage, block_config_opt, ifm_block, ifm_bits, ifm_granule, acc_bits, acc_granule, lut_banks |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 438 | ) |
| 439 | if layout is None: |
| 440 | return None |
| 441 | config.layout = layout |
| 442 | config.bank_size = arch.shram_bank_size |
| 443 | config.ifm_block = ifm_block |
Jacob Bohlin | b8060f5 | 2021-08-09 12:22:51 +0100 | [diff] [blame] | 444 | config.ofm_block = block_config |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 445 | return config |