Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 1 | # Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved. |
| 2 | # |
| 3 | # SPDX-License-Identifier: Apache-2.0 |
| 4 | # |
| 5 | # Licensed under the Apache License, Version 2.0 (the License); you may |
| 6 | # not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| 8 | # |
| 9 | # www.apache.org/licenses/LICENSE-2.0 |
| 10 | # |
| 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an AS IS BASIS, WITHOUT |
| 13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
| 16 | # |
| 17 | # Description: Architecture SHRAM allocator |
| 18 | import enum |
| 19 | import math |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame^] | 20 | from typing import Dict |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 21 | from typing import Optional |
| 22 | from typing import Tuple |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame^] | 23 | from typing import Union |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 24 | |
| 25 | from .architecture_features import ArchitectureFeatures |
| 26 | from .architecture_features import Block |
| 27 | from .architecture_features import SHRAMConfig |
| 28 | from .architecture_features import SHRAMElements |
| 29 | from .ethos_u55_regs.ethos_u55_regs import resampling_mode |
| 30 | from .numeric_util import round_up |
| 31 | from .numeric_util import round_up_divide |
| 32 | from .operation import Kernel |
| 33 | from .operation import NpuBlockType |
| 34 | from .range_set import MemoryRangeSet |
| 35 | from .shape4d import Shape4D |
| 36 | from .tensor import MemArea |
| 37 | |
| 38 | |
| 39 | class SHRAMLayout: |
| 40 | def __init__(self): |
| 41 | self.ib_start = 0 |
| 42 | self.ib_end = 0 |
| 43 | self.ib_start2 = 0 |
| 44 | self.ab_start = 0 |
| 45 | self.lut_start = 0 |
| 46 | |
| 47 | |
| 48 | class ArchitectureBlockConfig: |
| 49 | def __init__(self): |
| 50 | self.layout = SHRAMLayout() |
| 51 | self.ifm_block = Shape4D() |
James Ward | 399c4a2 | 2021-10-20 11:04:46 +0100 | [diff] [blame] | 52 | self.ofm_block = Shape4D() # non-1D-optimised block |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 53 | self.acc_type = SHRAMElements.Acc32 |
| 54 | self.is_partkernel = False |
| 55 | self.bank_size = 0 |
Louis Verhaard | d2b5510 | 2022-03-17 15:59:04 +0100 | [diff] [blame] | 56 | self.ifm_depth_buf_scaling = 0 |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 57 | |
| 58 | def get_shram_memory_access_range(self): |
| 59 | # Returns the SHRAM memory access range used by this shared buffer, |
| 60 | # excluding access to LUT |
| 61 | return MemoryRangeSet(MemArea.Shram, 0, self.layout.lut_start * self.bank_size) |
| 62 | |
| 63 | def old_style_representation(self): |
| 64 | return [self.ofm_block.height, self.ofm_block.width, self.ifm_block.depth, self.ofm_block.depth] |
| 65 | |
| 66 | def __str__(self): |
| 67 | return str(self.old_style_representation()) |
| 68 | |
| 69 | |
| 70 | _AccumulatorBits = {SHRAMElements.Acc16: 16, SHRAMElements.Acc32: 32, SHRAMElements.Acc40: 40} |
| 71 | |
| 72 | |
| 73 | class ElementwiseUsage(enum.IntEnum): |
| 74 | No = 0 |
| 75 | Full = 1 |
| 76 | Scalar = 2 |
| 77 | |
| 78 | |
| 79 | def _try_block_config( |
| 80 | shram: SHRAMConfig, |
| 81 | ew_usage: ElementwiseUsage, |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame^] | 82 | ofm_block: Union[Shape4D, Block], |
| 83 | ifm_block: Union[Shape4D, Block], |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 84 | ifm_bits: int, |
| 85 | ifm_granule: int, |
| 86 | acc_bits: int, |
| 87 | acc_granule: int, |
| 88 | lut_banks: int, |
Louis Verhaard | d2b5510 | 2022-03-17 15:59:04 +0100 | [diff] [blame] | 89 | ifm_depth_buf_scaling: int, |
| 90 | cores: int, |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame^] | 91 | ) -> Union[SHRAMLayout, None]: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 92 | assert (acc_bits > 0) and (acc_granule > 0) |
| 93 | assert (ifm_bits >= 8) and ((ifm_bits % 8) == 0) and (ifm_granule > 0) |
| 94 | |
Louis Verhaard | d2b5510 | 2022-03-17 15:59:04 +0100 | [diff] [blame] | 95 | # Scale depth with cores |
| 96 | ifm_depth = round_up_divide(ifm_block.depth, ifm_depth_buf_scaling) |
| 97 | ofm_depth = round_up_divide(ofm_block.depth, cores) |
| 98 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 99 | # Aways need IFM space |
Louis Verhaard | d2b5510 | 2022-03-17 15:59:04 +0100 | [diff] [blame] | 100 | ifm_bytes = ifm_block.elements_wh() * round_up((ifm_depth * ifm_bits) / 8, 8) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 101 | ifm_banks = round_up_divide(ifm_bytes, shram.bank_size_bytes) * 2 |
| 102 | ifm_banks = round_up(ifm_banks, ifm_granule) |
| 103 | |
| 104 | # Calculate SHRAM boundaries of the IFM and Accumulators |
| 105 | lut_start = shram.total_banks - lut_banks |
| 106 | ifm_end = shram.reserved_output_banks + ifm_banks |
| 107 | ifm2_start = ifm_end |
| 108 | acc_start = lut_start |
| 109 | |
| 110 | # If not elementwise then we need accumulator space |
| 111 | if ew_usage == ElementwiseUsage.No: |
Louis Verhaard | d2b5510 | 2022-03-17 15:59:04 +0100 | [diff] [blame] | 112 | acc_bytes = (ofm_block.elements_wh() * round_up(ofm_depth, 8) * acc_bits) // 8 |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 113 | acc_banks = round_up_divide(acc_bytes, shram.bank_size_bytes) * 2 |
| 114 | acc_banks = round_up(acc_banks, acc_granule) |
| 115 | acc_start = acc_start - acc_banks |
| 116 | else: |
| 117 | ifm2_banks = ifm_banks if ew_usage == ElementwiseUsage.Full else 0 |
| 118 | if ifm2_start + ifm2_banks > acc_start: |
| 119 | return None |
| 120 | ifm_end = acc_start |
| 121 | |
| 122 | # IFM must still fit before accumulators |
| 123 | if ifm_end > acc_start: |
| 124 | return None |
| 125 | |
| 126 | # Should all fit, so return this layout |
| 127 | layout = SHRAMLayout() |
| 128 | layout.ib_start = shram.reserved_output_banks |
| 129 | layout.ib_start2 = ifm2_start |
| 130 | layout.ib_end = ifm_end |
| 131 | layout.ab_start = acc_start |
| 132 | layout.lut_start = lut_start |
| 133 | return layout |
| 134 | |
| 135 | |
| 136 | def _choose_kernel_method(ifm_shape: Shape4D, ifm_bits: int, kernel: Kernel) -> bool: |
| 137 | if ifm_shape.depth <= 8: |
| 138 | return True |
| 139 | |
| 140 | # Compare part-kernel to depth-kernel and choose the one with best utilisation |
| 141 | kernel_elements = kernel.elements_wh() |
| 142 | depth_utilisation = ifm_shape.depth / round_up(ifm_shape.depth, 32 if ifm_bits == 8 else 16) |
| 143 | part_utilisation = ( |
| 144 | ifm_shape.depth |
| 145 | * kernel_elements |
| 146 | / (round_up(ifm_shape.depth, 8) * round_up(kernel_elements, 4 if ifm_bits == 8 else 2)) |
| 147 | ) |
| 148 | |
| 149 | return part_utilisation > depth_utilisation |
| 150 | |
| 151 | |
| 152 | def _ew_usage(npu_op_type: NpuBlockType, uses_scalar: bool) -> ElementwiseUsage: |
| 153 | ew_usage = ElementwiseUsage.No |
| 154 | if npu_op_type == NpuBlockType.ElementWise: |
| 155 | ew_usage = ElementwiseUsage.Full |
| 156 | if uses_scalar: |
| 157 | ew_usage = ElementwiseUsage.Scalar |
| 158 | return ew_usage |
| 159 | |
| 160 | |
| 161 | def _acc_type(npu_op_type: NpuBlockType, ifm_bits: int, scaled: bool) -> int: |
| 162 | """Returns accumulator type""" |
| 163 | acc_type = SHRAMElements.Acc32 |
| 164 | if (ifm_bits == 16) and npu_op_type != NpuBlockType.Pooling and scaled: |
| 165 | acc_type = SHRAMElements.Acc40 |
| 166 | return acc_type |
| 167 | |
| 168 | |
Fredrik Svedberg | 3ff7a4a | 2021-09-29 10:08:04 +0200 | [diff] [blame] | 169 | def is_nearest(ifm_resampling: resampling_mode) -> bool: |
| 170 | return ifm_resampling == resampling_mode.NEAREST |
| 171 | |
| 172 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 173 | def to_upscale(ifm_resampling: resampling_mode) -> int: |
| 174 | # Upscaling depending on resampling mode |
| 175 | return 1 if ifm_resampling == resampling_mode.NONE else 2 |
| 176 | |
| 177 | |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame^] | 178 | def _ifm_blockdepth(arch, ifm_shape: Union[Shape4D, Block], ifm_bits: int, is_partkernel: bool): |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 179 | if ifm_bits == 16: |
| 180 | ifm_blockdepth = round_up(min(ifm_shape.depth, 16), 4) |
| 181 | else: |
| 182 | ifm_blockdepth = round_up(min(ifm_shape.depth, 16 if is_partkernel else 32), arch.ifm_ublock.depth) |
| 183 | return ifm_blockdepth |
| 184 | |
| 185 | |
Fredrik Svedberg | 3ff7a4a | 2021-09-29 10:08:04 +0200 | [diff] [blame] | 186 | def _required_size(value: int, stride: int, border: int, upscale: int, nearest: bool) -> int: |
| 187 | return int(math.ceil(((value - 1) * stride + border + nearest) / upscale)) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 188 | |
| 189 | |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame^] | 190 | def get_ifm_area_required( |
| 191 | ofm_shape: Union[Shape4D, Block], kernel: Kernel, resampling_mode: resampling_mode |
| 192 | ) -> Tuple[int, int]: |
Fredrik Svedberg | 3ff7a4a | 2021-09-29 10:08:04 +0200 | [diff] [blame] | 193 | upscale = to_upscale(resampling_mode) |
| 194 | nearest = is_nearest(resampling_mode) |
| 195 | h1 = _required_size(ofm_shape.height, kernel.stride.y, kernel.area_height(), upscale, nearest) |
| 196 | w1 = _required_size(ofm_shape.width, kernel.stride.x, kernel.area_width(), upscale, nearest) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 197 | return (w1, h1) |
| 198 | |
| 199 | |
| 200 | def _get_ifm_blocksize( |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame^] | 201 | ofm_block: Union[Shape4D, Block], kernel: Kernel, ublock: Block, subkernel_limit: Block, upscale: int, nearest: bool |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 202 | ) -> Shape4D: |
| 203 | # IFM block height |
Fredrik Svedberg | 3ff7a4a | 2021-09-29 10:08:04 +0200 | [diff] [blame] | 204 | h1 = _required_size( |
| 205 | ofm_block.height, kernel.stride.y, min(kernel.area_height(), subkernel_limit.height), upscale, nearest |
| 206 | ) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 207 | h2 = h1 |
| 208 | height = round_up(min(h1, h2), ublock.height) |
| 209 | |
| 210 | # IFM block width |
Fredrik Svedberg | 3ff7a4a | 2021-09-29 10:08:04 +0200 | [diff] [blame] | 211 | w1 = _required_size( |
| 212 | ofm_block.width, kernel.stride.x, min(kernel.area_width(), subkernel_limit.width), upscale, nearest |
| 213 | ) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 214 | w2 = w1 |
| 215 | width = round_up(min(w1, w2), ublock.width) |
| 216 | |
| 217 | return Shape4D(1, height, width, ofm_block.depth) |
| 218 | |
| 219 | |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame^] | 220 | def fit_block_for_ofm( |
| 221 | arch: ArchitectureFeatures, ofm_shape: Union[Shape4D, Block], kernel: Kernel, block: Union[Shape4D, Block] |
| 222 | ): |
Tim Hall | 3016157 | 2021-06-17 17:03:49 +0100 | [diff] [blame] | 223 | # 256/512 Conv1D optimisation (ratio of IFM:Accumulators changes) This is a specific |
| 224 | # interpretation of a more general constraint that can't be applied because the |
| 225 | # find_block_config function must return block configs that can be applied to any OFM shape. |
| 226 | if (ofm_shape.height == 1) and (kernel.height == 1) and (arch.ofm_ublock.height == 2): |
| 227 | return Shape4D(1, min(block.height, ofm_shape.height), block.width, block.depth) |
| 228 | return block |
| 229 | |
| 230 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 231 | def find_block_config( |
| 232 | arch: ArchitectureFeatures, |
| 233 | npu_op_type: NpuBlockType, |
| 234 | ofm_shape: Shape4D, |
| 235 | ifm_shape: Shape4D, |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame^] | 236 | ifm2_shape: Optional[Shape4D], |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 237 | uses_scalar: bool, |
| 238 | ifm_bits: int, |
| 239 | kernel: Kernel, |
| 240 | lut_banks: int, |
| 241 | scaled: bool, |
| 242 | ifm_resampling: resampling_mode, |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame^] | 243 | ) -> Optional[ArchitectureBlockConfig]: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 244 | SplitDepth = ArchitectureFeatures.OFMSplitDepth |
| 245 | # Elementwise larger-volume correction |
| 246 | if ifm2_shape is not None and ifm2_shape.elements() > ifm_shape.elements(): |
| 247 | ifm_shape = ifm2_shape |
| 248 | |
| 249 | # Figure out if SHRAM should be portioned for elementwise |
| 250 | ew_usage = _ew_usage(npu_op_type, uses_scalar) |
| 251 | |
| 252 | # Operator typing help |
| 253 | is_pooling = npu_op_type == NpuBlockType.Pooling |
| 254 | is_depthwise = npu_op_type == NpuBlockType.ConvolutionDepthWise |
| 255 | is_equal_depth_op = (ew_usage != ElementwiseUsage.No) or is_pooling or is_depthwise |
| 256 | is_convolution = (npu_op_type == NpuBlockType.ConvolutionMxN) or is_depthwise |
| 257 | |
| 258 | # Block config to be returned |
| 259 | config = ArchitectureBlockConfig() |
| 260 | config.is_partkernel = is_convolution and _choose_kernel_method(ifm_shape, ifm_bits, kernel) |
| 261 | |
Louis Verhaard | d2b5510 | 2022-03-17 15:59:04 +0100 | [diff] [blame] | 262 | # IFM is not broadcasted for pooling and depthwise ops and for elementwise |
| 263 | # when there's no elementwise-broadcasting in depth |
| 264 | elemwise_buf_scalable = npu_op_type == NpuBlockType.ElementWise and ( |
| 265 | not ifm2_shape or ifm_shape.depth == ifm2_shape.depth |
| 266 | ) |
| 267 | ifm_depth_buf_scaling = arch.ncores if is_pooling or is_depthwise or elemwise_buf_scalable else 1 |
| 268 | config.ifm_depth_buf_scaling = ifm_depth_buf_scaling |
| 269 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 270 | # Accumulator & granule settings |
| 271 | config.acc_type = _acc_type(npu_op_type, ifm_bits, scaled) |
| 272 | |
| 273 | # Memory rounding granules |
| 274 | acc_granule = arch.accumulator_granules[config.acc_type] |
| 275 | acc_bits = _AccumulatorBits[config.acc_type] |
| 276 | if ew_usage != ElementwiseUsage.No: |
| 277 | ifm_granule = arch.ifm_ew_bank_granules[ifm_bits] |
| 278 | else: |
| 279 | ifm_granule = arch.ifm_bank_granules[ifm_bits] |
| 280 | lut_banks = max(lut_banks, arch.shram.reserved_end_banks) |
| 281 | upscale = to_upscale(ifm_resampling) |
Fredrik Svedberg | 3ff7a4a | 2021-09-29 10:08:04 +0200 | [diff] [blame] | 282 | nearest = is_nearest(ifm_resampling) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 283 | |
| 284 | # Subkernel repeats of the IFM |
| 285 | ifm_repeats = round_up_divide(kernel.area_width(), arch.SubKernelMax.width) * round_up_divide( |
| 286 | kernel.area_height(), arch.SubKernelMax.height |
| 287 | ) |
| 288 | ifm_blockdepth = _ifm_blockdepth(arch, ifm_shape, ifm_bits, config.is_partkernel) |
| 289 | |
| 290 | # Weights fetch (for operators that have them) |
| 291 | weight_fetch_wh = (kernel.area_width() * kernel.area_height()) if is_convolution else 0 |
| 292 | |
Louis Verhaard | d2b5510 | 2022-03-17 15:59:04 +0100 | [diff] [blame] | 293 | ofm_ublock_depth = arch.ofm_ublock.depth * arch.ncores |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 294 | search_space = Shape4D.min(ofm_shape, Shape4D(arch.ofm_block_max.to_hwc())) |
Louis Verhaard | d2b5510 | 2022-03-17 15:59:04 +0100 | [diff] [blame] | 295 | search_space = Shape4D.round_up(search_space, Shape4D(arch.ofm_ublock.to_hwc()).with_depth(ofm_ublock_depth)) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 296 | |
| 297 | # Block WHC search, loops across the search space looking for best efficiency |
| 298 | best_cost = math.inf |
Tim Hall | daed152 | 2021-07-19 21:22:46 +0100 | [diff] [blame] | 299 | best_coverage = math.inf |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 300 | depth = max(arch.ofm_ublock.depth, min(search_space.depth, SplitDepth)) |
| 301 | if depth < ofm_shape.depth: |
| 302 | depth = round_up(depth, SplitDepth) |
| 303 | |
| 304 | while depth <= search_space.depth: |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame^] | 305 | wont_fit: Dict[Tuple[int, int], bool] = {} |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 306 | for height in range(arch.ofm_ublock.height, search_space.height + 1, arch.ofm_ublock.height): |
| 307 | for width in range(arch.ofm_ublock.width, search_space.width + 1, arch.ofm_ublock.width): |
| 308 | # Avoid checking W/H transposed blocks that already didn't fit. i.e. if 8x4x16 didn't |
| 309 | # fit, then 4x8x16 won't either. |
| 310 | if wont_fit.get((height, width), False): |
| 311 | continue |
| 312 | |
| 313 | # Calculate the IFM block dimensions required to feed this OFM block |
| 314 | ofm_block = Shape4D(1, height, width, depth) |
Fredrik Svedberg | 3ff7a4a | 2021-09-29 10:08:04 +0200 | [diff] [blame] | 315 | ifm_block = _get_ifm_blocksize(ofm_block, kernel, arch.ofm_ublock, arch.SubKernelMax, upscale, nearest) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 316 | if not is_equal_depth_op: |
| 317 | ifm_block = ifm_block.with_depth(ifm_blockdepth) |
| 318 | |
| 319 | # Test if the IFM/OFM blocks fit into SHRAM |
Tim Hall | 3016157 | 2021-06-17 17:03:49 +0100 | [diff] [blame] | 320 | ofm_block = fit_block_for_ofm(arch, ofm_shape, kernel, ofm_block) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 321 | layout = _try_block_config( |
Louis Verhaard | d2b5510 | 2022-03-17 15:59:04 +0100 | [diff] [blame] | 322 | arch.shram, |
| 323 | ew_usage, |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame^] | 324 | Block(ofm_block.width, ofm_block.height, ofm_block.depth), |
| 325 | Block(ifm_block.width, ifm_block.height, ifm_block.depth), |
Louis Verhaard | d2b5510 | 2022-03-17 15:59:04 +0100 | [diff] [blame] | 326 | ifm_bits, |
| 327 | ifm_granule, |
| 328 | acc_bits, |
| 329 | acc_granule, |
| 330 | lut_banks, |
| 331 | ifm_depth_buf_scaling, |
| 332 | arch.ncores, |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 333 | ) |
| 334 | |
| 335 | if layout: |
Tim Hall | 789e6f3 | 2021-06-17 17:02:31 +0100 | [diff] [blame] | 336 | full_blocks = Shape4D.div_round_up(ofm_shape, ofm_block) |
| 337 | blocks = ofm_shape / ofm_block |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 338 | |
Tim Hall | 789e6f3 | 2021-06-17 17:02:31 +0100 | [diff] [blame] | 339 | # Weights fetching |
| 340 | weight_fetch = weight_fetch_wh * ifm_shape.depth * full_blocks.elements_wh() |
| 341 | if not is_depthwise: |
| 342 | weight_fetch *= ofm_block.depth * blocks.depth |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 343 | |
Tim Hall | 789e6f3 | 2021-06-17 17:02:31 +0100 | [diff] [blame] | 344 | # IFM fetching |
| 345 | ifm_fetch = ifm_block.elements_wh() * ifm_shape.depth * ifm_repeats * blocks.elements_wh() |
| 346 | if not is_equal_depth_op: |
| 347 | ifm_fetch *= full_blocks.depth |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 348 | |
Tim Hall | 789e6f3 | 2021-06-17 17:02:31 +0100 | [diff] [blame] | 349 | # Scale relative to every output OFM element |
| 350 | relative_cost = (ifm_fetch + weight_fetch) / ofm_shape.elements() |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 351 | |
| 352 | # If the entire IFM can be encompassed by both buffers, bias to prefer this configuration |
| 353 | if ifm_shape.elements() < ifm_block.elements() * 2: |
| 354 | relative_cost = relative_cost / 2 |
| 355 | |
Tim Hall | daed152 | 2021-07-19 21:22:46 +0100 | [diff] [blame] | 356 | # Choose based on relative minimum cost or larger IFM area (if equal cost) |
| 357 | if relative_cost <= best_cost: |
| 358 | choose_this = False |
| 359 | # Check IFM coverage only when it's equal best_cost and small OFM |
| 360 | if relative_cost == best_cost: |
| 361 | coverage_shape = Shape4D.min(ifm_shape, ifm_block) |
| 362 | coverage = ifm_shape.elements_wh() / coverage_shape.elements_wh() |
| 363 | # Small 4x4 IFM constraint found through analysis of networks |
| 364 | if coverage <= best_coverage and (height <= 4 and width <= 4): |
| 365 | best_coverage = coverage |
| 366 | choose_this = True |
| 367 | else: |
| 368 | best_coverage = math.inf |
| 369 | choose_this = True |
| 370 | |
| 371 | if choose_this: |
| 372 | best_cost = relative_cost |
| 373 | config.layout = layout |
| 374 | config.bank_size = arch.shram_bank_size |
| 375 | config.ifm_block = ifm_block |
| 376 | config.ofm_block = Shape4D(1, height, width, depth) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 377 | else: |
| 378 | wont_fit[(width, height)] = True |
| 379 | |
| 380 | depth = depth + arch.ofm_ublock.depth |
| 381 | if depth < ofm_shape.depth: |
| 382 | depth = round_up(depth, SplitDepth) |
| 383 | |
| 384 | if best_cost != math.inf: |
| 385 | return config |
| 386 | |
| 387 | return None |
| 388 | |
| 389 | |
| 390 | def try_block_config( |
| 391 | block_config: Block, |
| 392 | arch: ArchitectureFeatures, |
| 393 | npu_op_type: NpuBlockType, |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame^] | 394 | ofm_shape: Union[Shape4D, Block], |
| 395 | ifm_shape: Union[Shape4D, Block], |
| 396 | ifm2_shape: Optional[Union[Shape4D, Block]], |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 397 | uses_scalar: bool, |
| 398 | ifm_bits: int, |
| 399 | is_partkernel: bool, |
| 400 | kernel: Kernel, |
| 401 | lut_banks: int, |
| 402 | scaled: bool, |
| 403 | ifm_resampling: resampling_mode, |
| 404 | ) -> Optional[ArchitectureBlockConfig]: |
| 405 | """ |
| 406 | Given a block_config, returns a corresponding ArchitectureBlockConfig. |
| 407 | Returns None if the block_config does not fit or is invalid. |
| 408 | """ |
| 409 | # Check block config validity |
| 410 | if not all( |
| 411 | blk > 0 and blk <= blk_max and blk % ublk == 0 |
| 412 | for blk, blk_max, ublk in zip(block_config.as_list(), arch.ofm_block_max.as_list(), arch.ofm_ublock.as_list()) |
| 413 | ): |
| 414 | return None |
| 415 | # Elementwise larger-volume correction |
| 416 | if ifm2_shape is not None and ifm2_shape.elements() > ifm_shape.elements(): |
| 417 | ifm_shape = ifm2_shape |
| 418 | |
| 419 | ew_usage = _ew_usage(npu_op_type, uses_scalar) |
| 420 | |
| 421 | # Operator typing help |
| 422 | is_pooling = npu_op_type == NpuBlockType.Pooling |
| 423 | is_depthwise = npu_op_type == NpuBlockType.ConvolutionDepthWise |
| 424 | is_equal_depth_op = (ew_usage != ElementwiseUsage.No) or is_pooling or is_depthwise |
| 425 | |
| 426 | # Block config to be returned |
| 427 | config = ArchitectureBlockConfig() |
| 428 | config.is_partkernel = is_partkernel |
| 429 | |
Louis Verhaard | d2b5510 | 2022-03-17 15:59:04 +0100 | [diff] [blame] | 430 | # IFM is not broadcasted for pooling and depthwise ops and for elementwise |
| 431 | # when there's no elementwise-broadcasting in depth |
| 432 | elemwise_buf_scalable = npu_op_type == NpuBlockType.ElementWise and ( |
| 433 | not ifm2_shape or ifm_shape.depth == ifm2_shape.depth |
| 434 | ) |
| 435 | ifm_depth_buf_scaling = arch.ncores if is_pooling or is_depthwise or elemwise_buf_scalable else 1 |
| 436 | config.ifm_depth_buf_scaling = ifm_depth_buf_scaling |
| 437 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 438 | # Accumulator & granule settings |
| 439 | config.acc_type = _acc_type(npu_op_type, ifm_bits, scaled) |
| 440 | |
| 441 | # Memory rounding granules |
| 442 | acc_granule = arch.accumulator_granules[config.acc_type] |
| 443 | acc_bits = _AccumulatorBits[config.acc_type] |
| 444 | if ew_usage != ElementwiseUsage.No: |
| 445 | ifm_granule = arch.ifm_ew_bank_granules[ifm_bits] |
| 446 | else: |
| 447 | ifm_granule = arch.ifm_bank_granules[ifm_bits] |
| 448 | lut_banks = max(lut_banks, arch.shram.reserved_end_banks) |
| 449 | upscale = to_upscale(ifm_resampling) |
Fredrik Svedberg | 3ff7a4a | 2021-09-29 10:08:04 +0200 | [diff] [blame] | 450 | nearest = is_nearest(ifm_resampling) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 451 | ifm_blockdepth = _ifm_blockdepth(arch, ifm_shape, ifm_bits, is_partkernel) |
Fredrik Svedberg | 3ff7a4a | 2021-09-29 10:08:04 +0200 | [diff] [blame] | 452 | ifm_block = _get_ifm_blocksize(block_config, kernel, arch.ofm_ublock, arch.SubKernelMax, upscale, nearest) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 453 | if not is_equal_depth_op: |
| 454 | ifm_block = ifm_block.with_depth(ifm_blockdepth) |
| 455 | |
Tim Hall | 3016157 | 2021-06-17 17:03:49 +0100 | [diff] [blame] | 456 | # 256/512 Conv1D optimisation (ratio of IFM:Accumulators changes) |
James Ward | 399c4a2 | 2021-10-20 11:04:46 +0100 | [diff] [blame] | 457 | block_config_opt = fit_block_for_ofm(arch, ofm_shape, kernel, block_config) |
Tim Hall | 3016157 | 2021-06-17 17:03:49 +0100 | [diff] [blame] | 458 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 459 | layout = _try_block_config( |
Louis Verhaard | d2b5510 | 2022-03-17 15:59:04 +0100 | [diff] [blame] | 460 | arch.shram, |
| 461 | ew_usage, |
| 462 | block_config_opt, |
| 463 | ifm_block, |
| 464 | ifm_bits, |
| 465 | ifm_granule, |
| 466 | acc_bits, |
| 467 | acc_granule, |
| 468 | lut_banks, |
| 469 | ifm_depth_buf_scaling, |
| 470 | arch.ncores, |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 471 | ) |
| 472 | if layout is None: |
| 473 | return None |
| 474 | config.layout = layout |
| 475 | config.bank_size = arch.shram_bank_size |
| 476 | config.ifm_block = ifm_block |
Jacob Bohlin | b8060f5 | 2021-08-09 12:22:51 +0100 | [diff] [blame] | 477 | config.ofm_block = block_config |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 478 | return config |