William Isaksson | 56e5f0c | 2024-01-10 12:28:04 +0100 | [diff] [blame] | 1 | # SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com> |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 2 | # |
| 3 | # SPDX-License-Identifier: Apache-2.0 |
| 4 | # |
| 5 | # Licensed under the Apache License, Version 2.0 (the License); you may |
| 6 | # not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| 8 | # |
| 9 | # www.apache.org/licenses/LICENSE-2.0 |
| 10 | # |
| 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an AS IS BASIS, WITHOUT |
| 13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
| 16 | # |
| 17 | # Description: |
| 18 | # Utility functions for code generation |
| 19 | from typing import List |
| 20 | from typing import NamedTuple |
| 21 | from typing import Optional |
| 22 | |
| 23 | from . import numeric_util |
William Isaksson | 56e5f0c | 2024-01-10 12:28:04 +0100 | [diff] [blame] | 24 | from .api import NpuAccumulatorType |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 25 | from .api import NpuActivationOp |
| 26 | from .api import NpuAddressRange |
| 27 | from .api import NpuBlockOperation |
| 28 | from .api import NpuDmaOperation |
| 29 | from .api import NpuElementWiseOp |
| 30 | from .api import NpuFeatureMap |
| 31 | from .api import NpuKernel |
| 32 | from .api import NpuLayout |
| 33 | from .api import NpuOperation |
| 34 | from .api import NpuOperationType |
| 35 | from .api import NpuPadding |
Fredrik Svedberg | f3c7d55 | 2022-11-04 09:48:49 +0100 | [diff] [blame] | 36 | from .api import NpuQuantization |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 37 | from .api import NpuShape3D |
| 38 | from .architecture_features import ArchitectureFeatures |
| 39 | from .architecture_features import Block |
| 40 | from .architecture_features import Rect |
William Isaksson | a4f8411 | 2023-06-19 15:31:46 +0000 | [diff] [blame] | 41 | from .errors import ByteAlignmentError |
| 42 | from .errors import ByteSizeError |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 43 | from .operation import Kernel |
| 44 | from .operation import PointXYZ |
William Isaksson | a4f8411 | 2023-06-19 15:31:46 +0000 | [diff] [blame] | 45 | from .tensor import TensorFormat |
William Isaksson | 56e5f0c | 2024-01-10 12:28:04 +0100 | [diff] [blame] | 46 | from .tflite.TensorType import TensorType |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 47 | from ethosu.vela.range_set import AccessDirection |
| 48 | from ethosu.vela.range_set import MemoryAccessSet |
| 49 | from ethosu.vela.range_set import MemoryRangeSet |
| 50 | |
William Isaksson | a4f8411 | 2023-06-19 15:31:46 +0000 | [diff] [blame] | 51 | |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 52 | # base address slot for memory to memory transfer |
| 53 | BASE_PTR_INDEX_MEM2MEM = int((1 << 8) | (3 << 0)) |
| 54 | |
| 55 | |
Michael McGeagh | f3e3ad7 | 2020-12-02 12:39:03 +0000 | [diff] [blame] | 56 | UNARY_ELEMWISE_OPS = (NpuElementWiseOp.ABS, NpuElementWiseOp.LRELU, NpuElementWiseOp.CLZ) |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 57 | |
| 58 | |
William Isaksson | a4f8411 | 2023-06-19 15:31:46 +0000 | [diff] [blame] | 59 | def check_alignment(payload, required_alignment): |
| 60 | # assuming payload is defined in bytes |
| 61 | if payload % required_alignment != 0: |
| 62 | raise ByteAlignmentError(f"Cmd1 payload of size: {payload} Bytes is not {required_alignment}-byte aligned") |
| 63 | |
| 64 | |
Björn Davidsson | 199e8e6 | 2023-10-10 11:22:59 +0200 | [diff] [blame] | 65 | def check_size(payload, required_multiple, value_type): |
William Isaksson | a4f8411 | 2023-06-19 15:31:46 +0000 | [diff] [blame] | 66 | # assuming payload is defined in bytes |
| 67 | if payload % required_multiple != 0: |
Björn Davidsson | 199e8e6 | 2023-10-10 11:22:59 +0200 | [diff] [blame] | 68 | raise ByteSizeError(f"Cmd1 {value_type} of size: {payload} Bytes is not a multiple of {required_multiple}") |
| 69 | |
| 70 | |
| 71 | def check_stride(stride, required_multiple): |
| 72 | check_size(stride, required_multiple, "stride") |
| 73 | |
| 74 | |
| 75 | def check_length(length, required_multiple): |
| 76 | check_size(length, required_multiple, "length") |
William Isaksson | a4f8411 | 2023-06-19 15:31:46 +0000 | [diff] [blame] | 77 | |
| 78 | |
William Isaksson | 56e5f0c | 2024-01-10 12:28:04 +0100 | [diff] [blame] | 79 | def to_npu_acc_type(accType: TensorType) -> NpuAccumulatorType: |
| 80 | if accType == TensorType.INT32: |
| 81 | return NpuAccumulatorType.Int32 |
| 82 | elif accType == TensorType.INT64: |
| 83 | return NpuAccumulatorType.Int40 |
| 84 | else: |
| 85 | return NpuAccumulatorType.Default |
| 86 | |
| 87 | |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 88 | def to_npu_kernel(kernel: Kernel) -> NpuKernel: |
| 89 | """Converts the given internally used kernel object to NpuKernel (of public API)""" |
| 90 | return NpuKernel( |
| 91 | kernel.width, kernel.height, kernel.stride.x, kernel.stride.y, kernel.dilation.x, kernel.dilation.y |
| 92 | ) |
| 93 | |
| 94 | |
| 95 | def to_kernel(kernel: Optional[NpuKernel]) -> Kernel: |
| 96 | """Converts the given public API object to Kernel (used internally)""" |
| 97 | if kernel is None: |
| 98 | return Kernel(1, 1) |
| 99 | return Kernel(kernel.width, kernel.height, kernel.stride_x, kernel.stride_y, kernel.dilation_x, kernel.dilation_y) |
| 100 | |
| 101 | |
| 102 | def has_ifm2(npu_op: NpuBlockOperation) -> bool: |
| 103 | """Checks if op has non-scalar IFM2""" |
| 104 | return npu_op.ifm2 is not None and npu_op.ifm2_scalar is None |
| 105 | |
| 106 | |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 107 | def shape3d_size(shape: NpuShape3D) -> int: |
| 108 | return shape.width * shape.height * shape.depth |
| 109 | |
| 110 | |
| 111 | def shape3d_to_rect(shape: NpuShape3D) -> Rect: |
| 112 | return Rect(0, 0, 0, shape.width - 1, shape.height - 1, shape.depth - 1) |
| 113 | |
| 114 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 115 | def shape3d_to_block(shape: NpuShape3D) -> Block: |
| 116 | return Block(shape.width, shape.height, shape.depth) |
| 117 | |
| 118 | |
Fredrik Svedberg | f3c7d55 | 2022-11-04 09:48:49 +0100 | [diff] [blame] | 119 | def get_zero_point(fm: NpuFeatureMap): |
| 120 | return int(fm.quantization.zero_point if fm.quantization else 0) |
| 121 | |
| 122 | |
| 123 | def quantise(value: float, quant: Optional[NpuQuantization]) -> int: |
| 124 | """Quantizes the given value""" |
| 125 | scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32 |
| 126 | zp = 0 if quant is None else quant.zero_point |
| 127 | return numeric_util.quantise_float32(value, scale, zp) |
| 128 | |
| 129 | |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 130 | # ------------------------------------------------------------------- |
| 131 | # ADDRESSING/STRIDES (helper functions) |
| 132 | # ------------------------------------------------------------------- |
| 133 | |
| 134 | |
| 135 | def ranges_overlap(range1: NpuAddressRange, range2: NpuAddressRange) -> bool: |
| 136 | """Checks if the ranges overlap""" |
| 137 | return range1.region == range2.region and numeric_util.overlaps( |
| 138 | range1.address, range1.address + range1.length, range2.address, range2.address + range2.length |
| 139 | ) |
| 140 | |
| 141 | |
| 142 | def range_lists_overlap(list1: List[Optional[NpuAddressRange]], list2: List[Optional[NpuAddressRange]]) -> bool: |
| 143 | """Checks if there is any address overlap between list1 and list2""" |
| 144 | for range1 in list1: |
| 145 | if range1 is None: |
| 146 | continue |
| 147 | for range2 in list2: |
| 148 | if range2 is not None and ranges_overlap(range1, range2): |
| 149 | return True |
| 150 | return False |
| 151 | |
| 152 | |
| 153 | def get_strides(fm: NpuFeatureMap) -> NpuShape3D: |
| 154 | """Calculates STRIDE_C/Y/X""" |
| 155 | if fm.strides is not None: |
| 156 | return fm.strides |
| 157 | elem_size = fm.data_type.size_in_bytes() |
| 158 | if fm.layout == NpuLayout.NHWC: |
| 159 | stride_c = elem_size |
| 160 | stride_x = fm.shape.depth * stride_c |
| 161 | stride_y = fm.shape.width * stride_x |
| 162 | else: |
| 163 | stride_x = 16 * elem_size |
| 164 | stride_c = stride_x * fm.shape.width |
| 165 | stride_y = elem_size * fm.shape.width * numeric_util.round_up(fm.shape.depth, 16) |
| 166 | return NpuShape3D(depth=stride_c, height=stride_y, width=stride_x) |
| 167 | |
| 168 | |
| 169 | def get_address(fm: NpuFeatureMap, strides: NpuShape3D, y: int, x: int, c: int) -> int: |
| 170 | """Returns address of given coordinate""" |
| 171 | t = 0 |
| 172 | BRICK = 16 |
| 173 | stride_c = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHWC else strides.depth |
| 174 | stride_x = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHCWB16 else strides.width |
| 175 | if x >= fm.tiles.width_0: |
| 176 | x -= fm.tiles.width_0 |
| 177 | t = 1 |
| 178 | if y >= fm.tiles.height_1: |
| 179 | y -= fm.tiles.height_1 |
| 180 | t += 2 |
| 181 | elif y >= fm.tiles.height_0: |
| 182 | y -= fm.tiles.height_0 |
| 183 | t += 2 |
| 184 | elem_size = fm.data_type.size_in_bytes() |
| 185 | return ( |
| 186 | fm.tiles.addresses[t] + y * strides.height + x * stride_x + (c // BRICK) * stride_c + int(c % BRICK) * elem_size |
| 187 | ) |
| 188 | |
| 189 | |
| 190 | def get_address_range( |
| 191 | fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int |
| 192 | ) -> NpuAddressRange: |
| 193 | """ |
| 194 | Gets address range for (y0, x0, c0) - (y1, x1, c1) (inclusive, so the second coordinate is within the fm). |
| 195 | The begin and end coordinates must be within the same tile. |
| 196 | """ |
| 197 | addr0 = get_address(fm, strides, y0, x0, c0) |
| 198 | addr1 = get_address(fm, strides, y1, x1, c1) |
| 199 | return NpuAddressRange(region=fm.region, address=addr0, length=addr1 - addr0 + fm.data_type.size_in_bytes()) |
| 200 | |
| 201 | |
| 202 | def get_h_ranges( |
| 203 | fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int |
| 204 | ) -> List[NpuAddressRange]: |
| 205 | """ |
| 206 | Gets address ranges for (y0, x0, c0) - (y1, x1, c1) (inclusive, so the second coordinate is within the fm); |
| 207 | the begin and end coordinates must be within the same tile. |
| 208 | Divides the area in horizontal "stripes" of height 1, and returns the address ranges for these "stripes". |
| 209 | """ |
| 210 | return [get_address_range(fm, strides, y, x0, c0, y, x1, c1) for y in range(y0, y1 + 1)] |
| 211 | |
| 212 | |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame] | 213 | def get_address_ranges_for_area(fm: NpuFeatureMap, start: PointXYZ, end: PointXYZ) -> List[Optional[NpuAddressRange]]: |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 214 | """ |
| 215 | Returns a list of adddress ranges that covers the area start - end (inclusive). |
| 216 | Divides the area in horizontal "stripes" of height 1, and returns the address ranges for these "stripes". |
| 217 | |
| 218 | For example, for the area marked with X (in a feature map with 4 tiles) as input, this function would return |
| 219 | 6 address ranges: the address ranges for 1-height areas [AAA, BBB, CC, DD, EEE, FF] |
| 220 | |
| 221 | .....|.... .....|.... |
| 222 | t0 ..XXX|XX.. t1 t0 ..AAA|CC.. t1 |
| 223 | ..XXX|XX.. ..BBB|DD.. |
| 224 | -----+---- --> -----+---- |
| 225 | t2 ..XXX|XX.. t3 t2 ..EEE|FF.. t3 |
| 226 | .....|.... .....|.... |
| 227 | """ |
| 228 | strides = get_strides(fm) |
| 229 | height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0 |
| 230 | h, w, c = fm.shape |
| 231 | y0, x0, c0 = start.y, start.x, start.z |
| 232 | y1, x1, c1 = min(end.y, h - 1), min(end.x, w - 1), min(end.z, c - 1) |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame] | 233 | ranges: List[Optional[NpuAddressRange]] = [] |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 234 | if x0 < width_0 and y0 < height_0: |
| 235 | # Horizontal ranges for tile 0 |
| 236 | ranges.extend(get_h_ranges(fm, strides, y0, x0, c0, min(y1, height_0 - 1), min(x1, width_0 - 1), c1)) |
| 237 | if x1 >= width_0 and y0 < height_1: |
| 238 | # Horizontal ranges for tile 1 |
| 239 | ranges.extend(get_h_ranges(fm, strides, y0, max(x0, width_0), c0, min(y1, height_1 - 1), x1, c1)) |
| 240 | if x0 < width_0 and y1 >= height_0: |
| 241 | # Horizontal ranges for tile 2 |
| 242 | ranges.extend(get_h_ranges(fm, strides, max(y0, height_0), x0, c0, y1, min(x1, width_0 - 1), c1)) |
| 243 | if x1 >= width_0 and y1 >= height_1: |
| 244 | # Horizontal ranges for tile 3 |
| 245 | ranges.extend(get_h_ranges(fm, strides, max(y0, height_1), max(x0, width_0), c0, y1, x1, c1)) |
| 246 | return ranges |
| 247 | |
| 248 | |
| 249 | def get_address_ranges(fm: NpuFeatureMap) -> List[Optional[NpuAddressRange]]: |
| 250 | """Returns 4 adddress ranges, one for every tile, None if the tile is not in use""" |
| 251 | strides = get_strides(fm) |
| 252 | height, width, depth = fm.shape.height, fm.shape.width, fm.shape.depth |
| 253 | height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0 |
Jonas Ohlsson | d857507 | 2022-03-30 10:30:25 +0200 | [diff] [blame] | 254 | t0 = get_address_range( |
| 255 | fm, |
| 256 | strides, |
| 257 | 0, |
| 258 | 0, |
| 259 | 0, |
| 260 | min(height, height_0) - 1, |
| 261 | min(width, width_0) - 1, |
| 262 | depth - 1, |
| 263 | ) |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 264 | if width > width_0: |
| 265 | t1 = get_address_range(fm, strides, 0, width_0, 0, min(height, height_1) - 1, width - 1, depth - 1) |
| 266 | else: |
| 267 | t1 = None |
| 268 | if height > height_0: |
| 269 | t2 = get_address_range(fm, strides, height_0, 0, 0, height - 1, min(width, width_0) - 1, depth - 1) |
| 270 | else: |
| 271 | t2 = None |
| 272 | if t1 is not None and t2 is not None: |
| 273 | t3 = get_address_range(fm, strides, height_1, width_0, 0, height - 1, width - 1, depth - 1) |
| 274 | else: |
| 275 | t3 = None |
| 276 | return [t0, t1, t2, t3] |
| 277 | |
| 278 | |
William Isaksson | a4f8411 | 2023-06-19 15:31:46 +0000 | [diff] [blame] | 279 | def check_strides(fm: NpuFeatureMap, strides: NpuShape3D): |
| 280 | |
| 281 | element_size_in_bytes = fm.data_type.size_in_bytes() |
| 282 | |
| 283 | if fm.layout == NpuLayout.NHCWB16: |
| 284 | strides_to_check = [strides.depth, strides.height] |
Björn Davidsson | 199e8e6 | 2023-10-10 11:22:59 +0200 | [diff] [blame] | 285 | required_multiple = 16 |
William Isaksson | a4f8411 | 2023-06-19 15:31:46 +0000 | [diff] [blame] | 286 | else: |
| 287 | strides_to_check = [strides.height, strides.width] |
| 288 | required_multiple = element_size_in_bytes |
| 289 | for stride in strides_to_check: |
Björn Davidsson | 199e8e6 | 2023-10-10 11:22:59 +0200 | [diff] [blame] | 290 | check_stride(stride, required_multiple) |
William Isaksson | a4f8411 | 2023-06-19 15:31:46 +0000 | [diff] [blame] | 291 | |
| 292 | |
| 293 | def check_addresses(addresses: List[int], layout: NpuLayout, element_size, arch: ArchitectureFeatures): |
| 294 | if layout == NpuLayout.NHCWB16: |
| 295 | required_alignment = arch.storage_rounding_quantums[TensorFormat.NHCWB16][-1] |
| 296 | else: |
| 297 | required_alignment = element_size |
| 298 | for addr in addresses: |
| 299 | check_alignment(addr, required_alignment) |
| 300 | |
| 301 | |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 302 | # ------------------------------------------------------------------- |
| 303 | # DMA_WAIT/KERNEL_WAIT |
| 304 | # ------------------------------------------------------------------- |
| 305 | |
| 306 | |
| 307 | class Watermark(NamedTuple): |
| 308 | npu: int |
| 309 | dma: int |
| 310 | |
| 311 | |
| 312 | def memory_range_set(range: NpuAddressRange) -> MemoryRangeSet: |
| 313 | return MemoryRangeSet(range.region, range.address, range.address + range.length) |
| 314 | |
| 315 | |
| 316 | def get_dma_memory_accesses(dma_op: NpuDmaOperation) -> MemoryAccessSet: |
| 317 | """Returns the address that are read and written by the given DMA operation""" |
| 318 | res = MemoryAccessSet() |
| 319 | res.add(memory_range_set(dma_op.src), AccessDirection.Read) |
| 320 | res.add(memory_range_set(dma_op.dest), AccessDirection.Write) |
| 321 | return res |
| 322 | |
| 323 | |
| 324 | def get_op_memory_accesses(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> MemoryAccessSet: |
| 325 | """Returns the addresses that are read and written by the given operation""" |
| 326 | assert npu_op.ifm is not None and npu_op.ofm is not None |
| 327 | # Read addresses |
| 328 | read_ranges = get_address_ranges(npu_op.ifm) |
| 329 | if has_ifm2(npu_op): |
| 330 | assert npu_op.ifm2 is not None |
| 331 | read_ranges.extend(get_address_ranges(npu_op.ifm2)) |
| 332 | read_ranges.extend(npu_op.weights) |
| 333 | read_ranges.extend(npu_op.biases) |
| 334 | if npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP: |
| 335 | address = arch.available_shram_banks(True) * arch.shram_bank_size |
| 336 | read_ranges.append(NpuAddressRange(region=BASE_PTR_INDEX_MEM2MEM, address=address, length=2048)) |
| 337 | # Written addresses |
| 338 | write_ranges = get_address_ranges(npu_op.ofm) |
| 339 | # Add write access to SHRAM, needed when LUTs can overwrite accumulator banks |
| 340 | uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP |
| 341 | written_shram_size = arch.available_shram_banks(uses_lut) * arch.shram_bank_size |
| 342 | write_ranges.append(NpuAddressRange(region=BASE_PTR_INDEX_MEM2MEM, address=0, length=written_shram_size)) |
| 343 | |
| 344 | res = MemoryAccessSet() |
| 345 | for read_range in read_ranges: |
| 346 | if read_range is not None: |
| 347 | res.add(memory_range_set(read_range), AccessDirection.Read) |
| 348 | for write_range in write_ranges: |
| 349 | if write_range is not None: |
| 350 | res.add(memory_range_set(write_range), AccessDirection.Write) |
| 351 | return res |
| 352 | |
| 353 | |
| 354 | def get_wait_dependency( |
Alexander Hansson | ca9cc42 | 2023-06-22 16:01:27 +0000 | [diff] [blame] | 355 | arch: ArchitectureFeatures, |
| 356 | npu_op: NpuOperation, |
| 357 | memory_accesses, |
| 358 | outstanding_dma_ops: List[NpuOperation], |
| 359 | outstanding_npu_ops: List[NpuOperation], |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 360 | ): |
| 361 | """Used to calculate whether DMA wait or kernel wait operations are needed""" |
Alexander Hansson | ca9cc42 | 2023-06-22 16:01:27 +0000 | [diff] [blame] | 362 | kern_wait = -1 |
| 363 | dma_wait = -1 |
| 364 | op_accesses = memory_accesses[npu_op] |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 365 | |
Alexander Hansson | ca9cc42 | 2023-06-22 16:01:27 +0000 | [diff] [blame] | 366 | if isinstance(npu_op, NpuDmaOperation): |
| 367 | outstanding_ops = outstanding_npu_ops |
| 368 | outstanding_dma_ops.append(npu_op) |
| 369 | if len(outstanding_dma_ops) > arch.max_outstanding_dma: |
| 370 | outstanding_dma_ops.pop(0) |
| 371 | else: |
| 372 | outstanding_ops = outstanding_dma_ops |
| 373 | outstanding_npu_ops.append(npu_op) |
| 374 | if len(outstanding_npu_ops) > arch.max_outstanding_kernels: |
| 375 | outstanding_npu_ops.pop(0) |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 376 | |
Alexander Hansson | ca9cc42 | 2023-06-22 16:01:27 +0000 | [diff] [blame] | 377 | waits = -1 |
| 378 | for idx in range(len(outstanding_ops) - 1, -1, -1): |
| 379 | waits += 1 |
| 380 | other_op = outstanding_ops[idx] |
| 381 | other_accesses = memory_accesses[other_op] |
| 382 | if other_accesses.conflicts(op_accesses): |
| 383 | if isinstance(npu_op, NpuDmaOperation): |
| 384 | kern_wait = waits |
| 385 | else: |
| 386 | dma_wait = waits |
| 387 | # Current op needs to wait, and after it has waited, |
| 388 | # outstanding_ops[0..idx] are not outstanding any longer |
| 389 | for i in range(idx + 1): |
| 390 | outstanding_ops.pop(0) |
| 391 | break |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 392 | |
Alexander Hansson | ca9cc42 | 2023-06-22 16:01:27 +0000 | [diff] [blame] | 393 | cmd_waits = Watermark(kern_wait, dma_wait) |
| 394 | return cmd_waits |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 395 | |
| 396 | |
William Isaksson | a4f8411 | 2023-06-19 15:31:46 +0000 | [diff] [blame] | 397 | def check_dma_op(dma_op: NpuDmaOperation, arch: ArchitectureFeatures): |
| 398 | |
| 399 | # For Ethos-U65 only internal addresses have to be aligned, and if the internal address is the destination |
| 400 | # then the length has to be aligned also. |
| 401 | if arch.is_ethos_u65_system: |
| 402 | if dma_op.src.region == BASE_PTR_INDEX_MEM2MEM: |
| 403 | check_alignment(dma_op.src.address, 16) |
| 404 | if dma_op.dest.region == BASE_PTR_INDEX_MEM2MEM: |
| 405 | check_alignment(dma_op.dest.address, 16) |
Björn Davidsson | 199e8e6 | 2023-10-10 11:22:59 +0200 | [diff] [blame] | 406 | check_length(dma_op.src.length, 16) |
William Isaksson | a4f8411 | 2023-06-19 15:31:46 +0000 | [diff] [blame] | 407 | else: |
| 408 | check_alignment(dma_op.src.address, 16) |
| 409 | check_alignment(dma_op.dest.address, 16) |
Björn Davidsson | 199e8e6 | 2023-10-10 11:22:59 +0200 | [diff] [blame] | 410 | check_length(dma_op.src.length, 16) |
William Isaksson | a4f8411 | 2023-06-19 15:31:46 +0000 | [diff] [blame] | 411 | |
| 412 | |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 413 | # ------------------------------------------------------------------- |
| 414 | # BLOCKDEP |
| 415 | # ------------------------------------------------------------------- |
| 416 | |
| 417 | |
| 418 | def get_ifm_ofm_block_depth(arch: ArchitectureFeatures, npu_op: NpuBlockOperation) -> int: |
| 419 | # Note: NOT equivalent to the normal ifm block depth calculation since |
| 420 | # it takes into account 'depthless' block operations by returning full |
| 421 | # depth |
| 422 | if npu_op.op_type == NpuOperationType.Conv2D: |
| 423 | res = arch.calc_ifm_block_depth(npu_op.ifm.shape.depth, npu_op.ifm.data_type.size_in_bits()) |
| 424 | return res |
| 425 | return npu_op.ofm.shape.depth |
| 426 | |
| 427 | |
| 428 | def coords_intersect(start_a: PointXYZ, end_a: PointXYZ, start_b: PointXYZ, end_b: PointXYZ) -> bool: |
| 429 | """Checks if the two areas overlap""" |
| 430 | start_x = max(start_a.x, start_b.x) |
| 431 | end_x = min(end_a.x, end_b.x) |
| 432 | start_y = max(start_a.y, start_b.y) |
| 433 | end_y = min(end_a.y, end_b.y) |
| 434 | start_z = max(start_a.z, start_b.z) |
| 435 | end_z = min(end_a.z, end_b.z) |
| 436 | return ((end_x - start_x) > 0) and ((end_y - start_y) > 0) and ((end_z - start_z) > 0) |
| 437 | |
| 438 | |
| 439 | def intersects( |
| 440 | ifm: NpuFeatureMap, |
| 441 | ifm_start_coord: PointXYZ, |
| 442 | ifm_end_coord: PointXYZ, |
| 443 | prev_ofm: NpuFeatureMap, |
| 444 | ofm_start_coord: PointXYZ, |
| 445 | ofm_end_coord: PointXYZ, |
| 446 | ) -> bool: |
| 447 | """Checks if the given IFM area overlaps with the given OFM area""" |
| 448 | if ifm.shape == prev_ofm.shape and ifm.tiles == prev_ofm.tiles: |
| 449 | # Common case: prev_op.ofm == op.ifm; in this case it suffices to check |
| 450 | # if the xyz coordinates overlap, which is quick and easy |
| 451 | res = coords_intersect(ifm_start_coord, ifm_end_coord, ofm_start_coord, ofm_end_coord) |
| 452 | else: |
| 453 | # The OFM produces a part of the IFM (e.g. a stripe), or the IFM consumes part of the OFM. |
| 454 | # In this case, address comparison between the two areas is needed |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame] | 455 | ifm_ranges: List[Optional[NpuAddressRange]] = get_address_ranges_for_area(ifm, ifm_start_coord, ifm_end_coord) |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 456 | prev_ofm_ranges = get_address_ranges_for_area(prev_ofm, ofm_start_coord, ofm_end_coord) |
| 457 | res = range_lists_overlap(ifm_ranges, prev_ofm_ranges) |
| 458 | return res |
| 459 | |
| 460 | |
| 461 | # Block job dependency: |
| 462 | # Does the VOLUME of IFMs for block job B(0) overlap with VOLUME of OFMs block jobs A(8,9,10) |
| 463 | # |
| 464 | # A | B |
| 465 | # ----------------------+------------------ |
| 466 | # .... 3,4,5,6,7,8,9,10 | 0,1,2,3,4,5,6,8 10 < JOB NUMBER |
| 467 | # |<------->| dependency offset |
| 468 | # |
| 469 | |
| 470 | |
| 471 | def get_offset_block_coords(area: Rect, block: Block, offset: int) -> Optional[PointXYZ]: |
| 472 | """ |
| 473 | Get the coordinates of a block offset from either the end (negative) |
| 474 | or the start (zero or positive) of the given 3D area |
| 475 | """ |
| 476 | size = area.size() |
| 477 | # Dimensions of the region, in blocks |
| 478 | width_blocks = numeric_util.round_up_divide(size.width, block.width) |
| 479 | height_blocks = numeric_util.round_up_divide(size.height, block.height) |
| 480 | depth_blocks = numeric_util.round_up_divide(size.depth, block.depth) |
| 481 | total_blocks = width_blocks * height_blocks * depth_blocks |
| 482 | if offset < 0: |
| 483 | index = total_blocks + offset |
| 484 | else: |
| 485 | index = offset |
| 486 | |
| 487 | if index >= total_blocks: |
| 488 | return None |
| 489 | |
| 490 | # Coordinates of the indexed block |
| 491 | coord_z = block.depth * (index % depth_blocks) |
| 492 | coord_y = block.height * (index // (depth_blocks * width_blocks)) |
| 493 | coord_x = block.width * ((index // depth_blocks) % width_blocks) |
| 494 | |
| 495 | return PointXYZ(x=coord_x + area.x, y=coord_y + area.y, z=coord_z + area.z) |
| 496 | |
| 497 | |
| 498 | def get_first_job_input_volume( |
| 499 | arch: ArchitectureFeatures, |
| 500 | ifm: Rect, |
| 501 | ofm: Rect, |
| 502 | ifm_block_depth, |
| 503 | ofm_block: Block, |
| 504 | kernel: Kernel, |
| 505 | padding: NpuPadding, |
| 506 | block_offset: int, |
| 507 | ): |
| 508 | # Get ifm block size (jobs are invisibly decomposed into subkernels) |
| 509 | ifm_block = arch.get_ifm_block_size(ifm_block_depth, ofm_block, kernel, arch.ofm_block_max) |
| 510 | ifm_depth_blocks = numeric_util.round_up_divide(ifm.size().depth, ifm_block_depth) |
| 511 | |
| 512 | # Which OFM block are we calculating |
| 513 | ofm_coord = get_offset_block_coords(ofm, ofm_block, block_offset // ifm_depth_blocks) |
| 514 | if ofm_coord is None: |
| 515 | return None |
| 516 | |
| 517 | # Coordinate of the source IFM block |
| 518 | ifm_coord_x = max(0, ofm_coord[0] * kernel.stride.x - padding.left) |
| 519 | ifm_coord_y = max(0, ofm_coord[1] * kernel.stride.y - padding.right) |
| 520 | ifm_coord_z = ifm.z + (block_offset % ifm_depth_blocks) * ifm_block.depth |
| 521 | |
| 522 | # IFM block that will be sampled for the FIRST+block_offset job in the next operator's OFM |
| 523 | start_coord = PointXYZ(x=ifm_coord_x, y=ifm_coord_y, z=ifm_coord_z) |
| 524 | end_coord = PointXYZ( |
Jonas Ohlsson | d857507 | 2022-03-30 10:30:25 +0200 | [diff] [blame] | 525 | x=start_coord[0] + ifm_block.width, |
| 526 | y=start_coord[1] + ifm_block.height, |
| 527 | z=start_coord[2] + ifm_block.depth, |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 528 | ) |
| 529 | return (start_coord, end_coord, 1) # start, end, total jobs |
| 530 | |
| 531 | |
| 532 | def get_prev_job_output_volume(ofm: Rect, ofm_block: Block, block_offset: int): |
| 533 | assert block_offset >= 0 |
| 534 | |
| 535 | # Get OFM block's volume coordinates |
| 536 | start_coord = get_offset_block_coords(ofm, ofm_block, -1 - block_offset) |
| 537 | if start_coord is None: |
| 538 | return None |
| 539 | end_coord = PointXYZ( |
Jonas Ohlsson | d857507 | 2022-03-30 10:30:25 +0200 | [diff] [blame] | 540 | x=start_coord.x + ofm_block.width, |
| 541 | y=start_coord.y + ofm_block.height, |
| 542 | z=start_coord.z + ofm_block.depth, |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 543 | ) |
| 544 | return (start_coord, end_coord, 1) # start, end, total jobs for this OFM block |
| 545 | |
| 546 | |
Jonas Ohlsson | d857507 | 2022-03-30 10:30:25 +0200 | [diff] [blame] | 547 | def calc_blockdep( |
| 548 | arch: ArchitectureFeatures, |
| 549 | prev_op: Optional[NpuBlockOperation], |
| 550 | npu_op: NpuBlockOperation, |
| 551 | ) -> int: |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 552 | """Calculates the value of the BLOCKDEP register""" |
| 553 | if prev_op is None: |
| 554 | return 0 |
| 555 | assert npu_op.ifm is not None |
| 556 | assert prev_op.ofm is not None |
Diqing Zhong | 455e20e | 2021-02-03 16:37:31 +0100 | [diff] [blame] | 557 | # Check if the reserved shram will be used in current/prev op |
| 558 | prev_uses_lut = prev_op.activation is not None and prev_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP |
| 559 | curr_uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP |
| 560 | if prev_uses_lut and arch.shram_reserved_unused_banks == 0 and not curr_uses_lut: |
| 561 | return 0 |
| 562 | |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 563 | # Check if IFM or IFM2 overlaps with prev op's OFM |
| 564 | prev_ofm_ranges = get_address_ranges(prev_op.ofm) |
| 565 | ifm_ranges = get_address_ranges(npu_op.ifm) |
| 566 | ifm_overlaps = range_lists_overlap(prev_ofm_ranges, ifm_ranges) |
| 567 | if has_ifm2(npu_op): |
| 568 | assert npu_op.ifm2 is not None |
| 569 | ifm2_ranges = get_address_ranges(npu_op.ifm2) |
| 570 | ifm2_overlaps = range_lists_overlap(prev_ofm_ranges, ifm2_ranges) |
| 571 | else: |
| 572 | ifm2_overlaps = False |
| 573 | if ifm_overlaps and ifm2_overlaps: |
| 574 | # Both IFM and IFM2 overlap (should be rare) |
| 575 | return 0 |
| 576 | if not ifm_overlaps and not ifm2_overlaps: |
| 577 | # No overlap between prev OFM and IFM/IFM2 |
| 578 | return ArchitectureFeatures.MAX_BLOCKDEP |
| 579 | if ifm2_overlaps and shape3d_size(npu_op.ifm2.shape) < shape3d_size(npu_op.ifm.shape): |
| 580 | # Prev OFM produces IFM2 which is broadcasted (this should be rare) |
| 581 | return 0 |
| 582 | # Prev OFM overlaps with IFM or IFM2; calculate the blockdep |
| 583 | prev_block_config = prev_op.block_config |
| 584 | block_config = npu_op.block_config |
| 585 | overlapping_fm = npu_op.ifm if ifm_overlaps else npu_op.ifm2 |
| 586 | assert overlapping_fm is not None |
| 587 | |
| 588 | cur_ifm_block_depth = get_ifm_ofm_block_depth(arch, npu_op) |
| 589 | cur_ofm_block = Block(block_config.width, block_config.height, block_config.depth) |
| 590 | cur_ofm_rect = shape3d_to_rect(npu_op.ofm.shape) |
| 591 | cur_ifm_rect = shape3d_to_rect(npu_op.ifm.shape) |
| 592 | padding = NpuPadding(0, 0, 0, 0) if npu_op.padding is None else npu_op.padding |
| 593 | blockdep = ArchitectureFeatures.MAX_BLOCKDEP |
| 594 | kernel = to_kernel(npu_op.kernel) |
| 595 | |
| 596 | prev_ofm_block = Block(prev_block_config.width, prev_block_config.height, prev_block_config.depth) |
| 597 | prev_ofm_rect = shape3d_to_rect(prev_op.ofm.shape) |
| 598 | # Iterate over the next BLOCKDEP inputs, checking to see if a sliding window |
| 599 | # of IFM area overlaps with any previous OFM block generation. |
| 600 | elapsed_jobs = 0 |
| 601 | for forward_offset in range(ArchitectureFeatures.MAX_BLOCKDEP): |
| 602 | # This is the IFM block we want to sample from |
| 603 | in_area = get_first_job_input_volume( |
| 604 | arch, cur_ifm_rect, cur_ofm_rect, cur_ifm_block_depth, cur_ofm_block, kernel, padding, forward_offset |
| 605 | ) |
| 606 | if in_area is None: |
| 607 | break |
| 608 | |
| 609 | # Try several previous-OFM blocks in the past (they still might comprise multiple IFM jobs) |
| 610 | outstanding_jobs = 0 |
| 611 | for block_offset in range(ArchitectureFeatures.MAX_BLOCKDEP): |
| 612 | # This is the OFM block being generated by the previous op |
| 613 | out_area = get_prev_job_output_volume(prev_ofm_rect, prev_ofm_block, block_offset) |
| 614 | if out_area is None: |
| 615 | break |
| 616 | |
| 617 | # Block dependency is the max number of allowed outstanding jobs |
| 618 | # in the pipeline. Selected by determining how many jobs occur |
| 619 | # in between two operators' overlapping OFM->IFM block volumes |
| 620 | if intersects(overlapping_fm, in_area[0], in_area[1], prev_op.ofm, out_area[0], out_area[1]): |
| 621 | break |
| 622 | # Early exit if no intersections and we've seen enough jobs in the pipeline |
| 623 | elif outstanding_jobs > ArchitectureFeatures.MAX_BLOCKDEP: |
| 624 | break |
| 625 | |
| 626 | # This OFM had this many jobs (accumulate over multiple OFM blocks) |
| 627 | outstanding_jobs += out_area[2] |
| 628 | |
| 629 | blockdep = min(blockdep, elapsed_jobs + outstanding_jobs) |
| 630 | elapsed_jobs += in_area[2] |
| 631 | # Early exit if no intersections and we've seen enough jobs in the pipeline |
| 632 | if elapsed_jobs > ArchitectureFeatures.MAX_BLOCKDEP: |
| 633 | break |
| 634 | |
| 635 | return blockdep |