Alexander Hansson | ca9cc42 | 2023-06-22 16:01:27 +0000 | [diff] [blame] | 1 | # SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com> |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 2 | # |
| 3 | # SPDX-License-Identifier: Apache-2.0 |
| 4 | # |
| 5 | # Licensed under the Apache License, Version 2.0 (the License); you may |
| 6 | # not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| 8 | # |
| 9 | # www.apache.org/licenses/LICENSE-2.0 |
| 10 | # |
| 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an AS IS BASIS, WITHOUT |
| 13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
| 16 | # |
| 17 | # Description: |
| 18 | # Utility functions for code generation |
| 19 | from typing import List |
| 20 | from typing import NamedTuple |
| 21 | from typing import Optional |
| 22 | |
| 23 | from . import numeric_util |
| 24 | from .api import NpuActivationOp |
| 25 | from .api import NpuAddressRange |
| 26 | from .api import NpuBlockOperation |
| 27 | from .api import NpuDmaOperation |
| 28 | from .api import NpuElementWiseOp |
| 29 | from .api import NpuFeatureMap |
| 30 | from .api import NpuKernel |
| 31 | from .api import NpuLayout |
| 32 | from .api import NpuOperation |
| 33 | from .api import NpuOperationType |
| 34 | from .api import NpuPadding |
Fredrik Svedberg | f3c7d55 | 2022-11-04 09:48:49 +0100 | [diff] [blame] | 35 | from .api import NpuQuantization |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 36 | from .api import NpuShape3D |
| 37 | from .architecture_features import ArchitectureFeatures |
| 38 | from .architecture_features import Block |
| 39 | from .architecture_features import Rect |
William Isaksson | a4f8411 | 2023-06-19 15:31:46 +0000 | [diff] [blame] | 40 | from .errors import ByteAlignmentError |
| 41 | from .errors import ByteSizeError |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 42 | from .operation import Kernel |
| 43 | from .operation import PointXYZ |
William Isaksson | a4f8411 | 2023-06-19 15:31:46 +0000 | [diff] [blame] | 44 | from .tensor import TensorFormat |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 45 | from ethosu.vela.range_set import AccessDirection |
| 46 | from ethosu.vela.range_set import MemoryAccessSet |
| 47 | from ethosu.vela.range_set import MemoryRangeSet |
| 48 | |
William Isaksson | a4f8411 | 2023-06-19 15:31:46 +0000 | [diff] [blame] | 49 | |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 50 | # base address slot for memory to memory transfer |
| 51 | BASE_PTR_INDEX_MEM2MEM = int((1 << 8) | (3 << 0)) |
| 52 | |
| 53 | |
Michael McGeagh | f3e3ad7 | 2020-12-02 12:39:03 +0000 | [diff] [blame] | 54 | UNARY_ELEMWISE_OPS = (NpuElementWiseOp.ABS, NpuElementWiseOp.LRELU, NpuElementWiseOp.CLZ) |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 55 | |
| 56 | |
William Isaksson | a4f8411 | 2023-06-19 15:31:46 +0000 | [diff] [blame] | 57 | def check_alignment(payload, required_alignment): |
| 58 | # assuming payload is defined in bytes |
| 59 | if payload % required_alignment != 0: |
| 60 | raise ByteAlignmentError(f"Cmd1 payload of size: {payload} Bytes is not {required_alignment}-byte aligned") |
| 61 | |
| 62 | |
Björn Davidsson | 199e8e6 | 2023-10-10 11:22:59 +0200 | [diff] [blame^] | 63 | def check_size(payload, required_multiple, value_type): |
William Isaksson | a4f8411 | 2023-06-19 15:31:46 +0000 | [diff] [blame] | 64 | # assuming payload is defined in bytes |
| 65 | if payload % required_multiple != 0: |
Björn Davidsson | 199e8e6 | 2023-10-10 11:22:59 +0200 | [diff] [blame^] | 66 | raise ByteSizeError(f"Cmd1 {value_type} of size: {payload} Bytes is not a multiple of {required_multiple}") |
| 67 | |
| 68 | |
| 69 | def check_stride(stride, required_multiple): |
| 70 | check_size(stride, required_multiple, "stride") |
| 71 | |
| 72 | |
| 73 | def check_length(length, required_multiple): |
| 74 | check_size(length, required_multiple, "length") |
William Isaksson | a4f8411 | 2023-06-19 15:31:46 +0000 | [diff] [blame] | 75 | |
| 76 | |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 77 | def to_npu_kernel(kernel: Kernel) -> NpuKernel: |
| 78 | """Converts the given internally used kernel object to NpuKernel (of public API)""" |
| 79 | return NpuKernel( |
| 80 | kernel.width, kernel.height, kernel.stride.x, kernel.stride.y, kernel.dilation.x, kernel.dilation.y |
| 81 | ) |
| 82 | |
| 83 | |
| 84 | def to_kernel(kernel: Optional[NpuKernel]) -> Kernel: |
| 85 | """Converts the given public API object to Kernel (used internally)""" |
| 86 | if kernel is None: |
| 87 | return Kernel(1, 1) |
| 88 | return Kernel(kernel.width, kernel.height, kernel.stride_x, kernel.stride_y, kernel.dilation_x, kernel.dilation_y) |
| 89 | |
| 90 | |
| 91 | def has_ifm2(npu_op: NpuBlockOperation) -> bool: |
| 92 | """Checks if op has non-scalar IFM2""" |
| 93 | return npu_op.ifm2 is not None and npu_op.ifm2_scalar is None |
| 94 | |
| 95 | |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 96 | def shape3d_size(shape: NpuShape3D) -> int: |
| 97 | return shape.width * shape.height * shape.depth |
| 98 | |
| 99 | |
| 100 | def shape3d_to_rect(shape: NpuShape3D) -> Rect: |
| 101 | return Rect(0, 0, 0, shape.width - 1, shape.height - 1, shape.depth - 1) |
| 102 | |
| 103 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 104 | def shape3d_to_block(shape: NpuShape3D) -> Block: |
| 105 | return Block(shape.width, shape.height, shape.depth) |
| 106 | |
| 107 | |
Fredrik Svedberg | f3c7d55 | 2022-11-04 09:48:49 +0100 | [diff] [blame] | 108 | def get_zero_point(fm: NpuFeatureMap): |
| 109 | return int(fm.quantization.zero_point if fm.quantization else 0) |
| 110 | |
| 111 | |
| 112 | def quantise(value: float, quant: Optional[NpuQuantization]) -> int: |
| 113 | """Quantizes the given value""" |
| 114 | scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32 |
| 115 | zp = 0 if quant is None else quant.zero_point |
| 116 | return numeric_util.quantise_float32(value, scale, zp) |
| 117 | |
| 118 | |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 119 | # ------------------------------------------------------------------- |
| 120 | # ADDRESSING/STRIDES (helper functions) |
| 121 | # ------------------------------------------------------------------- |
| 122 | |
| 123 | |
| 124 | def ranges_overlap(range1: NpuAddressRange, range2: NpuAddressRange) -> bool: |
| 125 | """Checks if the ranges overlap""" |
| 126 | return range1.region == range2.region and numeric_util.overlaps( |
| 127 | range1.address, range1.address + range1.length, range2.address, range2.address + range2.length |
| 128 | ) |
| 129 | |
| 130 | |
| 131 | def range_lists_overlap(list1: List[Optional[NpuAddressRange]], list2: List[Optional[NpuAddressRange]]) -> bool: |
| 132 | """Checks if there is any address overlap between list1 and list2""" |
| 133 | for range1 in list1: |
| 134 | if range1 is None: |
| 135 | continue |
| 136 | for range2 in list2: |
| 137 | if range2 is not None and ranges_overlap(range1, range2): |
| 138 | return True |
| 139 | return False |
| 140 | |
| 141 | |
| 142 | def get_strides(fm: NpuFeatureMap) -> NpuShape3D: |
| 143 | """Calculates STRIDE_C/Y/X""" |
| 144 | if fm.strides is not None: |
| 145 | return fm.strides |
| 146 | elem_size = fm.data_type.size_in_bytes() |
| 147 | if fm.layout == NpuLayout.NHWC: |
| 148 | stride_c = elem_size |
| 149 | stride_x = fm.shape.depth * stride_c |
| 150 | stride_y = fm.shape.width * stride_x |
| 151 | else: |
| 152 | stride_x = 16 * elem_size |
| 153 | stride_c = stride_x * fm.shape.width |
| 154 | stride_y = elem_size * fm.shape.width * numeric_util.round_up(fm.shape.depth, 16) |
| 155 | return NpuShape3D(depth=stride_c, height=stride_y, width=stride_x) |
| 156 | |
| 157 | |
| 158 | def get_address(fm: NpuFeatureMap, strides: NpuShape3D, y: int, x: int, c: int) -> int: |
| 159 | """Returns address of given coordinate""" |
| 160 | t = 0 |
| 161 | BRICK = 16 |
| 162 | stride_c = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHWC else strides.depth |
| 163 | stride_x = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHCWB16 else strides.width |
| 164 | if x >= fm.tiles.width_0: |
| 165 | x -= fm.tiles.width_0 |
| 166 | t = 1 |
| 167 | if y >= fm.tiles.height_1: |
| 168 | y -= fm.tiles.height_1 |
| 169 | t += 2 |
| 170 | elif y >= fm.tiles.height_0: |
| 171 | y -= fm.tiles.height_0 |
| 172 | t += 2 |
| 173 | elem_size = fm.data_type.size_in_bytes() |
| 174 | return ( |
| 175 | fm.tiles.addresses[t] + y * strides.height + x * stride_x + (c // BRICK) * stride_c + int(c % BRICK) * elem_size |
| 176 | ) |
| 177 | |
| 178 | |
| 179 | def get_address_range( |
| 180 | fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int |
| 181 | ) -> NpuAddressRange: |
| 182 | """ |
| 183 | Gets address range for (y0, x0, c0) - (y1, x1, c1) (inclusive, so the second coordinate is within the fm). |
| 184 | The begin and end coordinates must be within the same tile. |
| 185 | """ |
| 186 | addr0 = get_address(fm, strides, y0, x0, c0) |
| 187 | addr1 = get_address(fm, strides, y1, x1, c1) |
| 188 | return NpuAddressRange(region=fm.region, address=addr0, length=addr1 - addr0 + fm.data_type.size_in_bytes()) |
| 189 | |
| 190 | |
| 191 | def get_h_ranges( |
| 192 | fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int |
| 193 | ) -> List[NpuAddressRange]: |
| 194 | """ |
| 195 | Gets address ranges for (y0, x0, c0) - (y1, x1, c1) (inclusive, so the second coordinate is within the fm); |
| 196 | the begin and end coordinates must be within the same tile. |
| 197 | Divides the area in horizontal "stripes" of height 1, and returns the address ranges for these "stripes". |
| 198 | """ |
| 199 | return [get_address_range(fm, strides, y, x0, c0, y, x1, c1) for y in range(y0, y1 + 1)] |
| 200 | |
| 201 | |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame] | 202 | def get_address_ranges_for_area(fm: NpuFeatureMap, start: PointXYZ, end: PointXYZ) -> List[Optional[NpuAddressRange]]: |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 203 | """ |
| 204 | Returns a list of adddress ranges that covers the area start - end (inclusive). |
| 205 | Divides the area in horizontal "stripes" of height 1, and returns the address ranges for these "stripes". |
| 206 | |
| 207 | For example, for the area marked with X (in a feature map with 4 tiles) as input, this function would return |
| 208 | 6 address ranges: the address ranges for 1-height areas [AAA, BBB, CC, DD, EEE, FF] |
| 209 | |
| 210 | .....|.... .....|.... |
| 211 | t0 ..XXX|XX.. t1 t0 ..AAA|CC.. t1 |
| 212 | ..XXX|XX.. ..BBB|DD.. |
| 213 | -----+---- --> -----+---- |
| 214 | t2 ..XXX|XX.. t3 t2 ..EEE|FF.. t3 |
| 215 | .....|.... .....|.... |
| 216 | """ |
| 217 | strides = get_strides(fm) |
| 218 | height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0 |
| 219 | h, w, c = fm.shape |
| 220 | y0, x0, c0 = start.y, start.x, start.z |
| 221 | y1, x1, c1 = min(end.y, h - 1), min(end.x, w - 1), min(end.z, c - 1) |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame] | 222 | ranges: List[Optional[NpuAddressRange]] = [] |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 223 | if x0 < width_0 and y0 < height_0: |
| 224 | # Horizontal ranges for tile 0 |
| 225 | ranges.extend(get_h_ranges(fm, strides, y0, x0, c0, min(y1, height_0 - 1), min(x1, width_0 - 1), c1)) |
| 226 | if x1 >= width_0 and y0 < height_1: |
| 227 | # Horizontal ranges for tile 1 |
| 228 | ranges.extend(get_h_ranges(fm, strides, y0, max(x0, width_0), c0, min(y1, height_1 - 1), x1, c1)) |
| 229 | if x0 < width_0 and y1 >= height_0: |
| 230 | # Horizontal ranges for tile 2 |
| 231 | ranges.extend(get_h_ranges(fm, strides, max(y0, height_0), x0, c0, y1, min(x1, width_0 - 1), c1)) |
| 232 | if x1 >= width_0 and y1 >= height_1: |
| 233 | # Horizontal ranges for tile 3 |
| 234 | ranges.extend(get_h_ranges(fm, strides, max(y0, height_1), max(x0, width_0), c0, y1, x1, c1)) |
| 235 | return ranges |
| 236 | |
| 237 | |
| 238 | def get_address_ranges(fm: NpuFeatureMap) -> List[Optional[NpuAddressRange]]: |
| 239 | """Returns 4 adddress ranges, one for every tile, None if the tile is not in use""" |
| 240 | strides = get_strides(fm) |
| 241 | height, width, depth = fm.shape.height, fm.shape.width, fm.shape.depth |
| 242 | height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0 |
Jonas Ohlsson | d857507 | 2022-03-30 10:30:25 +0200 | [diff] [blame] | 243 | t0 = get_address_range( |
| 244 | fm, |
| 245 | strides, |
| 246 | 0, |
| 247 | 0, |
| 248 | 0, |
| 249 | min(height, height_0) - 1, |
| 250 | min(width, width_0) - 1, |
| 251 | depth - 1, |
| 252 | ) |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 253 | if width > width_0: |
| 254 | t1 = get_address_range(fm, strides, 0, width_0, 0, min(height, height_1) - 1, width - 1, depth - 1) |
| 255 | else: |
| 256 | t1 = None |
| 257 | if height > height_0: |
| 258 | t2 = get_address_range(fm, strides, height_0, 0, 0, height - 1, min(width, width_0) - 1, depth - 1) |
| 259 | else: |
| 260 | t2 = None |
| 261 | if t1 is not None and t2 is not None: |
| 262 | t3 = get_address_range(fm, strides, height_1, width_0, 0, height - 1, width - 1, depth - 1) |
| 263 | else: |
| 264 | t3 = None |
| 265 | return [t0, t1, t2, t3] |
| 266 | |
| 267 | |
William Isaksson | a4f8411 | 2023-06-19 15:31:46 +0000 | [diff] [blame] | 268 | def check_strides(fm: NpuFeatureMap, strides: NpuShape3D): |
| 269 | |
| 270 | element_size_in_bytes = fm.data_type.size_in_bytes() |
| 271 | |
| 272 | if fm.layout == NpuLayout.NHCWB16: |
| 273 | strides_to_check = [strides.depth, strides.height] |
Björn Davidsson | 199e8e6 | 2023-10-10 11:22:59 +0200 | [diff] [blame^] | 274 | required_multiple = 16 |
William Isaksson | a4f8411 | 2023-06-19 15:31:46 +0000 | [diff] [blame] | 275 | else: |
| 276 | strides_to_check = [strides.height, strides.width] |
| 277 | required_multiple = element_size_in_bytes |
| 278 | for stride in strides_to_check: |
Björn Davidsson | 199e8e6 | 2023-10-10 11:22:59 +0200 | [diff] [blame^] | 279 | check_stride(stride, required_multiple) |
William Isaksson | a4f8411 | 2023-06-19 15:31:46 +0000 | [diff] [blame] | 280 | |
| 281 | |
| 282 | def check_addresses(addresses: List[int], layout: NpuLayout, element_size, arch: ArchitectureFeatures): |
| 283 | if layout == NpuLayout.NHCWB16: |
| 284 | required_alignment = arch.storage_rounding_quantums[TensorFormat.NHCWB16][-1] |
| 285 | else: |
| 286 | required_alignment = element_size |
| 287 | for addr in addresses: |
| 288 | check_alignment(addr, required_alignment) |
| 289 | |
| 290 | |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 291 | # ------------------------------------------------------------------- |
| 292 | # DMA_WAIT/KERNEL_WAIT |
| 293 | # ------------------------------------------------------------------- |
| 294 | |
| 295 | |
| 296 | class Watermark(NamedTuple): |
| 297 | npu: int |
| 298 | dma: int |
| 299 | |
| 300 | |
| 301 | def memory_range_set(range: NpuAddressRange) -> MemoryRangeSet: |
| 302 | return MemoryRangeSet(range.region, range.address, range.address + range.length) |
| 303 | |
| 304 | |
| 305 | def get_dma_memory_accesses(dma_op: NpuDmaOperation) -> MemoryAccessSet: |
| 306 | """Returns the address that are read and written by the given DMA operation""" |
| 307 | res = MemoryAccessSet() |
| 308 | res.add(memory_range_set(dma_op.src), AccessDirection.Read) |
| 309 | res.add(memory_range_set(dma_op.dest), AccessDirection.Write) |
| 310 | return res |
| 311 | |
| 312 | |
| 313 | def get_op_memory_accesses(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> MemoryAccessSet: |
| 314 | """Returns the addresses that are read and written by the given operation""" |
| 315 | assert npu_op.ifm is not None and npu_op.ofm is not None |
| 316 | # Read addresses |
| 317 | read_ranges = get_address_ranges(npu_op.ifm) |
| 318 | if has_ifm2(npu_op): |
| 319 | assert npu_op.ifm2 is not None |
| 320 | read_ranges.extend(get_address_ranges(npu_op.ifm2)) |
| 321 | read_ranges.extend(npu_op.weights) |
| 322 | read_ranges.extend(npu_op.biases) |
| 323 | if npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP: |
| 324 | address = arch.available_shram_banks(True) * arch.shram_bank_size |
| 325 | read_ranges.append(NpuAddressRange(region=BASE_PTR_INDEX_MEM2MEM, address=address, length=2048)) |
| 326 | # Written addresses |
| 327 | write_ranges = get_address_ranges(npu_op.ofm) |
| 328 | # Add write access to SHRAM, needed when LUTs can overwrite accumulator banks |
| 329 | uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP |
| 330 | written_shram_size = arch.available_shram_banks(uses_lut) * arch.shram_bank_size |
| 331 | write_ranges.append(NpuAddressRange(region=BASE_PTR_INDEX_MEM2MEM, address=0, length=written_shram_size)) |
| 332 | |
| 333 | res = MemoryAccessSet() |
| 334 | for read_range in read_ranges: |
| 335 | if read_range is not None: |
| 336 | res.add(memory_range_set(read_range), AccessDirection.Read) |
| 337 | for write_range in write_ranges: |
| 338 | if write_range is not None: |
| 339 | res.add(memory_range_set(write_range), AccessDirection.Write) |
| 340 | return res |
| 341 | |
| 342 | |
| 343 | def get_wait_dependency( |
Alexander Hansson | ca9cc42 | 2023-06-22 16:01:27 +0000 | [diff] [blame] | 344 | arch: ArchitectureFeatures, |
| 345 | npu_op: NpuOperation, |
| 346 | memory_accesses, |
| 347 | outstanding_dma_ops: List[NpuOperation], |
| 348 | outstanding_npu_ops: List[NpuOperation], |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 349 | ): |
| 350 | """Used to calculate whether DMA wait or kernel wait operations are needed""" |
Alexander Hansson | ca9cc42 | 2023-06-22 16:01:27 +0000 | [diff] [blame] | 351 | kern_wait = -1 |
| 352 | dma_wait = -1 |
| 353 | op_accesses = memory_accesses[npu_op] |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 354 | |
Alexander Hansson | ca9cc42 | 2023-06-22 16:01:27 +0000 | [diff] [blame] | 355 | if isinstance(npu_op, NpuDmaOperation): |
| 356 | outstanding_ops = outstanding_npu_ops |
| 357 | outstanding_dma_ops.append(npu_op) |
| 358 | if len(outstanding_dma_ops) > arch.max_outstanding_dma: |
| 359 | outstanding_dma_ops.pop(0) |
| 360 | else: |
| 361 | outstanding_ops = outstanding_dma_ops |
| 362 | outstanding_npu_ops.append(npu_op) |
| 363 | if len(outstanding_npu_ops) > arch.max_outstanding_kernels: |
| 364 | outstanding_npu_ops.pop(0) |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 365 | |
Alexander Hansson | ca9cc42 | 2023-06-22 16:01:27 +0000 | [diff] [blame] | 366 | waits = -1 |
| 367 | for idx in range(len(outstanding_ops) - 1, -1, -1): |
| 368 | waits += 1 |
| 369 | other_op = outstanding_ops[idx] |
| 370 | other_accesses = memory_accesses[other_op] |
| 371 | if other_accesses.conflicts(op_accesses): |
| 372 | if isinstance(npu_op, NpuDmaOperation): |
| 373 | kern_wait = waits |
| 374 | else: |
| 375 | dma_wait = waits |
| 376 | # Current op needs to wait, and after it has waited, |
| 377 | # outstanding_ops[0..idx] are not outstanding any longer |
| 378 | for i in range(idx + 1): |
| 379 | outstanding_ops.pop(0) |
| 380 | break |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 381 | |
Alexander Hansson | ca9cc42 | 2023-06-22 16:01:27 +0000 | [diff] [blame] | 382 | cmd_waits = Watermark(kern_wait, dma_wait) |
| 383 | return cmd_waits |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 384 | |
| 385 | |
William Isaksson | a4f8411 | 2023-06-19 15:31:46 +0000 | [diff] [blame] | 386 | def check_dma_op(dma_op: NpuDmaOperation, arch: ArchitectureFeatures): |
| 387 | |
| 388 | # For Ethos-U65 only internal addresses have to be aligned, and if the internal address is the destination |
| 389 | # then the length has to be aligned also. |
| 390 | if arch.is_ethos_u65_system: |
| 391 | if dma_op.src.region == BASE_PTR_INDEX_MEM2MEM: |
| 392 | check_alignment(dma_op.src.address, 16) |
| 393 | if dma_op.dest.region == BASE_PTR_INDEX_MEM2MEM: |
| 394 | check_alignment(dma_op.dest.address, 16) |
Björn Davidsson | 199e8e6 | 2023-10-10 11:22:59 +0200 | [diff] [blame^] | 395 | check_length(dma_op.src.length, 16) |
William Isaksson | a4f8411 | 2023-06-19 15:31:46 +0000 | [diff] [blame] | 396 | else: |
| 397 | check_alignment(dma_op.src.address, 16) |
| 398 | check_alignment(dma_op.dest.address, 16) |
Björn Davidsson | 199e8e6 | 2023-10-10 11:22:59 +0200 | [diff] [blame^] | 399 | check_length(dma_op.src.length, 16) |
William Isaksson | a4f8411 | 2023-06-19 15:31:46 +0000 | [diff] [blame] | 400 | |
| 401 | |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 402 | # ------------------------------------------------------------------- |
| 403 | # BLOCKDEP |
| 404 | # ------------------------------------------------------------------- |
| 405 | |
| 406 | |
| 407 | def get_ifm_ofm_block_depth(arch: ArchitectureFeatures, npu_op: NpuBlockOperation) -> int: |
| 408 | # Note: NOT equivalent to the normal ifm block depth calculation since |
| 409 | # it takes into account 'depthless' block operations by returning full |
| 410 | # depth |
| 411 | if npu_op.op_type == NpuOperationType.Conv2D: |
| 412 | res = arch.calc_ifm_block_depth(npu_op.ifm.shape.depth, npu_op.ifm.data_type.size_in_bits()) |
| 413 | return res |
| 414 | return npu_op.ofm.shape.depth |
| 415 | |
| 416 | |
| 417 | def coords_intersect(start_a: PointXYZ, end_a: PointXYZ, start_b: PointXYZ, end_b: PointXYZ) -> bool: |
| 418 | """Checks if the two areas overlap""" |
| 419 | start_x = max(start_a.x, start_b.x) |
| 420 | end_x = min(end_a.x, end_b.x) |
| 421 | start_y = max(start_a.y, start_b.y) |
| 422 | end_y = min(end_a.y, end_b.y) |
| 423 | start_z = max(start_a.z, start_b.z) |
| 424 | end_z = min(end_a.z, end_b.z) |
| 425 | return ((end_x - start_x) > 0) and ((end_y - start_y) > 0) and ((end_z - start_z) > 0) |
| 426 | |
| 427 | |
| 428 | def intersects( |
| 429 | ifm: NpuFeatureMap, |
| 430 | ifm_start_coord: PointXYZ, |
| 431 | ifm_end_coord: PointXYZ, |
| 432 | prev_ofm: NpuFeatureMap, |
| 433 | ofm_start_coord: PointXYZ, |
| 434 | ofm_end_coord: PointXYZ, |
| 435 | ) -> bool: |
| 436 | """Checks if the given IFM area overlaps with the given OFM area""" |
| 437 | if ifm.shape == prev_ofm.shape and ifm.tiles == prev_ofm.tiles: |
| 438 | # Common case: prev_op.ofm == op.ifm; in this case it suffices to check |
| 439 | # if the xyz coordinates overlap, which is quick and easy |
| 440 | res = coords_intersect(ifm_start_coord, ifm_end_coord, ofm_start_coord, ofm_end_coord) |
| 441 | else: |
| 442 | # The OFM produces a part of the IFM (e.g. a stripe), or the IFM consumes part of the OFM. |
| 443 | # In this case, address comparison between the two areas is needed |
Jonas Ohlsson | 845e232 | 2022-03-01 12:39:55 +0100 | [diff] [blame] | 444 | ifm_ranges: List[Optional[NpuAddressRange]] = get_address_ranges_for_area(ifm, ifm_start_coord, ifm_end_coord) |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 445 | prev_ofm_ranges = get_address_ranges_for_area(prev_ofm, ofm_start_coord, ofm_end_coord) |
| 446 | res = range_lists_overlap(ifm_ranges, prev_ofm_ranges) |
| 447 | return res |
| 448 | |
| 449 | |
| 450 | # Block job dependency: |
| 451 | # Does the VOLUME of IFMs for block job B(0) overlap with VOLUME of OFMs block jobs A(8,9,10) |
| 452 | # |
| 453 | # A | B |
| 454 | # ----------------------+------------------ |
| 455 | # .... 3,4,5,6,7,8,9,10 | 0,1,2,3,4,5,6,8 10 < JOB NUMBER |
| 456 | # |<------->| dependency offset |
| 457 | # |
| 458 | |
| 459 | |
| 460 | def get_offset_block_coords(area: Rect, block: Block, offset: int) -> Optional[PointXYZ]: |
| 461 | """ |
| 462 | Get the coordinates of a block offset from either the end (negative) |
| 463 | or the start (zero or positive) of the given 3D area |
| 464 | """ |
| 465 | size = area.size() |
| 466 | # Dimensions of the region, in blocks |
| 467 | width_blocks = numeric_util.round_up_divide(size.width, block.width) |
| 468 | height_blocks = numeric_util.round_up_divide(size.height, block.height) |
| 469 | depth_blocks = numeric_util.round_up_divide(size.depth, block.depth) |
| 470 | total_blocks = width_blocks * height_blocks * depth_blocks |
| 471 | if offset < 0: |
| 472 | index = total_blocks + offset |
| 473 | else: |
| 474 | index = offset |
| 475 | |
| 476 | if index >= total_blocks: |
| 477 | return None |
| 478 | |
| 479 | # Coordinates of the indexed block |
| 480 | coord_z = block.depth * (index % depth_blocks) |
| 481 | coord_y = block.height * (index // (depth_blocks * width_blocks)) |
| 482 | coord_x = block.width * ((index // depth_blocks) % width_blocks) |
| 483 | |
| 484 | return PointXYZ(x=coord_x + area.x, y=coord_y + area.y, z=coord_z + area.z) |
| 485 | |
| 486 | |
| 487 | def get_first_job_input_volume( |
| 488 | arch: ArchitectureFeatures, |
| 489 | ifm: Rect, |
| 490 | ofm: Rect, |
| 491 | ifm_block_depth, |
| 492 | ofm_block: Block, |
| 493 | kernel: Kernel, |
| 494 | padding: NpuPadding, |
| 495 | block_offset: int, |
| 496 | ): |
| 497 | # Get ifm block size (jobs are invisibly decomposed into subkernels) |
| 498 | ifm_block = arch.get_ifm_block_size(ifm_block_depth, ofm_block, kernel, arch.ofm_block_max) |
| 499 | ifm_depth_blocks = numeric_util.round_up_divide(ifm.size().depth, ifm_block_depth) |
| 500 | |
| 501 | # Which OFM block are we calculating |
| 502 | ofm_coord = get_offset_block_coords(ofm, ofm_block, block_offset // ifm_depth_blocks) |
| 503 | if ofm_coord is None: |
| 504 | return None |
| 505 | |
| 506 | # Coordinate of the source IFM block |
| 507 | ifm_coord_x = max(0, ofm_coord[0] * kernel.stride.x - padding.left) |
| 508 | ifm_coord_y = max(0, ofm_coord[1] * kernel.stride.y - padding.right) |
| 509 | ifm_coord_z = ifm.z + (block_offset % ifm_depth_blocks) * ifm_block.depth |
| 510 | |
| 511 | # IFM block that will be sampled for the FIRST+block_offset job in the next operator's OFM |
| 512 | start_coord = PointXYZ(x=ifm_coord_x, y=ifm_coord_y, z=ifm_coord_z) |
| 513 | end_coord = PointXYZ( |
Jonas Ohlsson | d857507 | 2022-03-30 10:30:25 +0200 | [diff] [blame] | 514 | x=start_coord[0] + ifm_block.width, |
| 515 | y=start_coord[1] + ifm_block.height, |
| 516 | z=start_coord[2] + ifm_block.depth, |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 517 | ) |
| 518 | return (start_coord, end_coord, 1) # start, end, total jobs |
| 519 | |
| 520 | |
| 521 | def get_prev_job_output_volume(ofm: Rect, ofm_block: Block, block_offset: int): |
| 522 | assert block_offset >= 0 |
| 523 | |
| 524 | # Get OFM block's volume coordinates |
| 525 | start_coord = get_offset_block_coords(ofm, ofm_block, -1 - block_offset) |
| 526 | if start_coord is None: |
| 527 | return None |
| 528 | end_coord = PointXYZ( |
Jonas Ohlsson | d857507 | 2022-03-30 10:30:25 +0200 | [diff] [blame] | 529 | x=start_coord.x + ofm_block.width, |
| 530 | y=start_coord.y + ofm_block.height, |
| 531 | z=start_coord.z + ofm_block.depth, |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 532 | ) |
| 533 | return (start_coord, end_coord, 1) # start, end, total jobs for this OFM block |
| 534 | |
| 535 | |
Jonas Ohlsson | d857507 | 2022-03-30 10:30:25 +0200 | [diff] [blame] | 536 | def calc_blockdep( |
| 537 | arch: ArchitectureFeatures, |
| 538 | prev_op: Optional[NpuBlockOperation], |
| 539 | npu_op: NpuBlockOperation, |
| 540 | ) -> int: |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 541 | """Calculates the value of the BLOCKDEP register""" |
| 542 | if prev_op is None: |
| 543 | return 0 |
| 544 | assert npu_op.ifm is not None |
| 545 | assert prev_op.ofm is not None |
Diqing Zhong | 455e20e | 2021-02-03 16:37:31 +0100 | [diff] [blame] | 546 | # Check if the reserved shram will be used in current/prev op |
| 547 | prev_uses_lut = prev_op.activation is not None and prev_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP |
| 548 | curr_uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP |
| 549 | if prev_uses_lut and arch.shram_reserved_unused_banks == 0 and not curr_uses_lut: |
| 550 | return 0 |
| 551 | |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 552 | # Check if IFM or IFM2 overlaps with prev op's OFM |
| 553 | prev_ofm_ranges = get_address_ranges(prev_op.ofm) |
| 554 | ifm_ranges = get_address_ranges(npu_op.ifm) |
| 555 | ifm_overlaps = range_lists_overlap(prev_ofm_ranges, ifm_ranges) |
| 556 | if has_ifm2(npu_op): |
| 557 | assert npu_op.ifm2 is not None |
| 558 | ifm2_ranges = get_address_ranges(npu_op.ifm2) |
| 559 | ifm2_overlaps = range_lists_overlap(prev_ofm_ranges, ifm2_ranges) |
| 560 | else: |
| 561 | ifm2_overlaps = False |
| 562 | if ifm_overlaps and ifm2_overlaps: |
| 563 | # Both IFM and IFM2 overlap (should be rare) |
| 564 | return 0 |
| 565 | if not ifm_overlaps and not ifm2_overlaps: |
| 566 | # No overlap between prev OFM and IFM/IFM2 |
| 567 | return ArchitectureFeatures.MAX_BLOCKDEP |
| 568 | if ifm2_overlaps and shape3d_size(npu_op.ifm2.shape) < shape3d_size(npu_op.ifm.shape): |
| 569 | # Prev OFM produces IFM2 which is broadcasted (this should be rare) |
| 570 | return 0 |
| 571 | # Prev OFM overlaps with IFM or IFM2; calculate the blockdep |
| 572 | prev_block_config = prev_op.block_config |
| 573 | block_config = npu_op.block_config |
| 574 | overlapping_fm = npu_op.ifm if ifm_overlaps else npu_op.ifm2 |
| 575 | assert overlapping_fm is not None |
| 576 | |
| 577 | cur_ifm_block_depth = get_ifm_ofm_block_depth(arch, npu_op) |
| 578 | cur_ofm_block = Block(block_config.width, block_config.height, block_config.depth) |
| 579 | cur_ofm_rect = shape3d_to_rect(npu_op.ofm.shape) |
| 580 | cur_ifm_rect = shape3d_to_rect(npu_op.ifm.shape) |
| 581 | padding = NpuPadding(0, 0, 0, 0) if npu_op.padding is None else npu_op.padding |
| 582 | blockdep = ArchitectureFeatures.MAX_BLOCKDEP |
| 583 | kernel = to_kernel(npu_op.kernel) |
| 584 | |
| 585 | prev_ofm_block = Block(prev_block_config.width, prev_block_config.height, prev_block_config.depth) |
| 586 | prev_ofm_rect = shape3d_to_rect(prev_op.ofm.shape) |
| 587 | # Iterate over the next BLOCKDEP inputs, checking to see if a sliding window |
| 588 | # of IFM area overlaps with any previous OFM block generation. |
| 589 | elapsed_jobs = 0 |
| 590 | for forward_offset in range(ArchitectureFeatures.MAX_BLOCKDEP): |
| 591 | # This is the IFM block we want to sample from |
| 592 | in_area = get_first_job_input_volume( |
| 593 | arch, cur_ifm_rect, cur_ofm_rect, cur_ifm_block_depth, cur_ofm_block, kernel, padding, forward_offset |
| 594 | ) |
| 595 | if in_area is None: |
| 596 | break |
| 597 | |
| 598 | # Try several previous-OFM blocks in the past (they still might comprise multiple IFM jobs) |
| 599 | outstanding_jobs = 0 |
| 600 | for block_offset in range(ArchitectureFeatures.MAX_BLOCKDEP): |
| 601 | # This is the OFM block being generated by the previous op |
| 602 | out_area = get_prev_job_output_volume(prev_ofm_rect, prev_ofm_block, block_offset) |
| 603 | if out_area is None: |
| 604 | break |
| 605 | |
| 606 | # Block dependency is the max number of allowed outstanding jobs |
| 607 | # in the pipeline. Selected by determining how many jobs occur |
| 608 | # in between two operators' overlapping OFM->IFM block volumes |
| 609 | if intersects(overlapping_fm, in_area[0], in_area[1], prev_op.ofm, out_area[0], out_area[1]): |
| 610 | break |
| 611 | # Early exit if no intersections and we've seen enough jobs in the pipeline |
| 612 | elif outstanding_jobs > ArchitectureFeatures.MAX_BLOCKDEP: |
| 613 | break |
| 614 | |
| 615 | # This OFM had this many jobs (accumulate over multiple OFM blocks) |
| 616 | outstanding_jobs += out_area[2] |
| 617 | |
| 618 | blockdep = min(blockdep, elapsed_jobs + outstanding_jobs) |
| 619 | elapsed_jobs += in_area[2] |
| 620 | # Early exit if no intersections and we've seen enough jobs in the pipeline |
| 621 | if elapsed_jobs > ArchitectureFeatures.MAX_BLOCKDEP: |
| 622 | break |
| 623 | |
| 624 | return blockdep |