erik.andersson@arm.com | 460c689 | 2021-02-24 14:38:09 +0100 | [diff] [blame] | 1 | # Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved. |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 2 | # |
| 3 | # SPDX-License-Identifier: Apache-2.0 |
| 4 | # |
| 5 | # Licensed under the Apache License, Version 2.0 (the License); you may |
| 6 | # not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| 8 | # |
| 9 | # www.apache.org/licenses/LICENSE-2.0 |
| 10 | # |
| 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an AS IS BASIS, WITHOUT |
| 13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
| 16 | # |
| 17 | # Description: |
| 18 | # Utility functions for code generation |
| 19 | from typing import List |
| 20 | from typing import NamedTuple |
| 21 | from typing import Optional |
| 22 | |
| 23 | from . import numeric_util |
| 24 | from .api import NpuActivationOp |
| 25 | from .api import NpuAddressRange |
| 26 | from .api import NpuBlockOperation |
| 27 | from .api import NpuDmaOperation |
| 28 | from .api import NpuElementWiseOp |
| 29 | from .api import NpuFeatureMap |
| 30 | from .api import NpuKernel |
| 31 | from .api import NpuLayout |
| 32 | from .api import NpuOperation |
| 33 | from .api import NpuOperationType |
| 34 | from .api import NpuPadding |
| 35 | from .api import NpuShape3D |
| 36 | from .architecture_features import ArchitectureFeatures |
| 37 | from .architecture_features import Block |
| 38 | from .architecture_features import Rect |
| 39 | from .operation import Kernel |
| 40 | from .operation import PointXYZ |
| 41 | from ethosu.vela.range_set import AccessDirection |
| 42 | from ethosu.vela.range_set import MemoryAccessSet |
| 43 | from ethosu.vela.range_set import MemoryRangeSet |
| 44 | |
| 45 | # base address slot for memory to memory transfer |
| 46 | BASE_PTR_INDEX_MEM2MEM = int((1 << 8) | (3 << 0)) |
| 47 | |
| 48 | |
Michael McGeagh | f3e3ad7 | 2020-12-02 12:39:03 +0000 | [diff] [blame] | 49 | UNARY_ELEMWISE_OPS = (NpuElementWiseOp.ABS, NpuElementWiseOp.LRELU, NpuElementWiseOp.CLZ) |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 50 | |
| 51 | |
| 52 | def to_npu_kernel(kernel: Kernel) -> NpuKernel: |
| 53 | """Converts the given internally used kernel object to NpuKernel (of public API)""" |
| 54 | return NpuKernel( |
| 55 | kernel.width, kernel.height, kernel.stride.x, kernel.stride.y, kernel.dilation.x, kernel.dilation.y |
| 56 | ) |
| 57 | |
| 58 | |
| 59 | def to_kernel(kernel: Optional[NpuKernel]) -> Kernel: |
| 60 | """Converts the given public API object to Kernel (used internally)""" |
| 61 | if kernel is None: |
| 62 | return Kernel(1, 1) |
| 63 | return Kernel(kernel.width, kernel.height, kernel.stride_x, kernel.stride_y, kernel.dilation_x, kernel.dilation_y) |
| 64 | |
| 65 | |
| 66 | def has_ifm2(npu_op: NpuBlockOperation) -> bool: |
| 67 | """Checks if op has non-scalar IFM2""" |
| 68 | return npu_op.ifm2 is not None and npu_op.ifm2_scalar is None |
| 69 | |
| 70 | |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 71 | def shape3d_size(shape: NpuShape3D) -> int: |
| 72 | return shape.width * shape.height * shape.depth |
| 73 | |
| 74 | |
| 75 | def shape3d_to_rect(shape: NpuShape3D) -> Rect: |
| 76 | return Rect(0, 0, 0, shape.width - 1, shape.height - 1, shape.depth - 1) |
| 77 | |
| 78 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 79 | def shape3d_to_block(shape: NpuShape3D) -> Block: |
| 80 | return Block(shape.width, shape.height, shape.depth) |
| 81 | |
| 82 | |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 83 | # ------------------------------------------------------------------- |
| 84 | # ADDRESSING/STRIDES (helper functions) |
| 85 | # ------------------------------------------------------------------- |
| 86 | |
| 87 | |
| 88 | def ranges_overlap(range1: NpuAddressRange, range2: NpuAddressRange) -> bool: |
| 89 | """Checks if the ranges overlap""" |
| 90 | return range1.region == range2.region and numeric_util.overlaps( |
| 91 | range1.address, range1.address + range1.length, range2.address, range2.address + range2.length |
| 92 | ) |
| 93 | |
| 94 | |
| 95 | def range_lists_overlap(list1: List[Optional[NpuAddressRange]], list2: List[Optional[NpuAddressRange]]) -> bool: |
| 96 | """Checks if there is any address overlap between list1 and list2""" |
| 97 | for range1 in list1: |
| 98 | if range1 is None: |
| 99 | continue |
| 100 | for range2 in list2: |
| 101 | if range2 is not None and ranges_overlap(range1, range2): |
| 102 | return True |
| 103 | return False |
| 104 | |
| 105 | |
| 106 | def get_strides(fm: NpuFeatureMap) -> NpuShape3D: |
| 107 | """Calculates STRIDE_C/Y/X""" |
| 108 | if fm.strides is not None: |
| 109 | return fm.strides |
| 110 | elem_size = fm.data_type.size_in_bytes() |
| 111 | if fm.layout == NpuLayout.NHWC: |
| 112 | stride_c = elem_size |
| 113 | stride_x = fm.shape.depth * stride_c |
| 114 | stride_y = fm.shape.width * stride_x |
| 115 | else: |
| 116 | stride_x = 16 * elem_size |
| 117 | stride_c = stride_x * fm.shape.width |
| 118 | stride_y = elem_size * fm.shape.width * numeric_util.round_up(fm.shape.depth, 16) |
| 119 | return NpuShape3D(depth=stride_c, height=stride_y, width=stride_x) |
| 120 | |
| 121 | |
| 122 | def get_address(fm: NpuFeatureMap, strides: NpuShape3D, y: int, x: int, c: int) -> int: |
| 123 | """Returns address of given coordinate""" |
| 124 | t = 0 |
| 125 | BRICK = 16 |
| 126 | stride_c = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHWC else strides.depth |
| 127 | stride_x = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHCWB16 else strides.width |
| 128 | if x >= fm.tiles.width_0: |
| 129 | x -= fm.tiles.width_0 |
| 130 | t = 1 |
| 131 | if y >= fm.tiles.height_1: |
| 132 | y -= fm.tiles.height_1 |
| 133 | t += 2 |
| 134 | elif y >= fm.tiles.height_0: |
| 135 | y -= fm.tiles.height_0 |
| 136 | t += 2 |
| 137 | elem_size = fm.data_type.size_in_bytes() |
| 138 | return ( |
| 139 | fm.tiles.addresses[t] + y * strides.height + x * stride_x + (c // BRICK) * stride_c + int(c % BRICK) * elem_size |
| 140 | ) |
| 141 | |
| 142 | |
| 143 | def get_address_range( |
| 144 | fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int |
| 145 | ) -> NpuAddressRange: |
| 146 | """ |
| 147 | Gets address range for (y0, x0, c0) - (y1, x1, c1) (inclusive, so the second coordinate is within the fm). |
| 148 | The begin and end coordinates must be within the same tile. |
| 149 | """ |
| 150 | addr0 = get_address(fm, strides, y0, x0, c0) |
| 151 | addr1 = get_address(fm, strides, y1, x1, c1) |
| 152 | return NpuAddressRange(region=fm.region, address=addr0, length=addr1 - addr0 + fm.data_type.size_in_bytes()) |
| 153 | |
| 154 | |
| 155 | def get_h_ranges( |
| 156 | fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int |
| 157 | ) -> List[NpuAddressRange]: |
| 158 | """ |
| 159 | Gets address ranges for (y0, x0, c0) - (y1, x1, c1) (inclusive, so the second coordinate is within the fm); |
| 160 | the begin and end coordinates must be within the same tile. |
| 161 | Divides the area in horizontal "stripes" of height 1, and returns the address ranges for these "stripes". |
| 162 | """ |
| 163 | return [get_address_range(fm, strides, y, x0, c0, y, x1, c1) for y in range(y0, y1 + 1)] |
| 164 | |
| 165 | |
| 166 | def get_address_ranges_for_area(fm: NpuFeatureMap, start: PointXYZ, end: PointXYZ) -> List[NpuAddressRange]: |
| 167 | """ |
| 168 | Returns a list of adddress ranges that covers the area start - end (inclusive). |
| 169 | Divides the area in horizontal "stripes" of height 1, and returns the address ranges for these "stripes". |
| 170 | |
| 171 | For example, for the area marked with X (in a feature map with 4 tiles) as input, this function would return |
| 172 | 6 address ranges: the address ranges for 1-height areas [AAA, BBB, CC, DD, EEE, FF] |
| 173 | |
| 174 | .....|.... .....|.... |
| 175 | t0 ..XXX|XX.. t1 t0 ..AAA|CC.. t1 |
| 176 | ..XXX|XX.. ..BBB|DD.. |
| 177 | -----+---- --> -----+---- |
| 178 | t2 ..XXX|XX.. t3 t2 ..EEE|FF.. t3 |
| 179 | .....|.... .....|.... |
| 180 | """ |
| 181 | strides = get_strides(fm) |
| 182 | height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0 |
| 183 | h, w, c = fm.shape |
| 184 | y0, x0, c0 = start.y, start.x, start.z |
| 185 | y1, x1, c1 = min(end.y, h - 1), min(end.x, w - 1), min(end.z, c - 1) |
| 186 | ranges = [] |
| 187 | if x0 < width_0 and y0 < height_0: |
| 188 | # Horizontal ranges for tile 0 |
| 189 | ranges.extend(get_h_ranges(fm, strides, y0, x0, c0, min(y1, height_0 - 1), min(x1, width_0 - 1), c1)) |
| 190 | if x1 >= width_0 and y0 < height_1: |
| 191 | # Horizontal ranges for tile 1 |
| 192 | ranges.extend(get_h_ranges(fm, strides, y0, max(x0, width_0), c0, min(y1, height_1 - 1), x1, c1)) |
| 193 | if x0 < width_0 and y1 >= height_0: |
| 194 | # Horizontal ranges for tile 2 |
| 195 | ranges.extend(get_h_ranges(fm, strides, max(y0, height_0), x0, c0, y1, min(x1, width_0 - 1), c1)) |
| 196 | if x1 >= width_0 and y1 >= height_1: |
| 197 | # Horizontal ranges for tile 3 |
| 198 | ranges.extend(get_h_ranges(fm, strides, max(y0, height_1), max(x0, width_0), c0, y1, x1, c1)) |
| 199 | return ranges |
| 200 | |
| 201 | |
| 202 | def get_address_ranges(fm: NpuFeatureMap) -> List[Optional[NpuAddressRange]]: |
| 203 | """Returns 4 adddress ranges, one for every tile, None if the tile is not in use""" |
| 204 | strides = get_strides(fm) |
| 205 | height, width, depth = fm.shape.height, fm.shape.width, fm.shape.depth |
| 206 | height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0 |
| 207 | t0 = get_address_range(fm, strides, 0, 0, 0, min(height, height_0) - 1, min(width, width_0) - 1, depth - 1,) |
| 208 | if width > width_0: |
| 209 | t1 = get_address_range(fm, strides, 0, width_0, 0, min(height, height_1) - 1, width - 1, depth - 1) |
| 210 | else: |
| 211 | t1 = None |
| 212 | if height > height_0: |
| 213 | t2 = get_address_range(fm, strides, height_0, 0, 0, height - 1, min(width, width_0) - 1, depth - 1) |
| 214 | else: |
| 215 | t2 = None |
| 216 | if t1 is not None and t2 is not None: |
| 217 | t3 = get_address_range(fm, strides, height_1, width_0, 0, height - 1, width - 1, depth - 1) |
| 218 | else: |
| 219 | t3 = None |
| 220 | return [t0, t1, t2, t3] |
| 221 | |
| 222 | |
| 223 | # ------------------------------------------------------------------- |
| 224 | # DMA_WAIT/KERNEL_WAIT |
| 225 | # ------------------------------------------------------------------- |
| 226 | |
| 227 | |
| 228 | class Watermark(NamedTuple): |
| 229 | npu: int |
| 230 | dma: int |
| 231 | |
| 232 | |
| 233 | def memory_range_set(range: NpuAddressRange) -> MemoryRangeSet: |
| 234 | return MemoryRangeSet(range.region, range.address, range.address + range.length) |
| 235 | |
| 236 | |
| 237 | def get_dma_memory_accesses(dma_op: NpuDmaOperation) -> MemoryAccessSet: |
| 238 | """Returns the address that are read and written by the given DMA operation""" |
| 239 | res = MemoryAccessSet() |
| 240 | res.add(memory_range_set(dma_op.src), AccessDirection.Read) |
| 241 | res.add(memory_range_set(dma_op.dest), AccessDirection.Write) |
| 242 | return res |
| 243 | |
| 244 | |
| 245 | def get_op_memory_accesses(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> MemoryAccessSet: |
| 246 | """Returns the addresses that are read and written by the given operation""" |
| 247 | assert npu_op.ifm is not None and npu_op.ofm is not None |
| 248 | # Read addresses |
| 249 | read_ranges = get_address_ranges(npu_op.ifm) |
| 250 | if has_ifm2(npu_op): |
| 251 | assert npu_op.ifm2 is not None |
| 252 | read_ranges.extend(get_address_ranges(npu_op.ifm2)) |
| 253 | read_ranges.extend(npu_op.weights) |
| 254 | read_ranges.extend(npu_op.biases) |
| 255 | if npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP: |
| 256 | address = arch.available_shram_banks(True) * arch.shram_bank_size |
| 257 | read_ranges.append(NpuAddressRange(region=BASE_PTR_INDEX_MEM2MEM, address=address, length=2048)) |
| 258 | # Written addresses |
| 259 | write_ranges = get_address_ranges(npu_op.ofm) |
| 260 | # Add write access to SHRAM, needed when LUTs can overwrite accumulator banks |
| 261 | uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP |
| 262 | written_shram_size = arch.available_shram_banks(uses_lut) * arch.shram_bank_size |
| 263 | write_ranges.append(NpuAddressRange(region=BASE_PTR_INDEX_MEM2MEM, address=0, length=written_shram_size)) |
| 264 | |
| 265 | res = MemoryAccessSet() |
| 266 | for read_range in read_ranges: |
| 267 | if read_range is not None: |
| 268 | res.add(memory_range_set(read_range), AccessDirection.Read) |
| 269 | for write_range in write_ranges: |
| 270 | if write_range is not None: |
| 271 | res.add(memory_range_set(write_range), AccessDirection.Write) |
| 272 | return res |
| 273 | |
| 274 | |
| 275 | def get_wait_dependency( |
| 276 | arch: ArchitectureFeatures, npu_op_list: List[NpuOperation], memory_accesses, op_index: int, watermark: Watermark |
| 277 | ): |
| 278 | """Used to calculate whether DMA wait or kernel wait operations are needed""" |
| 279 | npu_op = npu_op_list[op_index] |
| 280 | op_access = memory_accesses[npu_op] |
| 281 | index = op_index - 1 |
| 282 | |
| 283 | # NPU dependency tracking |
| 284 | npu_outstanding = -1 |
| 285 | npu_ops = 0 |
| 286 | npu_index = watermark.npu |
| 287 | |
| 288 | # DMA dependency tracking |
| 289 | dma_outstanding = -1 |
| 290 | dma_ops = 0 |
| 291 | dma_index = watermark.dma |
| 292 | |
| 293 | # Seek back in the command stream looking for NPU or DMA dependencies |
| 294 | # but only as far as the first dependency or the watermarks (dependencies |
| 295 | # before this point have been satisfied already). |
| 296 | # The watermark moves to after the latest element we must wait for, not |
| 297 | # the command that issues the wait. |
| 298 | # NPU->NPU dependency is handled via blockdep. |
| 299 | while (index >= npu_index) or (index >= dma_index): |
| 300 | prev_op = npu_op_list[index] |
| 301 | prev_access = memory_accesses[prev_op] |
| 302 | |
| 303 | # Check NPU consuming DMA output |
Dwight Lidman | 9b43f84 | 2020-12-08 17:56:44 +0100 | [diff] [blame] | 304 | if isinstance(prev_op, NpuDmaOperation): |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 305 | if index >= dma_index: |
Dwight Lidman | 9b43f84 | 2020-12-08 17:56:44 +0100 | [diff] [blame] | 306 | if not isinstance(npu_op, NpuDmaOperation): |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 307 | if (dma_outstanding == -1) and prev_access.conflicts(op_access): |
| 308 | dma_outstanding = dma_ops |
| 309 | dma_ops += 1 # Count DMA ops in the pipeline |
| 310 | if dma_ops >= arch.max_outstanding_dma: |
| 311 | dma_index = max(index + 1, dma_index) |
| 312 | # Check DMA consuming NPU output |
| 313 | else: |
| 314 | if index >= npu_index: |
Dwight Lidman | 9b43f84 | 2020-12-08 17:56:44 +0100 | [diff] [blame] | 315 | if isinstance(npu_op, NpuDmaOperation) and npu_outstanding == -1 and prev_access.conflicts(op_access): |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 316 | npu_outstanding = npu_ops |
| 317 | npu_ops += 1 # Count NPU ops in the pipeline |
| 318 | if npu_ops >= arch.max_outstanding_kernels: |
| 319 | npu_index = max(index + 1, npu_index) |
| 320 | |
| 321 | index -= 1 |
| 322 | |
| 323 | # Update DMA watermark if we didn't see any and the NPU pipeline is full |
| 324 | if (dma_ops == 0) and (npu_ops >= arch.max_outstanding_kernels): |
| 325 | dma_index = op_index |
| 326 | |
| 327 | # Bring the search watermark forwards as we complete for those dependencies |
| 328 | watermark = Watermark(npu_index, dma_index) |
| 329 | outstanding = Watermark(npu_outstanding, dma_outstanding) |
| 330 | |
| 331 | return watermark, outstanding |
| 332 | |
| 333 | |
| 334 | # ------------------------------------------------------------------- |
| 335 | # BLOCKDEP |
| 336 | # ------------------------------------------------------------------- |
| 337 | |
| 338 | |
| 339 | def get_ifm_ofm_block_depth(arch: ArchitectureFeatures, npu_op: NpuBlockOperation) -> int: |
| 340 | # Note: NOT equivalent to the normal ifm block depth calculation since |
| 341 | # it takes into account 'depthless' block operations by returning full |
| 342 | # depth |
| 343 | if npu_op.op_type == NpuOperationType.Conv2D: |
| 344 | res = arch.calc_ifm_block_depth(npu_op.ifm.shape.depth, npu_op.ifm.data_type.size_in_bits()) |
| 345 | return res |
| 346 | return npu_op.ofm.shape.depth |
| 347 | |
| 348 | |
| 349 | def coords_intersect(start_a: PointXYZ, end_a: PointXYZ, start_b: PointXYZ, end_b: PointXYZ) -> bool: |
| 350 | """Checks if the two areas overlap""" |
| 351 | start_x = max(start_a.x, start_b.x) |
| 352 | end_x = min(end_a.x, end_b.x) |
| 353 | start_y = max(start_a.y, start_b.y) |
| 354 | end_y = min(end_a.y, end_b.y) |
| 355 | start_z = max(start_a.z, start_b.z) |
| 356 | end_z = min(end_a.z, end_b.z) |
| 357 | return ((end_x - start_x) > 0) and ((end_y - start_y) > 0) and ((end_z - start_z) > 0) |
| 358 | |
| 359 | |
| 360 | def intersects( |
| 361 | ifm: NpuFeatureMap, |
| 362 | ifm_start_coord: PointXYZ, |
| 363 | ifm_end_coord: PointXYZ, |
| 364 | prev_ofm: NpuFeatureMap, |
| 365 | ofm_start_coord: PointXYZ, |
| 366 | ofm_end_coord: PointXYZ, |
| 367 | ) -> bool: |
| 368 | """Checks if the given IFM area overlaps with the given OFM area""" |
| 369 | if ifm.shape == prev_ofm.shape and ifm.tiles == prev_ofm.tiles: |
| 370 | # Common case: prev_op.ofm == op.ifm; in this case it suffices to check |
| 371 | # if the xyz coordinates overlap, which is quick and easy |
| 372 | res = coords_intersect(ifm_start_coord, ifm_end_coord, ofm_start_coord, ofm_end_coord) |
| 373 | else: |
| 374 | # The OFM produces a part of the IFM (e.g. a stripe), or the IFM consumes part of the OFM. |
| 375 | # In this case, address comparison between the two areas is needed |
| 376 | ifm_ranges = get_address_ranges_for_area(ifm, ifm_start_coord, ifm_end_coord) |
| 377 | prev_ofm_ranges = get_address_ranges_for_area(prev_ofm, ofm_start_coord, ofm_end_coord) |
| 378 | res = range_lists_overlap(ifm_ranges, prev_ofm_ranges) |
| 379 | return res |
| 380 | |
| 381 | |
| 382 | # Block job dependency: |
| 383 | # Does the VOLUME of IFMs for block job B(0) overlap with VOLUME of OFMs block jobs A(8,9,10) |
| 384 | # |
| 385 | # A | B |
| 386 | # ----------------------+------------------ |
| 387 | # .... 3,4,5,6,7,8,9,10 | 0,1,2,3,4,5,6,8 10 < JOB NUMBER |
| 388 | # |<------->| dependency offset |
| 389 | # |
| 390 | |
| 391 | |
| 392 | def get_offset_block_coords(area: Rect, block: Block, offset: int) -> Optional[PointXYZ]: |
| 393 | """ |
| 394 | Get the coordinates of a block offset from either the end (negative) |
| 395 | or the start (zero or positive) of the given 3D area |
| 396 | """ |
| 397 | size = area.size() |
| 398 | # Dimensions of the region, in blocks |
| 399 | width_blocks = numeric_util.round_up_divide(size.width, block.width) |
| 400 | height_blocks = numeric_util.round_up_divide(size.height, block.height) |
| 401 | depth_blocks = numeric_util.round_up_divide(size.depth, block.depth) |
| 402 | total_blocks = width_blocks * height_blocks * depth_blocks |
| 403 | if offset < 0: |
| 404 | index = total_blocks + offset |
| 405 | else: |
| 406 | index = offset |
| 407 | |
| 408 | if index >= total_blocks: |
| 409 | return None |
| 410 | |
| 411 | # Coordinates of the indexed block |
| 412 | coord_z = block.depth * (index % depth_blocks) |
| 413 | coord_y = block.height * (index // (depth_blocks * width_blocks)) |
| 414 | coord_x = block.width * ((index // depth_blocks) % width_blocks) |
| 415 | |
| 416 | return PointXYZ(x=coord_x + area.x, y=coord_y + area.y, z=coord_z + area.z) |
| 417 | |
| 418 | |
| 419 | def get_first_job_input_volume( |
| 420 | arch: ArchitectureFeatures, |
| 421 | ifm: Rect, |
| 422 | ofm: Rect, |
| 423 | ifm_block_depth, |
| 424 | ofm_block: Block, |
| 425 | kernel: Kernel, |
| 426 | padding: NpuPadding, |
| 427 | block_offset: int, |
| 428 | ): |
| 429 | # Get ifm block size (jobs are invisibly decomposed into subkernels) |
| 430 | ifm_block = arch.get_ifm_block_size(ifm_block_depth, ofm_block, kernel, arch.ofm_block_max) |
| 431 | ifm_depth_blocks = numeric_util.round_up_divide(ifm.size().depth, ifm_block_depth) |
| 432 | |
| 433 | # Which OFM block are we calculating |
| 434 | ofm_coord = get_offset_block_coords(ofm, ofm_block, block_offset // ifm_depth_blocks) |
| 435 | if ofm_coord is None: |
| 436 | return None |
| 437 | |
| 438 | # Coordinate of the source IFM block |
| 439 | ifm_coord_x = max(0, ofm_coord[0] * kernel.stride.x - padding.left) |
| 440 | ifm_coord_y = max(0, ofm_coord[1] * kernel.stride.y - padding.right) |
| 441 | ifm_coord_z = ifm.z + (block_offset % ifm_depth_blocks) * ifm_block.depth |
| 442 | |
| 443 | # IFM block that will be sampled for the FIRST+block_offset job in the next operator's OFM |
| 444 | start_coord = PointXYZ(x=ifm_coord_x, y=ifm_coord_y, z=ifm_coord_z) |
| 445 | end_coord = PointXYZ( |
| 446 | x=start_coord[0] + ifm_block.width, y=start_coord[1] + ifm_block.height, z=start_coord[2] + ifm_block.depth, |
| 447 | ) |
| 448 | return (start_coord, end_coord, 1) # start, end, total jobs |
| 449 | |
| 450 | |
| 451 | def get_prev_job_output_volume(ofm: Rect, ofm_block: Block, block_offset: int): |
| 452 | assert block_offset >= 0 |
| 453 | |
| 454 | # Get OFM block's volume coordinates |
| 455 | start_coord = get_offset_block_coords(ofm, ofm_block, -1 - block_offset) |
| 456 | if start_coord is None: |
| 457 | return None |
| 458 | end_coord = PointXYZ( |
| 459 | x=start_coord.x + ofm_block.width, y=start_coord.y + ofm_block.height, z=start_coord.z + ofm_block.depth, |
| 460 | ) |
| 461 | return (start_coord, end_coord, 1) # start, end, total jobs for this OFM block |
| 462 | |
| 463 | |
| 464 | def calc_blockdep(arch: ArchitectureFeatures, prev_op: Optional[NpuBlockOperation], npu_op: NpuBlockOperation,) -> int: |
| 465 | """Calculates the value of the BLOCKDEP register""" |
| 466 | if prev_op is None: |
| 467 | return 0 |
| 468 | assert npu_op.ifm is not None |
| 469 | assert prev_op.ofm is not None |
Diqing Zhong | 455e20e | 2021-02-03 16:37:31 +0100 | [diff] [blame] | 470 | # Check if the reserved shram will be used in current/prev op |
| 471 | prev_uses_lut = prev_op.activation is not None and prev_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP |
| 472 | curr_uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP |
| 473 | if prev_uses_lut and arch.shram_reserved_unused_banks == 0 and not curr_uses_lut: |
| 474 | return 0 |
| 475 | |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 476 | # Check if IFM or IFM2 overlaps with prev op's OFM |
| 477 | prev_ofm_ranges = get_address_ranges(prev_op.ofm) |
| 478 | ifm_ranges = get_address_ranges(npu_op.ifm) |
| 479 | ifm_overlaps = range_lists_overlap(prev_ofm_ranges, ifm_ranges) |
| 480 | if has_ifm2(npu_op): |
| 481 | assert npu_op.ifm2 is not None |
| 482 | ifm2_ranges = get_address_ranges(npu_op.ifm2) |
| 483 | ifm2_overlaps = range_lists_overlap(prev_ofm_ranges, ifm2_ranges) |
| 484 | else: |
| 485 | ifm2_overlaps = False |
| 486 | if ifm_overlaps and ifm2_overlaps: |
| 487 | # Both IFM and IFM2 overlap (should be rare) |
| 488 | return 0 |
| 489 | if not ifm_overlaps and not ifm2_overlaps: |
| 490 | # No overlap between prev OFM and IFM/IFM2 |
| 491 | return ArchitectureFeatures.MAX_BLOCKDEP |
| 492 | if ifm2_overlaps and shape3d_size(npu_op.ifm2.shape) < shape3d_size(npu_op.ifm.shape): |
| 493 | # Prev OFM produces IFM2 which is broadcasted (this should be rare) |
| 494 | return 0 |
| 495 | # Prev OFM overlaps with IFM or IFM2; calculate the blockdep |
| 496 | prev_block_config = prev_op.block_config |
| 497 | block_config = npu_op.block_config |
| 498 | overlapping_fm = npu_op.ifm if ifm_overlaps else npu_op.ifm2 |
| 499 | assert overlapping_fm is not None |
| 500 | |
| 501 | cur_ifm_block_depth = get_ifm_ofm_block_depth(arch, npu_op) |
| 502 | cur_ofm_block = Block(block_config.width, block_config.height, block_config.depth) |
| 503 | cur_ofm_rect = shape3d_to_rect(npu_op.ofm.shape) |
| 504 | cur_ifm_rect = shape3d_to_rect(npu_op.ifm.shape) |
| 505 | padding = NpuPadding(0, 0, 0, 0) if npu_op.padding is None else npu_op.padding |
| 506 | blockdep = ArchitectureFeatures.MAX_BLOCKDEP |
| 507 | kernel = to_kernel(npu_op.kernel) |
| 508 | |
| 509 | prev_ofm_block = Block(prev_block_config.width, prev_block_config.height, prev_block_config.depth) |
| 510 | prev_ofm_rect = shape3d_to_rect(prev_op.ofm.shape) |
| 511 | # Iterate over the next BLOCKDEP inputs, checking to see if a sliding window |
| 512 | # of IFM area overlaps with any previous OFM block generation. |
| 513 | elapsed_jobs = 0 |
| 514 | for forward_offset in range(ArchitectureFeatures.MAX_BLOCKDEP): |
| 515 | # This is the IFM block we want to sample from |
| 516 | in_area = get_first_job_input_volume( |
| 517 | arch, cur_ifm_rect, cur_ofm_rect, cur_ifm_block_depth, cur_ofm_block, kernel, padding, forward_offset |
| 518 | ) |
| 519 | if in_area is None: |
| 520 | break |
| 521 | |
| 522 | # Try several previous-OFM blocks in the past (they still might comprise multiple IFM jobs) |
| 523 | outstanding_jobs = 0 |
| 524 | for block_offset in range(ArchitectureFeatures.MAX_BLOCKDEP): |
| 525 | # This is the OFM block being generated by the previous op |
| 526 | out_area = get_prev_job_output_volume(prev_ofm_rect, prev_ofm_block, block_offset) |
| 527 | if out_area is None: |
| 528 | break |
| 529 | |
| 530 | # Block dependency is the max number of allowed outstanding jobs |
| 531 | # in the pipeline. Selected by determining how many jobs occur |
| 532 | # in between two operators' overlapping OFM->IFM block volumes |
| 533 | if intersects(overlapping_fm, in_area[0], in_area[1], prev_op.ofm, out_area[0], out_area[1]): |
| 534 | break |
| 535 | # Early exit if no intersections and we've seen enough jobs in the pipeline |
| 536 | elif outstanding_jobs > ArchitectureFeatures.MAX_BLOCKDEP: |
| 537 | break |
| 538 | |
| 539 | # This OFM had this many jobs (accumulate over multiple OFM blocks) |
| 540 | outstanding_jobs += out_area[2] |
| 541 | |
| 542 | blockdep = min(blockdep, elapsed_jobs + outstanding_jobs) |
| 543 | elapsed_jobs += in_area[2] |
| 544 | # Early exit if no intersections and we've seen enough jobs in the pipeline |
| 545 | if elapsed_jobs > ArchitectureFeatures.MAX_BLOCKDEP: |
| 546 | break |
| 547 | |
| 548 | return blockdep |