blob: 3751d88edc77d475a467d6c2eb854bb990a4cd5e [file] [log] [blame]
erik.andersson@arm.com460c6892021-02-24 14:38:09 +01001# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Louis Verhaard1e170182020-11-26 11:42:04 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17# Description:
18# Utility functions for code generation
19from typing import List
20from typing import NamedTuple
21from typing import Optional
22
23from . import numeric_util
24from .api import NpuActivationOp
25from .api import NpuAddressRange
26from .api import NpuBlockOperation
27from .api import NpuDmaOperation
28from .api import NpuElementWiseOp
29from .api import NpuFeatureMap
30from .api import NpuKernel
31from .api import NpuLayout
32from .api import NpuOperation
33from .api import NpuOperationType
34from .api import NpuPadding
35from .api import NpuShape3D
36from .architecture_features import ArchitectureFeatures
37from .architecture_features import Block
38from .architecture_features import Rect
39from .operation import Kernel
40from .operation import PointXYZ
41from ethosu.vela.range_set import AccessDirection
42from ethosu.vela.range_set import MemoryAccessSet
43from ethosu.vela.range_set import MemoryRangeSet
44
45# base address slot for memory to memory transfer
46BASE_PTR_INDEX_MEM2MEM = int((1 << 8) | (3 << 0))
47
48
Michael McGeaghf3e3ad72020-12-02 12:39:03 +000049UNARY_ELEMWISE_OPS = (NpuElementWiseOp.ABS, NpuElementWiseOp.LRELU, NpuElementWiseOp.CLZ)
Louis Verhaard1e170182020-11-26 11:42:04 +010050
51
52def to_npu_kernel(kernel: Kernel) -> NpuKernel:
53 """Converts the given internally used kernel object to NpuKernel (of public API)"""
54 return NpuKernel(
55 kernel.width, kernel.height, kernel.stride.x, kernel.stride.y, kernel.dilation.x, kernel.dilation.y
56 )
57
58
59def to_kernel(kernel: Optional[NpuKernel]) -> Kernel:
60 """Converts the given public API object to Kernel (used internally)"""
61 if kernel is None:
62 return Kernel(1, 1)
63 return Kernel(kernel.width, kernel.height, kernel.stride_x, kernel.stride_y, kernel.dilation_x, kernel.dilation_y)
64
65
66def has_ifm2(npu_op: NpuBlockOperation) -> bool:
67 """Checks if op has non-scalar IFM2"""
68 return npu_op.ifm2 is not None and npu_op.ifm2_scalar is None
69
70
Louis Verhaard1e170182020-11-26 11:42:04 +010071def shape3d_size(shape: NpuShape3D) -> int:
72 return shape.width * shape.height * shape.depth
73
74
75def shape3d_to_rect(shape: NpuShape3D) -> Rect:
76 return Rect(0, 0, 0, shape.width - 1, shape.height - 1, shape.depth - 1)
77
78
Tim Halld8339a72021-05-27 18:49:40 +010079def shape3d_to_block(shape: NpuShape3D) -> Block:
80 return Block(shape.width, shape.height, shape.depth)
81
82
Louis Verhaard1e170182020-11-26 11:42:04 +010083# -------------------------------------------------------------------
84# ADDRESSING/STRIDES (helper functions)
85# -------------------------------------------------------------------
86
87
88def ranges_overlap(range1: NpuAddressRange, range2: NpuAddressRange) -> bool:
89 """Checks if the ranges overlap"""
90 return range1.region == range2.region and numeric_util.overlaps(
91 range1.address, range1.address + range1.length, range2.address, range2.address + range2.length
92 )
93
94
95def range_lists_overlap(list1: List[Optional[NpuAddressRange]], list2: List[Optional[NpuAddressRange]]) -> bool:
96 """Checks if there is any address overlap between list1 and list2"""
97 for range1 in list1:
98 if range1 is None:
99 continue
100 for range2 in list2:
101 if range2 is not None and ranges_overlap(range1, range2):
102 return True
103 return False
104
105
106def get_strides(fm: NpuFeatureMap) -> NpuShape3D:
107 """Calculates STRIDE_C/Y/X"""
108 if fm.strides is not None:
109 return fm.strides
110 elem_size = fm.data_type.size_in_bytes()
111 if fm.layout == NpuLayout.NHWC:
112 stride_c = elem_size
113 stride_x = fm.shape.depth * stride_c
114 stride_y = fm.shape.width * stride_x
115 else:
116 stride_x = 16 * elem_size
117 stride_c = stride_x * fm.shape.width
118 stride_y = elem_size * fm.shape.width * numeric_util.round_up(fm.shape.depth, 16)
119 return NpuShape3D(depth=stride_c, height=stride_y, width=stride_x)
120
121
122def get_address(fm: NpuFeatureMap, strides: NpuShape3D, y: int, x: int, c: int) -> int:
123 """Returns address of given coordinate"""
124 t = 0
125 BRICK = 16
126 stride_c = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHWC else strides.depth
127 stride_x = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHCWB16 else strides.width
128 if x >= fm.tiles.width_0:
129 x -= fm.tiles.width_0
130 t = 1
131 if y >= fm.tiles.height_1:
132 y -= fm.tiles.height_1
133 t += 2
134 elif y >= fm.tiles.height_0:
135 y -= fm.tiles.height_0
136 t += 2
137 elem_size = fm.data_type.size_in_bytes()
138 return (
139 fm.tiles.addresses[t] + y * strides.height + x * stride_x + (c // BRICK) * stride_c + int(c % BRICK) * elem_size
140 )
141
142
143def get_address_range(
144 fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int
145) -> NpuAddressRange:
146 """
147 Gets address range for (y0, x0, c0) - (y1, x1, c1) (inclusive, so the second coordinate is within the fm).
148 The begin and end coordinates must be within the same tile.
149 """
150 addr0 = get_address(fm, strides, y0, x0, c0)
151 addr1 = get_address(fm, strides, y1, x1, c1)
152 return NpuAddressRange(region=fm.region, address=addr0, length=addr1 - addr0 + fm.data_type.size_in_bytes())
153
154
155def get_h_ranges(
156 fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int
157) -> List[NpuAddressRange]:
158 """
159 Gets address ranges for (y0, x0, c0) - (y1, x1, c1) (inclusive, so the second coordinate is within the fm);
160 the begin and end coordinates must be within the same tile.
161 Divides the area in horizontal "stripes" of height 1, and returns the address ranges for these "stripes".
162 """
163 return [get_address_range(fm, strides, y, x0, c0, y, x1, c1) for y in range(y0, y1 + 1)]
164
165
166def get_address_ranges_for_area(fm: NpuFeatureMap, start: PointXYZ, end: PointXYZ) -> List[NpuAddressRange]:
167 """
168 Returns a list of adddress ranges that covers the area start - end (inclusive).
169 Divides the area in horizontal "stripes" of height 1, and returns the address ranges for these "stripes".
170
171 For example, for the area marked with X (in a feature map with 4 tiles) as input, this function would return
172 6 address ranges: the address ranges for 1-height areas [AAA, BBB, CC, DD, EEE, FF]
173
174 .....|.... .....|....
175 t0 ..XXX|XX.. t1 t0 ..AAA|CC.. t1
176 ..XXX|XX.. ..BBB|DD..
177 -----+---- --> -----+----
178 t2 ..XXX|XX.. t3 t2 ..EEE|FF.. t3
179 .....|.... .....|....
180 """
181 strides = get_strides(fm)
182 height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0
183 h, w, c = fm.shape
184 y0, x0, c0 = start.y, start.x, start.z
185 y1, x1, c1 = min(end.y, h - 1), min(end.x, w - 1), min(end.z, c - 1)
186 ranges = []
187 if x0 < width_0 and y0 < height_0:
188 # Horizontal ranges for tile 0
189 ranges.extend(get_h_ranges(fm, strides, y0, x0, c0, min(y1, height_0 - 1), min(x1, width_0 - 1), c1))
190 if x1 >= width_0 and y0 < height_1:
191 # Horizontal ranges for tile 1
192 ranges.extend(get_h_ranges(fm, strides, y0, max(x0, width_0), c0, min(y1, height_1 - 1), x1, c1))
193 if x0 < width_0 and y1 >= height_0:
194 # Horizontal ranges for tile 2
195 ranges.extend(get_h_ranges(fm, strides, max(y0, height_0), x0, c0, y1, min(x1, width_0 - 1), c1))
196 if x1 >= width_0 and y1 >= height_1:
197 # Horizontal ranges for tile 3
198 ranges.extend(get_h_ranges(fm, strides, max(y0, height_1), max(x0, width_0), c0, y1, x1, c1))
199 return ranges
200
201
202def get_address_ranges(fm: NpuFeatureMap) -> List[Optional[NpuAddressRange]]:
203 """Returns 4 adddress ranges, one for every tile, None if the tile is not in use"""
204 strides = get_strides(fm)
205 height, width, depth = fm.shape.height, fm.shape.width, fm.shape.depth
206 height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0
207 t0 = get_address_range(fm, strides, 0, 0, 0, min(height, height_0) - 1, min(width, width_0) - 1, depth - 1,)
208 if width > width_0:
209 t1 = get_address_range(fm, strides, 0, width_0, 0, min(height, height_1) - 1, width - 1, depth - 1)
210 else:
211 t1 = None
212 if height > height_0:
213 t2 = get_address_range(fm, strides, height_0, 0, 0, height - 1, min(width, width_0) - 1, depth - 1)
214 else:
215 t2 = None
216 if t1 is not None and t2 is not None:
217 t3 = get_address_range(fm, strides, height_1, width_0, 0, height - 1, width - 1, depth - 1)
218 else:
219 t3 = None
220 return [t0, t1, t2, t3]
221
222
223# -------------------------------------------------------------------
224# DMA_WAIT/KERNEL_WAIT
225# -------------------------------------------------------------------
226
227
228class Watermark(NamedTuple):
229 npu: int
230 dma: int
231
232
233def memory_range_set(range: NpuAddressRange) -> MemoryRangeSet:
234 return MemoryRangeSet(range.region, range.address, range.address + range.length)
235
236
237def get_dma_memory_accesses(dma_op: NpuDmaOperation) -> MemoryAccessSet:
238 """Returns the address that are read and written by the given DMA operation"""
239 res = MemoryAccessSet()
240 res.add(memory_range_set(dma_op.src), AccessDirection.Read)
241 res.add(memory_range_set(dma_op.dest), AccessDirection.Write)
242 return res
243
244
245def get_op_memory_accesses(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> MemoryAccessSet:
246 """Returns the addresses that are read and written by the given operation"""
247 assert npu_op.ifm is not None and npu_op.ofm is not None
248 # Read addresses
249 read_ranges = get_address_ranges(npu_op.ifm)
250 if has_ifm2(npu_op):
251 assert npu_op.ifm2 is not None
252 read_ranges.extend(get_address_ranges(npu_op.ifm2))
253 read_ranges.extend(npu_op.weights)
254 read_ranges.extend(npu_op.biases)
255 if npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP:
256 address = arch.available_shram_banks(True) * arch.shram_bank_size
257 read_ranges.append(NpuAddressRange(region=BASE_PTR_INDEX_MEM2MEM, address=address, length=2048))
258 # Written addresses
259 write_ranges = get_address_ranges(npu_op.ofm)
260 # Add write access to SHRAM, needed when LUTs can overwrite accumulator banks
261 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
262 written_shram_size = arch.available_shram_banks(uses_lut) * arch.shram_bank_size
263 write_ranges.append(NpuAddressRange(region=BASE_PTR_INDEX_MEM2MEM, address=0, length=written_shram_size))
264
265 res = MemoryAccessSet()
266 for read_range in read_ranges:
267 if read_range is not None:
268 res.add(memory_range_set(read_range), AccessDirection.Read)
269 for write_range in write_ranges:
270 if write_range is not None:
271 res.add(memory_range_set(write_range), AccessDirection.Write)
272 return res
273
274
275def get_wait_dependency(
276 arch: ArchitectureFeatures, npu_op_list: List[NpuOperation], memory_accesses, op_index: int, watermark: Watermark
277):
278 """Used to calculate whether DMA wait or kernel wait operations are needed"""
279 npu_op = npu_op_list[op_index]
280 op_access = memory_accesses[npu_op]
281 index = op_index - 1
282
283 # NPU dependency tracking
284 npu_outstanding = -1
285 npu_ops = 0
286 npu_index = watermark.npu
287
288 # DMA dependency tracking
289 dma_outstanding = -1
290 dma_ops = 0
291 dma_index = watermark.dma
292
293 # Seek back in the command stream looking for NPU or DMA dependencies
294 # but only as far as the first dependency or the watermarks (dependencies
295 # before this point have been satisfied already).
296 # The watermark moves to after the latest element we must wait for, not
297 # the command that issues the wait.
298 # NPU->NPU dependency is handled via blockdep.
299 while (index >= npu_index) or (index >= dma_index):
300 prev_op = npu_op_list[index]
301 prev_access = memory_accesses[prev_op]
302
303 # Check NPU consuming DMA output
Dwight Lidman9b43f842020-12-08 17:56:44 +0100304 if isinstance(prev_op, NpuDmaOperation):
Louis Verhaard1e170182020-11-26 11:42:04 +0100305 if index >= dma_index:
Dwight Lidman9b43f842020-12-08 17:56:44 +0100306 if not isinstance(npu_op, NpuDmaOperation):
Louis Verhaard1e170182020-11-26 11:42:04 +0100307 if (dma_outstanding == -1) and prev_access.conflicts(op_access):
308 dma_outstanding = dma_ops
309 dma_ops += 1 # Count DMA ops in the pipeline
310 if dma_ops >= arch.max_outstanding_dma:
311 dma_index = max(index + 1, dma_index)
312 # Check DMA consuming NPU output
313 else:
314 if index >= npu_index:
Dwight Lidman9b43f842020-12-08 17:56:44 +0100315 if isinstance(npu_op, NpuDmaOperation) and npu_outstanding == -1 and prev_access.conflicts(op_access):
Louis Verhaard1e170182020-11-26 11:42:04 +0100316 npu_outstanding = npu_ops
317 npu_ops += 1 # Count NPU ops in the pipeline
318 if npu_ops >= arch.max_outstanding_kernels:
319 npu_index = max(index + 1, npu_index)
320
321 index -= 1
322
323 # Update DMA watermark if we didn't see any and the NPU pipeline is full
324 if (dma_ops == 0) and (npu_ops >= arch.max_outstanding_kernels):
325 dma_index = op_index
326
327 # Bring the search watermark forwards as we complete for those dependencies
328 watermark = Watermark(npu_index, dma_index)
329 outstanding = Watermark(npu_outstanding, dma_outstanding)
330
331 return watermark, outstanding
332
333
334# -------------------------------------------------------------------
335# BLOCKDEP
336# -------------------------------------------------------------------
337
338
339def get_ifm_ofm_block_depth(arch: ArchitectureFeatures, npu_op: NpuBlockOperation) -> int:
340 # Note: NOT equivalent to the normal ifm block depth calculation since
341 # it takes into account 'depthless' block operations by returning full
342 # depth
343 if npu_op.op_type == NpuOperationType.Conv2D:
344 res = arch.calc_ifm_block_depth(npu_op.ifm.shape.depth, npu_op.ifm.data_type.size_in_bits())
345 return res
346 return npu_op.ofm.shape.depth
347
348
349def coords_intersect(start_a: PointXYZ, end_a: PointXYZ, start_b: PointXYZ, end_b: PointXYZ) -> bool:
350 """Checks if the two areas overlap"""
351 start_x = max(start_a.x, start_b.x)
352 end_x = min(end_a.x, end_b.x)
353 start_y = max(start_a.y, start_b.y)
354 end_y = min(end_a.y, end_b.y)
355 start_z = max(start_a.z, start_b.z)
356 end_z = min(end_a.z, end_b.z)
357 return ((end_x - start_x) > 0) and ((end_y - start_y) > 0) and ((end_z - start_z) > 0)
358
359
360def intersects(
361 ifm: NpuFeatureMap,
362 ifm_start_coord: PointXYZ,
363 ifm_end_coord: PointXYZ,
364 prev_ofm: NpuFeatureMap,
365 ofm_start_coord: PointXYZ,
366 ofm_end_coord: PointXYZ,
367) -> bool:
368 """Checks if the given IFM area overlaps with the given OFM area"""
369 if ifm.shape == prev_ofm.shape and ifm.tiles == prev_ofm.tiles:
370 # Common case: prev_op.ofm == op.ifm; in this case it suffices to check
371 # if the xyz coordinates overlap, which is quick and easy
372 res = coords_intersect(ifm_start_coord, ifm_end_coord, ofm_start_coord, ofm_end_coord)
373 else:
374 # The OFM produces a part of the IFM (e.g. a stripe), or the IFM consumes part of the OFM.
375 # In this case, address comparison between the two areas is needed
376 ifm_ranges = get_address_ranges_for_area(ifm, ifm_start_coord, ifm_end_coord)
377 prev_ofm_ranges = get_address_ranges_for_area(prev_ofm, ofm_start_coord, ofm_end_coord)
378 res = range_lists_overlap(ifm_ranges, prev_ofm_ranges)
379 return res
380
381
382# Block job dependency:
383# Does the VOLUME of IFMs for block job B(0) overlap with VOLUME of OFMs block jobs A(8,9,10)
384#
385# A | B
386# ----------------------+------------------
387# .... 3,4,5,6,7,8,9,10 | 0,1,2,3,4,5,6,8 10 < JOB NUMBER
388# |<------->| dependency offset
389#
390
391
392def get_offset_block_coords(area: Rect, block: Block, offset: int) -> Optional[PointXYZ]:
393 """
394 Get the coordinates of a block offset from either the end (negative)
395 or the start (zero or positive) of the given 3D area
396 """
397 size = area.size()
398 # Dimensions of the region, in blocks
399 width_blocks = numeric_util.round_up_divide(size.width, block.width)
400 height_blocks = numeric_util.round_up_divide(size.height, block.height)
401 depth_blocks = numeric_util.round_up_divide(size.depth, block.depth)
402 total_blocks = width_blocks * height_blocks * depth_blocks
403 if offset < 0:
404 index = total_blocks + offset
405 else:
406 index = offset
407
408 if index >= total_blocks:
409 return None
410
411 # Coordinates of the indexed block
412 coord_z = block.depth * (index % depth_blocks)
413 coord_y = block.height * (index // (depth_blocks * width_blocks))
414 coord_x = block.width * ((index // depth_blocks) % width_blocks)
415
416 return PointXYZ(x=coord_x + area.x, y=coord_y + area.y, z=coord_z + area.z)
417
418
419def get_first_job_input_volume(
420 arch: ArchitectureFeatures,
421 ifm: Rect,
422 ofm: Rect,
423 ifm_block_depth,
424 ofm_block: Block,
425 kernel: Kernel,
426 padding: NpuPadding,
427 block_offset: int,
428):
429 # Get ifm block size (jobs are invisibly decomposed into subkernels)
430 ifm_block = arch.get_ifm_block_size(ifm_block_depth, ofm_block, kernel, arch.ofm_block_max)
431 ifm_depth_blocks = numeric_util.round_up_divide(ifm.size().depth, ifm_block_depth)
432
433 # Which OFM block are we calculating
434 ofm_coord = get_offset_block_coords(ofm, ofm_block, block_offset // ifm_depth_blocks)
435 if ofm_coord is None:
436 return None
437
438 # Coordinate of the source IFM block
439 ifm_coord_x = max(0, ofm_coord[0] * kernel.stride.x - padding.left)
440 ifm_coord_y = max(0, ofm_coord[1] * kernel.stride.y - padding.right)
441 ifm_coord_z = ifm.z + (block_offset % ifm_depth_blocks) * ifm_block.depth
442
443 # IFM block that will be sampled for the FIRST+block_offset job in the next operator's OFM
444 start_coord = PointXYZ(x=ifm_coord_x, y=ifm_coord_y, z=ifm_coord_z)
445 end_coord = PointXYZ(
446 x=start_coord[0] + ifm_block.width, y=start_coord[1] + ifm_block.height, z=start_coord[2] + ifm_block.depth,
447 )
448 return (start_coord, end_coord, 1) # start, end, total jobs
449
450
451def get_prev_job_output_volume(ofm: Rect, ofm_block: Block, block_offset: int):
452 assert block_offset >= 0
453
454 # Get OFM block's volume coordinates
455 start_coord = get_offset_block_coords(ofm, ofm_block, -1 - block_offset)
456 if start_coord is None:
457 return None
458 end_coord = PointXYZ(
459 x=start_coord.x + ofm_block.width, y=start_coord.y + ofm_block.height, z=start_coord.z + ofm_block.depth,
460 )
461 return (start_coord, end_coord, 1) # start, end, total jobs for this OFM block
462
463
464def calc_blockdep(arch: ArchitectureFeatures, prev_op: Optional[NpuBlockOperation], npu_op: NpuBlockOperation,) -> int:
465 """Calculates the value of the BLOCKDEP register"""
466 if prev_op is None:
467 return 0
468 assert npu_op.ifm is not None
469 assert prev_op.ofm is not None
Diqing Zhong455e20e2021-02-03 16:37:31 +0100470 # Check if the reserved shram will be used in current/prev op
471 prev_uses_lut = prev_op.activation is not None and prev_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
472 curr_uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
473 if prev_uses_lut and arch.shram_reserved_unused_banks == 0 and not curr_uses_lut:
474 return 0
475
Louis Verhaard1e170182020-11-26 11:42:04 +0100476 # Check if IFM or IFM2 overlaps with prev op's OFM
477 prev_ofm_ranges = get_address_ranges(prev_op.ofm)
478 ifm_ranges = get_address_ranges(npu_op.ifm)
479 ifm_overlaps = range_lists_overlap(prev_ofm_ranges, ifm_ranges)
480 if has_ifm2(npu_op):
481 assert npu_op.ifm2 is not None
482 ifm2_ranges = get_address_ranges(npu_op.ifm2)
483 ifm2_overlaps = range_lists_overlap(prev_ofm_ranges, ifm2_ranges)
484 else:
485 ifm2_overlaps = False
486 if ifm_overlaps and ifm2_overlaps:
487 # Both IFM and IFM2 overlap (should be rare)
488 return 0
489 if not ifm_overlaps and not ifm2_overlaps:
490 # No overlap between prev OFM and IFM/IFM2
491 return ArchitectureFeatures.MAX_BLOCKDEP
492 if ifm2_overlaps and shape3d_size(npu_op.ifm2.shape) < shape3d_size(npu_op.ifm.shape):
493 # Prev OFM produces IFM2 which is broadcasted (this should be rare)
494 return 0
495 # Prev OFM overlaps with IFM or IFM2; calculate the blockdep
496 prev_block_config = prev_op.block_config
497 block_config = npu_op.block_config
498 overlapping_fm = npu_op.ifm if ifm_overlaps else npu_op.ifm2
499 assert overlapping_fm is not None
500
501 cur_ifm_block_depth = get_ifm_ofm_block_depth(arch, npu_op)
502 cur_ofm_block = Block(block_config.width, block_config.height, block_config.depth)
503 cur_ofm_rect = shape3d_to_rect(npu_op.ofm.shape)
504 cur_ifm_rect = shape3d_to_rect(npu_op.ifm.shape)
505 padding = NpuPadding(0, 0, 0, 0) if npu_op.padding is None else npu_op.padding
506 blockdep = ArchitectureFeatures.MAX_BLOCKDEP
507 kernel = to_kernel(npu_op.kernel)
508
509 prev_ofm_block = Block(prev_block_config.width, prev_block_config.height, prev_block_config.depth)
510 prev_ofm_rect = shape3d_to_rect(prev_op.ofm.shape)
511 # Iterate over the next BLOCKDEP inputs, checking to see if a sliding window
512 # of IFM area overlaps with any previous OFM block generation.
513 elapsed_jobs = 0
514 for forward_offset in range(ArchitectureFeatures.MAX_BLOCKDEP):
515 # This is the IFM block we want to sample from
516 in_area = get_first_job_input_volume(
517 arch, cur_ifm_rect, cur_ofm_rect, cur_ifm_block_depth, cur_ofm_block, kernel, padding, forward_offset
518 )
519 if in_area is None:
520 break
521
522 # Try several previous-OFM blocks in the past (they still might comprise multiple IFM jobs)
523 outstanding_jobs = 0
524 for block_offset in range(ArchitectureFeatures.MAX_BLOCKDEP):
525 # This is the OFM block being generated by the previous op
526 out_area = get_prev_job_output_volume(prev_ofm_rect, prev_ofm_block, block_offset)
527 if out_area is None:
528 break
529
530 # Block dependency is the max number of allowed outstanding jobs
531 # in the pipeline. Selected by determining how many jobs occur
532 # in between two operators' overlapping OFM->IFM block volumes
533 if intersects(overlapping_fm, in_area[0], in_area[1], prev_op.ofm, out_area[0], out_area[1]):
534 break
535 # Early exit if no intersections and we've seen enough jobs in the pipeline
536 elif outstanding_jobs > ArchitectureFeatures.MAX_BLOCKDEP:
537 break
538
539 # This OFM had this many jobs (accumulate over multiple OFM blocks)
540 outstanding_jobs += out_area[2]
541
542 blockdep = min(blockdep, elapsed_jobs + outstanding_jobs)
543 elapsed_jobs += in_area[2]
544 # Early exit if no intersections and we've seen enough jobs in the pipeline
545 if elapsed_jobs > ArchitectureFeatures.MAX_BLOCKDEP:
546 break
547
548 return blockdep