blob: ce49fc29ec3446f56d6d63bda72600f01aa44aa0 [file] [log] [blame]
Louis Verhaard1e170182020-11-26 11:42:04 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17# Description:
18# Utility functions for code generation
19from typing import List
20from typing import NamedTuple
21from typing import Optional
22
23from . import numeric_util
24from .api import NpuActivationOp
25from .api import NpuAddressRange
26from .api import NpuBlockOperation
27from .api import NpuDmaOperation
28from .api import NpuElementWiseOp
29from .api import NpuFeatureMap
30from .api import NpuKernel
31from .api import NpuLayout
32from .api import NpuOperation
33from .api import NpuOperationType
34from .api import NpuPadding
35from .api import NpuShape3D
36from .architecture_features import ArchitectureFeatures
37from .architecture_features import Block
38from .architecture_features import Rect
39from .operation import Kernel
40from .operation import PointXYZ
41from ethosu.vela.range_set import AccessDirection
42from ethosu.vela.range_set import MemoryAccessSet
43from ethosu.vela.range_set import MemoryRangeSet
44
45# base address slot for memory to memory transfer
46BASE_PTR_INDEX_MEM2MEM = int((1 << 8) | (3 << 0))
47
48
Michael McGeaghf3e3ad72020-12-02 12:39:03 +000049UNARY_ELEMWISE_OPS = (NpuElementWiseOp.ABS, NpuElementWiseOp.LRELU, NpuElementWiseOp.CLZ)
Louis Verhaard1e170182020-11-26 11:42:04 +010050
51
52def to_npu_kernel(kernel: Kernel) -> NpuKernel:
53 """Converts the given internally used kernel object to NpuKernel (of public API)"""
54 return NpuKernel(
55 kernel.width, kernel.height, kernel.stride.x, kernel.stride.y, kernel.dilation.x, kernel.dilation.y
56 )
57
58
59def to_kernel(kernel: Optional[NpuKernel]) -> Kernel:
60 """Converts the given public API object to Kernel (used internally)"""
61 if kernel is None:
62 return Kernel(1, 1)
63 return Kernel(kernel.width, kernel.height, kernel.stride_x, kernel.stride_y, kernel.dilation_x, kernel.dilation_y)
64
65
66def has_ifm2(npu_op: NpuBlockOperation) -> bool:
67 """Checks if op has non-scalar IFM2"""
68 return npu_op.ifm2 is not None and npu_op.ifm2_scalar is None
69
70
71def is_dma_op(npu_op: NpuOperation) -> bool:
72 """Checks if op is a DMA operation"""
73 return npu_op.op_type == NpuOperationType.Dma
74
75
76def shape3d_size(shape: NpuShape3D) -> int:
77 return shape.width * shape.height * shape.depth
78
79
80def shape3d_to_rect(shape: NpuShape3D) -> Rect:
81 return Rect(0, 0, 0, shape.width - 1, shape.height - 1, shape.depth - 1)
82
83
84# -------------------------------------------------------------------
85# ADDRESSING/STRIDES (helper functions)
86# -------------------------------------------------------------------
87
88
89def ranges_overlap(range1: NpuAddressRange, range2: NpuAddressRange) -> bool:
90 """Checks if the ranges overlap"""
91 return range1.region == range2.region and numeric_util.overlaps(
92 range1.address, range1.address + range1.length, range2.address, range2.address + range2.length
93 )
94
95
96def range_lists_overlap(list1: List[Optional[NpuAddressRange]], list2: List[Optional[NpuAddressRange]]) -> bool:
97 """Checks if there is any address overlap between list1 and list2"""
98 for range1 in list1:
99 if range1 is None:
100 continue
101 for range2 in list2:
102 if range2 is not None and ranges_overlap(range1, range2):
103 return True
104 return False
105
106
107def get_strides(fm: NpuFeatureMap) -> NpuShape3D:
108 """Calculates STRIDE_C/Y/X"""
109 if fm.strides is not None:
110 return fm.strides
111 elem_size = fm.data_type.size_in_bytes()
112 if fm.layout == NpuLayout.NHWC:
113 stride_c = elem_size
114 stride_x = fm.shape.depth * stride_c
115 stride_y = fm.shape.width * stride_x
116 else:
117 stride_x = 16 * elem_size
118 stride_c = stride_x * fm.shape.width
119 stride_y = elem_size * fm.shape.width * numeric_util.round_up(fm.shape.depth, 16)
120 return NpuShape3D(depth=stride_c, height=stride_y, width=stride_x)
121
122
123def get_address(fm: NpuFeatureMap, strides: NpuShape3D, y: int, x: int, c: int) -> int:
124 """Returns address of given coordinate"""
125 t = 0
126 BRICK = 16
127 stride_c = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHWC else strides.depth
128 stride_x = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHCWB16 else strides.width
129 if x >= fm.tiles.width_0:
130 x -= fm.tiles.width_0
131 t = 1
132 if y >= fm.tiles.height_1:
133 y -= fm.tiles.height_1
134 t += 2
135 elif y >= fm.tiles.height_0:
136 y -= fm.tiles.height_0
137 t += 2
138 elem_size = fm.data_type.size_in_bytes()
139 return (
140 fm.tiles.addresses[t] + y * strides.height + x * stride_x + (c // BRICK) * stride_c + int(c % BRICK) * elem_size
141 )
142
143
144def get_address_range(
145 fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int
146) -> NpuAddressRange:
147 """
148 Gets address range for (y0, x0, c0) - (y1, x1, c1) (inclusive, so the second coordinate is within the fm).
149 The begin and end coordinates must be within the same tile.
150 """
151 addr0 = get_address(fm, strides, y0, x0, c0)
152 addr1 = get_address(fm, strides, y1, x1, c1)
153 return NpuAddressRange(region=fm.region, address=addr0, length=addr1 - addr0 + fm.data_type.size_in_bytes())
154
155
156def get_h_ranges(
157 fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int
158) -> List[NpuAddressRange]:
159 """
160 Gets address ranges for (y0, x0, c0) - (y1, x1, c1) (inclusive, so the second coordinate is within the fm);
161 the begin and end coordinates must be within the same tile.
162 Divides the area in horizontal "stripes" of height 1, and returns the address ranges for these "stripes".
163 """
164 return [get_address_range(fm, strides, y, x0, c0, y, x1, c1) for y in range(y0, y1 + 1)]
165
166
167def get_address_ranges_for_area(fm: NpuFeatureMap, start: PointXYZ, end: PointXYZ) -> List[NpuAddressRange]:
168 """
169 Returns a list of adddress ranges that covers the area start - end (inclusive).
170 Divides the area in horizontal "stripes" of height 1, and returns the address ranges for these "stripes".
171
172 For example, for the area marked with X (in a feature map with 4 tiles) as input, this function would return
173 6 address ranges: the address ranges for 1-height areas [AAA, BBB, CC, DD, EEE, FF]
174
175 .....|.... .....|....
176 t0 ..XXX|XX.. t1 t0 ..AAA|CC.. t1
177 ..XXX|XX.. ..BBB|DD..
178 -----+---- --> -----+----
179 t2 ..XXX|XX.. t3 t2 ..EEE|FF.. t3
180 .....|.... .....|....
181 """
182 strides = get_strides(fm)
183 height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0
184 h, w, c = fm.shape
185 y0, x0, c0 = start.y, start.x, start.z
186 y1, x1, c1 = min(end.y, h - 1), min(end.x, w - 1), min(end.z, c - 1)
187 ranges = []
188 if x0 < width_0 and y0 < height_0:
189 # Horizontal ranges for tile 0
190 ranges.extend(get_h_ranges(fm, strides, y0, x0, c0, min(y1, height_0 - 1), min(x1, width_0 - 1), c1))
191 if x1 >= width_0 and y0 < height_1:
192 # Horizontal ranges for tile 1
193 ranges.extend(get_h_ranges(fm, strides, y0, max(x0, width_0), c0, min(y1, height_1 - 1), x1, c1))
194 if x0 < width_0 and y1 >= height_0:
195 # Horizontal ranges for tile 2
196 ranges.extend(get_h_ranges(fm, strides, max(y0, height_0), x0, c0, y1, min(x1, width_0 - 1), c1))
197 if x1 >= width_0 and y1 >= height_1:
198 # Horizontal ranges for tile 3
199 ranges.extend(get_h_ranges(fm, strides, max(y0, height_1), max(x0, width_0), c0, y1, x1, c1))
200 return ranges
201
202
203def get_address_ranges(fm: NpuFeatureMap) -> List[Optional[NpuAddressRange]]:
204 """Returns 4 adddress ranges, one for every tile, None if the tile is not in use"""
205 strides = get_strides(fm)
206 height, width, depth = fm.shape.height, fm.shape.width, fm.shape.depth
207 height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0
208 t0 = get_address_range(fm, strides, 0, 0, 0, min(height, height_0) - 1, min(width, width_0) - 1, depth - 1,)
209 if width > width_0:
210 t1 = get_address_range(fm, strides, 0, width_0, 0, min(height, height_1) - 1, width - 1, depth - 1)
211 else:
212 t1 = None
213 if height > height_0:
214 t2 = get_address_range(fm, strides, height_0, 0, 0, height - 1, min(width, width_0) - 1, depth - 1)
215 else:
216 t2 = None
217 if t1 is not None and t2 is not None:
218 t3 = get_address_range(fm, strides, height_1, width_0, 0, height - 1, width - 1, depth - 1)
219 else:
220 t3 = None
221 return [t0, t1, t2, t3]
222
223
224# -------------------------------------------------------------------
225# DMA_WAIT/KERNEL_WAIT
226# -------------------------------------------------------------------
227
228
229class Watermark(NamedTuple):
230 npu: int
231 dma: int
232
233
234def memory_range_set(range: NpuAddressRange) -> MemoryRangeSet:
235 return MemoryRangeSet(range.region, range.address, range.address + range.length)
236
237
238def get_dma_memory_accesses(dma_op: NpuDmaOperation) -> MemoryAccessSet:
239 """Returns the address that are read and written by the given DMA operation"""
240 res = MemoryAccessSet()
241 res.add(memory_range_set(dma_op.src), AccessDirection.Read)
242 res.add(memory_range_set(dma_op.dest), AccessDirection.Write)
243 return res
244
245
246def get_op_memory_accesses(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> MemoryAccessSet:
247 """Returns the addresses that are read and written by the given operation"""
248 assert npu_op.ifm is not None and npu_op.ofm is not None
249 # Read addresses
250 read_ranges = get_address_ranges(npu_op.ifm)
251 if has_ifm2(npu_op):
252 assert npu_op.ifm2 is not None
253 read_ranges.extend(get_address_ranges(npu_op.ifm2))
254 read_ranges.extend(npu_op.weights)
255 read_ranges.extend(npu_op.biases)
256 if npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP:
257 address = arch.available_shram_banks(True) * arch.shram_bank_size
258 read_ranges.append(NpuAddressRange(region=BASE_PTR_INDEX_MEM2MEM, address=address, length=2048))
259 # Written addresses
260 write_ranges = get_address_ranges(npu_op.ofm)
261 # Add write access to SHRAM, needed when LUTs can overwrite accumulator banks
262 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
263 written_shram_size = arch.available_shram_banks(uses_lut) * arch.shram_bank_size
264 write_ranges.append(NpuAddressRange(region=BASE_PTR_INDEX_MEM2MEM, address=0, length=written_shram_size))
265
266 res = MemoryAccessSet()
267 for read_range in read_ranges:
268 if read_range is not None:
269 res.add(memory_range_set(read_range), AccessDirection.Read)
270 for write_range in write_ranges:
271 if write_range is not None:
272 res.add(memory_range_set(write_range), AccessDirection.Write)
273 return res
274
275
276def get_wait_dependency(
277 arch: ArchitectureFeatures, npu_op_list: List[NpuOperation], memory_accesses, op_index: int, watermark: Watermark
278):
279 """Used to calculate whether DMA wait or kernel wait operations are needed"""
280 npu_op = npu_op_list[op_index]
281 op_access = memory_accesses[npu_op]
282 index = op_index - 1
283
284 # NPU dependency tracking
285 npu_outstanding = -1
286 npu_ops = 0
287 npu_index = watermark.npu
288
289 # DMA dependency tracking
290 dma_outstanding = -1
291 dma_ops = 0
292 dma_index = watermark.dma
293
294 # Seek back in the command stream looking for NPU or DMA dependencies
295 # but only as far as the first dependency or the watermarks (dependencies
296 # before this point have been satisfied already).
297 # The watermark moves to after the latest element we must wait for, not
298 # the command that issues the wait.
299 # NPU->NPU dependency is handled via blockdep.
300 while (index >= npu_index) or (index >= dma_index):
301 prev_op = npu_op_list[index]
302 prev_access = memory_accesses[prev_op]
303
304 # Check NPU consuming DMA output
305 if is_dma_op(prev_op):
306 if index >= dma_index:
307 if not is_dma_op(npu_op):
308 if (dma_outstanding == -1) and prev_access.conflicts(op_access):
309 dma_outstanding = dma_ops
310 dma_ops += 1 # Count DMA ops in the pipeline
311 if dma_ops >= arch.max_outstanding_dma:
312 dma_index = max(index + 1, dma_index)
313 # Check DMA consuming NPU output
314 else:
315 if index >= npu_index:
316 if is_dma_op(npu_op) and npu_outstanding == -1 and prev_access.conflicts(op_access):
317 npu_outstanding = npu_ops
318 npu_ops += 1 # Count NPU ops in the pipeline
319 if npu_ops >= arch.max_outstanding_kernels:
320 npu_index = max(index + 1, npu_index)
321
322 index -= 1
323
324 # Update DMA watermark if we didn't see any and the NPU pipeline is full
325 if (dma_ops == 0) and (npu_ops >= arch.max_outstanding_kernels):
326 dma_index = op_index
327
328 # Bring the search watermark forwards as we complete for those dependencies
329 watermark = Watermark(npu_index, dma_index)
330 outstanding = Watermark(npu_outstanding, dma_outstanding)
331
332 return watermark, outstanding
333
334
335# -------------------------------------------------------------------
336# BLOCKDEP
337# -------------------------------------------------------------------
338
339
340def get_ifm_ofm_block_depth(arch: ArchitectureFeatures, npu_op: NpuBlockOperation) -> int:
341 # Note: NOT equivalent to the normal ifm block depth calculation since
342 # it takes into account 'depthless' block operations by returning full
343 # depth
344 if npu_op.op_type == NpuOperationType.Conv2D:
345 res = arch.calc_ifm_block_depth(npu_op.ifm.shape.depth, npu_op.ifm.data_type.size_in_bits())
346 return res
347 return npu_op.ofm.shape.depth
348
349
350def coords_intersect(start_a: PointXYZ, end_a: PointXYZ, start_b: PointXYZ, end_b: PointXYZ) -> bool:
351 """Checks if the two areas overlap"""
352 start_x = max(start_a.x, start_b.x)
353 end_x = min(end_a.x, end_b.x)
354 start_y = max(start_a.y, start_b.y)
355 end_y = min(end_a.y, end_b.y)
356 start_z = max(start_a.z, start_b.z)
357 end_z = min(end_a.z, end_b.z)
358 return ((end_x - start_x) > 0) and ((end_y - start_y) > 0) and ((end_z - start_z) > 0)
359
360
361def intersects(
362 ifm: NpuFeatureMap,
363 ifm_start_coord: PointXYZ,
364 ifm_end_coord: PointXYZ,
365 prev_ofm: NpuFeatureMap,
366 ofm_start_coord: PointXYZ,
367 ofm_end_coord: PointXYZ,
368) -> bool:
369 """Checks if the given IFM area overlaps with the given OFM area"""
370 if ifm.shape == prev_ofm.shape and ifm.tiles == prev_ofm.tiles:
371 # Common case: prev_op.ofm == op.ifm; in this case it suffices to check
372 # if the xyz coordinates overlap, which is quick and easy
373 res = coords_intersect(ifm_start_coord, ifm_end_coord, ofm_start_coord, ofm_end_coord)
374 else:
375 # The OFM produces a part of the IFM (e.g. a stripe), or the IFM consumes part of the OFM.
376 # In this case, address comparison between the two areas is needed
377 ifm_ranges = get_address_ranges_for_area(ifm, ifm_start_coord, ifm_end_coord)
378 prev_ofm_ranges = get_address_ranges_for_area(prev_ofm, ofm_start_coord, ofm_end_coord)
379 res = range_lists_overlap(ifm_ranges, prev_ofm_ranges)
380 return res
381
382
383# Block job dependency:
384# Does the VOLUME of IFMs for block job B(0) overlap with VOLUME of OFMs block jobs A(8,9,10)
385#
386# A | B
387# ----------------------+------------------
388# .... 3,4,5,6,7,8,9,10 | 0,1,2,3,4,5,6,8 10 < JOB NUMBER
389# |<------->| dependency offset
390#
391
392
393def get_offset_block_coords(area: Rect, block: Block, offset: int) -> Optional[PointXYZ]:
394 """
395 Get the coordinates of a block offset from either the end (negative)
396 or the start (zero or positive) of the given 3D area
397 """
398 size = area.size()
399 # Dimensions of the region, in blocks
400 width_blocks = numeric_util.round_up_divide(size.width, block.width)
401 height_blocks = numeric_util.round_up_divide(size.height, block.height)
402 depth_blocks = numeric_util.round_up_divide(size.depth, block.depth)
403 total_blocks = width_blocks * height_blocks * depth_blocks
404 if offset < 0:
405 index = total_blocks + offset
406 else:
407 index = offset
408
409 if index >= total_blocks:
410 return None
411
412 # Coordinates of the indexed block
413 coord_z = block.depth * (index % depth_blocks)
414 coord_y = block.height * (index // (depth_blocks * width_blocks))
415 coord_x = block.width * ((index // depth_blocks) % width_blocks)
416
417 return PointXYZ(x=coord_x + area.x, y=coord_y + area.y, z=coord_z + area.z)
418
419
420def get_first_job_input_volume(
421 arch: ArchitectureFeatures,
422 ifm: Rect,
423 ofm: Rect,
424 ifm_block_depth,
425 ofm_block: Block,
426 kernel: Kernel,
427 padding: NpuPadding,
428 block_offset: int,
429):
430 # Get ifm block size (jobs are invisibly decomposed into subkernels)
431 ifm_block = arch.get_ifm_block_size(ifm_block_depth, ofm_block, kernel, arch.ofm_block_max)
432 ifm_depth_blocks = numeric_util.round_up_divide(ifm.size().depth, ifm_block_depth)
433
434 # Which OFM block are we calculating
435 ofm_coord = get_offset_block_coords(ofm, ofm_block, block_offset // ifm_depth_blocks)
436 if ofm_coord is None:
437 return None
438
439 # Coordinate of the source IFM block
440 ifm_coord_x = max(0, ofm_coord[0] * kernel.stride.x - padding.left)
441 ifm_coord_y = max(0, ofm_coord[1] * kernel.stride.y - padding.right)
442 ifm_coord_z = ifm.z + (block_offset % ifm_depth_blocks) * ifm_block.depth
443
444 # IFM block that will be sampled for the FIRST+block_offset job in the next operator's OFM
445 start_coord = PointXYZ(x=ifm_coord_x, y=ifm_coord_y, z=ifm_coord_z)
446 end_coord = PointXYZ(
447 x=start_coord[0] + ifm_block.width, y=start_coord[1] + ifm_block.height, z=start_coord[2] + ifm_block.depth,
448 )
449 return (start_coord, end_coord, 1) # start, end, total jobs
450
451
452def get_prev_job_output_volume(ofm: Rect, ofm_block: Block, block_offset: int):
453 assert block_offset >= 0
454
455 # Get OFM block's volume coordinates
456 start_coord = get_offset_block_coords(ofm, ofm_block, -1 - block_offset)
457 if start_coord is None:
458 return None
459 end_coord = PointXYZ(
460 x=start_coord.x + ofm_block.width, y=start_coord.y + ofm_block.height, z=start_coord.z + ofm_block.depth,
461 )
462 return (start_coord, end_coord, 1) # start, end, total jobs for this OFM block
463
464
465def calc_blockdep(arch: ArchitectureFeatures, prev_op: Optional[NpuBlockOperation], npu_op: NpuBlockOperation,) -> int:
466 """Calculates the value of the BLOCKDEP register"""
467 if prev_op is None:
468 return 0
469 assert npu_op.ifm is not None
470 assert prev_op.ofm is not None
471 # Check if IFM or IFM2 overlaps with prev op's OFM
472 prev_ofm_ranges = get_address_ranges(prev_op.ofm)
473 ifm_ranges = get_address_ranges(npu_op.ifm)
474 ifm_overlaps = range_lists_overlap(prev_ofm_ranges, ifm_ranges)
475 if has_ifm2(npu_op):
476 assert npu_op.ifm2 is not None
477 ifm2_ranges = get_address_ranges(npu_op.ifm2)
478 ifm2_overlaps = range_lists_overlap(prev_ofm_ranges, ifm2_ranges)
479 else:
480 ifm2_overlaps = False
481 if ifm_overlaps and ifm2_overlaps:
482 # Both IFM and IFM2 overlap (should be rare)
483 return 0
484 if not ifm_overlaps and not ifm2_overlaps:
485 # No overlap between prev OFM and IFM/IFM2
486 return ArchitectureFeatures.MAX_BLOCKDEP
487 if ifm2_overlaps and shape3d_size(npu_op.ifm2.shape) < shape3d_size(npu_op.ifm.shape):
488 # Prev OFM produces IFM2 which is broadcasted (this should be rare)
489 return 0
490 # Prev OFM overlaps with IFM or IFM2; calculate the blockdep
491 prev_block_config = prev_op.block_config
492 block_config = npu_op.block_config
493 overlapping_fm = npu_op.ifm if ifm_overlaps else npu_op.ifm2
494 assert overlapping_fm is not None
495
496 cur_ifm_block_depth = get_ifm_ofm_block_depth(arch, npu_op)
497 cur_ofm_block = Block(block_config.width, block_config.height, block_config.depth)
498 cur_ofm_rect = shape3d_to_rect(npu_op.ofm.shape)
499 cur_ifm_rect = shape3d_to_rect(npu_op.ifm.shape)
500 padding = NpuPadding(0, 0, 0, 0) if npu_op.padding is None else npu_op.padding
501 blockdep = ArchitectureFeatures.MAX_BLOCKDEP
502 kernel = to_kernel(npu_op.kernel)
503
504 prev_ofm_block = Block(prev_block_config.width, prev_block_config.height, prev_block_config.depth)
505 prev_ofm_rect = shape3d_to_rect(prev_op.ofm.shape)
506 # Iterate over the next BLOCKDEP inputs, checking to see if a sliding window
507 # of IFM area overlaps with any previous OFM block generation.
508 elapsed_jobs = 0
509 for forward_offset in range(ArchitectureFeatures.MAX_BLOCKDEP):
510 # This is the IFM block we want to sample from
511 in_area = get_first_job_input_volume(
512 arch, cur_ifm_rect, cur_ofm_rect, cur_ifm_block_depth, cur_ofm_block, kernel, padding, forward_offset
513 )
514 if in_area is None:
515 break
516
517 # Try several previous-OFM blocks in the past (they still might comprise multiple IFM jobs)
518 outstanding_jobs = 0
519 for block_offset in range(ArchitectureFeatures.MAX_BLOCKDEP):
520 # This is the OFM block being generated by the previous op
521 out_area = get_prev_job_output_volume(prev_ofm_rect, prev_ofm_block, block_offset)
522 if out_area is None:
523 break
524
525 # Block dependency is the max number of allowed outstanding jobs
526 # in the pipeline. Selected by determining how many jobs occur
527 # in between two operators' overlapping OFM->IFM block volumes
528 if intersects(overlapping_fm, in_area[0], in_area[1], prev_op.ofm, out_area[0], out_area[1]):
529 break
530 # Early exit if no intersections and we've seen enough jobs in the pipeline
531 elif outstanding_jobs > ArchitectureFeatures.MAX_BLOCKDEP:
532 break
533
534 # This OFM had this many jobs (accumulate over multiple OFM blocks)
535 outstanding_jobs += out_area[2]
536
537 blockdep = min(blockdep, elapsed_jobs + outstanding_jobs)
538 elapsed_jobs += in_area[2]
539 # Early exit if no intersections and we've seen enough jobs in the pipeline
540 if elapsed_jobs > ArchitectureFeatures.MAX_BLOCKDEP:
541 break
542
543 return blockdep