blob: 55fa620c5279b0b85efe8a4125a51899e69643ac [file] [log] [blame]
Louis Verhaard1e170182020-11-26 11:42:04 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17# Description:
18# Utility functions for code generation
19from typing import List
20from typing import NamedTuple
21from typing import Optional
22
23from . import numeric_util
24from .api import NpuActivationOp
25from .api import NpuAddressRange
26from .api import NpuBlockOperation
27from .api import NpuDmaOperation
28from .api import NpuElementWiseOp
29from .api import NpuFeatureMap
30from .api import NpuKernel
31from .api import NpuLayout
32from .api import NpuOperation
33from .api import NpuOperationType
34from .api import NpuPadding
35from .api import NpuShape3D
36from .architecture_features import ArchitectureFeatures
37from .architecture_features import Block
38from .architecture_features import Rect
39from .operation import Kernel
40from .operation import PointXYZ
41from ethosu.vela.range_set import AccessDirection
42from ethosu.vela.range_set import MemoryAccessSet
43from ethosu.vela.range_set import MemoryRangeSet
44
45# base address slot for memory to memory transfer
46BASE_PTR_INDEX_MEM2MEM = int((1 << 8) | (3 << 0))
47
48
Michael McGeaghf3e3ad72020-12-02 12:39:03 +000049UNARY_ELEMWISE_OPS = (NpuElementWiseOp.ABS, NpuElementWiseOp.LRELU, NpuElementWiseOp.CLZ)
Louis Verhaard1e170182020-11-26 11:42:04 +010050
51
52def to_npu_kernel(kernel: Kernel) -> NpuKernel:
53 """Converts the given internally used kernel object to NpuKernel (of public API)"""
54 return NpuKernel(
55 kernel.width, kernel.height, kernel.stride.x, kernel.stride.y, kernel.dilation.x, kernel.dilation.y
56 )
57
58
59def to_kernel(kernel: Optional[NpuKernel]) -> Kernel:
60 """Converts the given public API object to Kernel (used internally)"""
61 if kernel is None:
62 return Kernel(1, 1)
63 return Kernel(kernel.width, kernel.height, kernel.stride_x, kernel.stride_y, kernel.dilation_x, kernel.dilation_y)
64
65
66def has_ifm2(npu_op: NpuBlockOperation) -> bool:
67 """Checks if op has non-scalar IFM2"""
68 return npu_op.ifm2 is not None and npu_op.ifm2_scalar is None
69
70
Louis Verhaard1e170182020-11-26 11:42:04 +010071def shape3d_size(shape: NpuShape3D) -> int:
72 return shape.width * shape.height * shape.depth
73
74
75def shape3d_to_rect(shape: NpuShape3D) -> Rect:
76 return Rect(0, 0, 0, shape.width - 1, shape.height - 1, shape.depth - 1)
77
78
79# -------------------------------------------------------------------
80# ADDRESSING/STRIDES (helper functions)
81# -------------------------------------------------------------------
82
83
84def ranges_overlap(range1: NpuAddressRange, range2: NpuAddressRange) -> bool:
85 """Checks if the ranges overlap"""
86 return range1.region == range2.region and numeric_util.overlaps(
87 range1.address, range1.address + range1.length, range2.address, range2.address + range2.length
88 )
89
90
91def range_lists_overlap(list1: List[Optional[NpuAddressRange]], list2: List[Optional[NpuAddressRange]]) -> bool:
92 """Checks if there is any address overlap between list1 and list2"""
93 for range1 in list1:
94 if range1 is None:
95 continue
96 for range2 in list2:
97 if range2 is not None and ranges_overlap(range1, range2):
98 return True
99 return False
100
101
102def get_strides(fm: NpuFeatureMap) -> NpuShape3D:
103 """Calculates STRIDE_C/Y/X"""
104 if fm.strides is not None:
105 return fm.strides
106 elem_size = fm.data_type.size_in_bytes()
107 if fm.layout == NpuLayout.NHWC:
108 stride_c = elem_size
109 stride_x = fm.shape.depth * stride_c
110 stride_y = fm.shape.width * stride_x
111 else:
112 stride_x = 16 * elem_size
113 stride_c = stride_x * fm.shape.width
114 stride_y = elem_size * fm.shape.width * numeric_util.round_up(fm.shape.depth, 16)
115 return NpuShape3D(depth=stride_c, height=stride_y, width=stride_x)
116
117
118def get_address(fm: NpuFeatureMap, strides: NpuShape3D, y: int, x: int, c: int) -> int:
119 """Returns address of given coordinate"""
120 t = 0
121 BRICK = 16
122 stride_c = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHWC else strides.depth
123 stride_x = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHCWB16 else strides.width
124 if x >= fm.tiles.width_0:
125 x -= fm.tiles.width_0
126 t = 1
127 if y >= fm.tiles.height_1:
128 y -= fm.tiles.height_1
129 t += 2
130 elif y >= fm.tiles.height_0:
131 y -= fm.tiles.height_0
132 t += 2
133 elem_size = fm.data_type.size_in_bytes()
134 return (
135 fm.tiles.addresses[t] + y * strides.height + x * stride_x + (c // BRICK) * stride_c + int(c % BRICK) * elem_size
136 )
137
138
139def get_address_range(
140 fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int
141) -> NpuAddressRange:
142 """
143 Gets address range for (y0, x0, c0) - (y1, x1, c1) (inclusive, so the second coordinate is within the fm).
144 The begin and end coordinates must be within the same tile.
145 """
146 addr0 = get_address(fm, strides, y0, x0, c0)
147 addr1 = get_address(fm, strides, y1, x1, c1)
148 return NpuAddressRange(region=fm.region, address=addr0, length=addr1 - addr0 + fm.data_type.size_in_bytes())
149
150
151def get_h_ranges(
152 fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int
153) -> List[NpuAddressRange]:
154 """
155 Gets address ranges for (y0, x0, c0) - (y1, x1, c1) (inclusive, so the second coordinate is within the fm);
156 the begin and end coordinates must be within the same tile.
157 Divides the area in horizontal "stripes" of height 1, and returns the address ranges for these "stripes".
158 """
159 return [get_address_range(fm, strides, y, x0, c0, y, x1, c1) for y in range(y0, y1 + 1)]
160
161
162def get_address_ranges_for_area(fm: NpuFeatureMap, start: PointXYZ, end: PointXYZ) -> List[NpuAddressRange]:
163 """
164 Returns a list of adddress ranges that covers the area start - end (inclusive).
165 Divides the area in horizontal "stripes" of height 1, and returns the address ranges for these "stripes".
166
167 For example, for the area marked with X (in a feature map with 4 tiles) as input, this function would return
168 6 address ranges: the address ranges for 1-height areas [AAA, BBB, CC, DD, EEE, FF]
169
170 .....|.... .....|....
171 t0 ..XXX|XX.. t1 t0 ..AAA|CC.. t1
172 ..XXX|XX.. ..BBB|DD..
173 -----+---- --> -----+----
174 t2 ..XXX|XX.. t3 t2 ..EEE|FF.. t3
175 .....|.... .....|....
176 """
177 strides = get_strides(fm)
178 height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0
179 h, w, c = fm.shape
180 y0, x0, c0 = start.y, start.x, start.z
181 y1, x1, c1 = min(end.y, h - 1), min(end.x, w - 1), min(end.z, c - 1)
182 ranges = []
183 if x0 < width_0 and y0 < height_0:
184 # Horizontal ranges for tile 0
185 ranges.extend(get_h_ranges(fm, strides, y0, x0, c0, min(y1, height_0 - 1), min(x1, width_0 - 1), c1))
186 if x1 >= width_0 and y0 < height_1:
187 # Horizontal ranges for tile 1
188 ranges.extend(get_h_ranges(fm, strides, y0, max(x0, width_0), c0, min(y1, height_1 - 1), x1, c1))
189 if x0 < width_0 and y1 >= height_0:
190 # Horizontal ranges for tile 2
191 ranges.extend(get_h_ranges(fm, strides, max(y0, height_0), x0, c0, y1, min(x1, width_0 - 1), c1))
192 if x1 >= width_0 and y1 >= height_1:
193 # Horizontal ranges for tile 3
194 ranges.extend(get_h_ranges(fm, strides, max(y0, height_1), max(x0, width_0), c0, y1, x1, c1))
195 return ranges
196
197
198def get_address_ranges(fm: NpuFeatureMap) -> List[Optional[NpuAddressRange]]:
199 """Returns 4 adddress ranges, one for every tile, None if the tile is not in use"""
200 strides = get_strides(fm)
201 height, width, depth = fm.shape.height, fm.shape.width, fm.shape.depth
202 height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0
203 t0 = get_address_range(fm, strides, 0, 0, 0, min(height, height_0) - 1, min(width, width_0) - 1, depth - 1,)
204 if width > width_0:
205 t1 = get_address_range(fm, strides, 0, width_0, 0, min(height, height_1) - 1, width - 1, depth - 1)
206 else:
207 t1 = None
208 if height > height_0:
209 t2 = get_address_range(fm, strides, height_0, 0, 0, height - 1, min(width, width_0) - 1, depth - 1)
210 else:
211 t2 = None
212 if t1 is not None and t2 is not None:
213 t3 = get_address_range(fm, strides, height_1, width_0, 0, height - 1, width - 1, depth - 1)
214 else:
215 t3 = None
216 return [t0, t1, t2, t3]
217
218
219# -------------------------------------------------------------------
220# DMA_WAIT/KERNEL_WAIT
221# -------------------------------------------------------------------
222
223
224class Watermark(NamedTuple):
225 npu: int
226 dma: int
227
228
229def memory_range_set(range: NpuAddressRange) -> MemoryRangeSet:
230 return MemoryRangeSet(range.region, range.address, range.address + range.length)
231
232
233def get_dma_memory_accesses(dma_op: NpuDmaOperation) -> MemoryAccessSet:
234 """Returns the address that are read and written by the given DMA operation"""
235 res = MemoryAccessSet()
236 res.add(memory_range_set(dma_op.src), AccessDirection.Read)
237 res.add(memory_range_set(dma_op.dest), AccessDirection.Write)
238 return res
239
240
241def get_op_memory_accesses(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> MemoryAccessSet:
242 """Returns the addresses that are read and written by the given operation"""
243 assert npu_op.ifm is not None and npu_op.ofm is not None
244 # Read addresses
245 read_ranges = get_address_ranges(npu_op.ifm)
246 if has_ifm2(npu_op):
247 assert npu_op.ifm2 is not None
248 read_ranges.extend(get_address_ranges(npu_op.ifm2))
249 read_ranges.extend(npu_op.weights)
250 read_ranges.extend(npu_op.biases)
251 if npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP:
252 address = arch.available_shram_banks(True) * arch.shram_bank_size
253 read_ranges.append(NpuAddressRange(region=BASE_PTR_INDEX_MEM2MEM, address=address, length=2048))
254 # Written addresses
255 write_ranges = get_address_ranges(npu_op.ofm)
256 # Add write access to SHRAM, needed when LUTs can overwrite accumulator banks
257 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
258 written_shram_size = arch.available_shram_banks(uses_lut) * arch.shram_bank_size
259 write_ranges.append(NpuAddressRange(region=BASE_PTR_INDEX_MEM2MEM, address=0, length=written_shram_size))
260
261 res = MemoryAccessSet()
262 for read_range in read_ranges:
263 if read_range is not None:
264 res.add(memory_range_set(read_range), AccessDirection.Read)
265 for write_range in write_ranges:
266 if write_range is not None:
267 res.add(memory_range_set(write_range), AccessDirection.Write)
268 return res
269
270
271def get_wait_dependency(
272 arch: ArchitectureFeatures, npu_op_list: List[NpuOperation], memory_accesses, op_index: int, watermark: Watermark
273):
274 """Used to calculate whether DMA wait or kernel wait operations are needed"""
275 npu_op = npu_op_list[op_index]
276 op_access = memory_accesses[npu_op]
277 index = op_index - 1
278
279 # NPU dependency tracking
280 npu_outstanding = -1
281 npu_ops = 0
282 npu_index = watermark.npu
283
284 # DMA dependency tracking
285 dma_outstanding = -1
286 dma_ops = 0
287 dma_index = watermark.dma
288
289 # Seek back in the command stream looking for NPU or DMA dependencies
290 # but only as far as the first dependency or the watermarks (dependencies
291 # before this point have been satisfied already).
292 # The watermark moves to after the latest element we must wait for, not
293 # the command that issues the wait.
294 # NPU->NPU dependency is handled via blockdep.
295 while (index >= npu_index) or (index >= dma_index):
296 prev_op = npu_op_list[index]
297 prev_access = memory_accesses[prev_op]
298
299 # Check NPU consuming DMA output
Dwight Lidman9b43f842020-12-08 17:56:44 +0100300 if isinstance(prev_op, NpuDmaOperation):
Louis Verhaard1e170182020-11-26 11:42:04 +0100301 if index >= dma_index:
Dwight Lidman9b43f842020-12-08 17:56:44 +0100302 if not isinstance(npu_op, NpuDmaOperation):
Louis Verhaard1e170182020-11-26 11:42:04 +0100303 if (dma_outstanding == -1) and prev_access.conflicts(op_access):
304 dma_outstanding = dma_ops
305 dma_ops += 1 # Count DMA ops in the pipeline
306 if dma_ops >= arch.max_outstanding_dma:
307 dma_index = max(index + 1, dma_index)
308 # Check DMA consuming NPU output
309 else:
310 if index >= npu_index:
Dwight Lidman9b43f842020-12-08 17:56:44 +0100311 if isinstance(npu_op, NpuDmaOperation) and npu_outstanding == -1 and prev_access.conflicts(op_access):
Louis Verhaard1e170182020-11-26 11:42:04 +0100312 npu_outstanding = npu_ops
313 npu_ops += 1 # Count NPU ops in the pipeline
314 if npu_ops >= arch.max_outstanding_kernels:
315 npu_index = max(index + 1, npu_index)
316
317 index -= 1
318
319 # Update DMA watermark if we didn't see any and the NPU pipeline is full
320 if (dma_ops == 0) and (npu_ops >= arch.max_outstanding_kernels):
321 dma_index = op_index
322
323 # Bring the search watermark forwards as we complete for those dependencies
324 watermark = Watermark(npu_index, dma_index)
325 outstanding = Watermark(npu_outstanding, dma_outstanding)
326
327 return watermark, outstanding
328
329
330# -------------------------------------------------------------------
331# BLOCKDEP
332# -------------------------------------------------------------------
333
334
335def get_ifm_ofm_block_depth(arch: ArchitectureFeatures, npu_op: NpuBlockOperation) -> int:
336 # Note: NOT equivalent to the normal ifm block depth calculation since
337 # it takes into account 'depthless' block operations by returning full
338 # depth
339 if npu_op.op_type == NpuOperationType.Conv2D:
340 res = arch.calc_ifm_block_depth(npu_op.ifm.shape.depth, npu_op.ifm.data_type.size_in_bits())
341 return res
342 return npu_op.ofm.shape.depth
343
344
345def coords_intersect(start_a: PointXYZ, end_a: PointXYZ, start_b: PointXYZ, end_b: PointXYZ) -> bool:
346 """Checks if the two areas overlap"""
347 start_x = max(start_a.x, start_b.x)
348 end_x = min(end_a.x, end_b.x)
349 start_y = max(start_a.y, start_b.y)
350 end_y = min(end_a.y, end_b.y)
351 start_z = max(start_a.z, start_b.z)
352 end_z = min(end_a.z, end_b.z)
353 return ((end_x - start_x) > 0) and ((end_y - start_y) > 0) and ((end_z - start_z) > 0)
354
355
356def intersects(
357 ifm: NpuFeatureMap,
358 ifm_start_coord: PointXYZ,
359 ifm_end_coord: PointXYZ,
360 prev_ofm: NpuFeatureMap,
361 ofm_start_coord: PointXYZ,
362 ofm_end_coord: PointXYZ,
363) -> bool:
364 """Checks if the given IFM area overlaps with the given OFM area"""
365 if ifm.shape == prev_ofm.shape and ifm.tiles == prev_ofm.tiles:
366 # Common case: prev_op.ofm == op.ifm; in this case it suffices to check
367 # if the xyz coordinates overlap, which is quick and easy
368 res = coords_intersect(ifm_start_coord, ifm_end_coord, ofm_start_coord, ofm_end_coord)
369 else:
370 # The OFM produces a part of the IFM (e.g. a stripe), or the IFM consumes part of the OFM.
371 # In this case, address comparison between the two areas is needed
372 ifm_ranges = get_address_ranges_for_area(ifm, ifm_start_coord, ifm_end_coord)
373 prev_ofm_ranges = get_address_ranges_for_area(prev_ofm, ofm_start_coord, ofm_end_coord)
374 res = range_lists_overlap(ifm_ranges, prev_ofm_ranges)
375 return res
376
377
378# Block job dependency:
379# Does the VOLUME of IFMs for block job B(0) overlap with VOLUME of OFMs block jobs A(8,9,10)
380#
381# A | B
382# ----------------------+------------------
383# .... 3,4,5,6,7,8,9,10 | 0,1,2,3,4,5,6,8 10 < JOB NUMBER
384# |<------->| dependency offset
385#
386
387
388def get_offset_block_coords(area: Rect, block: Block, offset: int) -> Optional[PointXYZ]:
389 """
390 Get the coordinates of a block offset from either the end (negative)
391 or the start (zero or positive) of the given 3D area
392 """
393 size = area.size()
394 # Dimensions of the region, in blocks
395 width_blocks = numeric_util.round_up_divide(size.width, block.width)
396 height_blocks = numeric_util.round_up_divide(size.height, block.height)
397 depth_blocks = numeric_util.round_up_divide(size.depth, block.depth)
398 total_blocks = width_blocks * height_blocks * depth_blocks
399 if offset < 0:
400 index = total_blocks + offset
401 else:
402 index = offset
403
404 if index >= total_blocks:
405 return None
406
407 # Coordinates of the indexed block
408 coord_z = block.depth * (index % depth_blocks)
409 coord_y = block.height * (index // (depth_blocks * width_blocks))
410 coord_x = block.width * ((index // depth_blocks) % width_blocks)
411
412 return PointXYZ(x=coord_x + area.x, y=coord_y + area.y, z=coord_z + area.z)
413
414
415def get_first_job_input_volume(
416 arch: ArchitectureFeatures,
417 ifm: Rect,
418 ofm: Rect,
419 ifm_block_depth,
420 ofm_block: Block,
421 kernel: Kernel,
422 padding: NpuPadding,
423 block_offset: int,
424):
425 # Get ifm block size (jobs are invisibly decomposed into subkernels)
426 ifm_block = arch.get_ifm_block_size(ifm_block_depth, ofm_block, kernel, arch.ofm_block_max)
427 ifm_depth_blocks = numeric_util.round_up_divide(ifm.size().depth, ifm_block_depth)
428
429 # Which OFM block are we calculating
430 ofm_coord = get_offset_block_coords(ofm, ofm_block, block_offset // ifm_depth_blocks)
431 if ofm_coord is None:
432 return None
433
434 # Coordinate of the source IFM block
435 ifm_coord_x = max(0, ofm_coord[0] * kernel.stride.x - padding.left)
436 ifm_coord_y = max(0, ofm_coord[1] * kernel.stride.y - padding.right)
437 ifm_coord_z = ifm.z + (block_offset % ifm_depth_blocks) * ifm_block.depth
438
439 # IFM block that will be sampled for the FIRST+block_offset job in the next operator's OFM
440 start_coord = PointXYZ(x=ifm_coord_x, y=ifm_coord_y, z=ifm_coord_z)
441 end_coord = PointXYZ(
442 x=start_coord[0] + ifm_block.width, y=start_coord[1] + ifm_block.height, z=start_coord[2] + ifm_block.depth,
443 )
444 return (start_coord, end_coord, 1) # start, end, total jobs
445
446
447def get_prev_job_output_volume(ofm: Rect, ofm_block: Block, block_offset: int):
448 assert block_offset >= 0
449
450 # Get OFM block's volume coordinates
451 start_coord = get_offset_block_coords(ofm, ofm_block, -1 - block_offset)
452 if start_coord is None:
453 return None
454 end_coord = PointXYZ(
455 x=start_coord.x + ofm_block.width, y=start_coord.y + ofm_block.height, z=start_coord.z + ofm_block.depth,
456 )
457 return (start_coord, end_coord, 1) # start, end, total jobs for this OFM block
458
459
460def calc_blockdep(arch: ArchitectureFeatures, prev_op: Optional[NpuBlockOperation], npu_op: NpuBlockOperation,) -> int:
461 """Calculates the value of the BLOCKDEP register"""
462 if prev_op is None:
463 return 0
464 assert npu_op.ifm is not None
465 assert prev_op.ofm is not None
466 # Check if IFM or IFM2 overlaps with prev op's OFM
467 prev_ofm_ranges = get_address_ranges(prev_op.ofm)
468 ifm_ranges = get_address_ranges(npu_op.ifm)
469 ifm_overlaps = range_lists_overlap(prev_ofm_ranges, ifm_ranges)
470 if has_ifm2(npu_op):
471 assert npu_op.ifm2 is not None
472 ifm2_ranges = get_address_ranges(npu_op.ifm2)
473 ifm2_overlaps = range_lists_overlap(prev_ofm_ranges, ifm2_ranges)
474 else:
475 ifm2_overlaps = False
476 if ifm_overlaps and ifm2_overlaps:
477 # Both IFM and IFM2 overlap (should be rare)
478 return 0
479 if not ifm_overlaps and not ifm2_overlaps:
480 # No overlap between prev OFM and IFM/IFM2
481 return ArchitectureFeatures.MAX_BLOCKDEP
482 if ifm2_overlaps and shape3d_size(npu_op.ifm2.shape) < shape3d_size(npu_op.ifm.shape):
483 # Prev OFM produces IFM2 which is broadcasted (this should be rare)
484 return 0
485 # Prev OFM overlaps with IFM or IFM2; calculate the blockdep
486 prev_block_config = prev_op.block_config
487 block_config = npu_op.block_config
488 overlapping_fm = npu_op.ifm if ifm_overlaps else npu_op.ifm2
489 assert overlapping_fm is not None
490
491 cur_ifm_block_depth = get_ifm_ofm_block_depth(arch, npu_op)
492 cur_ofm_block = Block(block_config.width, block_config.height, block_config.depth)
493 cur_ofm_rect = shape3d_to_rect(npu_op.ofm.shape)
494 cur_ifm_rect = shape3d_to_rect(npu_op.ifm.shape)
495 padding = NpuPadding(0, 0, 0, 0) if npu_op.padding is None else npu_op.padding
496 blockdep = ArchitectureFeatures.MAX_BLOCKDEP
497 kernel = to_kernel(npu_op.kernel)
498
499 prev_ofm_block = Block(prev_block_config.width, prev_block_config.height, prev_block_config.depth)
500 prev_ofm_rect = shape3d_to_rect(prev_op.ofm.shape)
501 # Iterate over the next BLOCKDEP inputs, checking to see if a sliding window
502 # of IFM area overlaps with any previous OFM block generation.
503 elapsed_jobs = 0
504 for forward_offset in range(ArchitectureFeatures.MAX_BLOCKDEP):
505 # This is the IFM block we want to sample from
506 in_area = get_first_job_input_volume(
507 arch, cur_ifm_rect, cur_ofm_rect, cur_ifm_block_depth, cur_ofm_block, kernel, padding, forward_offset
508 )
509 if in_area is None:
510 break
511
512 # Try several previous-OFM blocks in the past (they still might comprise multiple IFM jobs)
513 outstanding_jobs = 0
514 for block_offset in range(ArchitectureFeatures.MAX_BLOCKDEP):
515 # This is the OFM block being generated by the previous op
516 out_area = get_prev_job_output_volume(prev_ofm_rect, prev_ofm_block, block_offset)
517 if out_area is None:
518 break
519
520 # Block dependency is the max number of allowed outstanding jobs
521 # in the pipeline. Selected by determining how many jobs occur
522 # in between two operators' overlapping OFM->IFM block volumes
523 if intersects(overlapping_fm, in_area[0], in_area[1], prev_op.ofm, out_area[0], out_area[1]):
524 break
525 # Early exit if no intersections and we've seen enough jobs in the pipeline
526 elif outstanding_jobs > ArchitectureFeatures.MAX_BLOCKDEP:
527 break
528
529 # This OFM had this many jobs (accumulate over multiple OFM blocks)
530 outstanding_jobs += out_area[2]
531
532 blockdep = min(blockdep, elapsed_jobs + outstanding_jobs)
533 elapsed_jobs += in_area[2]
534 # Early exit if no intersections and we've seen enough jobs in the pipeline
535 if elapsed_jobs > ArchitectureFeatures.MAX_BLOCKDEP:
536 break
537
538 return blockdep