blob: b2c84d7ca7225ddc3b7d049e40fb34a617efa9c2 [file] [log] [blame]
erik.andersson@arm.com460c6892021-02-24 14:38:09 +01001# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Louis Verhaard1e170182020-11-26 11:42:04 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17# Description:
18# Utility functions for code generation
19from typing import List
20from typing import NamedTuple
21from typing import Optional
22
23from . import numeric_util
24from .api import NpuActivationOp
25from .api import NpuAddressRange
26from .api import NpuBlockOperation
27from .api import NpuDmaOperation
28from .api import NpuElementWiseOp
29from .api import NpuFeatureMap
30from .api import NpuKernel
31from .api import NpuLayout
32from .api import NpuOperation
33from .api import NpuOperationType
34from .api import NpuPadding
35from .api import NpuShape3D
36from .architecture_features import ArchitectureFeatures
37from .architecture_features import Block
38from .architecture_features import Rect
39from .operation import Kernel
40from .operation import PointXYZ
41from ethosu.vela.range_set import AccessDirection
42from ethosu.vela.range_set import MemoryAccessSet
43from ethosu.vela.range_set import MemoryRangeSet
44
45# base address slot for memory to memory transfer
46BASE_PTR_INDEX_MEM2MEM = int((1 << 8) | (3 << 0))
47
48
Michael McGeaghf3e3ad72020-12-02 12:39:03 +000049UNARY_ELEMWISE_OPS = (NpuElementWiseOp.ABS, NpuElementWiseOp.LRELU, NpuElementWiseOp.CLZ)
Louis Verhaard1e170182020-11-26 11:42:04 +010050
51
52def to_npu_kernel(kernel: Kernel) -> NpuKernel:
53 """Converts the given internally used kernel object to NpuKernel (of public API)"""
54 return NpuKernel(
55 kernel.width, kernel.height, kernel.stride.x, kernel.stride.y, kernel.dilation.x, kernel.dilation.y
56 )
57
58
59def to_kernel(kernel: Optional[NpuKernel]) -> Kernel:
60 """Converts the given public API object to Kernel (used internally)"""
61 if kernel is None:
62 return Kernel(1, 1)
63 return Kernel(kernel.width, kernel.height, kernel.stride_x, kernel.stride_y, kernel.dilation_x, kernel.dilation_y)
64
65
66def has_ifm2(npu_op: NpuBlockOperation) -> bool:
67 """Checks if op has non-scalar IFM2"""
68 return npu_op.ifm2 is not None and npu_op.ifm2_scalar is None
69
70
Louis Verhaard1e170182020-11-26 11:42:04 +010071def shape3d_size(shape: NpuShape3D) -> int:
72 return shape.width * shape.height * shape.depth
73
74
75def shape3d_to_rect(shape: NpuShape3D) -> Rect:
76 return Rect(0, 0, 0, shape.width - 1, shape.height - 1, shape.depth - 1)
77
78
Tim Halld8339a72021-05-27 18:49:40 +010079def shape3d_to_block(shape: NpuShape3D) -> Block:
80 return Block(shape.width, shape.height, shape.depth)
81
82
Louis Verhaard1e170182020-11-26 11:42:04 +010083# -------------------------------------------------------------------
84# ADDRESSING/STRIDES (helper functions)
85# -------------------------------------------------------------------
86
87
88def ranges_overlap(range1: NpuAddressRange, range2: NpuAddressRange) -> bool:
89 """Checks if the ranges overlap"""
90 return range1.region == range2.region and numeric_util.overlaps(
91 range1.address, range1.address + range1.length, range2.address, range2.address + range2.length
92 )
93
94
95def range_lists_overlap(list1: List[Optional[NpuAddressRange]], list2: List[Optional[NpuAddressRange]]) -> bool:
96 """Checks if there is any address overlap between list1 and list2"""
97 for range1 in list1:
98 if range1 is None:
99 continue
100 for range2 in list2:
101 if range2 is not None and ranges_overlap(range1, range2):
102 return True
103 return False
104
105
106def get_strides(fm: NpuFeatureMap) -> NpuShape3D:
107 """Calculates STRIDE_C/Y/X"""
108 if fm.strides is not None:
109 return fm.strides
110 elem_size = fm.data_type.size_in_bytes()
111 if fm.layout == NpuLayout.NHWC:
112 stride_c = elem_size
113 stride_x = fm.shape.depth * stride_c
114 stride_y = fm.shape.width * stride_x
115 else:
116 stride_x = 16 * elem_size
117 stride_c = stride_x * fm.shape.width
118 stride_y = elem_size * fm.shape.width * numeric_util.round_up(fm.shape.depth, 16)
119 return NpuShape3D(depth=stride_c, height=stride_y, width=stride_x)
120
121
122def get_address(fm: NpuFeatureMap, strides: NpuShape3D, y: int, x: int, c: int) -> int:
123 """Returns address of given coordinate"""
124 t = 0
125 BRICK = 16
126 stride_c = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHWC else strides.depth
127 stride_x = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHCWB16 else strides.width
128 if x >= fm.tiles.width_0:
129 x -= fm.tiles.width_0
130 t = 1
131 if y >= fm.tiles.height_1:
132 y -= fm.tiles.height_1
133 t += 2
134 elif y >= fm.tiles.height_0:
135 y -= fm.tiles.height_0
136 t += 2
137 elem_size = fm.data_type.size_in_bytes()
138 return (
139 fm.tiles.addresses[t] + y * strides.height + x * stride_x + (c // BRICK) * stride_c + int(c % BRICK) * elem_size
140 )
141
142
143def get_address_range(
144 fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int
145) -> NpuAddressRange:
146 """
147 Gets address range for (y0, x0, c0) - (y1, x1, c1) (inclusive, so the second coordinate is within the fm).
148 The begin and end coordinates must be within the same tile.
149 """
150 addr0 = get_address(fm, strides, y0, x0, c0)
151 addr1 = get_address(fm, strides, y1, x1, c1)
152 return NpuAddressRange(region=fm.region, address=addr0, length=addr1 - addr0 + fm.data_type.size_in_bytes())
153
154
155def get_h_ranges(
156 fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int
157) -> List[NpuAddressRange]:
158 """
159 Gets address ranges for (y0, x0, c0) - (y1, x1, c1) (inclusive, so the second coordinate is within the fm);
160 the begin and end coordinates must be within the same tile.
161 Divides the area in horizontal "stripes" of height 1, and returns the address ranges for these "stripes".
162 """
163 return [get_address_range(fm, strides, y, x0, c0, y, x1, c1) for y in range(y0, y1 + 1)]
164
165
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100166def get_address_ranges_for_area(fm: NpuFeatureMap, start: PointXYZ, end: PointXYZ) -> List[Optional[NpuAddressRange]]:
Louis Verhaard1e170182020-11-26 11:42:04 +0100167 """
168 Returns a list of adddress ranges that covers the area start - end (inclusive).
169 Divides the area in horizontal "stripes" of height 1, and returns the address ranges for these "stripes".
170
171 For example, for the area marked with X (in a feature map with 4 tiles) as input, this function would return
172 6 address ranges: the address ranges for 1-height areas [AAA, BBB, CC, DD, EEE, FF]
173
174 .....|.... .....|....
175 t0 ..XXX|XX.. t1 t0 ..AAA|CC.. t1
176 ..XXX|XX.. ..BBB|DD..
177 -----+---- --> -----+----
178 t2 ..XXX|XX.. t3 t2 ..EEE|FF.. t3
179 .....|.... .....|....
180 """
181 strides = get_strides(fm)
182 height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0
183 h, w, c = fm.shape
184 y0, x0, c0 = start.y, start.x, start.z
185 y1, x1, c1 = min(end.y, h - 1), min(end.x, w - 1), min(end.z, c - 1)
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100186 ranges: List[Optional[NpuAddressRange]] = []
Louis Verhaard1e170182020-11-26 11:42:04 +0100187 if x0 < width_0 and y0 < height_0:
188 # Horizontal ranges for tile 0
189 ranges.extend(get_h_ranges(fm, strides, y0, x0, c0, min(y1, height_0 - 1), min(x1, width_0 - 1), c1))
190 if x1 >= width_0 and y0 < height_1:
191 # Horizontal ranges for tile 1
192 ranges.extend(get_h_ranges(fm, strides, y0, max(x0, width_0), c0, min(y1, height_1 - 1), x1, c1))
193 if x0 < width_0 and y1 >= height_0:
194 # Horizontal ranges for tile 2
195 ranges.extend(get_h_ranges(fm, strides, max(y0, height_0), x0, c0, y1, min(x1, width_0 - 1), c1))
196 if x1 >= width_0 and y1 >= height_1:
197 # Horizontal ranges for tile 3
198 ranges.extend(get_h_ranges(fm, strides, max(y0, height_1), max(x0, width_0), c0, y1, x1, c1))
199 return ranges
200
201
202def get_address_ranges(fm: NpuFeatureMap) -> List[Optional[NpuAddressRange]]:
203 """Returns 4 adddress ranges, one for every tile, None if the tile is not in use"""
204 strides = get_strides(fm)
205 height, width, depth = fm.shape.height, fm.shape.width, fm.shape.depth
206 height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0
Jonas Ohlssond8575072022-03-30 10:30:25 +0200207 t0 = get_address_range(
208 fm,
209 strides,
210 0,
211 0,
212 0,
213 min(height, height_0) - 1,
214 min(width, width_0) - 1,
215 depth - 1,
216 )
Louis Verhaard1e170182020-11-26 11:42:04 +0100217 if width > width_0:
218 t1 = get_address_range(fm, strides, 0, width_0, 0, min(height, height_1) - 1, width - 1, depth - 1)
219 else:
220 t1 = None
221 if height > height_0:
222 t2 = get_address_range(fm, strides, height_0, 0, 0, height - 1, min(width, width_0) - 1, depth - 1)
223 else:
224 t2 = None
225 if t1 is not None and t2 is not None:
226 t3 = get_address_range(fm, strides, height_1, width_0, 0, height - 1, width - 1, depth - 1)
227 else:
228 t3 = None
229 return [t0, t1, t2, t3]
230
231
232# -------------------------------------------------------------------
233# DMA_WAIT/KERNEL_WAIT
234# -------------------------------------------------------------------
235
236
237class Watermark(NamedTuple):
238 npu: int
239 dma: int
240
241
242def memory_range_set(range: NpuAddressRange) -> MemoryRangeSet:
243 return MemoryRangeSet(range.region, range.address, range.address + range.length)
244
245
246def get_dma_memory_accesses(dma_op: NpuDmaOperation) -> MemoryAccessSet:
247 """Returns the address that are read and written by the given DMA operation"""
248 res = MemoryAccessSet()
249 res.add(memory_range_set(dma_op.src), AccessDirection.Read)
250 res.add(memory_range_set(dma_op.dest), AccessDirection.Write)
251 return res
252
253
254def get_op_memory_accesses(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> MemoryAccessSet:
255 """Returns the addresses that are read and written by the given operation"""
256 assert npu_op.ifm is not None and npu_op.ofm is not None
257 # Read addresses
258 read_ranges = get_address_ranges(npu_op.ifm)
259 if has_ifm2(npu_op):
260 assert npu_op.ifm2 is not None
261 read_ranges.extend(get_address_ranges(npu_op.ifm2))
262 read_ranges.extend(npu_op.weights)
263 read_ranges.extend(npu_op.biases)
264 if npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP:
265 address = arch.available_shram_banks(True) * arch.shram_bank_size
266 read_ranges.append(NpuAddressRange(region=BASE_PTR_INDEX_MEM2MEM, address=address, length=2048))
267 # Written addresses
268 write_ranges = get_address_ranges(npu_op.ofm)
269 # Add write access to SHRAM, needed when LUTs can overwrite accumulator banks
270 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
271 written_shram_size = arch.available_shram_banks(uses_lut) * arch.shram_bank_size
272 write_ranges.append(NpuAddressRange(region=BASE_PTR_INDEX_MEM2MEM, address=0, length=written_shram_size))
273
274 res = MemoryAccessSet()
275 for read_range in read_ranges:
276 if read_range is not None:
277 res.add(memory_range_set(read_range), AccessDirection.Read)
278 for write_range in write_ranges:
279 if write_range is not None:
280 res.add(memory_range_set(write_range), AccessDirection.Write)
281 return res
282
283
284def get_wait_dependency(
285 arch: ArchitectureFeatures, npu_op_list: List[NpuOperation], memory_accesses, op_index: int, watermark: Watermark
286):
287 """Used to calculate whether DMA wait or kernel wait operations are needed"""
288 npu_op = npu_op_list[op_index]
289 op_access = memory_accesses[npu_op]
290 index = op_index - 1
291
292 # NPU dependency tracking
293 npu_outstanding = -1
294 npu_ops = 0
295 npu_index = watermark.npu
296
297 # DMA dependency tracking
298 dma_outstanding = -1
299 dma_ops = 0
300 dma_index = watermark.dma
301
302 # Seek back in the command stream looking for NPU or DMA dependencies
303 # but only as far as the first dependency or the watermarks (dependencies
304 # before this point have been satisfied already).
305 # The watermark moves to after the latest element we must wait for, not
306 # the command that issues the wait.
307 # NPU->NPU dependency is handled via blockdep.
308 while (index >= npu_index) or (index >= dma_index):
309 prev_op = npu_op_list[index]
310 prev_access = memory_accesses[prev_op]
311
312 # Check NPU consuming DMA output
Dwight Lidman9b43f842020-12-08 17:56:44 +0100313 if isinstance(prev_op, NpuDmaOperation):
Louis Verhaard1e170182020-11-26 11:42:04 +0100314 if index >= dma_index:
Dwight Lidman9b43f842020-12-08 17:56:44 +0100315 if not isinstance(npu_op, NpuDmaOperation):
Louis Verhaard1e170182020-11-26 11:42:04 +0100316 if (dma_outstanding == -1) and prev_access.conflicts(op_access):
317 dma_outstanding = dma_ops
318 dma_ops += 1 # Count DMA ops in the pipeline
319 if dma_ops >= arch.max_outstanding_dma:
320 dma_index = max(index + 1, dma_index)
321 # Check DMA consuming NPU output
322 else:
323 if index >= npu_index:
Dwight Lidman9b43f842020-12-08 17:56:44 +0100324 if isinstance(npu_op, NpuDmaOperation) and npu_outstanding == -1 and prev_access.conflicts(op_access):
Louis Verhaard1e170182020-11-26 11:42:04 +0100325 npu_outstanding = npu_ops
326 npu_ops += 1 # Count NPU ops in the pipeline
327 if npu_ops >= arch.max_outstanding_kernels:
328 npu_index = max(index + 1, npu_index)
329
330 index -= 1
331
332 # Update DMA watermark if we didn't see any and the NPU pipeline is full
333 if (dma_ops == 0) and (npu_ops >= arch.max_outstanding_kernels):
334 dma_index = op_index
335
336 # Bring the search watermark forwards as we complete for those dependencies
337 watermark = Watermark(npu_index, dma_index)
338 outstanding = Watermark(npu_outstanding, dma_outstanding)
339
340 return watermark, outstanding
341
342
343# -------------------------------------------------------------------
344# BLOCKDEP
345# -------------------------------------------------------------------
346
347
348def get_ifm_ofm_block_depth(arch: ArchitectureFeatures, npu_op: NpuBlockOperation) -> int:
349 # Note: NOT equivalent to the normal ifm block depth calculation since
350 # it takes into account 'depthless' block operations by returning full
351 # depth
352 if npu_op.op_type == NpuOperationType.Conv2D:
353 res = arch.calc_ifm_block_depth(npu_op.ifm.shape.depth, npu_op.ifm.data_type.size_in_bits())
354 return res
355 return npu_op.ofm.shape.depth
356
357
358def coords_intersect(start_a: PointXYZ, end_a: PointXYZ, start_b: PointXYZ, end_b: PointXYZ) -> bool:
359 """Checks if the two areas overlap"""
360 start_x = max(start_a.x, start_b.x)
361 end_x = min(end_a.x, end_b.x)
362 start_y = max(start_a.y, start_b.y)
363 end_y = min(end_a.y, end_b.y)
364 start_z = max(start_a.z, start_b.z)
365 end_z = min(end_a.z, end_b.z)
366 return ((end_x - start_x) > 0) and ((end_y - start_y) > 0) and ((end_z - start_z) > 0)
367
368
369def intersects(
370 ifm: NpuFeatureMap,
371 ifm_start_coord: PointXYZ,
372 ifm_end_coord: PointXYZ,
373 prev_ofm: NpuFeatureMap,
374 ofm_start_coord: PointXYZ,
375 ofm_end_coord: PointXYZ,
376) -> bool:
377 """Checks if the given IFM area overlaps with the given OFM area"""
378 if ifm.shape == prev_ofm.shape and ifm.tiles == prev_ofm.tiles:
379 # Common case: prev_op.ofm == op.ifm; in this case it suffices to check
380 # if the xyz coordinates overlap, which is quick and easy
381 res = coords_intersect(ifm_start_coord, ifm_end_coord, ofm_start_coord, ofm_end_coord)
382 else:
383 # The OFM produces a part of the IFM (e.g. a stripe), or the IFM consumes part of the OFM.
384 # In this case, address comparison between the two areas is needed
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100385 ifm_ranges: List[Optional[NpuAddressRange]] = get_address_ranges_for_area(ifm, ifm_start_coord, ifm_end_coord)
Louis Verhaard1e170182020-11-26 11:42:04 +0100386 prev_ofm_ranges = get_address_ranges_for_area(prev_ofm, ofm_start_coord, ofm_end_coord)
387 res = range_lists_overlap(ifm_ranges, prev_ofm_ranges)
388 return res
389
390
391# Block job dependency:
392# Does the VOLUME of IFMs for block job B(0) overlap with VOLUME of OFMs block jobs A(8,9,10)
393#
394# A | B
395# ----------------------+------------------
396# .... 3,4,5,6,7,8,9,10 | 0,1,2,3,4,5,6,8 10 < JOB NUMBER
397# |<------->| dependency offset
398#
399
400
401def get_offset_block_coords(area: Rect, block: Block, offset: int) -> Optional[PointXYZ]:
402 """
403 Get the coordinates of a block offset from either the end (negative)
404 or the start (zero or positive) of the given 3D area
405 """
406 size = area.size()
407 # Dimensions of the region, in blocks
408 width_blocks = numeric_util.round_up_divide(size.width, block.width)
409 height_blocks = numeric_util.round_up_divide(size.height, block.height)
410 depth_blocks = numeric_util.round_up_divide(size.depth, block.depth)
411 total_blocks = width_blocks * height_blocks * depth_blocks
412 if offset < 0:
413 index = total_blocks + offset
414 else:
415 index = offset
416
417 if index >= total_blocks:
418 return None
419
420 # Coordinates of the indexed block
421 coord_z = block.depth * (index % depth_blocks)
422 coord_y = block.height * (index // (depth_blocks * width_blocks))
423 coord_x = block.width * ((index // depth_blocks) % width_blocks)
424
425 return PointXYZ(x=coord_x + area.x, y=coord_y + area.y, z=coord_z + area.z)
426
427
428def get_first_job_input_volume(
429 arch: ArchitectureFeatures,
430 ifm: Rect,
431 ofm: Rect,
432 ifm_block_depth,
433 ofm_block: Block,
434 kernel: Kernel,
435 padding: NpuPadding,
436 block_offset: int,
437):
438 # Get ifm block size (jobs are invisibly decomposed into subkernels)
439 ifm_block = arch.get_ifm_block_size(ifm_block_depth, ofm_block, kernel, arch.ofm_block_max)
440 ifm_depth_blocks = numeric_util.round_up_divide(ifm.size().depth, ifm_block_depth)
441
442 # Which OFM block are we calculating
443 ofm_coord = get_offset_block_coords(ofm, ofm_block, block_offset // ifm_depth_blocks)
444 if ofm_coord is None:
445 return None
446
447 # Coordinate of the source IFM block
448 ifm_coord_x = max(0, ofm_coord[0] * kernel.stride.x - padding.left)
449 ifm_coord_y = max(0, ofm_coord[1] * kernel.stride.y - padding.right)
450 ifm_coord_z = ifm.z + (block_offset % ifm_depth_blocks) * ifm_block.depth
451
452 # IFM block that will be sampled for the FIRST+block_offset job in the next operator's OFM
453 start_coord = PointXYZ(x=ifm_coord_x, y=ifm_coord_y, z=ifm_coord_z)
454 end_coord = PointXYZ(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200455 x=start_coord[0] + ifm_block.width,
456 y=start_coord[1] + ifm_block.height,
457 z=start_coord[2] + ifm_block.depth,
Louis Verhaard1e170182020-11-26 11:42:04 +0100458 )
459 return (start_coord, end_coord, 1) # start, end, total jobs
460
461
462def get_prev_job_output_volume(ofm: Rect, ofm_block: Block, block_offset: int):
463 assert block_offset >= 0
464
465 # Get OFM block's volume coordinates
466 start_coord = get_offset_block_coords(ofm, ofm_block, -1 - block_offset)
467 if start_coord is None:
468 return None
469 end_coord = PointXYZ(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200470 x=start_coord.x + ofm_block.width,
471 y=start_coord.y + ofm_block.height,
472 z=start_coord.z + ofm_block.depth,
Louis Verhaard1e170182020-11-26 11:42:04 +0100473 )
474 return (start_coord, end_coord, 1) # start, end, total jobs for this OFM block
475
476
Jonas Ohlssond8575072022-03-30 10:30:25 +0200477def calc_blockdep(
478 arch: ArchitectureFeatures,
479 prev_op: Optional[NpuBlockOperation],
480 npu_op: NpuBlockOperation,
481) -> int:
Louis Verhaard1e170182020-11-26 11:42:04 +0100482 """Calculates the value of the BLOCKDEP register"""
483 if prev_op is None:
484 return 0
485 assert npu_op.ifm is not None
486 assert prev_op.ofm is not None
Diqing Zhong455e20e2021-02-03 16:37:31 +0100487 # Check if the reserved shram will be used in current/prev op
488 prev_uses_lut = prev_op.activation is not None and prev_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
489 curr_uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
490 if prev_uses_lut and arch.shram_reserved_unused_banks == 0 and not curr_uses_lut:
491 return 0
492
Louis Verhaard1e170182020-11-26 11:42:04 +0100493 # Check if IFM or IFM2 overlaps with prev op's OFM
494 prev_ofm_ranges = get_address_ranges(prev_op.ofm)
495 ifm_ranges = get_address_ranges(npu_op.ifm)
496 ifm_overlaps = range_lists_overlap(prev_ofm_ranges, ifm_ranges)
497 if has_ifm2(npu_op):
498 assert npu_op.ifm2 is not None
499 ifm2_ranges = get_address_ranges(npu_op.ifm2)
500 ifm2_overlaps = range_lists_overlap(prev_ofm_ranges, ifm2_ranges)
501 else:
502 ifm2_overlaps = False
503 if ifm_overlaps and ifm2_overlaps:
504 # Both IFM and IFM2 overlap (should be rare)
505 return 0
506 if not ifm_overlaps and not ifm2_overlaps:
507 # No overlap between prev OFM and IFM/IFM2
508 return ArchitectureFeatures.MAX_BLOCKDEP
509 if ifm2_overlaps and shape3d_size(npu_op.ifm2.shape) < shape3d_size(npu_op.ifm.shape):
510 # Prev OFM produces IFM2 which is broadcasted (this should be rare)
511 return 0
512 # Prev OFM overlaps with IFM or IFM2; calculate the blockdep
513 prev_block_config = prev_op.block_config
514 block_config = npu_op.block_config
515 overlapping_fm = npu_op.ifm if ifm_overlaps else npu_op.ifm2
516 assert overlapping_fm is not None
517
518 cur_ifm_block_depth = get_ifm_ofm_block_depth(arch, npu_op)
519 cur_ofm_block = Block(block_config.width, block_config.height, block_config.depth)
520 cur_ofm_rect = shape3d_to_rect(npu_op.ofm.shape)
521 cur_ifm_rect = shape3d_to_rect(npu_op.ifm.shape)
522 padding = NpuPadding(0, 0, 0, 0) if npu_op.padding is None else npu_op.padding
523 blockdep = ArchitectureFeatures.MAX_BLOCKDEP
524 kernel = to_kernel(npu_op.kernel)
525
526 prev_ofm_block = Block(prev_block_config.width, prev_block_config.height, prev_block_config.depth)
527 prev_ofm_rect = shape3d_to_rect(prev_op.ofm.shape)
528 # Iterate over the next BLOCKDEP inputs, checking to see if a sliding window
529 # of IFM area overlaps with any previous OFM block generation.
530 elapsed_jobs = 0
531 for forward_offset in range(ArchitectureFeatures.MAX_BLOCKDEP):
532 # This is the IFM block we want to sample from
533 in_area = get_first_job_input_volume(
534 arch, cur_ifm_rect, cur_ofm_rect, cur_ifm_block_depth, cur_ofm_block, kernel, padding, forward_offset
535 )
536 if in_area is None:
537 break
538
539 # Try several previous-OFM blocks in the past (they still might comprise multiple IFM jobs)
540 outstanding_jobs = 0
541 for block_offset in range(ArchitectureFeatures.MAX_BLOCKDEP):
542 # This is the OFM block being generated by the previous op
543 out_area = get_prev_job_output_volume(prev_ofm_rect, prev_ofm_block, block_offset)
544 if out_area is None:
545 break
546
547 # Block dependency is the max number of allowed outstanding jobs
548 # in the pipeline. Selected by determining how many jobs occur
549 # in between two operators' overlapping OFM->IFM block volumes
550 if intersects(overlapping_fm, in_area[0], in_area[1], prev_op.ofm, out_area[0], out_area[1]):
551 break
552 # Early exit if no intersections and we've seen enough jobs in the pipeline
553 elif outstanding_jobs > ArchitectureFeatures.MAX_BLOCKDEP:
554 break
555
556 # This OFM had this many jobs (accumulate over multiple OFM blocks)
557 outstanding_jobs += out_area[2]
558
559 blockdep = min(blockdep, elapsed_jobs + outstanding_jobs)
560 elapsed_jobs += in_area[2]
561 # Early exit if no intersections and we've seen enough jobs in the pipeline
562 if elapsed_jobs > ArchitectureFeatures.MAX_BLOCKDEP:
563 break
564
565 return blockdep