blob: 6f57f54a2e4c1e4b907e990505df51b25ddbb1bf [file] [log] [blame]
Rickard Bolinbc6ee582022-11-04 08:24:29 +00001# SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
Louis Verhaard1e170182020-11-26 11:42:04 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17# Description:
18# Utility functions for code generation
19from typing import List
20from typing import NamedTuple
21from typing import Optional
22
23from . import numeric_util
24from .api import NpuActivationOp
25from .api import NpuAddressRange
26from .api import NpuBlockOperation
27from .api import NpuDmaOperation
28from .api import NpuElementWiseOp
29from .api import NpuFeatureMap
30from .api import NpuKernel
31from .api import NpuLayout
32from .api import NpuOperation
33from .api import NpuOperationType
34from .api import NpuPadding
Fredrik Svedbergf3c7d552022-11-04 09:48:49 +010035from .api import NpuQuantization
Louis Verhaard1e170182020-11-26 11:42:04 +010036from .api import NpuShape3D
37from .architecture_features import ArchitectureFeatures
38from .architecture_features import Block
39from .architecture_features import Rect
40from .operation import Kernel
41from .operation import PointXYZ
42from ethosu.vela.range_set import AccessDirection
43from ethosu.vela.range_set import MemoryAccessSet
44from ethosu.vela.range_set import MemoryRangeSet
45
46# base address slot for memory to memory transfer
47BASE_PTR_INDEX_MEM2MEM = int((1 << 8) | (3 << 0))
48
49
Michael McGeaghf3e3ad72020-12-02 12:39:03 +000050UNARY_ELEMWISE_OPS = (NpuElementWiseOp.ABS, NpuElementWiseOp.LRELU, NpuElementWiseOp.CLZ)
Louis Verhaard1e170182020-11-26 11:42:04 +010051
52
53def to_npu_kernel(kernel: Kernel) -> NpuKernel:
54 """Converts the given internally used kernel object to NpuKernel (of public API)"""
55 return NpuKernel(
56 kernel.width, kernel.height, kernel.stride.x, kernel.stride.y, kernel.dilation.x, kernel.dilation.y
57 )
58
59
60def to_kernel(kernel: Optional[NpuKernel]) -> Kernel:
61 """Converts the given public API object to Kernel (used internally)"""
62 if kernel is None:
63 return Kernel(1, 1)
64 return Kernel(kernel.width, kernel.height, kernel.stride_x, kernel.stride_y, kernel.dilation_x, kernel.dilation_y)
65
66
67def has_ifm2(npu_op: NpuBlockOperation) -> bool:
68 """Checks if op has non-scalar IFM2"""
69 return npu_op.ifm2 is not None and npu_op.ifm2_scalar is None
70
71
Louis Verhaard1e170182020-11-26 11:42:04 +010072def shape3d_size(shape: NpuShape3D) -> int:
73 return shape.width * shape.height * shape.depth
74
75
76def shape3d_to_rect(shape: NpuShape3D) -> Rect:
77 return Rect(0, 0, 0, shape.width - 1, shape.height - 1, shape.depth - 1)
78
79
Tim Halld8339a72021-05-27 18:49:40 +010080def shape3d_to_block(shape: NpuShape3D) -> Block:
81 return Block(shape.width, shape.height, shape.depth)
82
83
Fredrik Svedbergf3c7d552022-11-04 09:48:49 +010084def get_zero_point(fm: NpuFeatureMap):
85 return int(fm.quantization.zero_point if fm.quantization else 0)
86
87
88def quantise(value: float, quant: Optional[NpuQuantization]) -> int:
89 """Quantizes the given value"""
90 scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32
91 zp = 0 if quant is None else quant.zero_point
92 return numeric_util.quantise_float32(value, scale, zp)
93
94
Louis Verhaard1e170182020-11-26 11:42:04 +010095# -------------------------------------------------------------------
96# ADDRESSING/STRIDES (helper functions)
97# -------------------------------------------------------------------
98
99
100def ranges_overlap(range1: NpuAddressRange, range2: NpuAddressRange) -> bool:
101 """Checks if the ranges overlap"""
102 return range1.region == range2.region and numeric_util.overlaps(
103 range1.address, range1.address + range1.length, range2.address, range2.address + range2.length
104 )
105
106
107def range_lists_overlap(list1: List[Optional[NpuAddressRange]], list2: List[Optional[NpuAddressRange]]) -> bool:
108 """Checks if there is any address overlap between list1 and list2"""
109 for range1 in list1:
110 if range1 is None:
111 continue
112 for range2 in list2:
113 if range2 is not None and ranges_overlap(range1, range2):
114 return True
115 return False
116
117
118def get_strides(fm: NpuFeatureMap) -> NpuShape3D:
119 """Calculates STRIDE_C/Y/X"""
120 if fm.strides is not None:
121 return fm.strides
122 elem_size = fm.data_type.size_in_bytes()
123 if fm.layout == NpuLayout.NHWC:
124 stride_c = elem_size
125 stride_x = fm.shape.depth * stride_c
126 stride_y = fm.shape.width * stride_x
127 else:
128 stride_x = 16 * elem_size
129 stride_c = stride_x * fm.shape.width
130 stride_y = elem_size * fm.shape.width * numeric_util.round_up(fm.shape.depth, 16)
131 return NpuShape3D(depth=stride_c, height=stride_y, width=stride_x)
132
133
134def get_address(fm: NpuFeatureMap, strides: NpuShape3D, y: int, x: int, c: int) -> int:
135 """Returns address of given coordinate"""
136 t = 0
137 BRICK = 16
138 stride_c = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHWC else strides.depth
139 stride_x = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHCWB16 else strides.width
140 if x >= fm.tiles.width_0:
141 x -= fm.tiles.width_0
142 t = 1
143 if y >= fm.tiles.height_1:
144 y -= fm.tiles.height_1
145 t += 2
146 elif y >= fm.tiles.height_0:
147 y -= fm.tiles.height_0
148 t += 2
149 elem_size = fm.data_type.size_in_bytes()
150 return (
151 fm.tiles.addresses[t] + y * strides.height + x * stride_x + (c // BRICK) * stride_c + int(c % BRICK) * elem_size
152 )
153
154
155def get_address_range(
156 fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int
157) -> NpuAddressRange:
158 """
159 Gets address range for (y0, x0, c0) - (y1, x1, c1) (inclusive, so the second coordinate is within the fm).
160 The begin and end coordinates must be within the same tile.
161 """
162 addr0 = get_address(fm, strides, y0, x0, c0)
163 addr1 = get_address(fm, strides, y1, x1, c1)
164 return NpuAddressRange(region=fm.region, address=addr0, length=addr1 - addr0 + fm.data_type.size_in_bytes())
165
166
167def get_h_ranges(
168 fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int
169) -> List[NpuAddressRange]:
170 """
171 Gets address ranges for (y0, x0, c0) - (y1, x1, c1) (inclusive, so the second coordinate is within the fm);
172 the begin and end coordinates must be within the same tile.
173 Divides the area in horizontal "stripes" of height 1, and returns the address ranges for these "stripes".
174 """
175 return [get_address_range(fm, strides, y, x0, c0, y, x1, c1) for y in range(y0, y1 + 1)]
176
177
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100178def get_address_ranges_for_area(fm: NpuFeatureMap, start: PointXYZ, end: PointXYZ) -> List[Optional[NpuAddressRange]]:
Louis Verhaard1e170182020-11-26 11:42:04 +0100179 """
180 Returns a list of adddress ranges that covers the area start - end (inclusive).
181 Divides the area in horizontal "stripes" of height 1, and returns the address ranges for these "stripes".
182
183 For example, for the area marked with X (in a feature map with 4 tiles) as input, this function would return
184 6 address ranges: the address ranges for 1-height areas [AAA, BBB, CC, DD, EEE, FF]
185
186 .....|.... .....|....
187 t0 ..XXX|XX.. t1 t0 ..AAA|CC.. t1
188 ..XXX|XX.. ..BBB|DD..
189 -----+---- --> -----+----
190 t2 ..XXX|XX.. t3 t2 ..EEE|FF.. t3
191 .....|.... .....|....
192 """
193 strides = get_strides(fm)
194 height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0
195 h, w, c = fm.shape
196 y0, x0, c0 = start.y, start.x, start.z
197 y1, x1, c1 = min(end.y, h - 1), min(end.x, w - 1), min(end.z, c - 1)
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100198 ranges: List[Optional[NpuAddressRange]] = []
Louis Verhaard1e170182020-11-26 11:42:04 +0100199 if x0 < width_0 and y0 < height_0:
200 # Horizontal ranges for tile 0
201 ranges.extend(get_h_ranges(fm, strides, y0, x0, c0, min(y1, height_0 - 1), min(x1, width_0 - 1), c1))
202 if x1 >= width_0 and y0 < height_1:
203 # Horizontal ranges for tile 1
204 ranges.extend(get_h_ranges(fm, strides, y0, max(x0, width_0), c0, min(y1, height_1 - 1), x1, c1))
205 if x0 < width_0 and y1 >= height_0:
206 # Horizontal ranges for tile 2
207 ranges.extend(get_h_ranges(fm, strides, max(y0, height_0), x0, c0, y1, min(x1, width_0 - 1), c1))
208 if x1 >= width_0 and y1 >= height_1:
209 # Horizontal ranges for tile 3
210 ranges.extend(get_h_ranges(fm, strides, max(y0, height_1), max(x0, width_0), c0, y1, x1, c1))
211 return ranges
212
213
214def get_address_ranges(fm: NpuFeatureMap) -> List[Optional[NpuAddressRange]]:
215 """Returns 4 adddress ranges, one for every tile, None if the tile is not in use"""
216 strides = get_strides(fm)
217 height, width, depth = fm.shape.height, fm.shape.width, fm.shape.depth
218 height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0
Jonas Ohlssond8575072022-03-30 10:30:25 +0200219 t0 = get_address_range(
220 fm,
221 strides,
222 0,
223 0,
224 0,
225 min(height, height_0) - 1,
226 min(width, width_0) - 1,
227 depth - 1,
228 )
Louis Verhaard1e170182020-11-26 11:42:04 +0100229 if width > width_0:
230 t1 = get_address_range(fm, strides, 0, width_0, 0, min(height, height_1) - 1, width - 1, depth - 1)
231 else:
232 t1 = None
233 if height > height_0:
234 t2 = get_address_range(fm, strides, height_0, 0, 0, height - 1, min(width, width_0) - 1, depth - 1)
235 else:
236 t2 = None
237 if t1 is not None and t2 is not None:
238 t3 = get_address_range(fm, strides, height_1, width_0, 0, height - 1, width - 1, depth - 1)
239 else:
240 t3 = None
241 return [t0, t1, t2, t3]
242
243
244# -------------------------------------------------------------------
245# DMA_WAIT/KERNEL_WAIT
246# -------------------------------------------------------------------
247
248
249class Watermark(NamedTuple):
250 npu: int
251 dma: int
252
253
254def memory_range_set(range: NpuAddressRange) -> MemoryRangeSet:
255 return MemoryRangeSet(range.region, range.address, range.address + range.length)
256
257
258def get_dma_memory_accesses(dma_op: NpuDmaOperation) -> MemoryAccessSet:
259 """Returns the address that are read and written by the given DMA operation"""
260 res = MemoryAccessSet()
261 res.add(memory_range_set(dma_op.src), AccessDirection.Read)
262 res.add(memory_range_set(dma_op.dest), AccessDirection.Write)
263 return res
264
265
266def get_op_memory_accesses(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> MemoryAccessSet:
267 """Returns the addresses that are read and written by the given operation"""
268 assert npu_op.ifm is not None and npu_op.ofm is not None
269 # Read addresses
270 read_ranges = get_address_ranges(npu_op.ifm)
271 if has_ifm2(npu_op):
272 assert npu_op.ifm2 is not None
273 read_ranges.extend(get_address_ranges(npu_op.ifm2))
274 read_ranges.extend(npu_op.weights)
275 read_ranges.extend(npu_op.biases)
276 if npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP:
277 address = arch.available_shram_banks(True) * arch.shram_bank_size
278 read_ranges.append(NpuAddressRange(region=BASE_PTR_INDEX_MEM2MEM, address=address, length=2048))
279 # Written addresses
280 write_ranges = get_address_ranges(npu_op.ofm)
281 # Add write access to SHRAM, needed when LUTs can overwrite accumulator banks
282 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
283 written_shram_size = arch.available_shram_banks(uses_lut) * arch.shram_bank_size
284 write_ranges.append(NpuAddressRange(region=BASE_PTR_INDEX_MEM2MEM, address=0, length=written_shram_size))
285
286 res = MemoryAccessSet()
287 for read_range in read_ranges:
288 if read_range is not None:
289 res.add(memory_range_set(read_range), AccessDirection.Read)
290 for write_range in write_ranges:
291 if write_range is not None:
292 res.add(memory_range_set(write_range), AccessDirection.Write)
293 return res
294
295
296def get_wait_dependency(
297 arch: ArchitectureFeatures, npu_op_list: List[NpuOperation], memory_accesses, op_index: int, watermark: Watermark
298):
299 """Used to calculate whether DMA wait or kernel wait operations are needed"""
300 npu_op = npu_op_list[op_index]
301 op_access = memory_accesses[npu_op]
302 index = op_index - 1
303
304 # NPU dependency tracking
305 npu_outstanding = -1
306 npu_ops = 0
307 npu_index = watermark.npu
308
309 # DMA dependency tracking
310 dma_outstanding = -1
311 dma_ops = 0
312 dma_index = watermark.dma
313
314 # Seek back in the command stream looking for NPU or DMA dependencies
315 # but only as far as the first dependency or the watermarks (dependencies
316 # before this point have been satisfied already).
317 # The watermark moves to after the latest element we must wait for, not
318 # the command that issues the wait.
319 # NPU->NPU dependency is handled via blockdep.
320 while (index >= npu_index) or (index >= dma_index):
321 prev_op = npu_op_list[index]
322 prev_access = memory_accesses[prev_op]
323
324 # Check NPU consuming DMA output
Dwight Lidman9b43f842020-12-08 17:56:44 +0100325 if isinstance(prev_op, NpuDmaOperation):
Louis Verhaard1e170182020-11-26 11:42:04 +0100326 if index >= dma_index:
Dwight Lidman9b43f842020-12-08 17:56:44 +0100327 if not isinstance(npu_op, NpuDmaOperation):
Louis Verhaard1e170182020-11-26 11:42:04 +0100328 if (dma_outstanding == -1) and prev_access.conflicts(op_access):
329 dma_outstanding = dma_ops
330 dma_ops += 1 # Count DMA ops in the pipeline
331 if dma_ops >= arch.max_outstanding_dma:
332 dma_index = max(index + 1, dma_index)
333 # Check DMA consuming NPU output
334 else:
335 if index >= npu_index:
Dwight Lidman9b43f842020-12-08 17:56:44 +0100336 if isinstance(npu_op, NpuDmaOperation) and npu_outstanding == -1 and prev_access.conflicts(op_access):
Louis Verhaard1e170182020-11-26 11:42:04 +0100337 npu_outstanding = npu_ops
338 npu_ops += 1 # Count NPU ops in the pipeline
339 if npu_ops >= arch.max_outstanding_kernels:
340 npu_index = max(index + 1, npu_index)
341
342 index -= 1
343
344 # Update DMA watermark if we didn't see any and the NPU pipeline is full
345 if (dma_ops == 0) and (npu_ops >= arch.max_outstanding_kernels):
346 dma_index = op_index
347
348 # Bring the search watermark forwards as we complete for those dependencies
349 watermark = Watermark(npu_index, dma_index)
350 outstanding = Watermark(npu_outstanding, dma_outstanding)
351
352 return watermark, outstanding
353
354
355# -------------------------------------------------------------------
356# BLOCKDEP
357# -------------------------------------------------------------------
358
359
360def get_ifm_ofm_block_depth(arch: ArchitectureFeatures, npu_op: NpuBlockOperation) -> int:
361 # Note: NOT equivalent to the normal ifm block depth calculation since
362 # it takes into account 'depthless' block operations by returning full
363 # depth
364 if npu_op.op_type == NpuOperationType.Conv2D:
365 res = arch.calc_ifm_block_depth(npu_op.ifm.shape.depth, npu_op.ifm.data_type.size_in_bits())
366 return res
367 return npu_op.ofm.shape.depth
368
369
370def coords_intersect(start_a: PointXYZ, end_a: PointXYZ, start_b: PointXYZ, end_b: PointXYZ) -> bool:
371 """Checks if the two areas overlap"""
372 start_x = max(start_a.x, start_b.x)
373 end_x = min(end_a.x, end_b.x)
374 start_y = max(start_a.y, start_b.y)
375 end_y = min(end_a.y, end_b.y)
376 start_z = max(start_a.z, start_b.z)
377 end_z = min(end_a.z, end_b.z)
378 return ((end_x - start_x) > 0) and ((end_y - start_y) > 0) and ((end_z - start_z) > 0)
379
380
381def intersects(
382 ifm: NpuFeatureMap,
383 ifm_start_coord: PointXYZ,
384 ifm_end_coord: PointXYZ,
385 prev_ofm: NpuFeatureMap,
386 ofm_start_coord: PointXYZ,
387 ofm_end_coord: PointXYZ,
388) -> bool:
389 """Checks if the given IFM area overlaps with the given OFM area"""
390 if ifm.shape == prev_ofm.shape and ifm.tiles == prev_ofm.tiles:
391 # Common case: prev_op.ofm == op.ifm; in this case it suffices to check
392 # if the xyz coordinates overlap, which is quick and easy
393 res = coords_intersect(ifm_start_coord, ifm_end_coord, ofm_start_coord, ofm_end_coord)
394 else:
395 # The OFM produces a part of the IFM (e.g. a stripe), or the IFM consumes part of the OFM.
396 # In this case, address comparison between the two areas is needed
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100397 ifm_ranges: List[Optional[NpuAddressRange]] = get_address_ranges_for_area(ifm, ifm_start_coord, ifm_end_coord)
Louis Verhaard1e170182020-11-26 11:42:04 +0100398 prev_ofm_ranges = get_address_ranges_for_area(prev_ofm, ofm_start_coord, ofm_end_coord)
399 res = range_lists_overlap(ifm_ranges, prev_ofm_ranges)
400 return res
401
402
403# Block job dependency:
404# Does the VOLUME of IFMs for block job B(0) overlap with VOLUME of OFMs block jobs A(8,9,10)
405#
406# A | B
407# ----------------------+------------------
408# .... 3,4,5,6,7,8,9,10 | 0,1,2,3,4,5,6,8 10 < JOB NUMBER
409# |<------->| dependency offset
410#
411
412
413def get_offset_block_coords(area: Rect, block: Block, offset: int) -> Optional[PointXYZ]:
414 """
415 Get the coordinates of a block offset from either the end (negative)
416 or the start (zero or positive) of the given 3D area
417 """
418 size = area.size()
419 # Dimensions of the region, in blocks
420 width_blocks = numeric_util.round_up_divide(size.width, block.width)
421 height_blocks = numeric_util.round_up_divide(size.height, block.height)
422 depth_blocks = numeric_util.round_up_divide(size.depth, block.depth)
423 total_blocks = width_blocks * height_blocks * depth_blocks
424 if offset < 0:
425 index = total_blocks + offset
426 else:
427 index = offset
428
429 if index >= total_blocks:
430 return None
431
432 # Coordinates of the indexed block
433 coord_z = block.depth * (index % depth_blocks)
434 coord_y = block.height * (index // (depth_blocks * width_blocks))
435 coord_x = block.width * ((index // depth_blocks) % width_blocks)
436
437 return PointXYZ(x=coord_x + area.x, y=coord_y + area.y, z=coord_z + area.z)
438
439
440def get_first_job_input_volume(
441 arch: ArchitectureFeatures,
442 ifm: Rect,
443 ofm: Rect,
444 ifm_block_depth,
445 ofm_block: Block,
446 kernel: Kernel,
447 padding: NpuPadding,
448 block_offset: int,
449):
450 # Get ifm block size (jobs are invisibly decomposed into subkernels)
451 ifm_block = arch.get_ifm_block_size(ifm_block_depth, ofm_block, kernel, arch.ofm_block_max)
452 ifm_depth_blocks = numeric_util.round_up_divide(ifm.size().depth, ifm_block_depth)
453
454 # Which OFM block are we calculating
455 ofm_coord = get_offset_block_coords(ofm, ofm_block, block_offset // ifm_depth_blocks)
456 if ofm_coord is None:
457 return None
458
459 # Coordinate of the source IFM block
460 ifm_coord_x = max(0, ofm_coord[0] * kernel.stride.x - padding.left)
461 ifm_coord_y = max(0, ofm_coord[1] * kernel.stride.y - padding.right)
462 ifm_coord_z = ifm.z + (block_offset % ifm_depth_blocks) * ifm_block.depth
463
464 # IFM block that will be sampled for the FIRST+block_offset job in the next operator's OFM
465 start_coord = PointXYZ(x=ifm_coord_x, y=ifm_coord_y, z=ifm_coord_z)
466 end_coord = PointXYZ(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200467 x=start_coord[0] + ifm_block.width,
468 y=start_coord[1] + ifm_block.height,
469 z=start_coord[2] + ifm_block.depth,
Louis Verhaard1e170182020-11-26 11:42:04 +0100470 )
471 return (start_coord, end_coord, 1) # start, end, total jobs
472
473
474def get_prev_job_output_volume(ofm: Rect, ofm_block: Block, block_offset: int):
475 assert block_offset >= 0
476
477 # Get OFM block's volume coordinates
478 start_coord = get_offset_block_coords(ofm, ofm_block, -1 - block_offset)
479 if start_coord is None:
480 return None
481 end_coord = PointXYZ(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200482 x=start_coord.x + ofm_block.width,
483 y=start_coord.y + ofm_block.height,
484 z=start_coord.z + ofm_block.depth,
Louis Verhaard1e170182020-11-26 11:42:04 +0100485 )
486 return (start_coord, end_coord, 1) # start, end, total jobs for this OFM block
487
488
Jonas Ohlssond8575072022-03-30 10:30:25 +0200489def calc_blockdep(
490 arch: ArchitectureFeatures,
491 prev_op: Optional[NpuBlockOperation],
492 npu_op: NpuBlockOperation,
493) -> int:
Louis Verhaard1e170182020-11-26 11:42:04 +0100494 """Calculates the value of the BLOCKDEP register"""
495 if prev_op is None:
496 return 0
497 assert npu_op.ifm is not None
498 assert prev_op.ofm is not None
Diqing Zhong455e20e2021-02-03 16:37:31 +0100499 # Check if the reserved shram will be used in current/prev op
500 prev_uses_lut = prev_op.activation is not None and prev_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
501 curr_uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
502 if prev_uses_lut and arch.shram_reserved_unused_banks == 0 and not curr_uses_lut:
503 return 0
504
Louis Verhaard1e170182020-11-26 11:42:04 +0100505 # Check if IFM or IFM2 overlaps with prev op's OFM
506 prev_ofm_ranges = get_address_ranges(prev_op.ofm)
507 ifm_ranges = get_address_ranges(npu_op.ifm)
508 ifm_overlaps = range_lists_overlap(prev_ofm_ranges, ifm_ranges)
509 if has_ifm2(npu_op):
510 assert npu_op.ifm2 is not None
511 ifm2_ranges = get_address_ranges(npu_op.ifm2)
512 ifm2_overlaps = range_lists_overlap(prev_ofm_ranges, ifm2_ranges)
513 else:
514 ifm2_overlaps = False
515 if ifm_overlaps and ifm2_overlaps:
516 # Both IFM and IFM2 overlap (should be rare)
517 return 0
518 if not ifm_overlaps and not ifm2_overlaps:
519 # No overlap between prev OFM and IFM/IFM2
520 return ArchitectureFeatures.MAX_BLOCKDEP
521 if ifm2_overlaps and shape3d_size(npu_op.ifm2.shape) < shape3d_size(npu_op.ifm.shape):
522 # Prev OFM produces IFM2 which is broadcasted (this should be rare)
523 return 0
524 # Prev OFM overlaps with IFM or IFM2; calculate the blockdep
525 prev_block_config = prev_op.block_config
526 block_config = npu_op.block_config
527 overlapping_fm = npu_op.ifm if ifm_overlaps else npu_op.ifm2
528 assert overlapping_fm is not None
529
530 cur_ifm_block_depth = get_ifm_ofm_block_depth(arch, npu_op)
531 cur_ofm_block = Block(block_config.width, block_config.height, block_config.depth)
532 cur_ofm_rect = shape3d_to_rect(npu_op.ofm.shape)
533 cur_ifm_rect = shape3d_to_rect(npu_op.ifm.shape)
534 padding = NpuPadding(0, 0, 0, 0) if npu_op.padding is None else npu_op.padding
535 blockdep = ArchitectureFeatures.MAX_BLOCKDEP
536 kernel = to_kernel(npu_op.kernel)
537
538 prev_ofm_block = Block(prev_block_config.width, prev_block_config.height, prev_block_config.depth)
539 prev_ofm_rect = shape3d_to_rect(prev_op.ofm.shape)
540 # Iterate over the next BLOCKDEP inputs, checking to see if a sliding window
541 # of IFM area overlaps with any previous OFM block generation.
542 elapsed_jobs = 0
543 for forward_offset in range(ArchitectureFeatures.MAX_BLOCKDEP):
544 # This is the IFM block we want to sample from
545 in_area = get_first_job_input_volume(
546 arch, cur_ifm_rect, cur_ofm_rect, cur_ifm_block_depth, cur_ofm_block, kernel, padding, forward_offset
547 )
548 if in_area is None:
549 break
550
551 # Try several previous-OFM blocks in the past (they still might comprise multiple IFM jobs)
552 outstanding_jobs = 0
553 for block_offset in range(ArchitectureFeatures.MAX_BLOCKDEP):
554 # This is the OFM block being generated by the previous op
555 out_area = get_prev_job_output_volume(prev_ofm_rect, prev_ofm_block, block_offset)
556 if out_area is None:
557 break
558
559 # Block dependency is the max number of allowed outstanding jobs
560 # in the pipeline. Selected by determining how many jobs occur
561 # in between two operators' overlapping OFM->IFM block volumes
562 if intersects(overlapping_fm, in_area[0], in_area[1], prev_op.ofm, out_area[0], out_area[1]):
563 break
564 # Early exit if no intersections and we've seen enough jobs in the pipeline
565 elif outstanding_jobs > ArchitectureFeatures.MAX_BLOCKDEP:
566 break
567
568 # This OFM had this many jobs (accumulate over multiple OFM blocks)
569 outstanding_jobs += out_area[2]
570
571 blockdep = min(blockdep, elapsed_jobs + outstanding_jobs)
572 elapsed_jobs += in_area[2]
573 # Early exit if no intersections and we've seen enough jobs in the pipeline
574 if elapsed_jobs > ArchitectureFeatures.MAX_BLOCKDEP:
575 break
576
577 return blockdep