blob: 74c4f90e07f17fd613af2481e63e245e620ee88b [file] [log] [blame]
Alexander Hanssonca9cc422023-06-22 16:01:27 +00001# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
Louis Verhaard1e170182020-11-26 11:42:04 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17# Description:
18# Utility functions for code generation
19from typing import List
20from typing import NamedTuple
21from typing import Optional
22
23from . import numeric_util
24from .api import NpuActivationOp
25from .api import NpuAddressRange
26from .api import NpuBlockOperation
27from .api import NpuDmaOperation
28from .api import NpuElementWiseOp
29from .api import NpuFeatureMap
30from .api import NpuKernel
31from .api import NpuLayout
32from .api import NpuOperation
33from .api import NpuOperationType
34from .api import NpuPadding
Fredrik Svedbergf3c7d552022-11-04 09:48:49 +010035from .api import NpuQuantization
Louis Verhaard1e170182020-11-26 11:42:04 +010036from .api import NpuShape3D
37from .architecture_features import ArchitectureFeatures
38from .architecture_features import Block
39from .architecture_features import Rect
William Isakssona4f84112023-06-19 15:31:46 +000040from .errors import ByteAlignmentError
41from .errors import ByteSizeError
Louis Verhaard1e170182020-11-26 11:42:04 +010042from .operation import Kernel
43from .operation import PointXYZ
William Isakssona4f84112023-06-19 15:31:46 +000044from .tensor import TensorFormat
Louis Verhaard1e170182020-11-26 11:42:04 +010045from ethosu.vela.range_set import AccessDirection
46from ethosu.vela.range_set import MemoryAccessSet
47from ethosu.vela.range_set import MemoryRangeSet
48
William Isakssona4f84112023-06-19 15:31:46 +000049
Louis Verhaard1e170182020-11-26 11:42:04 +010050# base address slot for memory to memory transfer
51BASE_PTR_INDEX_MEM2MEM = int((1 << 8) | (3 << 0))
52
53
Michael McGeaghf3e3ad72020-12-02 12:39:03 +000054UNARY_ELEMWISE_OPS = (NpuElementWiseOp.ABS, NpuElementWiseOp.LRELU, NpuElementWiseOp.CLZ)
Louis Verhaard1e170182020-11-26 11:42:04 +010055
56
William Isakssona4f84112023-06-19 15:31:46 +000057def check_alignment(payload, required_alignment):
58 # assuming payload is defined in bytes
59 if payload % required_alignment != 0:
60 raise ByteAlignmentError(f"Cmd1 payload of size: {payload} Bytes is not {required_alignment}-byte aligned")
61
62
Björn Davidsson199e8e62023-10-10 11:22:59 +020063def check_size(payload, required_multiple, value_type):
William Isakssona4f84112023-06-19 15:31:46 +000064 # assuming payload is defined in bytes
65 if payload % required_multiple != 0:
Björn Davidsson199e8e62023-10-10 11:22:59 +020066 raise ByteSizeError(f"Cmd1 {value_type} of size: {payload} Bytes is not a multiple of {required_multiple}")
67
68
69def check_stride(stride, required_multiple):
70 check_size(stride, required_multiple, "stride")
71
72
73def check_length(length, required_multiple):
74 check_size(length, required_multiple, "length")
William Isakssona4f84112023-06-19 15:31:46 +000075
76
Louis Verhaard1e170182020-11-26 11:42:04 +010077def to_npu_kernel(kernel: Kernel) -> NpuKernel:
78 """Converts the given internally used kernel object to NpuKernel (of public API)"""
79 return NpuKernel(
80 kernel.width, kernel.height, kernel.stride.x, kernel.stride.y, kernel.dilation.x, kernel.dilation.y
81 )
82
83
84def to_kernel(kernel: Optional[NpuKernel]) -> Kernel:
85 """Converts the given public API object to Kernel (used internally)"""
86 if kernel is None:
87 return Kernel(1, 1)
88 return Kernel(kernel.width, kernel.height, kernel.stride_x, kernel.stride_y, kernel.dilation_x, kernel.dilation_y)
89
90
91def has_ifm2(npu_op: NpuBlockOperation) -> bool:
92 """Checks if op has non-scalar IFM2"""
93 return npu_op.ifm2 is not None and npu_op.ifm2_scalar is None
94
95
Louis Verhaard1e170182020-11-26 11:42:04 +010096def shape3d_size(shape: NpuShape3D) -> int:
97 return shape.width * shape.height * shape.depth
98
99
100def shape3d_to_rect(shape: NpuShape3D) -> Rect:
101 return Rect(0, 0, 0, shape.width - 1, shape.height - 1, shape.depth - 1)
102
103
Tim Halld8339a72021-05-27 18:49:40 +0100104def shape3d_to_block(shape: NpuShape3D) -> Block:
105 return Block(shape.width, shape.height, shape.depth)
106
107
Fredrik Svedbergf3c7d552022-11-04 09:48:49 +0100108def get_zero_point(fm: NpuFeatureMap):
109 return int(fm.quantization.zero_point if fm.quantization else 0)
110
111
112def quantise(value: float, quant: Optional[NpuQuantization]) -> int:
113 """Quantizes the given value"""
114 scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32
115 zp = 0 if quant is None else quant.zero_point
116 return numeric_util.quantise_float32(value, scale, zp)
117
118
Louis Verhaard1e170182020-11-26 11:42:04 +0100119# -------------------------------------------------------------------
120# ADDRESSING/STRIDES (helper functions)
121# -------------------------------------------------------------------
122
123
124def ranges_overlap(range1: NpuAddressRange, range2: NpuAddressRange) -> bool:
125 """Checks if the ranges overlap"""
126 return range1.region == range2.region and numeric_util.overlaps(
127 range1.address, range1.address + range1.length, range2.address, range2.address + range2.length
128 )
129
130
131def range_lists_overlap(list1: List[Optional[NpuAddressRange]], list2: List[Optional[NpuAddressRange]]) -> bool:
132 """Checks if there is any address overlap between list1 and list2"""
133 for range1 in list1:
134 if range1 is None:
135 continue
136 for range2 in list2:
137 if range2 is not None and ranges_overlap(range1, range2):
138 return True
139 return False
140
141
142def get_strides(fm: NpuFeatureMap) -> NpuShape3D:
143 """Calculates STRIDE_C/Y/X"""
144 if fm.strides is not None:
145 return fm.strides
146 elem_size = fm.data_type.size_in_bytes()
147 if fm.layout == NpuLayout.NHWC:
148 stride_c = elem_size
149 stride_x = fm.shape.depth * stride_c
150 stride_y = fm.shape.width * stride_x
151 else:
152 stride_x = 16 * elem_size
153 stride_c = stride_x * fm.shape.width
154 stride_y = elem_size * fm.shape.width * numeric_util.round_up(fm.shape.depth, 16)
155 return NpuShape3D(depth=stride_c, height=stride_y, width=stride_x)
156
157
158def get_address(fm: NpuFeatureMap, strides: NpuShape3D, y: int, x: int, c: int) -> int:
159 """Returns address of given coordinate"""
160 t = 0
161 BRICK = 16
162 stride_c = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHWC else strides.depth
163 stride_x = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHCWB16 else strides.width
164 if x >= fm.tiles.width_0:
165 x -= fm.tiles.width_0
166 t = 1
167 if y >= fm.tiles.height_1:
168 y -= fm.tiles.height_1
169 t += 2
170 elif y >= fm.tiles.height_0:
171 y -= fm.tiles.height_0
172 t += 2
173 elem_size = fm.data_type.size_in_bytes()
174 return (
175 fm.tiles.addresses[t] + y * strides.height + x * stride_x + (c // BRICK) * stride_c + int(c % BRICK) * elem_size
176 )
177
178
179def get_address_range(
180 fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int
181) -> NpuAddressRange:
182 """
183 Gets address range for (y0, x0, c0) - (y1, x1, c1) (inclusive, so the second coordinate is within the fm).
184 The begin and end coordinates must be within the same tile.
185 """
186 addr0 = get_address(fm, strides, y0, x0, c0)
187 addr1 = get_address(fm, strides, y1, x1, c1)
188 return NpuAddressRange(region=fm.region, address=addr0, length=addr1 - addr0 + fm.data_type.size_in_bytes())
189
190
191def get_h_ranges(
192 fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int
193) -> List[NpuAddressRange]:
194 """
195 Gets address ranges for (y0, x0, c0) - (y1, x1, c1) (inclusive, so the second coordinate is within the fm);
196 the begin and end coordinates must be within the same tile.
197 Divides the area in horizontal "stripes" of height 1, and returns the address ranges for these "stripes".
198 """
199 return [get_address_range(fm, strides, y, x0, c0, y, x1, c1) for y in range(y0, y1 + 1)]
200
201
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100202def get_address_ranges_for_area(fm: NpuFeatureMap, start: PointXYZ, end: PointXYZ) -> List[Optional[NpuAddressRange]]:
Louis Verhaard1e170182020-11-26 11:42:04 +0100203 """
204 Returns a list of adddress ranges that covers the area start - end (inclusive).
205 Divides the area in horizontal "stripes" of height 1, and returns the address ranges for these "stripes".
206
207 For example, for the area marked with X (in a feature map with 4 tiles) as input, this function would return
208 6 address ranges: the address ranges for 1-height areas [AAA, BBB, CC, DD, EEE, FF]
209
210 .....|.... .....|....
211 t0 ..XXX|XX.. t1 t0 ..AAA|CC.. t1
212 ..XXX|XX.. ..BBB|DD..
213 -----+---- --> -----+----
214 t2 ..XXX|XX.. t3 t2 ..EEE|FF.. t3
215 .....|.... .....|....
216 """
217 strides = get_strides(fm)
218 height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0
219 h, w, c = fm.shape
220 y0, x0, c0 = start.y, start.x, start.z
221 y1, x1, c1 = min(end.y, h - 1), min(end.x, w - 1), min(end.z, c - 1)
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100222 ranges: List[Optional[NpuAddressRange]] = []
Louis Verhaard1e170182020-11-26 11:42:04 +0100223 if x0 < width_0 and y0 < height_0:
224 # Horizontal ranges for tile 0
225 ranges.extend(get_h_ranges(fm, strides, y0, x0, c0, min(y1, height_0 - 1), min(x1, width_0 - 1), c1))
226 if x1 >= width_0 and y0 < height_1:
227 # Horizontal ranges for tile 1
228 ranges.extend(get_h_ranges(fm, strides, y0, max(x0, width_0), c0, min(y1, height_1 - 1), x1, c1))
229 if x0 < width_0 and y1 >= height_0:
230 # Horizontal ranges for tile 2
231 ranges.extend(get_h_ranges(fm, strides, max(y0, height_0), x0, c0, y1, min(x1, width_0 - 1), c1))
232 if x1 >= width_0 and y1 >= height_1:
233 # Horizontal ranges for tile 3
234 ranges.extend(get_h_ranges(fm, strides, max(y0, height_1), max(x0, width_0), c0, y1, x1, c1))
235 return ranges
236
237
238def get_address_ranges(fm: NpuFeatureMap) -> List[Optional[NpuAddressRange]]:
239 """Returns 4 adddress ranges, one for every tile, None if the tile is not in use"""
240 strides = get_strides(fm)
241 height, width, depth = fm.shape.height, fm.shape.width, fm.shape.depth
242 height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0
Jonas Ohlssond8575072022-03-30 10:30:25 +0200243 t0 = get_address_range(
244 fm,
245 strides,
246 0,
247 0,
248 0,
249 min(height, height_0) - 1,
250 min(width, width_0) - 1,
251 depth - 1,
252 )
Louis Verhaard1e170182020-11-26 11:42:04 +0100253 if width > width_0:
254 t1 = get_address_range(fm, strides, 0, width_0, 0, min(height, height_1) - 1, width - 1, depth - 1)
255 else:
256 t1 = None
257 if height > height_0:
258 t2 = get_address_range(fm, strides, height_0, 0, 0, height - 1, min(width, width_0) - 1, depth - 1)
259 else:
260 t2 = None
261 if t1 is not None and t2 is not None:
262 t3 = get_address_range(fm, strides, height_1, width_0, 0, height - 1, width - 1, depth - 1)
263 else:
264 t3 = None
265 return [t0, t1, t2, t3]
266
267
William Isakssona4f84112023-06-19 15:31:46 +0000268def check_strides(fm: NpuFeatureMap, strides: NpuShape3D):
269
270 element_size_in_bytes = fm.data_type.size_in_bytes()
271
272 if fm.layout == NpuLayout.NHCWB16:
273 strides_to_check = [strides.depth, strides.height]
Björn Davidsson199e8e62023-10-10 11:22:59 +0200274 required_multiple = 16
William Isakssona4f84112023-06-19 15:31:46 +0000275 else:
276 strides_to_check = [strides.height, strides.width]
277 required_multiple = element_size_in_bytes
278 for stride in strides_to_check:
Björn Davidsson199e8e62023-10-10 11:22:59 +0200279 check_stride(stride, required_multiple)
William Isakssona4f84112023-06-19 15:31:46 +0000280
281
282def check_addresses(addresses: List[int], layout: NpuLayout, element_size, arch: ArchitectureFeatures):
283 if layout == NpuLayout.NHCWB16:
284 required_alignment = arch.storage_rounding_quantums[TensorFormat.NHCWB16][-1]
285 else:
286 required_alignment = element_size
287 for addr in addresses:
288 check_alignment(addr, required_alignment)
289
290
Louis Verhaard1e170182020-11-26 11:42:04 +0100291# -------------------------------------------------------------------
292# DMA_WAIT/KERNEL_WAIT
293# -------------------------------------------------------------------
294
295
296class Watermark(NamedTuple):
297 npu: int
298 dma: int
299
300
301def memory_range_set(range: NpuAddressRange) -> MemoryRangeSet:
302 return MemoryRangeSet(range.region, range.address, range.address + range.length)
303
304
305def get_dma_memory_accesses(dma_op: NpuDmaOperation) -> MemoryAccessSet:
306 """Returns the address that are read and written by the given DMA operation"""
307 res = MemoryAccessSet()
308 res.add(memory_range_set(dma_op.src), AccessDirection.Read)
309 res.add(memory_range_set(dma_op.dest), AccessDirection.Write)
310 return res
311
312
313def get_op_memory_accesses(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> MemoryAccessSet:
314 """Returns the addresses that are read and written by the given operation"""
315 assert npu_op.ifm is not None and npu_op.ofm is not None
316 # Read addresses
317 read_ranges = get_address_ranges(npu_op.ifm)
318 if has_ifm2(npu_op):
319 assert npu_op.ifm2 is not None
320 read_ranges.extend(get_address_ranges(npu_op.ifm2))
321 read_ranges.extend(npu_op.weights)
322 read_ranges.extend(npu_op.biases)
323 if npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP:
324 address = arch.available_shram_banks(True) * arch.shram_bank_size
325 read_ranges.append(NpuAddressRange(region=BASE_PTR_INDEX_MEM2MEM, address=address, length=2048))
326 # Written addresses
327 write_ranges = get_address_ranges(npu_op.ofm)
328 # Add write access to SHRAM, needed when LUTs can overwrite accumulator banks
329 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
330 written_shram_size = arch.available_shram_banks(uses_lut) * arch.shram_bank_size
331 write_ranges.append(NpuAddressRange(region=BASE_PTR_INDEX_MEM2MEM, address=0, length=written_shram_size))
332
333 res = MemoryAccessSet()
334 for read_range in read_ranges:
335 if read_range is not None:
336 res.add(memory_range_set(read_range), AccessDirection.Read)
337 for write_range in write_ranges:
338 if write_range is not None:
339 res.add(memory_range_set(write_range), AccessDirection.Write)
340 return res
341
342
343def get_wait_dependency(
Alexander Hanssonca9cc422023-06-22 16:01:27 +0000344 arch: ArchitectureFeatures,
345 npu_op: NpuOperation,
346 memory_accesses,
347 outstanding_dma_ops: List[NpuOperation],
348 outstanding_npu_ops: List[NpuOperation],
Louis Verhaard1e170182020-11-26 11:42:04 +0100349):
350 """Used to calculate whether DMA wait or kernel wait operations are needed"""
Alexander Hanssonca9cc422023-06-22 16:01:27 +0000351 kern_wait = -1
352 dma_wait = -1
353 op_accesses = memory_accesses[npu_op]
Louis Verhaard1e170182020-11-26 11:42:04 +0100354
Alexander Hanssonca9cc422023-06-22 16:01:27 +0000355 if isinstance(npu_op, NpuDmaOperation):
356 outstanding_ops = outstanding_npu_ops
357 outstanding_dma_ops.append(npu_op)
358 if len(outstanding_dma_ops) > arch.max_outstanding_dma:
359 outstanding_dma_ops.pop(0)
360 else:
361 outstanding_ops = outstanding_dma_ops
362 outstanding_npu_ops.append(npu_op)
363 if len(outstanding_npu_ops) > arch.max_outstanding_kernels:
364 outstanding_npu_ops.pop(0)
Louis Verhaard1e170182020-11-26 11:42:04 +0100365
Alexander Hanssonca9cc422023-06-22 16:01:27 +0000366 waits = -1
367 for idx in range(len(outstanding_ops) - 1, -1, -1):
368 waits += 1
369 other_op = outstanding_ops[idx]
370 other_accesses = memory_accesses[other_op]
371 if other_accesses.conflicts(op_accesses):
372 if isinstance(npu_op, NpuDmaOperation):
373 kern_wait = waits
374 else:
375 dma_wait = waits
376 # Current op needs to wait, and after it has waited,
377 # outstanding_ops[0..idx] are not outstanding any longer
378 for i in range(idx + 1):
379 outstanding_ops.pop(0)
380 break
Louis Verhaard1e170182020-11-26 11:42:04 +0100381
Alexander Hanssonca9cc422023-06-22 16:01:27 +0000382 cmd_waits = Watermark(kern_wait, dma_wait)
383 return cmd_waits
Louis Verhaard1e170182020-11-26 11:42:04 +0100384
385
William Isakssona4f84112023-06-19 15:31:46 +0000386def check_dma_op(dma_op: NpuDmaOperation, arch: ArchitectureFeatures):
387
388 # For Ethos-U65 only internal addresses have to be aligned, and if the internal address is the destination
389 # then the length has to be aligned also.
390 if arch.is_ethos_u65_system:
391 if dma_op.src.region == BASE_PTR_INDEX_MEM2MEM:
392 check_alignment(dma_op.src.address, 16)
393 if dma_op.dest.region == BASE_PTR_INDEX_MEM2MEM:
394 check_alignment(dma_op.dest.address, 16)
Björn Davidsson199e8e62023-10-10 11:22:59 +0200395 check_length(dma_op.src.length, 16)
William Isakssona4f84112023-06-19 15:31:46 +0000396 else:
397 check_alignment(dma_op.src.address, 16)
398 check_alignment(dma_op.dest.address, 16)
Björn Davidsson199e8e62023-10-10 11:22:59 +0200399 check_length(dma_op.src.length, 16)
William Isakssona4f84112023-06-19 15:31:46 +0000400
401
Louis Verhaard1e170182020-11-26 11:42:04 +0100402# -------------------------------------------------------------------
403# BLOCKDEP
404# -------------------------------------------------------------------
405
406
407def get_ifm_ofm_block_depth(arch: ArchitectureFeatures, npu_op: NpuBlockOperation) -> int:
408 # Note: NOT equivalent to the normal ifm block depth calculation since
409 # it takes into account 'depthless' block operations by returning full
410 # depth
411 if npu_op.op_type == NpuOperationType.Conv2D:
412 res = arch.calc_ifm_block_depth(npu_op.ifm.shape.depth, npu_op.ifm.data_type.size_in_bits())
413 return res
414 return npu_op.ofm.shape.depth
415
416
417def coords_intersect(start_a: PointXYZ, end_a: PointXYZ, start_b: PointXYZ, end_b: PointXYZ) -> bool:
418 """Checks if the two areas overlap"""
419 start_x = max(start_a.x, start_b.x)
420 end_x = min(end_a.x, end_b.x)
421 start_y = max(start_a.y, start_b.y)
422 end_y = min(end_a.y, end_b.y)
423 start_z = max(start_a.z, start_b.z)
424 end_z = min(end_a.z, end_b.z)
425 return ((end_x - start_x) > 0) and ((end_y - start_y) > 0) and ((end_z - start_z) > 0)
426
427
428def intersects(
429 ifm: NpuFeatureMap,
430 ifm_start_coord: PointXYZ,
431 ifm_end_coord: PointXYZ,
432 prev_ofm: NpuFeatureMap,
433 ofm_start_coord: PointXYZ,
434 ofm_end_coord: PointXYZ,
435) -> bool:
436 """Checks if the given IFM area overlaps with the given OFM area"""
437 if ifm.shape == prev_ofm.shape and ifm.tiles == prev_ofm.tiles:
438 # Common case: prev_op.ofm == op.ifm; in this case it suffices to check
439 # if the xyz coordinates overlap, which is quick and easy
440 res = coords_intersect(ifm_start_coord, ifm_end_coord, ofm_start_coord, ofm_end_coord)
441 else:
442 # The OFM produces a part of the IFM (e.g. a stripe), or the IFM consumes part of the OFM.
443 # In this case, address comparison between the two areas is needed
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100444 ifm_ranges: List[Optional[NpuAddressRange]] = get_address_ranges_for_area(ifm, ifm_start_coord, ifm_end_coord)
Louis Verhaard1e170182020-11-26 11:42:04 +0100445 prev_ofm_ranges = get_address_ranges_for_area(prev_ofm, ofm_start_coord, ofm_end_coord)
446 res = range_lists_overlap(ifm_ranges, prev_ofm_ranges)
447 return res
448
449
450# Block job dependency:
451# Does the VOLUME of IFMs for block job B(0) overlap with VOLUME of OFMs block jobs A(8,9,10)
452#
453# A | B
454# ----------------------+------------------
455# .... 3,4,5,6,7,8,9,10 | 0,1,2,3,4,5,6,8 10 < JOB NUMBER
456# |<------->| dependency offset
457#
458
459
460def get_offset_block_coords(area: Rect, block: Block, offset: int) -> Optional[PointXYZ]:
461 """
462 Get the coordinates of a block offset from either the end (negative)
463 or the start (zero or positive) of the given 3D area
464 """
465 size = area.size()
466 # Dimensions of the region, in blocks
467 width_blocks = numeric_util.round_up_divide(size.width, block.width)
468 height_blocks = numeric_util.round_up_divide(size.height, block.height)
469 depth_blocks = numeric_util.round_up_divide(size.depth, block.depth)
470 total_blocks = width_blocks * height_blocks * depth_blocks
471 if offset < 0:
472 index = total_blocks + offset
473 else:
474 index = offset
475
476 if index >= total_blocks:
477 return None
478
479 # Coordinates of the indexed block
480 coord_z = block.depth * (index % depth_blocks)
481 coord_y = block.height * (index // (depth_blocks * width_blocks))
482 coord_x = block.width * ((index // depth_blocks) % width_blocks)
483
484 return PointXYZ(x=coord_x + area.x, y=coord_y + area.y, z=coord_z + area.z)
485
486
487def get_first_job_input_volume(
488 arch: ArchitectureFeatures,
489 ifm: Rect,
490 ofm: Rect,
491 ifm_block_depth,
492 ofm_block: Block,
493 kernel: Kernel,
494 padding: NpuPadding,
495 block_offset: int,
496):
497 # Get ifm block size (jobs are invisibly decomposed into subkernels)
498 ifm_block = arch.get_ifm_block_size(ifm_block_depth, ofm_block, kernel, arch.ofm_block_max)
499 ifm_depth_blocks = numeric_util.round_up_divide(ifm.size().depth, ifm_block_depth)
500
501 # Which OFM block are we calculating
502 ofm_coord = get_offset_block_coords(ofm, ofm_block, block_offset // ifm_depth_blocks)
503 if ofm_coord is None:
504 return None
505
506 # Coordinate of the source IFM block
507 ifm_coord_x = max(0, ofm_coord[0] * kernel.stride.x - padding.left)
508 ifm_coord_y = max(0, ofm_coord[1] * kernel.stride.y - padding.right)
509 ifm_coord_z = ifm.z + (block_offset % ifm_depth_blocks) * ifm_block.depth
510
511 # IFM block that will be sampled for the FIRST+block_offset job in the next operator's OFM
512 start_coord = PointXYZ(x=ifm_coord_x, y=ifm_coord_y, z=ifm_coord_z)
513 end_coord = PointXYZ(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200514 x=start_coord[0] + ifm_block.width,
515 y=start_coord[1] + ifm_block.height,
516 z=start_coord[2] + ifm_block.depth,
Louis Verhaard1e170182020-11-26 11:42:04 +0100517 )
518 return (start_coord, end_coord, 1) # start, end, total jobs
519
520
521def get_prev_job_output_volume(ofm: Rect, ofm_block: Block, block_offset: int):
522 assert block_offset >= 0
523
524 # Get OFM block's volume coordinates
525 start_coord = get_offset_block_coords(ofm, ofm_block, -1 - block_offset)
526 if start_coord is None:
527 return None
528 end_coord = PointXYZ(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200529 x=start_coord.x + ofm_block.width,
530 y=start_coord.y + ofm_block.height,
531 z=start_coord.z + ofm_block.depth,
Louis Verhaard1e170182020-11-26 11:42:04 +0100532 )
533 return (start_coord, end_coord, 1) # start, end, total jobs for this OFM block
534
535
Jonas Ohlssond8575072022-03-30 10:30:25 +0200536def calc_blockdep(
537 arch: ArchitectureFeatures,
538 prev_op: Optional[NpuBlockOperation],
539 npu_op: NpuBlockOperation,
540) -> int:
Louis Verhaard1e170182020-11-26 11:42:04 +0100541 """Calculates the value of the BLOCKDEP register"""
542 if prev_op is None:
543 return 0
544 assert npu_op.ifm is not None
545 assert prev_op.ofm is not None
Diqing Zhong455e20e2021-02-03 16:37:31 +0100546 # Check if the reserved shram will be used in current/prev op
547 prev_uses_lut = prev_op.activation is not None and prev_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
548 curr_uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
549 if prev_uses_lut and arch.shram_reserved_unused_banks == 0 and not curr_uses_lut:
550 return 0
551
Louis Verhaard1e170182020-11-26 11:42:04 +0100552 # Check if IFM or IFM2 overlaps with prev op's OFM
553 prev_ofm_ranges = get_address_ranges(prev_op.ofm)
554 ifm_ranges = get_address_ranges(npu_op.ifm)
555 ifm_overlaps = range_lists_overlap(prev_ofm_ranges, ifm_ranges)
556 if has_ifm2(npu_op):
557 assert npu_op.ifm2 is not None
558 ifm2_ranges = get_address_ranges(npu_op.ifm2)
559 ifm2_overlaps = range_lists_overlap(prev_ofm_ranges, ifm2_ranges)
560 else:
561 ifm2_overlaps = False
562 if ifm_overlaps and ifm2_overlaps:
563 # Both IFM and IFM2 overlap (should be rare)
564 return 0
565 if not ifm_overlaps and not ifm2_overlaps:
566 # No overlap between prev OFM and IFM/IFM2
567 return ArchitectureFeatures.MAX_BLOCKDEP
568 if ifm2_overlaps and shape3d_size(npu_op.ifm2.shape) < shape3d_size(npu_op.ifm.shape):
569 # Prev OFM produces IFM2 which is broadcasted (this should be rare)
570 return 0
571 # Prev OFM overlaps with IFM or IFM2; calculate the blockdep
572 prev_block_config = prev_op.block_config
573 block_config = npu_op.block_config
574 overlapping_fm = npu_op.ifm if ifm_overlaps else npu_op.ifm2
575 assert overlapping_fm is not None
576
577 cur_ifm_block_depth = get_ifm_ofm_block_depth(arch, npu_op)
578 cur_ofm_block = Block(block_config.width, block_config.height, block_config.depth)
579 cur_ofm_rect = shape3d_to_rect(npu_op.ofm.shape)
580 cur_ifm_rect = shape3d_to_rect(npu_op.ifm.shape)
581 padding = NpuPadding(0, 0, 0, 0) if npu_op.padding is None else npu_op.padding
582 blockdep = ArchitectureFeatures.MAX_BLOCKDEP
583 kernel = to_kernel(npu_op.kernel)
584
585 prev_ofm_block = Block(prev_block_config.width, prev_block_config.height, prev_block_config.depth)
586 prev_ofm_rect = shape3d_to_rect(prev_op.ofm.shape)
587 # Iterate over the next BLOCKDEP inputs, checking to see if a sliding window
588 # of IFM area overlaps with any previous OFM block generation.
589 elapsed_jobs = 0
590 for forward_offset in range(ArchitectureFeatures.MAX_BLOCKDEP):
591 # This is the IFM block we want to sample from
592 in_area = get_first_job_input_volume(
593 arch, cur_ifm_rect, cur_ofm_rect, cur_ifm_block_depth, cur_ofm_block, kernel, padding, forward_offset
594 )
595 if in_area is None:
596 break
597
598 # Try several previous-OFM blocks in the past (they still might comprise multiple IFM jobs)
599 outstanding_jobs = 0
600 for block_offset in range(ArchitectureFeatures.MAX_BLOCKDEP):
601 # This is the OFM block being generated by the previous op
602 out_area = get_prev_job_output_volume(prev_ofm_rect, prev_ofm_block, block_offset)
603 if out_area is None:
604 break
605
606 # Block dependency is the max number of allowed outstanding jobs
607 # in the pipeline. Selected by determining how many jobs occur
608 # in between two operators' overlapping OFM->IFM block volumes
609 if intersects(overlapping_fm, in_area[0], in_area[1], prev_op.ofm, out_area[0], out_area[1]):
610 break
611 # Early exit if no intersections and we've seen enough jobs in the pipeline
612 elif outstanding_jobs > ArchitectureFeatures.MAX_BLOCKDEP:
613 break
614
615 # This OFM had this many jobs (accumulate over multiple OFM blocks)
616 outstanding_jobs += out_area[2]
617
618 blockdep = min(blockdep, elapsed_jobs + outstanding_jobs)
619 elapsed_jobs += in_area[2]
620 # Early exit if no intersections and we've seen enough jobs in the pipeline
621 if elapsed_jobs > ArchitectureFeatures.MAX_BLOCKDEP:
622 break
623
624 return blockdep