blob: 8a6f94e4d6e7daa8127be29b4b61c20d8ddcfb10 [file] [log] [blame]
William Isaksson56e5f0c2024-01-10 12:28:04 +01001# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
Louis Verhaard1e170182020-11-26 11:42:04 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17# Description:
18# Utility functions for code generation
19from typing import List
20from typing import NamedTuple
21from typing import Optional
22
23from . import numeric_util
William Isaksson56e5f0c2024-01-10 12:28:04 +010024from .api import NpuAccumulatorType
Louis Verhaard1e170182020-11-26 11:42:04 +010025from .api import NpuActivationOp
26from .api import NpuAddressRange
27from .api import NpuBlockOperation
28from .api import NpuDmaOperation
29from .api import NpuElementWiseOp
30from .api import NpuFeatureMap
31from .api import NpuKernel
32from .api import NpuLayout
33from .api import NpuOperation
34from .api import NpuOperationType
35from .api import NpuPadding
Fredrik Svedbergf3c7d552022-11-04 09:48:49 +010036from .api import NpuQuantization
Louis Verhaard1e170182020-11-26 11:42:04 +010037from .api import NpuShape3D
38from .architecture_features import ArchitectureFeatures
39from .architecture_features import Block
40from .architecture_features import Rect
William Isakssona4f84112023-06-19 15:31:46 +000041from .errors import ByteAlignmentError
42from .errors import ByteSizeError
Louis Verhaard1e170182020-11-26 11:42:04 +010043from .operation import Kernel
44from .operation import PointXYZ
William Isakssona4f84112023-06-19 15:31:46 +000045from .tensor import TensorFormat
William Isaksson56e5f0c2024-01-10 12:28:04 +010046from .tflite.TensorType import TensorType
Louis Verhaard1e170182020-11-26 11:42:04 +010047from ethosu.vela.range_set import AccessDirection
48from ethosu.vela.range_set import MemoryAccessSet
49from ethosu.vela.range_set import MemoryRangeSet
50
William Isakssona4f84112023-06-19 15:31:46 +000051
Louis Verhaard1e170182020-11-26 11:42:04 +010052# base address slot for memory to memory transfer
53BASE_PTR_INDEX_MEM2MEM = int((1 << 8) | (3 << 0))
54
55
Michael McGeaghf3e3ad72020-12-02 12:39:03 +000056UNARY_ELEMWISE_OPS = (NpuElementWiseOp.ABS, NpuElementWiseOp.LRELU, NpuElementWiseOp.CLZ)
Louis Verhaard1e170182020-11-26 11:42:04 +010057
58
William Isakssona4f84112023-06-19 15:31:46 +000059def check_alignment(payload, required_alignment):
60 # assuming payload is defined in bytes
61 if payload % required_alignment != 0:
62 raise ByteAlignmentError(f"Cmd1 payload of size: {payload} Bytes is not {required_alignment}-byte aligned")
63
64
Björn Davidsson199e8e62023-10-10 11:22:59 +020065def check_size(payload, required_multiple, value_type):
William Isakssona4f84112023-06-19 15:31:46 +000066 # assuming payload is defined in bytes
67 if payload % required_multiple != 0:
Björn Davidsson199e8e62023-10-10 11:22:59 +020068 raise ByteSizeError(f"Cmd1 {value_type} of size: {payload} Bytes is not a multiple of {required_multiple}")
69
70
71def check_stride(stride, required_multiple):
72 check_size(stride, required_multiple, "stride")
73
74
75def check_length(length, required_multiple):
76 check_size(length, required_multiple, "length")
William Isakssona4f84112023-06-19 15:31:46 +000077
78
William Isaksson56e5f0c2024-01-10 12:28:04 +010079def to_npu_acc_type(accType: TensorType) -> NpuAccumulatorType:
80 if accType == TensorType.INT32:
81 return NpuAccumulatorType.Int32
82 elif accType == TensorType.INT64:
83 return NpuAccumulatorType.Int40
84 else:
85 return NpuAccumulatorType.Default
86
87
Louis Verhaard1e170182020-11-26 11:42:04 +010088def to_npu_kernel(kernel: Kernel) -> NpuKernel:
89 """Converts the given internally used kernel object to NpuKernel (of public API)"""
90 return NpuKernel(
91 kernel.width, kernel.height, kernel.stride.x, kernel.stride.y, kernel.dilation.x, kernel.dilation.y
92 )
93
94
95def to_kernel(kernel: Optional[NpuKernel]) -> Kernel:
96 """Converts the given public API object to Kernel (used internally)"""
97 if kernel is None:
98 return Kernel(1, 1)
99 return Kernel(kernel.width, kernel.height, kernel.stride_x, kernel.stride_y, kernel.dilation_x, kernel.dilation_y)
100
101
102def has_ifm2(npu_op: NpuBlockOperation) -> bool:
103 """Checks if op has non-scalar IFM2"""
104 return npu_op.ifm2 is not None and npu_op.ifm2_scalar is None
105
106
Louis Verhaard1e170182020-11-26 11:42:04 +0100107def shape3d_size(shape: NpuShape3D) -> int:
108 return shape.width * shape.height * shape.depth
109
110
111def shape3d_to_rect(shape: NpuShape3D) -> Rect:
112 return Rect(0, 0, 0, shape.width - 1, shape.height - 1, shape.depth - 1)
113
114
Tim Halld8339a72021-05-27 18:49:40 +0100115def shape3d_to_block(shape: NpuShape3D) -> Block:
116 return Block(shape.width, shape.height, shape.depth)
117
118
Fredrik Svedbergf3c7d552022-11-04 09:48:49 +0100119def get_zero_point(fm: NpuFeatureMap):
120 return int(fm.quantization.zero_point if fm.quantization else 0)
121
122
123def quantise(value: float, quant: Optional[NpuQuantization]) -> int:
124 """Quantizes the given value"""
125 scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32
126 zp = 0 if quant is None else quant.zero_point
127 return numeric_util.quantise_float32(value, scale, zp)
128
129
Louis Verhaard1e170182020-11-26 11:42:04 +0100130# -------------------------------------------------------------------
131# ADDRESSING/STRIDES (helper functions)
132# -------------------------------------------------------------------
133
134
135def ranges_overlap(range1: NpuAddressRange, range2: NpuAddressRange) -> bool:
136 """Checks if the ranges overlap"""
137 return range1.region == range2.region and numeric_util.overlaps(
138 range1.address, range1.address + range1.length, range2.address, range2.address + range2.length
139 )
140
141
142def range_lists_overlap(list1: List[Optional[NpuAddressRange]], list2: List[Optional[NpuAddressRange]]) -> bool:
143 """Checks if there is any address overlap between list1 and list2"""
144 for range1 in list1:
145 if range1 is None:
146 continue
147 for range2 in list2:
148 if range2 is not None and ranges_overlap(range1, range2):
149 return True
150 return False
151
152
153def get_strides(fm: NpuFeatureMap) -> NpuShape3D:
154 """Calculates STRIDE_C/Y/X"""
155 if fm.strides is not None:
156 return fm.strides
157 elem_size = fm.data_type.size_in_bytes()
158 if fm.layout == NpuLayout.NHWC:
159 stride_c = elem_size
160 stride_x = fm.shape.depth * stride_c
161 stride_y = fm.shape.width * stride_x
162 else:
163 stride_x = 16 * elem_size
164 stride_c = stride_x * fm.shape.width
165 stride_y = elem_size * fm.shape.width * numeric_util.round_up(fm.shape.depth, 16)
166 return NpuShape3D(depth=stride_c, height=stride_y, width=stride_x)
167
168
169def get_address(fm: NpuFeatureMap, strides: NpuShape3D, y: int, x: int, c: int) -> int:
170 """Returns address of given coordinate"""
171 t = 0
172 BRICK = 16
173 stride_c = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHWC else strides.depth
174 stride_x = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHCWB16 else strides.width
175 if x >= fm.tiles.width_0:
176 x -= fm.tiles.width_0
177 t = 1
178 if y >= fm.tiles.height_1:
179 y -= fm.tiles.height_1
180 t += 2
181 elif y >= fm.tiles.height_0:
182 y -= fm.tiles.height_0
183 t += 2
184 elem_size = fm.data_type.size_in_bytes()
185 return (
186 fm.tiles.addresses[t] + y * strides.height + x * stride_x + (c // BRICK) * stride_c + int(c % BRICK) * elem_size
187 )
188
189
190def get_address_range(
191 fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int
192) -> NpuAddressRange:
193 """
194 Gets address range for (y0, x0, c0) - (y1, x1, c1) (inclusive, so the second coordinate is within the fm).
195 The begin and end coordinates must be within the same tile.
196 """
197 addr0 = get_address(fm, strides, y0, x0, c0)
198 addr1 = get_address(fm, strides, y1, x1, c1)
199 return NpuAddressRange(region=fm.region, address=addr0, length=addr1 - addr0 + fm.data_type.size_in_bytes())
200
201
202def get_h_ranges(
203 fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int
204) -> List[NpuAddressRange]:
205 """
206 Gets address ranges for (y0, x0, c0) - (y1, x1, c1) (inclusive, so the second coordinate is within the fm);
207 the begin and end coordinates must be within the same tile.
208 Divides the area in horizontal "stripes" of height 1, and returns the address ranges for these "stripes".
209 """
210 return [get_address_range(fm, strides, y, x0, c0, y, x1, c1) for y in range(y0, y1 + 1)]
211
212
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100213def get_address_ranges_for_area(fm: NpuFeatureMap, start: PointXYZ, end: PointXYZ) -> List[Optional[NpuAddressRange]]:
Louis Verhaard1e170182020-11-26 11:42:04 +0100214 """
215 Returns a list of adddress ranges that covers the area start - end (inclusive).
216 Divides the area in horizontal "stripes" of height 1, and returns the address ranges for these "stripes".
217
218 For example, for the area marked with X (in a feature map with 4 tiles) as input, this function would return
219 6 address ranges: the address ranges for 1-height areas [AAA, BBB, CC, DD, EEE, FF]
220
221 .....|.... .....|....
222 t0 ..XXX|XX.. t1 t0 ..AAA|CC.. t1
223 ..XXX|XX.. ..BBB|DD..
224 -----+---- --> -----+----
225 t2 ..XXX|XX.. t3 t2 ..EEE|FF.. t3
226 .....|.... .....|....
227 """
228 strides = get_strides(fm)
229 height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0
230 h, w, c = fm.shape
231 y0, x0, c0 = start.y, start.x, start.z
232 y1, x1, c1 = min(end.y, h - 1), min(end.x, w - 1), min(end.z, c - 1)
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100233 ranges: List[Optional[NpuAddressRange]] = []
Louis Verhaard1e170182020-11-26 11:42:04 +0100234 if x0 < width_0 and y0 < height_0:
235 # Horizontal ranges for tile 0
236 ranges.extend(get_h_ranges(fm, strides, y0, x0, c0, min(y1, height_0 - 1), min(x1, width_0 - 1), c1))
237 if x1 >= width_0 and y0 < height_1:
238 # Horizontal ranges for tile 1
239 ranges.extend(get_h_ranges(fm, strides, y0, max(x0, width_0), c0, min(y1, height_1 - 1), x1, c1))
240 if x0 < width_0 and y1 >= height_0:
241 # Horizontal ranges for tile 2
242 ranges.extend(get_h_ranges(fm, strides, max(y0, height_0), x0, c0, y1, min(x1, width_0 - 1), c1))
243 if x1 >= width_0 and y1 >= height_1:
244 # Horizontal ranges for tile 3
245 ranges.extend(get_h_ranges(fm, strides, max(y0, height_1), max(x0, width_0), c0, y1, x1, c1))
246 return ranges
247
248
249def get_address_ranges(fm: NpuFeatureMap) -> List[Optional[NpuAddressRange]]:
250 """Returns 4 adddress ranges, one for every tile, None if the tile is not in use"""
251 strides = get_strides(fm)
252 height, width, depth = fm.shape.height, fm.shape.width, fm.shape.depth
253 height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0
Jonas Ohlssond8575072022-03-30 10:30:25 +0200254 t0 = get_address_range(
255 fm,
256 strides,
257 0,
258 0,
259 0,
260 min(height, height_0) - 1,
261 min(width, width_0) - 1,
262 depth - 1,
263 )
Louis Verhaard1e170182020-11-26 11:42:04 +0100264 if width > width_0:
265 t1 = get_address_range(fm, strides, 0, width_0, 0, min(height, height_1) - 1, width - 1, depth - 1)
266 else:
267 t1 = None
268 if height > height_0:
269 t2 = get_address_range(fm, strides, height_0, 0, 0, height - 1, min(width, width_0) - 1, depth - 1)
270 else:
271 t2 = None
272 if t1 is not None and t2 is not None:
273 t3 = get_address_range(fm, strides, height_1, width_0, 0, height - 1, width - 1, depth - 1)
274 else:
275 t3 = None
276 return [t0, t1, t2, t3]
277
278
William Isakssona4f84112023-06-19 15:31:46 +0000279def check_strides(fm: NpuFeatureMap, strides: NpuShape3D):
280
281 element_size_in_bytes = fm.data_type.size_in_bytes()
282
283 if fm.layout == NpuLayout.NHCWB16:
284 strides_to_check = [strides.depth, strides.height]
Björn Davidsson199e8e62023-10-10 11:22:59 +0200285 required_multiple = 16
William Isakssona4f84112023-06-19 15:31:46 +0000286 else:
287 strides_to_check = [strides.height, strides.width]
288 required_multiple = element_size_in_bytes
289 for stride in strides_to_check:
Björn Davidsson199e8e62023-10-10 11:22:59 +0200290 check_stride(stride, required_multiple)
William Isakssona4f84112023-06-19 15:31:46 +0000291
292
293def check_addresses(addresses: List[int], layout: NpuLayout, element_size, arch: ArchitectureFeatures):
294 if layout == NpuLayout.NHCWB16:
295 required_alignment = arch.storage_rounding_quantums[TensorFormat.NHCWB16][-1]
296 else:
297 required_alignment = element_size
298 for addr in addresses:
299 check_alignment(addr, required_alignment)
300
301
Louis Verhaard1e170182020-11-26 11:42:04 +0100302# -------------------------------------------------------------------
303# DMA_WAIT/KERNEL_WAIT
304# -------------------------------------------------------------------
305
306
307class Watermark(NamedTuple):
308 npu: int
309 dma: int
310
311
312def memory_range_set(range: NpuAddressRange) -> MemoryRangeSet:
313 return MemoryRangeSet(range.region, range.address, range.address + range.length)
314
315
316def get_dma_memory_accesses(dma_op: NpuDmaOperation) -> MemoryAccessSet:
317 """Returns the address that are read and written by the given DMA operation"""
318 res = MemoryAccessSet()
319 res.add(memory_range_set(dma_op.src), AccessDirection.Read)
320 res.add(memory_range_set(dma_op.dest), AccessDirection.Write)
321 return res
322
323
324def get_op_memory_accesses(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> MemoryAccessSet:
325 """Returns the addresses that are read and written by the given operation"""
326 assert npu_op.ifm is not None and npu_op.ofm is not None
327 # Read addresses
328 read_ranges = get_address_ranges(npu_op.ifm)
329 if has_ifm2(npu_op):
330 assert npu_op.ifm2 is not None
331 read_ranges.extend(get_address_ranges(npu_op.ifm2))
332 read_ranges.extend(npu_op.weights)
333 read_ranges.extend(npu_op.biases)
334 if npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP:
335 address = arch.available_shram_banks(True) * arch.shram_bank_size
336 read_ranges.append(NpuAddressRange(region=BASE_PTR_INDEX_MEM2MEM, address=address, length=2048))
337 # Written addresses
338 write_ranges = get_address_ranges(npu_op.ofm)
339 # Add write access to SHRAM, needed when LUTs can overwrite accumulator banks
340 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
341 written_shram_size = arch.available_shram_banks(uses_lut) * arch.shram_bank_size
342 write_ranges.append(NpuAddressRange(region=BASE_PTR_INDEX_MEM2MEM, address=0, length=written_shram_size))
343
344 res = MemoryAccessSet()
345 for read_range in read_ranges:
346 if read_range is not None:
347 res.add(memory_range_set(read_range), AccessDirection.Read)
348 for write_range in write_ranges:
349 if write_range is not None:
350 res.add(memory_range_set(write_range), AccessDirection.Write)
351 return res
352
353
354def get_wait_dependency(
Alexander Hanssonca9cc422023-06-22 16:01:27 +0000355 arch: ArchitectureFeatures,
356 npu_op: NpuOperation,
357 memory_accesses,
358 outstanding_dma_ops: List[NpuOperation],
359 outstanding_npu_ops: List[NpuOperation],
Louis Verhaard1e170182020-11-26 11:42:04 +0100360):
361 """Used to calculate whether DMA wait or kernel wait operations are needed"""
Alexander Hanssonca9cc422023-06-22 16:01:27 +0000362 kern_wait = -1
363 dma_wait = -1
364 op_accesses = memory_accesses[npu_op]
Louis Verhaard1e170182020-11-26 11:42:04 +0100365
Alexander Hanssonca9cc422023-06-22 16:01:27 +0000366 if isinstance(npu_op, NpuDmaOperation):
367 outstanding_ops = outstanding_npu_ops
368 outstanding_dma_ops.append(npu_op)
369 if len(outstanding_dma_ops) > arch.max_outstanding_dma:
370 outstanding_dma_ops.pop(0)
371 else:
372 outstanding_ops = outstanding_dma_ops
373 outstanding_npu_ops.append(npu_op)
374 if len(outstanding_npu_ops) > arch.max_outstanding_kernels:
375 outstanding_npu_ops.pop(0)
Louis Verhaard1e170182020-11-26 11:42:04 +0100376
Alexander Hanssonca9cc422023-06-22 16:01:27 +0000377 waits = -1
378 for idx in range(len(outstanding_ops) - 1, -1, -1):
379 waits += 1
380 other_op = outstanding_ops[idx]
381 other_accesses = memory_accesses[other_op]
382 if other_accesses.conflicts(op_accesses):
383 if isinstance(npu_op, NpuDmaOperation):
384 kern_wait = waits
385 else:
386 dma_wait = waits
387 # Current op needs to wait, and after it has waited,
388 # outstanding_ops[0..idx] are not outstanding any longer
389 for i in range(idx + 1):
390 outstanding_ops.pop(0)
391 break
Louis Verhaard1e170182020-11-26 11:42:04 +0100392
Alexander Hanssonca9cc422023-06-22 16:01:27 +0000393 cmd_waits = Watermark(kern_wait, dma_wait)
394 return cmd_waits
Louis Verhaard1e170182020-11-26 11:42:04 +0100395
396
William Isakssona4f84112023-06-19 15:31:46 +0000397def check_dma_op(dma_op: NpuDmaOperation, arch: ArchitectureFeatures):
398
399 # For Ethos-U65 only internal addresses have to be aligned, and if the internal address is the destination
400 # then the length has to be aligned also.
401 if arch.is_ethos_u65_system:
402 if dma_op.src.region == BASE_PTR_INDEX_MEM2MEM:
403 check_alignment(dma_op.src.address, 16)
404 if dma_op.dest.region == BASE_PTR_INDEX_MEM2MEM:
405 check_alignment(dma_op.dest.address, 16)
Björn Davidsson199e8e62023-10-10 11:22:59 +0200406 check_length(dma_op.src.length, 16)
William Isakssona4f84112023-06-19 15:31:46 +0000407 else:
408 check_alignment(dma_op.src.address, 16)
409 check_alignment(dma_op.dest.address, 16)
Björn Davidsson199e8e62023-10-10 11:22:59 +0200410 check_length(dma_op.src.length, 16)
William Isakssona4f84112023-06-19 15:31:46 +0000411
412
Louis Verhaard1e170182020-11-26 11:42:04 +0100413# -------------------------------------------------------------------
414# BLOCKDEP
415# -------------------------------------------------------------------
416
417
418def get_ifm_ofm_block_depth(arch: ArchitectureFeatures, npu_op: NpuBlockOperation) -> int:
419 # Note: NOT equivalent to the normal ifm block depth calculation since
420 # it takes into account 'depthless' block operations by returning full
421 # depth
422 if npu_op.op_type == NpuOperationType.Conv2D:
423 res = arch.calc_ifm_block_depth(npu_op.ifm.shape.depth, npu_op.ifm.data_type.size_in_bits())
424 return res
425 return npu_op.ofm.shape.depth
426
427
428def coords_intersect(start_a: PointXYZ, end_a: PointXYZ, start_b: PointXYZ, end_b: PointXYZ) -> bool:
429 """Checks if the two areas overlap"""
430 start_x = max(start_a.x, start_b.x)
431 end_x = min(end_a.x, end_b.x)
432 start_y = max(start_a.y, start_b.y)
433 end_y = min(end_a.y, end_b.y)
434 start_z = max(start_a.z, start_b.z)
435 end_z = min(end_a.z, end_b.z)
436 return ((end_x - start_x) > 0) and ((end_y - start_y) > 0) and ((end_z - start_z) > 0)
437
438
439def intersects(
440 ifm: NpuFeatureMap,
441 ifm_start_coord: PointXYZ,
442 ifm_end_coord: PointXYZ,
443 prev_ofm: NpuFeatureMap,
444 ofm_start_coord: PointXYZ,
445 ofm_end_coord: PointXYZ,
446) -> bool:
447 """Checks if the given IFM area overlaps with the given OFM area"""
448 if ifm.shape == prev_ofm.shape and ifm.tiles == prev_ofm.tiles:
449 # Common case: prev_op.ofm == op.ifm; in this case it suffices to check
450 # if the xyz coordinates overlap, which is quick and easy
451 res = coords_intersect(ifm_start_coord, ifm_end_coord, ofm_start_coord, ofm_end_coord)
452 else:
453 # The OFM produces a part of the IFM (e.g. a stripe), or the IFM consumes part of the OFM.
454 # In this case, address comparison between the two areas is needed
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100455 ifm_ranges: List[Optional[NpuAddressRange]] = get_address_ranges_for_area(ifm, ifm_start_coord, ifm_end_coord)
Louis Verhaard1e170182020-11-26 11:42:04 +0100456 prev_ofm_ranges = get_address_ranges_for_area(prev_ofm, ofm_start_coord, ofm_end_coord)
457 res = range_lists_overlap(ifm_ranges, prev_ofm_ranges)
458 return res
459
460
461# Block job dependency:
462# Does the VOLUME of IFMs for block job B(0) overlap with VOLUME of OFMs block jobs A(8,9,10)
463#
464# A | B
465# ----------------------+------------------
466# .... 3,4,5,6,7,8,9,10 | 0,1,2,3,4,5,6,8 10 < JOB NUMBER
467# |<------->| dependency offset
468#
469
470
471def get_offset_block_coords(area: Rect, block: Block, offset: int) -> Optional[PointXYZ]:
472 """
473 Get the coordinates of a block offset from either the end (negative)
474 or the start (zero or positive) of the given 3D area
475 """
476 size = area.size()
477 # Dimensions of the region, in blocks
478 width_blocks = numeric_util.round_up_divide(size.width, block.width)
479 height_blocks = numeric_util.round_up_divide(size.height, block.height)
480 depth_blocks = numeric_util.round_up_divide(size.depth, block.depth)
481 total_blocks = width_blocks * height_blocks * depth_blocks
482 if offset < 0:
483 index = total_blocks + offset
484 else:
485 index = offset
486
487 if index >= total_blocks:
488 return None
489
490 # Coordinates of the indexed block
491 coord_z = block.depth * (index % depth_blocks)
492 coord_y = block.height * (index // (depth_blocks * width_blocks))
493 coord_x = block.width * ((index // depth_blocks) % width_blocks)
494
495 return PointXYZ(x=coord_x + area.x, y=coord_y + area.y, z=coord_z + area.z)
496
497
498def get_first_job_input_volume(
499 arch: ArchitectureFeatures,
500 ifm: Rect,
501 ofm: Rect,
502 ifm_block_depth,
503 ofm_block: Block,
504 kernel: Kernel,
505 padding: NpuPadding,
506 block_offset: int,
507):
508 # Get ifm block size (jobs are invisibly decomposed into subkernels)
509 ifm_block = arch.get_ifm_block_size(ifm_block_depth, ofm_block, kernel, arch.ofm_block_max)
510 ifm_depth_blocks = numeric_util.round_up_divide(ifm.size().depth, ifm_block_depth)
511
512 # Which OFM block are we calculating
513 ofm_coord = get_offset_block_coords(ofm, ofm_block, block_offset // ifm_depth_blocks)
514 if ofm_coord is None:
515 return None
516
517 # Coordinate of the source IFM block
518 ifm_coord_x = max(0, ofm_coord[0] * kernel.stride.x - padding.left)
519 ifm_coord_y = max(0, ofm_coord[1] * kernel.stride.y - padding.right)
520 ifm_coord_z = ifm.z + (block_offset % ifm_depth_blocks) * ifm_block.depth
521
522 # IFM block that will be sampled for the FIRST+block_offset job in the next operator's OFM
523 start_coord = PointXYZ(x=ifm_coord_x, y=ifm_coord_y, z=ifm_coord_z)
524 end_coord = PointXYZ(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200525 x=start_coord[0] + ifm_block.width,
526 y=start_coord[1] + ifm_block.height,
527 z=start_coord[2] + ifm_block.depth,
Louis Verhaard1e170182020-11-26 11:42:04 +0100528 )
529 return (start_coord, end_coord, 1) # start, end, total jobs
530
531
532def get_prev_job_output_volume(ofm: Rect, ofm_block: Block, block_offset: int):
533 assert block_offset >= 0
534
535 # Get OFM block's volume coordinates
536 start_coord = get_offset_block_coords(ofm, ofm_block, -1 - block_offset)
537 if start_coord is None:
538 return None
539 end_coord = PointXYZ(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200540 x=start_coord.x + ofm_block.width,
541 y=start_coord.y + ofm_block.height,
542 z=start_coord.z + ofm_block.depth,
Louis Verhaard1e170182020-11-26 11:42:04 +0100543 )
544 return (start_coord, end_coord, 1) # start, end, total jobs for this OFM block
545
546
Jonas Ohlssond8575072022-03-30 10:30:25 +0200547def calc_blockdep(
548 arch: ArchitectureFeatures,
549 prev_op: Optional[NpuBlockOperation],
550 npu_op: NpuBlockOperation,
551) -> int:
Louis Verhaard1e170182020-11-26 11:42:04 +0100552 """Calculates the value of the BLOCKDEP register"""
553 if prev_op is None:
554 return 0
555 assert npu_op.ifm is not None
556 assert prev_op.ofm is not None
Diqing Zhong455e20e2021-02-03 16:37:31 +0100557 # Check if the reserved shram will be used in current/prev op
558 prev_uses_lut = prev_op.activation is not None and prev_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
559 curr_uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
560 if prev_uses_lut and arch.shram_reserved_unused_banks == 0 and not curr_uses_lut:
561 return 0
562
Louis Verhaard1e170182020-11-26 11:42:04 +0100563 # Check if IFM or IFM2 overlaps with prev op's OFM
564 prev_ofm_ranges = get_address_ranges(prev_op.ofm)
565 ifm_ranges = get_address_ranges(npu_op.ifm)
566 ifm_overlaps = range_lists_overlap(prev_ofm_ranges, ifm_ranges)
567 if has_ifm2(npu_op):
568 assert npu_op.ifm2 is not None
569 ifm2_ranges = get_address_ranges(npu_op.ifm2)
570 ifm2_overlaps = range_lists_overlap(prev_ofm_ranges, ifm2_ranges)
571 else:
572 ifm2_overlaps = False
573 if ifm_overlaps and ifm2_overlaps:
574 # Both IFM and IFM2 overlap (should be rare)
575 return 0
576 if not ifm_overlaps and not ifm2_overlaps:
577 # No overlap between prev OFM and IFM/IFM2
578 return ArchitectureFeatures.MAX_BLOCKDEP
579 if ifm2_overlaps and shape3d_size(npu_op.ifm2.shape) < shape3d_size(npu_op.ifm.shape):
580 # Prev OFM produces IFM2 which is broadcasted (this should be rare)
581 return 0
582 # Prev OFM overlaps with IFM or IFM2; calculate the blockdep
583 prev_block_config = prev_op.block_config
584 block_config = npu_op.block_config
585 overlapping_fm = npu_op.ifm if ifm_overlaps else npu_op.ifm2
586 assert overlapping_fm is not None
587
588 cur_ifm_block_depth = get_ifm_ofm_block_depth(arch, npu_op)
589 cur_ofm_block = Block(block_config.width, block_config.height, block_config.depth)
590 cur_ofm_rect = shape3d_to_rect(npu_op.ofm.shape)
591 cur_ifm_rect = shape3d_to_rect(npu_op.ifm.shape)
592 padding = NpuPadding(0, 0, 0, 0) if npu_op.padding is None else npu_op.padding
593 blockdep = ArchitectureFeatures.MAX_BLOCKDEP
594 kernel = to_kernel(npu_op.kernel)
595
596 prev_ofm_block = Block(prev_block_config.width, prev_block_config.height, prev_block_config.depth)
597 prev_ofm_rect = shape3d_to_rect(prev_op.ofm.shape)
598 # Iterate over the next BLOCKDEP inputs, checking to see if a sliding window
599 # of IFM area overlaps with any previous OFM block generation.
600 elapsed_jobs = 0
601 for forward_offset in range(ArchitectureFeatures.MAX_BLOCKDEP):
602 # This is the IFM block we want to sample from
603 in_area = get_first_job_input_volume(
604 arch, cur_ifm_rect, cur_ofm_rect, cur_ifm_block_depth, cur_ofm_block, kernel, padding, forward_offset
605 )
606 if in_area is None:
607 break
608
609 # Try several previous-OFM blocks in the past (they still might comprise multiple IFM jobs)
610 outstanding_jobs = 0
611 for block_offset in range(ArchitectureFeatures.MAX_BLOCKDEP):
612 # This is the OFM block being generated by the previous op
613 out_area = get_prev_job_output_volume(prev_ofm_rect, prev_ofm_block, block_offset)
614 if out_area is None:
615 break
616
617 # Block dependency is the max number of allowed outstanding jobs
618 # in the pipeline. Selected by determining how many jobs occur
619 # in between two operators' overlapping OFM->IFM block volumes
620 if intersects(overlapping_fm, in_area[0], in_area[1], prev_op.ofm, out_area[0], out_area[1]):
621 break
622 # Early exit if no intersections and we've seen enough jobs in the pipeline
623 elif outstanding_jobs > ArchitectureFeatures.MAX_BLOCKDEP:
624 break
625
626 # This OFM had this many jobs (accumulate over multiple OFM blocks)
627 outstanding_jobs += out_area[2]
628
629 blockdep = min(blockdep, elapsed_jobs + outstanding_jobs)
630 elapsed_jobs += in_area[2]
631 # Early exit if no intersections and we've seen enough jobs in the pipeline
632 if elapsed_jobs > ArchitectureFeatures.MAX_BLOCKDEP:
633 break
634
635 return blockdep