blob: 99ac32d5a915c5e85c74cd9e0150c2de5c19cf1a [file] [log] [blame]
erik.andersson@arm.com460c6892021-02-24 14:38:09 +01001# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
Tim Hallc8a73862020-10-27 12:43:14 +000017# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates
Tim Hall79d07d22020-04-27 18:20:16 +010018# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
Tim Hallc8a73862020-10-27 12:43:14 +000019# stream suitable for interpretation by the Ethos-U processor.
Louis Verhaardc6291292021-03-19 09:35:48 +010020import math
Tim Hall79d07d22020-04-27 18:20:16 +010021from collections import defaultdict
Diego Russoe8a10452020-04-21 17:39:10 +010022from enum import Enum
23from enum import IntEnum
Jonas Ohlsson845e2322022-03-01 12:39:55 +010024from typing import cast
Dwight Lidman9b43f842020-12-08 17:56:44 +010025from typing import Dict
Louis Verhaarde8a5a782020-11-02 18:04:27 +010026from typing import List
27from typing import Optional
Diego Russoea6111a2020-04-14 18:41:58 +010028
29import numpy as np
30
31from . import scaling
Louis Verhaardaeae5672020-11-02 18:04:27 +010032from .api import NpuAccelerator
Louis Verhaarde8a5a782020-11-02 18:04:27 +010033from .api import NpuActivation
34from .api import NpuActivationOp
35from .api import NpuAddressRange
36from .api import NpuBlockOperation
37from .api import NpuBlockTraversal
38from .api import NpuConv2DOperation
Dwight Lidman9b43f842020-12-08 17:56:44 +010039from .api import NpuConvDepthWiseOperation
Louis Verhaarde8a5a782020-11-02 18:04:27 +010040from .api import NpuDataType
41from .api import NpuDmaOperation
42from .api import NpuElementWiseOp
43from .api import NpuElementWiseOperation
44from .api import NpuFeatureMap
45from .api import NpuKernel
46from .api import NpuLayout
47from .api import NpuOperation
48from .api import NpuOperationType
49from .api import NpuPadding
50from .api import NpuPoolingOp
51from .api import NpuPoolingOperation
52from .api import NpuQuantization
53from .api import NpuResamplingMode
54from .api import NpuRoundingMode
55from .api import NpuShape3D
56from .api import NpuTileBox
Tim Halld8339a72021-05-27 18:49:40 +010057from .architecture_allocator import ArchitectureBlockConfig
58from .architecture_allocator import try_block_config
Louis Verhaarde8a5a782020-11-02 18:04:27 +010059from .architecture_features import Accelerator
Diego Russoe8a10452020-04-21 17:39:10 +010060from .architecture_features import ArchitectureFeatures
Louis Verhaard52078302020-11-18 13:35:06 +010061from .architecture_features import create_default_arch
Diego Russoe8a10452020-04-21 17:39:10 +010062from .architecture_features import SHRAMElements
erik.andersson@arm.com1878dab2021-03-16 09:40:24 +010063from .errors import VelaError
Diego Russoe8a10452020-04-21 17:39:10 +010064from .ethos_u55_regs.ethos_u55_regs import acc_format
65from .ethos_u55_regs.ethos_u55_regs import activation
66from .ethos_u55_regs.ethos_u55_regs import cmd0
67from .ethos_u55_regs.ethos_u55_regs import cmd1
68from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
Fredrik Svedberga0c36242020-06-03 15:43:31 +020069from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlincf7da102020-05-20 09:03:40 +020070from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russoe8a10452020-04-21 17:39:10 +010071from .ethos_u55_regs.ethos_u55_regs import rounding
Diego Russoe8a10452020-04-21 17:39:10 +010072from .numeric_util import quantise_float32
73from .numeric_util import round_away_zero
Diego Russoe8a10452020-04-21 17:39:10 +010074from .numeric_util import round_up_to_int
Patrik Gustavssonc74682c2021-08-17 14:26:38 +020075from .operation import ExplicitScaling
Tim Hall79d07d22020-04-27 18:20:16 +010076from .operation import NpuBlockType
Dwight Lidman9b43f842020-12-08 17:56:44 +010077from .range_set import MemoryAccessSet
Louis Verhaard024c3552021-03-17 14:26:34 +010078from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM
Louis Verhaard1e170182020-11-26 11:42:04 +010079from .register_command_stream_util import calc_blockdep
80from .register_command_stream_util import get_dma_memory_accesses
81from .register_command_stream_util import get_op_memory_accesses
82from .register_command_stream_util import get_strides
83from .register_command_stream_util import get_wait_dependency
84from .register_command_stream_util import has_ifm2
Tim Halld8339a72021-05-27 18:49:40 +010085from .register_command_stream_util import shape3d_to_block
Louis Verhaard1e170182020-11-26 11:42:04 +010086from .register_command_stream_util import to_kernel
87from .register_command_stream_util import UNARY_ELEMWISE_OPS
88from .register_command_stream_util import Watermark
Tim Hall79d07d22020-04-27 18:20:16 +010089
90
91class RegisterMachine:
92 def __init__(self):
93 self.n_banks = 1
94 self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
95 self.bank_idx = 0
96
97 def set_register(self, reg, value):
98 is_changed = self.registers[self.bank_idx][reg] != value
99 self.registers[self.bank_idx][reg] = value
100 # is_changed = True # force command
101 return is_changed
102
103 def switch_bank(self):
104 self.bank_idx = (self.bank_idx + 1) % self.n_banks
105
106
107class CmdMode(IntEnum):
108 NoPayload = 0x0000
109 Payload32 = 0x4000
110 Mask = 0xC000
111 CmdOpMask = 0x03FF
112
113
Tim Hall79d07d22020-04-27 18:20:16 +0100114class CommandStreamEmitter:
Tim Halle6ccd872020-11-09 16:46:37 +0000115 WORD_SIZE = 4
116
Tim Hall79d07d22020-04-27 18:20:16 +0100117 def __init__(self):
118 self.cmd_stream = []
119 self.reg_machine = [RegisterMachine(), RegisterMachine()]
120 self.last_absolute_wait = defaultdict(int)
Tim Halle6ccd872020-11-09 16:46:37 +0000121 self.offset = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100122
123 def get_reg_machine(self, cmd):
124 if "DMA" in cmd.name:
125 return self.reg_machine[1]
126 else:
127 return self.reg_machine[0]
128
129 def size_in_bytes(self):
130 sz = 0
131 for cmd in self.cmd_stream:
Tim Halle6ccd872020-11-09 16:46:37 +0000132 sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100133 return sz
134
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100135 def to_list(self) -> List[int]:
Tim Hall79d07d22020-04-27 18:20:16 +0100136 return [elem for cmd in self.cmd_stream for elem in cmd]
137
138 def print_cmds(self):
Tim Hall114baba2022-05-10 12:42:27 +0100139 s = f" {'Offset':6}:"
140 s += f" {'Payload':8}"
141 s += f"{'Param':4}" # no leading space for alignment
142 s += f" {'Code':4}"
143 s += f" - {'Command':30}"
144 s += f" {'Param':5}"
145 print(s)
146
147 offset = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100148 for words_for_one_command in self.cmd_stream:
149 code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
150 param = words_for_one_command[0] >> 16 # higher 16 bits
151
152 payload_mode = CmdMode(code & CmdMode.Mask)
153
Tim Hallcda4fcb2022-05-19 12:36:58 +0100154 s = f"{offset:#08x}:"
Tim Hall114baba2022-05-10 12:42:27 +0100155
Tim Hall79d07d22020-04-27 18:20:16 +0100156 if payload_mode == CmdMode.NoPayload:
Tim Hall114baba2022-05-10 12:42:27 +0100157 s += f" {'':8}"
Tim Hall79d07d22020-04-27 18:20:16 +0100158 else:
Tim Hall114baba2022-05-10 12:42:27 +0100159 assert payload_mode == CmdMode.Payload32
160 s += f" {words_for_one_command[1]:08x}"
Tim Hall79d07d22020-04-27 18:20:16 +0100161
Tim Hall114baba2022-05-10 12:42:27 +0100162 s += f" {param:04x}"
163 s += f" {code:04x}"
Tim Hall79d07d22020-04-27 18:20:16 +0100164
Tim Hall114baba2022-05-10 12:42:27 +0100165 if payload_mode == CmdMode.NoPayload:
166 s += f" - {cmd0(code & CmdMode.CmdOpMask):30}"
167 offset += 4
Tim Hall79d07d22020-04-27 18:20:16 +0100168 else:
Tim Hall114baba2022-05-10 12:42:27 +0100169 s += f" - {cmd1(code & CmdMode.CmdOpMask):30}"
170 offset += 8
Tim Hall79d07d22020-04-27 18:20:16 +0100171
Tim Hall114baba2022-05-10 12:42:27 +0100172 s += f" {param:5}"
Tim Hall79d07d22020-04-27 18:20:16 +0100173 print(s)
174
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100175 def cmd0_with_param(self, cmd: cmd0, param):
Tim Hall79d07d22020-04-27 18:20:16 +0100176 if isinstance(param, Enum):
177 param = int(param.value)
178 else:
179 param = int(param)
180 param = param & 0xFFFF
181 command = cmd.value | (param << 16)
182 if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
183 return
184
185 # This is not a redundant command, actually write it
186 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000187 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100188
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100189 def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):
Louis Verhaard893780c2021-03-30 09:02:30 +0200190 offset = int(offset) & 0xFFFFFFFF
191 param = int(param) & 0xFFFF
Tim Hall79d07d22020-04-27 18:20:16 +0100192 command = cmd.value | CmdMode.Payload32.value | (param << 16)
193
194 if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
195 return
196
197 # This is not a redundant command, actually write it
198 self.cmd_stream.append((command, offset))
Tim Halle6ccd872020-11-09 16:46:37 +0000199 self.offset += CommandStreamEmitter.WORD_SIZE * 2
Tim Hall79d07d22020-04-27 18:20:16 +0100200
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100201 def cmd1_with_address(self, cmd: cmd1, offset):
202 self.cmd1_with_offset(cmd, offset, offset >> 32)
203
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100204 def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):
Tim Hall289a41d2020-08-04 21:40:14 +0100205 param = (16 * channel) + outstanding_count
Tim Hall79d07d22020-04-27 18:20:16 +0100206 command = ((param & 0xFFFF) << 16) | cmd.value
207 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000208 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100209
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100210 def cmd_do_operation(self, cmd: cmd0, param=0):
Tim Hall79d07d22020-04-27 18:20:16 +0100211 param = int(param)
212 command = ((param & 0xFFFF) << 16) | cmd.value
213
214 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000215 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100216 self.get_reg_machine(cmd).switch_bank()
217
218
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100219# -------------------------------------------------------------------
220# REGISTER GENERATION
221# -------------------------------------------------------------------
222
223
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100224# TODO: Replace with definitions from ethos_u55_regs
225class IFM2Broadcast(IntEnum):
226 BroadcastHdim = 1 << 0
227 BroadcastWdim = 1 << 1
228 BroadcastCdim = 1 << 2
229 ReverseOperandOrder = 1 << 6
230 UseIFM2Scalar = 1 << 7
231
232
233pooling_op_map = {
234 NpuPoolingOp.MAX: pooling_mode.MAX.value,
235 NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,
236 NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,
237}
238
239elementwise_op_map = {
240 NpuElementWiseOp.MUL: elementwise_mode.MUL.value,
241 NpuElementWiseOp.ADD: elementwise_mode.ADD.value,
242 NpuElementWiseOp.SUB: elementwise_mode.SUB.value,
243 NpuElementWiseOp.MIN: elementwise_mode.MIN.value,
244 NpuElementWiseOp.MAX: elementwise_mode.MAX.value,
245 NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,
246 NpuElementWiseOp.ABS: elementwise_mode.ABS.value,
247 NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,
248 NpuElementWiseOp.SHR: elementwise_mode.SHR.value,
249 NpuElementWiseOp.SHL: elementwise_mode.SHL.value,
250}
251
252activation_op_map = {
253 NpuActivationOp.NONE_OR_RELU: activation.NONE,
254 NpuActivationOp.TANH: activation.TANH,
255 NpuActivationOp.SIGMOID: activation.SIGMOID,
256}
257
258# Maps an AccumulatorType enum to the corresponding acc_format value
259acc_format_map = {
260 SHRAMElements.Acc16: acc_format.FP_S5_10.value,
261 SHRAMElements.Acc32: acc_format.INT_32BIT.value,
262 SHRAMElements.Acc40: acc_format.INT_40BIT.value,
263}
264
265resampling_mode_map = {
266 NpuResamplingMode.NONE: resampling_mode.NONE,
267 NpuResamplingMode.NEAREST: resampling_mode.NEAREST,
268 NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,
269}
270
271# Maps data type size in bits to activation precision
272precision_map = {8: 0, 16: 1, 32: 2}
273
274# Maps rounding mode to the corresponding value
275rounding_mode_map = {
276 NpuRoundingMode.TFL: rounding.TFL.value,
277 NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,
278 NpuRoundingMode.NATURAL: rounding.NATURAL.value,
279}
280
281
Louis Verhaard024c3552021-03-17 14:26:34 +0100282def check_mem_limits(memory_accesses: MemoryAccessSet, mem_limits: Dict[int, int]):
283 """Checks that an operation's memory accesses respect the boundaries imposed by mem_limits"""
284 for mem_access in memory_accesses.accesses:
285 for region, range_set in mem_access.regions.items():
286 if region not in mem_limits:
287 raise VelaError(f"Invalid region: {region}")
288 max = mem_limits[region]
289 for start, end in range_set.ranges:
290 for offset in (start, end):
291 if offset < 0:
292 raise VelaError(f"Negative address offset: {offset}, region: {region}")
293 if offset > max:
Tim Hallcda4fcb2022-05-19 12:36:58 +0100294 raise VelaError(
295 f"Address offset out of range: {offset}, region: {region}, max: {max}. Perhaps try running"
296 f" with the HillClimb tensor allocator and/or increasing the maximum iteration of that"
297 f" allocator"
298 )
Louis Verhaard024c3552021-03-17 14:26:34 +0100299
300
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100301def quantise(value: float, quant: Optional[NpuQuantization]) -> int:
302 """Quantizes the given value"""
303 scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32
304 zp = 0 if quant is None else quant.zero_point
305 return quantise_float32(value, scale, zp)
306
307
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100308def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):
309 """Generates IFM_PAD registers"""
310 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)
311 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)
312 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)
313 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)
314
315
316def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):
317 """Generates ACTIVATION registers"""
318 act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)
319
320 if act.min is None:
321 quantized_min = ofm.data_type.min_value()
322 else:
323 quantized_min = quantise(act.min, ofm.quantization)
324 if act.max is None:
325 quantized_max = ofm.data_type.max_value()
326 else:
327 quantized_max = quantise(act.max, ofm.quantization)
328 quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())
329 quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())
330 if act.op_type == NpuActivationOp.TABLE_LOOKUP:
331 assert 0 <= act.lookup_table_index < 8
332 activation_value = 16 + act.lookup_table_index
333 if ofm.data_type == NpuDataType.INT32:
334 activation_value |= 3 << 12 # Force I8 range
335 quantized_min = max(-128, quantized_min)
336 quantized_max = min(127, quantized_max)
337 else:
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100338 activation_value = cast(int, activation_op_map[act.op_type])
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100339 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)
340 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)
341 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)
342
343
344def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):
345 """Generates xFM_BASE registers"""
346 if layout == NpuLayout.NHCWB16:
347 # Check that all BasePointer addresses are aligned to 16 bytes
348 assert all((int(addr) % 16) == 0 for addr in addresses)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100349 for i in range(4):
350 emit.cmd1_with_address(ptr_cmds[i], addresses[i])
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100351
352
353def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):
354 """Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""
355 emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)
356 emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)
357 emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)
358
359
360def generate_strides(
361 emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1
362):
363 """Generates STRIDE_C/Y/X registers"""
364 strides = get_strides(fm)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100365 emit.cmd1_with_address(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)
366 emit.cmd1_with_address(stride_y_cmd, strides.height) # stride between vertical values (H)
367 emit.cmd1_with_address(stride_x_cmd, strides.width) # stride between horisontal values (W)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100368
369
370def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):
371 """Generates IFM/IFM2_PRECISION register"""
372 dtype = fm.data_type
373 prec = 1 if dtype.is_signed() else 0
374 activation_precision = precision_map[dtype.size_in_bits()]
375 prec += activation_precision << 2
376
377 if fm.layout == NpuLayout.NHCWB16:
378 prec |= 1 << 6
379
380 prec |= op_to_scale << 8
381 emit.cmd0_with_param(precision_cmd, prec)
382
383
384def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):
385 """Generates OFM_PRECISION register"""
386 dtype = npu_op.ofm.data_type
387 prec = 1 if dtype.is_signed() else 0
388 activation_precision = precision_map[dtype.size_in_bits()]
389 prec += activation_precision << 1
390
391 if use_global_scale:
392 # Set global scale bit, as opposed to using per channel scale
393 prec |= 1 << 8
394 if npu_op.ofm.layout == NpuLayout.NHCWB16:
395 prec |= 1 << 6
396 prec |= rounding_mode_map[npu_op.rounding_mode] << 14
397 emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
398
399
400def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):
401 """Generates IFM2_BROADCAST register for binary elementwise operations"""
402 ifm2_broadcast = 0
403 ifm = npu_op.ifm
404 ifm2 = npu_op.ifm2
405 if npu_op.reversed_operands:
406 ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder
407 if npu_op.ifm2_scalar is not None:
408 # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
409 ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar
410 else:
411 if ifm.shape.height != ifm2.shape.height:
412 # Broadcast in 'H' dimension
413 assert ifm2.shape.height == 1
414 ifm2_broadcast |= IFM2Broadcast.BroadcastHdim
415
416 if ifm.shape.width != ifm2.shape.width:
417 # Broadcast in 'W' dimension
418 assert ifm2.shape.width == 1
419 ifm2_broadcast |= IFM2Broadcast.BroadcastWdim
420
421 if ifm.shape.depth != ifm2.shape.depth:
422 # Broadcast in 'C' dimension
423 assert ifm2.shape.depth == 1
424 ifm2_broadcast |= IFM2Broadcast.BroadcastCdim
425
426 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
427
428
429def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):
430 """Generates general IFM registers"""
431 emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)
432 generate_addresses(
433 emit,
434 [cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],
435 ifm.tiles.addresses,
436 ifm.layout,
437 )
438 generate_tiles(
439 emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles
440 )
441 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)
442 generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)
443 emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))
444
445
446def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):
447 """Generates general IFM2 registers"""
448 if not has_scalar:
449 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)
450 generate_addresses(
451 emit,
452 [cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],
453 ifm2.tiles.addresses,
454 ifm2.layout,
455 )
456 generate_tiles(
457 emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles
458 )
459 generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)
460 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))
461
462
463def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):
464 """Generates general OFM registers"""
465 emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)
466 generate_addresses(
467 emit,
468 [cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],
469 ofm.tiles.addresses,
470 ofm.layout,
471 )
472 generate_tiles(
473 emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles
474 )
475 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)
476 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)
477 emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)
478 generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)
479 emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))
480
481
482def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):
483 """Generates KERNEL related registers"""
484 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))
485 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))
486 # set kernel x stride low bit
487 stride = (kernel.stride_x - 1) & 1
488 # set kernel y stride low bit
489 stride |= (kernel.stride_y - 1 & 1) << 1
490 # set kernel x stride extension bits
491 stride |= (kernel.stride_x - 1 >> 1) << 6
492 # set kernel y stride extension bits
493 stride |= (kernel.stride_y - 1 >> 1) << 9
494 stride |= (kernel.dilation_x - 1) << 3
495 stride |= (kernel.dilation_y - 1) << 4
496 if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:
497 stride |= 1 << 2
498 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
499
500
501def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):
502 """Generates WEIGHT registers"""
503 if len(weights) == 0:
504 return
505 emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)
506 # Set weights sources for active and present cores
507 for core, (addr, length) in enumerate(
508 [
509 (cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
510 (cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
511 ]
512 ):
513 if core < len(weights):
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100514 emit.cmd1_with_address(addr, weights[core].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100515 emit.cmd1_with_offset(length, weights[core].length)
516 elif core < arch.ncores:
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100517 emit.cmd1_with_address(addr, weights[0].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100518 emit.cmd1_with_offset(length, 0)
519
520
521def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):
522 """Generates SCALE registers"""
523 if len(biases) == 0:
524 return
525 emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)
526 # Set weights sources for active and present cores
527 for core, (addr, length) in enumerate(
528 [(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]
529 ):
530 if core < len(biases):
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100531 emit.cmd1_with_address(addr, biases[core].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100532 emit.cmd1_with_offset(length, biases[core].length)
533 elif core < arch.ncores:
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100534 emit.cmd1_with_address(addr, biases[0].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100535 emit.cmd1_with_offset(length, 0)
536
537
538def generate_block_config(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200539 emit: CommandStreamEmitter,
540 block_config: NpuShape3D,
Louis Verhaard933f55e2020-11-25 14:10:30 +0100541):
542 """Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100543 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
544 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)
545 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100546
547
Tim Halld8339a72021-05-27 18:49:40 +0100548def generate_shram_registers(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200549 emit: CommandStreamEmitter,
550 npu_op: NpuBlockOperation,
551 arch_block_config: ArchitectureBlockConfig,
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100552):
Tim Halld8339a72021-05-27 18:49:40 +0100553 """Generates IB_END/IB_START/AB_START/ACC_FORMAT registers"""
554 emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, arch_block_config.layout.ib_end)
555 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, arch_block_config.layout.ab_start)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100556 if has_ifm2(npu_op):
Tim Halld8339a72021-05-27 18:49:40 +0100557 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_IB_START, arch_block_config.layout.ib_start2)
558 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[arch_block_config.acc_type])
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100559
560
Tim Halld8339a72021-05-27 18:49:40 +0100561def get_block_config_for_npu_op(
562 arch, npu_op: NpuBlockOperation, npu_block_type: NpuBlockType, is_partkernel: bool, ifm_resampling: resampling_mode
563) -> Optional[ArchitectureBlockConfig]:
564 """
565 Given npu_op.block_config, returns a corresponding ArchitectureBlockConfig.
566 Returns None if the block_config does not fit.
567 """
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100568
569
Tim Halld8339a72021-05-27 18:49:40 +0100570def get_arch_block_config(
571 npu_op: NpuBlockOperation, block_traversal: NpuBlockTraversal, arch: ArchitectureFeatures
572) -> ArchitectureBlockConfig:
Louis Verhaard933f55e2020-11-25 14:10:30 +0100573 """Creates shared buffer allocation for the given operation"""
Tim Halld8339a72021-05-27 18:49:40 +0100574 assert npu_op.block_config is not None, "block_config has not been set"
575 block_type = NpuBlockType.Default
Dwight Lidman9b43f842020-12-08 17:56:44 +0100576 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100577 block_type = NpuBlockType.ConvolutionMxN
Dwight Lidman9b43f842020-12-08 17:56:44 +0100578 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100579 block_type = NpuBlockType.ConvolutionDepthWise
Dwight Lidman9b43f842020-12-08 17:56:44 +0100580 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100581 block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
Dwight Lidman9b43f842020-12-08 17:56:44 +0100582 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100583 block_type = NpuBlockType.ElementWise
584 else:
585 assert 0, "Unsupported operation"
586 ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
Tim Halld8339a72021-05-27 18:49:40 +0100587 is_partkernel = block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST
588 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
589 lut_banks = 2 if uses_lut else 0
590 fms = [npu_op.ifm, npu_op.ofm]
591 if npu_op.ifm2 is not None:
592 fms.append(npu_op.ifm2)
593 all_fms_have_quant = not any(fm.quantization is None or fm.quantization.scale_f32 is None for fm in fms)
594 ifm_bits = npu_op.ifm.data_type.size_in_bits()
595 ifm_shape = shape3d_to_block(npu_op.ifm.shape)
596 if has_ifm2(npu_op):
597 ifm2_shape = shape3d_to_block(npu_op.ifm2.shape)
598 else:
599 ifm2_shape = None
600 uses_scalar = npu_op.ifm2_scalar is not None
601 block_config = shape3d_to_block(npu_op.block_config)
602 arch_block_config = try_block_config(
603 block_config,
604 arch,
605 block_type,
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100606 shape3d_to_block(npu_op.ofm.shape),
Tim Halld8339a72021-05-27 18:49:40 +0100607 ifm_shape,
608 ifm2_shape,
609 uses_scalar,
610 ifm_bits,
611 is_partkernel=is_partkernel,
612 kernel=to_kernel(npu_op.kernel),
613 lut_banks=lut_banks,
614 scaled=all_fms_have_quant,
615 ifm_resampling=ifm_resampling_mode,
616 )
617 assert arch_block_config is not None, f"block_config {npu_op.block_config} does not fit"
618 return arch_block_config
Louis Verhaard933f55e2020-11-25 14:10:30 +0100619
620
Louis Verhaard1e170182020-11-26 11:42:04 +0100621def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):
622 """Generates KERNEL_WAIT/DMA_WAIT"""
623 if cmd_waits.npu >= 0:
624 emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
625
626 if cmd_waits.dma >= 0:
627 emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
628
629
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100630def generate_common(
631 emit: CommandStreamEmitter,
632 npu_op: NpuBlockOperation,
633 block_traversal: NpuBlockTraversal,
634 arch: ArchitectureFeatures,
635 use_global_scale: bool = False,
636 op_to_scale: int = 0,
637):
638 """Generate registers that are common to most operations"""
639 assert npu_op.ifm is not None and npu_op.ofm is not None
640 generate_ifm(emit, npu_op.ifm)
641 generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)
642 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])
643 if npu_op.padding is not None:
644 generate_padding(emit, npu_op.padding)
645 generate_ofm(emit, npu_op.ofm)
646 generate_ofm_precision(emit, npu_op, use_global_scale)
647 if npu_op.op_type != NpuOperationType.ElementWise:
648 assert npu_op.kernel is not None
649 generate_kernel(emit, npu_op.kernel, block_traversal)
650 generate_weights(emit, npu_op.weights, arch)
651 generate_biases(emit, npu_op.biases, arch)
652 generate_activation(emit, npu_op.activation, npu_op.ofm)
Tim Halld8339a72021-05-27 18:49:40 +0100653 arch_block_config = get_arch_block_config(npu_op, block_traversal, arch)
654 generate_block_config(emit, npu_op.block_config)
655 generate_shram_registers(emit, npu_op, arch_block_config)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100656
657
658# -------------------------------------------------------------------
659# SCALING
660# -------------------------------------------------------------------
661
662
663def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):
664 """Generates OFM_SCALE register for pooling operations"""
665 # For valid padding vela has to output scaling values
666 kernel = pool_op.kernel
667 ifm_quant = pool_op.ifm.quantization
668 ofm_quant = pool_op.ofm.quantization
669 if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):
670 assert ifm_quant.scale_f32 is not None
671 rescale = 0x3000 * ifm_quant.scale_f32
672 if pool_op.ifm.data_type == NpuDataType.INT16:
673 # Calculate scale and shift for the output scale of 1/(3*4096)
Louis Verhaardc6291292021-03-19 09:35:48 +0100674 x_log2 = math.log2(ifm_quant.scale_f32)
675 rounded_log2 = int(round(x_log2))
676 is_power_of_two = abs(x_log2 - rounded_log2) < 0.001
677 shift = rounded_log2 + 12
Patrik Gustavssone3dd2f32021-12-02 09:08:26 +0100678 if is_power_of_two and (
679 (pool_op.activation.op_type == NpuActivationOp.TANH and shift in (0, 1))
680 or (pool_op.activation.op_type == NpuActivationOp.SIGMOID and shift == 0)
681 ):
682 # Special handling if input scale is 1/2048 (tanh/sigmoid) or 1/4096 (tanh)
Louis Verhaardc6291292021-03-19 09:35:48 +0100683 scale = 3 << shift
684 shift = 0
685 else:
686 shift = 0
687 max_rescale = np.iinfo(np.int16).max / 2
688 while rescale <= max_rescale and shift <= 30:
689 shift += 1
690 rescale *= 2
691 scale = int(rescale)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100692 else:
693 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
694 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
695 scale = int(round_away_zero(scale * rescale))
696 elif pool_op.fused_quantize:
697 # Quantize op requires different scaling
698 ifm_scale_f64 = np.double(ifm_quant.scale_f32)
699 ofm_scale_f64 = np.double(ofm_quant.scale_f32)
700 scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
701 elif pool_op.rescale is not None:
Patrik Gustavssonc74682c2021-08-17 14:26:38 +0200702 if type(pool_op.rescale) == ExplicitScaling:
703 # Note: reuse of rescale for explicit scaling to not expose this in the external API
704 explicit_scaling = pool_op.rescale
705 assert explicit_scaling.per_channel is False
706 scale = explicit_scaling.multiplier[0]
707 shift = explicit_scaling.shift[0]
708 else:
Tim Hall885033b2022-07-21 11:46:03 +0100709 # for ResizeBilinear/NearestNeighbor operations with rescale
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200710 # Note: this is not used, but part of the public API
Patrik Gustavssonc74682c2021-08-17 14:26:38 +0200711 rescale = pool_op.rescale
712 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
713 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
714 scale = int(round_away_zero(scale * rescale))
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100715 else:
716 # In case avg pool fused with concat or other memory operation, rescaling might be needed.
717 # kernel height == kernel width == 1 is always true in this case
718 # Normally the scale is maximised, to get maximum precision, which means that
719 # if rescale != 1, scale need to consider the number of bits needed for rescaling
720 if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:
721 rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32
722 rescale_bits = 0
723 if kernel.height == kernel.width == 1:
724 if rescale > 1:
725 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
726 elif rescale < 1:
727 rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
728 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
729 scale = int(round_away_zero(scale * rescale))
730 else:
731 scale = 1
732 shift = 0
733
734 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
735
736
737def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
738 """
739 Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
740 Returns the operator to scale
741 """
742 op_to_scale = 0
743 if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):
744 input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
745 input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
746 output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
747
748 if npu_op.activation is not None and npu_op.activation.op_type in (
749 NpuActivationOp.SIGMOID,
750 NpuActivationOp.TANH,
751 ):
752 output_scale = 1 / 0x3000
753
754 if npu_op.sub_op_type == NpuElementWiseOp.MUL:
Patrik Gustavssonb081d672021-08-25 13:49:25 +0200755 if npu_op.rescale:
756 ofm_scale, shift = npu_op.rescale
757 elif None in (input_scale, input2_scale, output_scale):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100758 ofm_scale = 1
759 shift = 0
760 else:
761 ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100762 else: # Add/Sub
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200763 # Default operand scaling is no scaling
764 opa_scale = opb_scale = 1
765 opa_shift = 0
Henrik G Olssonad656a82021-03-19 15:50:28 +0100766 bitdepth = npu_op.ifm.data_type.size_in_bits()
767 use_advanced_scaling = False
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200768 if npu_op.rescale is not None:
769 # Explicit ofm scaling
770 ofm_scale, shift = npu_op.rescale
771 elif None in (input_scale, input2_scale, output_scale):
772 # No ofm scaling
773 ofm_scale = 1
774 shift = 0
Henrik G Olssonad656a82021-03-19 15:50:28 +0100775 elif input_scale == input2_scale and bitdepth == 16:
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200776 # int16 same scaling
Henrik G Olssonad656a82021-03-19 15:50:28 +0100777 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
778 input_scale, input2_scale, output_scale
779 )
780 # align the double rounding with that of advanced scaling
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200781 opa_scale //= 2
782 opb_scale //= 2
Henrik G Olssonad656a82021-03-19 15:50:28 +0100783 shift -= 1
784 opa_shift = 0 # Unused for this case
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100785 elif input_scale == input2_scale:
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200786 # Same scaling
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100787 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
788 input_scale, input2_scale, output_scale
789 )
790 opa_shift = 0 # Unused for this case
Henrik G Olssonad656a82021-03-19 15:50:28 +0100791 # For 8 bit we can't guarantee double rounding with simplified scaling will always be
792 # the same as with advanced scaling due to different shifts. When the ofm scale fulfils
793 # the following we know that double rounding will have no effect for advanced scaling
794 # no matter the input, so we can safely use simplified scaling with double rounding disabled.
795 use_advanced_scaling = int(ofm_scale) & 0xFFF != 0
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100796 else:
Henrik G Olssonad656a82021-03-19 15:50:28 +0100797 use_advanced_scaling = True
798 if use_advanced_scaling:
799 # Use advanced implementation only when input/output scales differ,
800 # or when we can't guarantee the absence of rounding errors
Jonas Ohlssond8575072022-03-30 10:30:25 +0200801 (
802 opa_scale,
803 opa_shift,
804 ofm_scale,
805 shift,
806 op_to_scale,
807 ) = scaling.advanced_elementwise_add_sub_scale(input_scale, input2_scale, output_scale, bitdepth)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100808 opb_scale = 0 # Unused for this case
809 if npu_op.reversed_operands:
810 # If the operand order is reversed we also have to swap which operand is scaled
811 if op_to_scale == scaling.OperandToScale.OPa:
812 op_to_scale = scaling.OperandToScale.OPb
813 else:
814 op_to_scale = scaling.OperandToScale.OPa
815 emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
816 emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100817 elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):
818 output_scale = npu_op.ofm.quantization.scale_f32
819 ofm_scale, shift = scaling.quantise_scale(output_scale)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100820 else:
Tim Halle178f382022-07-12 17:02:25 +0100821 ofm_scale = 1
822 shift = 0
823 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100824 return op_to_scale
825
826
827# -------------------------------------------------------------------
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100828# PRINT
829# -------------------------------------------------------------------
Jacob Bohline99b8932020-07-13 16:01:51 +0200830
831
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100832def print_feature_map(fm: Optional[NpuFeatureMap], name: str):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100833 if fm is not None:
834 q = (
835 "no quantization"
836 if fm.quantization is None
837 else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"
838 )
839 h, w, c = fm.shape
840 sz = h * w * c * fm.data_type.size_in_bytes()
841 print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")
842 strides = get_strides(fm)
843 stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"
844 t = fm.tiles
845 addresses = [hex(addr) for addr in t.addresses]
846 print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")
Tim Hall68df8a12022-03-16 16:51:16 +0000847 print(f" name={fm.name}")
Tim Hall79d07d22020-04-27 18:20:16 +0100848
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100849
Dwight Lidman9b43f842020-12-08 17:56:44 +0100850def print_operation(npu_op: NpuOperation, index: int = 0, cmd=None):
Tim Hall68df8a12022-03-16 16:51:16 +0000851 pass_info = f" {cmd}" if cmd else ""
Dwight Lidman9b43f842020-12-08 17:56:44 +0100852 if isinstance(npu_op, NpuOperation) and not isinstance(npu_op, (NpuDmaOperation, NpuBlockOperation)):
Tim Hall68df8a12022-03-16 16:51:16 +0000853 print(f"{index} {npu_op.op_type.name} name={npu_op.name}:{pass_info}")
Dwight Lidman9b43f842020-12-08 17:56:44 +0100854 return
855 if isinstance(npu_op, NpuDmaOperation):
Tim Hall68df8a12022-03-16 16:51:16 +0000856 print(f"{index} {npu_op.op_type.name} name={npu_op.name}, src={npu_op.src}, dest={npu_op.dest}:{pass_info}")
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100857 return
858 k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100859 if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)):
Tim Hall68df8a12022-03-16 16:51:16 +0000860 print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name} name={npu_op.name}:{pass_info}")
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200861 else:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100862 if (
Dwight Lidman9b43f842020-12-08 17:56:44 +0100863 isinstance(npu_op, NpuConv2DOperation)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100864 and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1
865 ):
866 fc = "FullyConnected "
867 else:
868 fc = ""
Tim Hall68df8a12022-03-16 16:51:16 +0000869 print(f"{index} {fc}{npu_op.op_type.name} name={npu_op.name}:{pass_info}")
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100870 print_feature_map(npu_op.ifm, "IFM")
871 if npu_op.ifm2_scalar is not None:
872 quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
873 print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")
874 else:
875 print_feature_map(npu_op.ifm2, "IFM2")
876 print_feature_map(npu_op.ofm, "OFM")
877 if k is not None and npu_op.op_type != NpuOperationType.ElementWise:
878 print(f" Kernel: {k}")
879 if npu_op.padding is not None:
880 print(f" {npu_op.padding}")
881 for weights in npu_op.weights:
882 print(f" Weights: {weights}")
883 for bias in npu_op.biases:
884 print(f" Scales: {bias}")
885 if npu_op.activation is not None:
886 act = npu_op.activation
887 if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:
888 lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""
889 print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")
Dwight Lidman9b43f842020-12-08 17:56:44 +0100890 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100891 print(f" {npu_op.block_traversal}")
892 bh, bw, bc = npu_op.block_config
Dwight Lidman9b43f842020-12-08 17:56:44 +0100893 rescale = (
894 f", rescale={npu_op.rescale}" if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)) else ""
895 )
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100896 print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")
Tim Hall79d07d22020-04-27 18:20:16 +0100897
Tim Hall79d07d22020-04-27 18:20:16 +0100898
Dwight Lidman9b43f842020-12-08 17:56:44 +0100899def print_operations(npu_op_list: List[NpuOperation], npu_op_to_cmd=None):
900 npu_op_to_cmd = dict() if npu_op_to_cmd is None else npu_op_to_cmd
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100901 for index, npu_op in enumerate(npu_op_list):
Dwight Lidman9b43f842020-12-08 17:56:44 +0100902 print_operation(npu_op, index, npu_op_to_cmd.get(npu_op))
Tim Hall79d07d22020-04-27 18:20:16 +0100903
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100904
905# -------------------------------------------------------------------
906# OPERATIONS
907# -------------------------------------------------------------------
908
909
910def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):
911 """Generates NPU_OP_* command"""
Dwight Lidman9b43f842020-12-08 17:56:44 +0100912 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100913 emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100914 elif isinstance(npu_op, NpuConv2DOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100915 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100916 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100917 emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100918 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100919 emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])
Dwight Lidman9b43f842020-12-08 17:56:44 +0100920 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100921 emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])
922 else:
923 assert 0, "Unsupported operation"
924
925
Louis Verhaard933f55e2020-11-25 14:10:30 +0100926def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100927 """Generates register commands for Conv2D operations"""
928 generate_common(emit, npu_op, npu_op.block_traversal, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100929
930
Dwight Lidman9b43f842020-12-08 17:56:44 +0100931def generate_conv_depthwise_op(
932 emit: CommandStreamEmitter, npu_op: NpuConvDepthWiseOperation, arch: ArchitectureFeatures
933):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100934 """Generates register commands for depthwise convolution operations"""
935 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100936
937
938def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
939 """Generates register commands for pooling operations"""
Tim Halld6efcd32022-09-02 15:01:01 +0100940 # check that reduce_sum input is NHWC
941 if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM and npu_op.ifm.layout != NpuLayout.NHWC:
942 if npu_op.ifm.data_type == NpuDataType.INT32:
943 raise VelaError(
944 f"REDUCE_SUM ({npu_op.name}) with IFM data type of INT32 requires IFM layout to be NHWC"
945 f" ({npu_op.ifm.name} == {npu_op.ifm.layout})"
946 )
947 elif arch.accelerator_config == Accelerator.Ethos_U65_512:
948 raise VelaError(
949 f"REDUCE_SUM ({npu_op.name}) with accelerator config of Ethos_U65_512 requires IFM layout to be NHWC"
950 f" ({npu_op.ifm.name} == {npu_op.ifm.layout})"
951 )
952
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100953 use_global_scale = (
954 npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0
955 )
Patrik Gustavssonc74682c2021-08-17 14:26:38 +0200956 # Note: reuse of rescale for explicit scaling to not expose this in the external API
957 if npu_op.rescale is not None and type(npu_op.rescale) == ExplicitScaling:
958 use_global_scale = not npu_op.rescale.per_channel
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100959 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)
960 # Pooling op specific
961 if use_global_scale:
962 generate_ofm_scaling_for_pooling(emit, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100963
964
965def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):
966 """Generates register commands for elementwise operations"""
967 use_global_scale = npu_op.sub_op_type in (
968 NpuElementWiseOp.ADD,
969 NpuElementWiseOp.SUB,
970 NpuElementWiseOp.MUL,
971 NpuElementWiseOp.LRELU,
972 NpuElementWiseOp.ABS,
973 )
974 op_to_scale = generate_scaling_for_elementwise(emit, npu_op)
975 generate_common(
976 emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale
977 )
978 # Elementwise op specific
Louis Verhaard1e170182020-11-26 11:42:04 +0100979 if npu_op.sub_op_type not in UNARY_ELEMWISE_OPS:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100980 # Binary operation; generate IFM2 registers
981 assert npu_op.ifm2 is not None
982 has_scalar = npu_op.ifm2_scalar is not None
983 generate_ifm2(emit, npu_op.ifm2, has_scalar)
984 generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)
985 generate_ifm2_broadcast(emit, npu_op)
986 if has_scalar:
987 quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
988 assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()
989 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100990
991
992def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):
993 """Generates register commands for DMA operations"""
994 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100995 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100996 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)
997
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100998 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
999 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001000
1001
Louis Verhaard933f55e2020-11-25 14:10:30 +01001002def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001003 """
1004 Generates register commands for the given operation, but not the final NPU_OP_... command.
1005 Returns the selected block config
1006 """
Dwight Lidman9b43f842020-12-08 17:56:44 +01001007 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +01001008 generate_conv2d_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001009 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +01001010 generate_conv_depthwise_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001011 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +01001012 generate_pooling_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001013 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +01001014 generate_elementwise_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001015 elif isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001016 generate_dma_op(emit, npu_op)
1017 else:
1018 assert 0, "Unsupported operation"
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001019
1020
1021def generate_command_stream(
Louis Verhaard024c3552021-03-17 14:26:34 +01001022 npu_op_list: List[NpuOperation],
1023 arch: ArchitectureFeatures,
1024 verbose: bool,
1025 mem_limits: Dict[int, int],
1026 add_to_debug_db=None,
1027 npu_op_to_cmd=None,
Louis Verhaard1e170182020-11-26 11:42:04 +01001028) -> List[int]:
1029 """
1030 Generates register commands for the given list of NPU operations.
1031 Returns Ethos-U instructions, as a list of 32-bit integers.
1032 """
1033 emit = CommandStreamEmitter()
1034 if verbose:
Dwight Lidman9b43f842020-12-08 17:56:44 +01001035 print_operations(npu_op_list, npu_op_to_cmd)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001036 # Calculate memory accesses for every operation
Dwight Lidman9b43f842020-12-08 17:56:44 +01001037 memory_accesses: Dict[NpuOperation, MemoryAccessSet] = {}
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001038 for npu_op in npu_op_list:
Dwight Lidman9b43f842020-12-08 17:56:44 +01001039 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001040 memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001041 elif isinstance(npu_op, NpuBlockOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001042 memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001043 else:
1044 assert 0, "Invalid operation type"
Louis Verhaard024c3552021-03-17 14:26:34 +01001045
Tim Hallc8a73862020-10-27 12:43:14 +00001046 if arch.is_ethos_u65_system:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001047 emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
1048 dep_watermark = Watermark(0, 0)
1049 prev_op = None
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001050 # Generate register commands for all operations
1051 for op_index, npu_op in enumerate(npu_op_list):
Louis Verhaard024c3552021-03-17 14:26:34 +01001052 try:
1053 check_mem_limits(memory_accesses[npu_op], mem_limits)
1054 dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)
1055 generate_registers_for_op(emit, npu_op, arch)
1056 except VelaError as e:
1057 # Add operation info and rethrow
1058 raise VelaError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None
Dwight Lidman9b43f842020-12-08 17:56:44 +01001059 if not isinstance(npu_op, NpuDmaOperation) and isinstance(npu_op, NpuBlockOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001060 # Generate BLOCKDEP
Louis Verhaard933f55e2020-11-25 14:10:30 +01001061 blockdep = calc_blockdep(arch, prev_op, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001062 blockdep = min(blockdep, arch.max_blockdep)
1063 emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
1064 prev_op = npu_op
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001065
1066 generate_cmd_waits(emit, cmd_waits)
1067 # Generate the actual NPU_OP command
1068 generate_operation_code(emit, npu_op)
1069 if add_to_debug_db is not None:
1070 add_to_debug_db(npu_op, emit.offset)
1071 # Fill in final part of command stream:
1072 emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
Louis Verhaard1e170182020-11-26 11:42:04 +01001073 res = emit.to_list()
erik.andersson@arm.com1878dab2021-03-16 09:40:24 +01001074
1075 if emit.size_in_bytes() >= 1 << 24:
1076 raise VelaError(
1077 f"The command stream size exceeds the hardware limit of 16 MiB. "
1078 f"The current stream size is {emit.size_in_bytes()/2**20:.2F} MiB."
1079 )
1080
Tim Hall79d07d22020-04-27 18:20:16 +01001081 if verbose:
1082 emit.print_cmds()
Tim Hall114baba2022-05-10 12:42:27 +01001083 print(f"Number of commands = {len(emit.cmd_stream)}")
1084 print(f"Command stream length = {emit.size_in_bytes()} bytes")
Louis Verhaard1e170182020-11-26 11:42:04 +01001085 return res
1086
1087
Louis Verhaardaeae5672020-11-02 18:04:27 +01001088def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001089 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001090 Internal implementation of the public facing API for generating an Ethos-U register command stream.
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001091 Calculates dependencies between commands and inserts wait operations if needed.
1092
1093 :param npu_op_list: List[NpuOperation] list of high level NPU operations
Tim Hallc8a73862020-10-27 12:43:14 +00001094 :param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator
1095 :return Ethos-U instructions, as a list of 32-bit integers
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001096 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001097 accelerator = Accelerator.from_npu_accelerator(npu_accelerator)
Louis Verhaard52078302020-11-18 13:35:06 +01001098 arch = create_default_arch(accelerator)
Louis Verhaard024c3552021-03-17 14:26:34 +01001099 mem_limits = dict()
1100 for region in range(0, 8):
1101 mem_limits[region] = arch.max_address_offset
1102 mem_limits[BASE_PTR_INDEX_MEM2MEM] = arch.shram_size_bytes
1103 return generate_command_stream(npu_op_list, arch, verbose=False, mem_limits=mem_limits)