blob: c9b57f22a253cfdc69dfc2dcd3946f0e877321ba [file] [log] [blame]
erik.andersson@arm.com460c6892021-02-24 14:38:09 +01001# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
Tim Hallc8a73862020-10-27 12:43:14 +000017# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates
Tim Hall79d07d22020-04-27 18:20:16 +010018# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
Tim Hallc8a73862020-10-27 12:43:14 +000019# stream suitable for interpretation by the Ethos-U processor.
Louis Verhaardc6291292021-03-19 09:35:48 +010020import math
Tim Hall79d07d22020-04-27 18:20:16 +010021from collections import defaultdict
Diego Russoe8a10452020-04-21 17:39:10 +010022from enum import Enum
23from enum import IntEnum
Jonas Ohlsson845e2322022-03-01 12:39:55 +010024from typing import cast
Dwight Lidman9b43f842020-12-08 17:56:44 +010025from typing import Dict
Louis Verhaarde8a5a782020-11-02 18:04:27 +010026from typing import List
27from typing import Optional
Diego Russoea6111a2020-04-14 18:41:58 +010028
29import numpy as np
30
31from . import scaling
Louis Verhaardaeae5672020-11-02 18:04:27 +010032from .api import NpuAccelerator
Louis Verhaarde8a5a782020-11-02 18:04:27 +010033from .api import NpuActivation
34from .api import NpuActivationOp
35from .api import NpuAddressRange
36from .api import NpuBlockOperation
37from .api import NpuBlockTraversal
38from .api import NpuConv2DOperation
Dwight Lidman9b43f842020-12-08 17:56:44 +010039from .api import NpuConvDepthWiseOperation
Louis Verhaarde8a5a782020-11-02 18:04:27 +010040from .api import NpuDataType
41from .api import NpuDmaOperation
42from .api import NpuElementWiseOp
43from .api import NpuElementWiseOperation
44from .api import NpuFeatureMap
45from .api import NpuKernel
46from .api import NpuLayout
47from .api import NpuOperation
48from .api import NpuOperationType
49from .api import NpuPadding
50from .api import NpuPoolingOp
51from .api import NpuPoolingOperation
Louis Verhaarde8a5a782020-11-02 18:04:27 +010052from .api import NpuResamplingMode
53from .api import NpuRoundingMode
54from .api import NpuShape3D
55from .api import NpuTileBox
Tim Halld8339a72021-05-27 18:49:40 +010056from .architecture_allocator import ArchitectureBlockConfig
57from .architecture_allocator import try_block_config
Louis Verhaarde8a5a782020-11-02 18:04:27 +010058from .architecture_features import Accelerator
Diego Russoe8a10452020-04-21 17:39:10 +010059from .architecture_features import ArchitectureFeatures
Louis Verhaard52078302020-11-18 13:35:06 +010060from .architecture_features import create_default_arch
Diego Russoe8a10452020-04-21 17:39:10 +010061from .architecture_features import SHRAMElements
erik.andersson@arm.com1878dab2021-03-16 09:40:24 +010062from .errors import VelaError
Diego Russoe8a10452020-04-21 17:39:10 +010063from .ethos_u55_regs.ethos_u55_regs import acc_format
64from .ethos_u55_regs.ethos_u55_regs import activation
65from .ethos_u55_regs.ethos_u55_regs import cmd0
66from .ethos_u55_regs.ethos_u55_regs import cmd1
67from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
Fredrik Svedberga0c36242020-06-03 15:43:31 +020068from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlincf7da102020-05-20 09:03:40 +020069from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russoe8a10452020-04-21 17:39:10 +010070from .ethos_u55_regs.ethos_u55_regs import rounding
Diego Russoe8a10452020-04-21 17:39:10 +010071from .numeric_util import round_away_zero
Diego Russoe8a10452020-04-21 17:39:10 +010072from .numeric_util import round_up_to_int
Patrik Gustavssonc74682c2021-08-17 14:26:38 +020073from .operation import ExplicitScaling
Tim Hall79d07d22020-04-27 18:20:16 +010074from .operation import NpuBlockType
Dwight Lidman9b43f842020-12-08 17:56:44 +010075from .range_set import MemoryAccessSet
Louis Verhaard024c3552021-03-17 14:26:34 +010076from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM
Louis Verhaard1e170182020-11-26 11:42:04 +010077from .register_command_stream_util import calc_blockdep
78from .register_command_stream_util import get_dma_memory_accesses
79from .register_command_stream_util import get_op_memory_accesses
80from .register_command_stream_util import get_strides
81from .register_command_stream_util import get_wait_dependency
Fredrik Svedbergf3c7d552022-11-04 09:48:49 +010082from .register_command_stream_util import get_zero_point
Louis Verhaard1e170182020-11-26 11:42:04 +010083from .register_command_stream_util import has_ifm2
Fredrik Svedbergf3c7d552022-11-04 09:48:49 +010084from .register_command_stream_util import quantise
Tim Halld8339a72021-05-27 18:49:40 +010085from .register_command_stream_util import shape3d_to_block
Louis Verhaard1e170182020-11-26 11:42:04 +010086from .register_command_stream_util import to_kernel
87from .register_command_stream_util import UNARY_ELEMWISE_OPS
88from .register_command_stream_util import Watermark
Tim Hall79d07d22020-04-27 18:20:16 +010089
90
91class RegisterMachine:
92 def __init__(self):
93 self.n_banks = 1
94 self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
95 self.bank_idx = 0
96
97 def set_register(self, reg, value):
98 is_changed = self.registers[self.bank_idx][reg] != value
99 self.registers[self.bank_idx][reg] = value
100 # is_changed = True # force command
101 return is_changed
102
103 def switch_bank(self):
104 self.bank_idx = (self.bank_idx + 1) % self.n_banks
105
106
107class CmdMode(IntEnum):
108 NoPayload = 0x0000
109 Payload32 = 0x4000
110 Mask = 0xC000
111 CmdOpMask = 0x03FF
112
113
Tim Hall79d07d22020-04-27 18:20:16 +0100114class CommandStreamEmitter:
Tim Halle6ccd872020-11-09 16:46:37 +0000115 WORD_SIZE = 4
116
Tim Hall79d07d22020-04-27 18:20:16 +0100117 def __init__(self):
118 self.cmd_stream = []
119 self.reg_machine = [RegisterMachine(), RegisterMachine()]
120 self.last_absolute_wait = defaultdict(int)
Tim Halle6ccd872020-11-09 16:46:37 +0000121 self.offset = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100122
123 def get_reg_machine(self, cmd):
124 if "DMA" in cmd.name:
125 return self.reg_machine[1]
126 else:
127 return self.reg_machine[0]
128
129 def size_in_bytes(self):
130 sz = 0
131 for cmd in self.cmd_stream:
Tim Halle6ccd872020-11-09 16:46:37 +0000132 sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100133 return sz
134
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100135 def to_list(self) -> List[int]:
Tim Hall79d07d22020-04-27 18:20:16 +0100136 return [elem for cmd in self.cmd_stream for elem in cmd]
137
138 def print_cmds(self):
Tim Hall114baba2022-05-10 12:42:27 +0100139 s = f" {'Offset':6}:"
140 s += f" {'Payload':8}"
141 s += f"{'Param':4}" # no leading space for alignment
142 s += f" {'Code':4}"
143 s += f" - {'Command':30}"
144 s += f" {'Param':5}"
145 print(s)
146
147 offset = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100148 for words_for_one_command in self.cmd_stream:
149 code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
150 param = words_for_one_command[0] >> 16 # higher 16 bits
151
152 payload_mode = CmdMode(code & CmdMode.Mask)
153
Tim Hallcda4fcb2022-05-19 12:36:58 +0100154 s = f"{offset:#08x}:"
Tim Hall114baba2022-05-10 12:42:27 +0100155
Tim Hall79d07d22020-04-27 18:20:16 +0100156 if payload_mode == CmdMode.NoPayload:
Tim Hall114baba2022-05-10 12:42:27 +0100157 s += f" {'':8}"
Tim Hall79d07d22020-04-27 18:20:16 +0100158 else:
Tim Hall114baba2022-05-10 12:42:27 +0100159 assert payload_mode == CmdMode.Payload32
160 s += f" {words_for_one_command[1]:08x}"
Tim Hall79d07d22020-04-27 18:20:16 +0100161
Tim Hall114baba2022-05-10 12:42:27 +0100162 s += f" {param:04x}"
163 s += f" {code:04x}"
Tim Hall79d07d22020-04-27 18:20:16 +0100164
Tim Hall114baba2022-05-10 12:42:27 +0100165 if payload_mode == CmdMode.NoPayload:
166 s += f" - {cmd0(code & CmdMode.CmdOpMask):30}"
167 offset += 4
Tim Hall79d07d22020-04-27 18:20:16 +0100168 else:
Tim Hall114baba2022-05-10 12:42:27 +0100169 s += f" - {cmd1(code & CmdMode.CmdOpMask):30}"
170 offset += 8
Tim Hall79d07d22020-04-27 18:20:16 +0100171
Tim Hall114baba2022-05-10 12:42:27 +0100172 s += f" {param:5}"
Tim Hall79d07d22020-04-27 18:20:16 +0100173 print(s)
174
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100175 def cmd0_with_param(self, cmd: cmd0, param):
Tim Hall79d07d22020-04-27 18:20:16 +0100176 if isinstance(param, Enum):
177 param = int(param.value)
178 else:
179 param = int(param)
180 param = param & 0xFFFF
181 command = cmd.value | (param << 16)
182 if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
183 return
184
185 # This is not a redundant command, actually write it
186 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000187 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100188
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100189 def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):
Louis Verhaard893780c2021-03-30 09:02:30 +0200190 offset = int(offset) & 0xFFFFFFFF
191 param = int(param) & 0xFFFF
Tim Hall79d07d22020-04-27 18:20:16 +0100192 command = cmd.value | CmdMode.Payload32.value | (param << 16)
193
194 if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
195 return
196
197 # This is not a redundant command, actually write it
198 self.cmd_stream.append((command, offset))
Tim Halle6ccd872020-11-09 16:46:37 +0000199 self.offset += CommandStreamEmitter.WORD_SIZE * 2
Tim Hall79d07d22020-04-27 18:20:16 +0100200
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100201 def cmd1_with_address(self, cmd: cmd1, offset):
202 self.cmd1_with_offset(cmd, offset, offset >> 32)
203
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100204 def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):
Tim Hall289a41d2020-08-04 21:40:14 +0100205 param = (16 * channel) + outstanding_count
Tim Hall79d07d22020-04-27 18:20:16 +0100206 command = ((param & 0xFFFF) << 16) | cmd.value
207 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000208 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100209
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100210 def cmd_do_operation(self, cmd: cmd0, param=0):
Tim Hall79d07d22020-04-27 18:20:16 +0100211 param = int(param)
212 command = ((param & 0xFFFF) << 16) | cmd.value
213
214 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000215 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100216 self.get_reg_machine(cmd).switch_bank()
217
218
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100219# -------------------------------------------------------------------
220# REGISTER GENERATION
221# -------------------------------------------------------------------
222
223
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100224# TODO: Replace with definitions from ethos_u55_regs
225class IFM2Broadcast(IntEnum):
226 BroadcastHdim = 1 << 0
227 BroadcastWdim = 1 << 1
228 BroadcastCdim = 1 << 2
229 ReverseOperandOrder = 1 << 6
230 UseIFM2Scalar = 1 << 7
231
232
233pooling_op_map = {
234 NpuPoolingOp.MAX: pooling_mode.MAX.value,
235 NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,
236 NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,
237}
238
239elementwise_op_map = {
240 NpuElementWiseOp.MUL: elementwise_mode.MUL.value,
241 NpuElementWiseOp.ADD: elementwise_mode.ADD.value,
242 NpuElementWiseOp.SUB: elementwise_mode.SUB.value,
243 NpuElementWiseOp.MIN: elementwise_mode.MIN.value,
244 NpuElementWiseOp.MAX: elementwise_mode.MAX.value,
245 NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,
246 NpuElementWiseOp.ABS: elementwise_mode.ABS.value,
247 NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,
248 NpuElementWiseOp.SHR: elementwise_mode.SHR.value,
249 NpuElementWiseOp.SHL: elementwise_mode.SHL.value,
250}
251
252activation_op_map = {
253 NpuActivationOp.NONE_OR_RELU: activation.NONE,
254 NpuActivationOp.TANH: activation.TANH,
255 NpuActivationOp.SIGMOID: activation.SIGMOID,
256}
257
258# Maps an AccumulatorType enum to the corresponding acc_format value
259acc_format_map = {
260 SHRAMElements.Acc16: acc_format.FP_S5_10.value,
261 SHRAMElements.Acc32: acc_format.INT_32BIT.value,
262 SHRAMElements.Acc40: acc_format.INT_40BIT.value,
263}
264
265resampling_mode_map = {
266 NpuResamplingMode.NONE: resampling_mode.NONE,
267 NpuResamplingMode.NEAREST: resampling_mode.NEAREST,
268 NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,
269}
270
271# Maps data type size in bits to activation precision
272precision_map = {8: 0, 16: 1, 32: 2}
273
274# Maps rounding mode to the corresponding value
275rounding_mode_map = {
276 NpuRoundingMode.TFL: rounding.TFL.value,
277 NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,
278 NpuRoundingMode.NATURAL: rounding.NATURAL.value,
279}
280
281
Louis Verhaard024c3552021-03-17 14:26:34 +0100282def check_mem_limits(memory_accesses: MemoryAccessSet, mem_limits: Dict[int, int]):
283 """Checks that an operation's memory accesses respect the boundaries imposed by mem_limits"""
284 for mem_access in memory_accesses.accesses:
285 for region, range_set in mem_access.regions.items():
286 if region not in mem_limits:
287 raise VelaError(f"Invalid region: {region}")
288 max = mem_limits[region]
289 for start, end in range_set.ranges:
290 for offset in (start, end):
291 if offset < 0:
292 raise VelaError(f"Negative address offset: {offset}, region: {region}")
293 if offset > max:
Tim Hallcda4fcb2022-05-19 12:36:58 +0100294 raise VelaError(
295 f"Address offset out of range: {offset}, region: {region}, max: {max}. Perhaps try running"
296 f" with the HillClimb tensor allocator and/or increasing the maximum iteration of that"
297 f" allocator"
298 )
Louis Verhaard024c3552021-03-17 14:26:34 +0100299
300
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100301def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):
302 """Generates IFM_PAD registers"""
303 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)
304 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)
305 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)
306 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)
307
308
309def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):
310 """Generates ACTIVATION registers"""
311 act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)
312
313 if act.min is None:
314 quantized_min = ofm.data_type.min_value()
315 else:
316 quantized_min = quantise(act.min, ofm.quantization)
317 if act.max is None:
318 quantized_max = ofm.data_type.max_value()
319 else:
320 quantized_max = quantise(act.max, ofm.quantization)
321 quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())
322 quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())
323 if act.op_type == NpuActivationOp.TABLE_LOOKUP:
324 assert 0 <= act.lookup_table_index < 8
325 activation_value = 16 + act.lookup_table_index
326 if ofm.data_type == NpuDataType.INT32:
327 activation_value |= 3 << 12 # Force I8 range
328 quantized_min = max(-128, quantized_min)
329 quantized_max = min(127, quantized_max)
330 else:
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100331 activation_value = cast(int, activation_op_map[act.op_type])
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100332 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)
333 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)
334 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)
335
336
337def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):
338 """Generates xFM_BASE registers"""
339 if layout == NpuLayout.NHCWB16:
340 # Check that all BasePointer addresses are aligned to 16 bytes
341 assert all((int(addr) % 16) == 0 for addr in addresses)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100342 for i in range(4):
343 emit.cmd1_with_address(ptr_cmds[i], addresses[i])
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100344
345
346def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):
347 """Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""
348 emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)
349 emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)
350 emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)
351
352
353def generate_strides(
354 emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1
355):
356 """Generates STRIDE_C/Y/X registers"""
357 strides = get_strides(fm)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100358 emit.cmd1_with_address(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)
359 emit.cmd1_with_address(stride_y_cmd, strides.height) # stride between vertical values (H)
360 emit.cmd1_with_address(stride_x_cmd, strides.width) # stride between horisontal values (W)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100361
362
363def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):
364 """Generates IFM/IFM2_PRECISION register"""
365 dtype = fm.data_type
366 prec = 1 if dtype.is_signed() else 0
367 activation_precision = precision_map[dtype.size_in_bits()]
368 prec += activation_precision << 2
369
370 if fm.layout == NpuLayout.NHCWB16:
371 prec |= 1 << 6
372
373 prec |= op_to_scale << 8
374 emit.cmd0_with_param(precision_cmd, prec)
375
376
377def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):
378 """Generates OFM_PRECISION register"""
379 dtype = npu_op.ofm.data_type
380 prec = 1 if dtype.is_signed() else 0
381 activation_precision = precision_map[dtype.size_in_bits()]
382 prec += activation_precision << 1
383
384 if use_global_scale:
385 # Set global scale bit, as opposed to using per channel scale
386 prec |= 1 << 8
387 if npu_op.ofm.layout == NpuLayout.NHCWB16:
388 prec |= 1 << 6
389 prec |= rounding_mode_map[npu_op.rounding_mode] << 14
390 emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
391
392
393def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):
394 """Generates IFM2_BROADCAST register for binary elementwise operations"""
395 ifm2_broadcast = 0
396 ifm = npu_op.ifm
397 ifm2 = npu_op.ifm2
398 if npu_op.reversed_operands:
399 ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder
400 if npu_op.ifm2_scalar is not None:
401 # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
402 ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar
403 else:
404 if ifm.shape.height != ifm2.shape.height:
405 # Broadcast in 'H' dimension
406 assert ifm2.shape.height == 1
407 ifm2_broadcast |= IFM2Broadcast.BroadcastHdim
408
409 if ifm.shape.width != ifm2.shape.width:
410 # Broadcast in 'W' dimension
411 assert ifm2.shape.width == 1
412 ifm2_broadcast |= IFM2Broadcast.BroadcastWdim
413
414 if ifm.shape.depth != ifm2.shape.depth:
415 # Broadcast in 'C' dimension
416 assert ifm2.shape.depth == 1
417 ifm2_broadcast |= IFM2Broadcast.BroadcastCdim
418
419 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
420
421
422def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):
423 """Generates general IFM registers"""
424 emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)
425 generate_addresses(
426 emit,
427 [cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],
428 ifm.tiles.addresses,
429 ifm.layout,
430 )
431 generate_tiles(
432 emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles
433 )
434 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)
435 generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)
Fredrik Svedbergf3c7d552022-11-04 09:48:49 +0100436 emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, get_zero_point(ifm))
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100437
438
439def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):
440 """Generates general IFM2 registers"""
441 if not has_scalar:
442 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)
443 generate_addresses(
444 emit,
445 [cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],
446 ifm2.tiles.addresses,
447 ifm2.layout,
448 )
449 generate_tiles(
450 emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles
451 )
452 generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)
Fredrik Svedbergf3c7d552022-11-04 09:48:49 +0100453 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, get_zero_point(ifm2))
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100454
455
456def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):
457 """Generates general OFM registers"""
458 emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)
459 generate_addresses(
460 emit,
461 [cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],
462 ofm.tiles.addresses,
463 ofm.layout,
464 )
465 generate_tiles(
466 emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles
467 )
468 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)
469 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)
470 emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)
471 generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)
Fredrik Svedbergf3c7d552022-11-04 09:48:49 +0100472 emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, get_zero_point(ofm))
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100473
474
475def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):
476 """Generates KERNEL related registers"""
477 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))
478 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))
479 # set kernel x stride low bit
480 stride = (kernel.stride_x - 1) & 1
481 # set kernel y stride low bit
482 stride |= (kernel.stride_y - 1 & 1) << 1
483 # set kernel x stride extension bits
484 stride |= (kernel.stride_x - 1 >> 1) << 6
485 # set kernel y stride extension bits
486 stride |= (kernel.stride_y - 1 >> 1) << 9
487 stride |= (kernel.dilation_x - 1) << 3
488 stride |= (kernel.dilation_y - 1) << 4
489 if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:
490 stride |= 1 << 2
491 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
492
493
494def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):
495 """Generates WEIGHT registers"""
496 if len(weights) == 0:
497 return
498 emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)
499 # Set weights sources for active and present cores
500 for core, (addr, length) in enumerate(
501 [
502 (cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
503 (cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
504 ]
505 ):
506 if core < len(weights):
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100507 emit.cmd1_with_address(addr, weights[core].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100508 emit.cmd1_with_offset(length, weights[core].length)
509 elif core < arch.ncores:
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100510 emit.cmd1_with_address(addr, weights[0].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100511 emit.cmd1_with_offset(length, 0)
512
513
514def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):
515 """Generates SCALE registers"""
516 if len(biases) == 0:
517 return
518 emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)
519 # Set weights sources for active and present cores
520 for core, (addr, length) in enumerate(
521 [(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]
522 ):
523 if core < len(biases):
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100524 emit.cmd1_with_address(addr, biases[core].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100525 emit.cmd1_with_offset(length, biases[core].length)
526 elif core < arch.ncores:
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100527 emit.cmd1_with_address(addr, biases[0].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100528 emit.cmd1_with_offset(length, 0)
529
530
531def generate_block_config(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200532 emit: CommandStreamEmitter,
533 block_config: NpuShape3D,
Louis Verhaard933f55e2020-11-25 14:10:30 +0100534):
535 """Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100536 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
537 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)
538 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100539
540
Tim Halld8339a72021-05-27 18:49:40 +0100541def generate_shram_registers(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200542 emit: CommandStreamEmitter,
543 npu_op: NpuBlockOperation,
544 arch_block_config: ArchitectureBlockConfig,
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100545):
Tim Halld8339a72021-05-27 18:49:40 +0100546 """Generates IB_END/IB_START/AB_START/ACC_FORMAT registers"""
547 emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, arch_block_config.layout.ib_end)
548 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, arch_block_config.layout.ab_start)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100549 if has_ifm2(npu_op):
Tim Halld8339a72021-05-27 18:49:40 +0100550 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_IB_START, arch_block_config.layout.ib_start2)
551 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[arch_block_config.acc_type])
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100552
553
Tim Halld8339a72021-05-27 18:49:40 +0100554def get_block_config_for_npu_op(
555 arch, npu_op: NpuBlockOperation, npu_block_type: NpuBlockType, is_partkernel: bool, ifm_resampling: resampling_mode
556) -> Optional[ArchitectureBlockConfig]:
557 """
558 Given npu_op.block_config, returns a corresponding ArchitectureBlockConfig.
559 Returns None if the block_config does not fit.
560 """
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100561
562
Tim Halld8339a72021-05-27 18:49:40 +0100563def get_arch_block_config(
564 npu_op: NpuBlockOperation, block_traversal: NpuBlockTraversal, arch: ArchitectureFeatures
565) -> ArchitectureBlockConfig:
Louis Verhaard933f55e2020-11-25 14:10:30 +0100566 """Creates shared buffer allocation for the given operation"""
Tim Halld8339a72021-05-27 18:49:40 +0100567 assert npu_op.block_config is not None, "block_config has not been set"
568 block_type = NpuBlockType.Default
Dwight Lidman9b43f842020-12-08 17:56:44 +0100569 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100570 block_type = NpuBlockType.ConvolutionMxN
Dwight Lidman9b43f842020-12-08 17:56:44 +0100571 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100572 block_type = NpuBlockType.ConvolutionDepthWise
Dwight Lidman9b43f842020-12-08 17:56:44 +0100573 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100574 block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
Dwight Lidman9b43f842020-12-08 17:56:44 +0100575 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100576 block_type = NpuBlockType.ElementWise
577 else:
578 assert 0, "Unsupported operation"
579 ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
Tim Halld8339a72021-05-27 18:49:40 +0100580 is_partkernel = block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST
581 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
582 lut_banks = 2 if uses_lut else 0
583 fms = [npu_op.ifm, npu_op.ofm]
584 if npu_op.ifm2 is not None:
585 fms.append(npu_op.ifm2)
586 all_fms_have_quant = not any(fm.quantization is None or fm.quantization.scale_f32 is None for fm in fms)
587 ifm_bits = npu_op.ifm.data_type.size_in_bits()
588 ifm_shape = shape3d_to_block(npu_op.ifm.shape)
589 if has_ifm2(npu_op):
590 ifm2_shape = shape3d_to_block(npu_op.ifm2.shape)
591 else:
592 ifm2_shape = None
593 uses_scalar = npu_op.ifm2_scalar is not None
594 block_config = shape3d_to_block(npu_op.block_config)
595 arch_block_config = try_block_config(
596 block_config,
597 arch,
598 block_type,
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100599 shape3d_to_block(npu_op.ofm.shape),
Tim Halld8339a72021-05-27 18:49:40 +0100600 ifm_shape,
601 ifm2_shape,
602 uses_scalar,
603 ifm_bits,
604 is_partkernel=is_partkernel,
605 kernel=to_kernel(npu_op.kernel),
606 lut_banks=lut_banks,
607 scaled=all_fms_have_quant,
608 ifm_resampling=ifm_resampling_mode,
609 )
610 assert arch_block_config is not None, f"block_config {npu_op.block_config} does not fit"
611 return arch_block_config
Louis Verhaard933f55e2020-11-25 14:10:30 +0100612
613
Louis Verhaard1e170182020-11-26 11:42:04 +0100614def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):
615 """Generates KERNEL_WAIT/DMA_WAIT"""
616 if cmd_waits.npu >= 0:
617 emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
618
619 if cmd_waits.dma >= 0:
620 emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
621
622
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100623def generate_common(
624 emit: CommandStreamEmitter,
625 npu_op: NpuBlockOperation,
626 block_traversal: NpuBlockTraversal,
627 arch: ArchitectureFeatures,
628 use_global_scale: bool = False,
629 op_to_scale: int = 0,
630):
631 """Generate registers that are common to most operations"""
632 assert npu_op.ifm is not None and npu_op.ofm is not None
633 generate_ifm(emit, npu_op.ifm)
634 generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)
635 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])
636 if npu_op.padding is not None:
637 generate_padding(emit, npu_op.padding)
638 generate_ofm(emit, npu_op.ofm)
639 generate_ofm_precision(emit, npu_op, use_global_scale)
640 if npu_op.op_type != NpuOperationType.ElementWise:
641 assert npu_op.kernel is not None
642 generate_kernel(emit, npu_op.kernel, block_traversal)
643 generate_weights(emit, npu_op.weights, arch)
644 generate_biases(emit, npu_op.biases, arch)
645 generate_activation(emit, npu_op.activation, npu_op.ofm)
Tim Halld8339a72021-05-27 18:49:40 +0100646 arch_block_config = get_arch_block_config(npu_op, block_traversal, arch)
647 generate_block_config(emit, npu_op.block_config)
648 generate_shram_registers(emit, npu_op, arch_block_config)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100649
650
651# -------------------------------------------------------------------
652# SCALING
653# -------------------------------------------------------------------
654
655
656def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):
657 """Generates OFM_SCALE register for pooling operations"""
658 # For valid padding vela has to output scaling values
659 kernel = pool_op.kernel
660 ifm_quant = pool_op.ifm.quantization
661 ofm_quant = pool_op.ofm.quantization
662 if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):
663 assert ifm_quant.scale_f32 is not None
664 rescale = 0x3000 * ifm_quant.scale_f32
665 if pool_op.ifm.data_type == NpuDataType.INT16:
666 # Calculate scale and shift for the output scale of 1/(3*4096)
Louis Verhaardc6291292021-03-19 09:35:48 +0100667 x_log2 = math.log2(ifm_quant.scale_f32)
668 rounded_log2 = int(round(x_log2))
669 is_power_of_two = abs(x_log2 - rounded_log2) < 0.001
670 shift = rounded_log2 + 12
Patrik Gustavssone3dd2f32021-12-02 09:08:26 +0100671 if is_power_of_two and (
672 (pool_op.activation.op_type == NpuActivationOp.TANH and shift in (0, 1))
673 or (pool_op.activation.op_type == NpuActivationOp.SIGMOID and shift == 0)
674 ):
675 # Special handling if input scale is 1/2048 (tanh/sigmoid) or 1/4096 (tanh)
Louis Verhaardc6291292021-03-19 09:35:48 +0100676 scale = 3 << shift
677 shift = 0
678 else:
679 shift = 0
680 max_rescale = np.iinfo(np.int16).max / 2
681 while rescale <= max_rescale and shift <= 30:
682 shift += 1
683 rescale *= 2
684 scale = int(rescale)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100685 else:
686 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
687 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
688 scale = int(round_away_zero(scale * rescale))
689 elif pool_op.fused_quantize:
690 # Quantize op requires different scaling
691 ifm_scale_f64 = np.double(ifm_quant.scale_f32)
692 ofm_scale_f64 = np.double(ofm_quant.scale_f32)
693 scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
694 elif pool_op.rescale is not None:
Patrik Gustavssonc74682c2021-08-17 14:26:38 +0200695 if type(pool_op.rescale) == ExplicitScaling:
696 # Note: reuse of rescale for explicit scaling to not expose this in the external API
697 explicit_scaling = pool_op.rescale
698 assert explicit_scaling.per_channel is False
699 scale = explicit_scaling.multiplier[0]
700 shift = explicit_scaling.shift[0]
701 else:
Tim Hall885033b2022-07-21 11:46:03 +0100702 # for ResizeBilinear/NearestNeighbor operations with rescale
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200703 # Note: this is not used, but part of the public API
Patrik Gustavssonc74682c2021-08-17 14:26:38 +0200704 rescale = pool_op.rescale
705 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
706 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
707 scale = int(round_away_zero(scale * rescale))
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100708 else:
709 # In case avg pool fused with concat or other memory operation, rescaling might be needed.
710 # kernel height == kernel width == 1 is always true in this case
711 # Normally the scale is maximised, to get maximum precision, which means that
712 # if rescale != 1, scale need to consider the number of bits needed for rescaling
713 if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:
714 rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32
715 rescale_bits = 0
716 if kernel.height == kernel.width == 1:
717 if rescale > 1:
718 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
719 elif rescale < 1:
720 rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
721 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
722 scale = int(round_away_zero(scale * rescale))
723 else:
724 scale = 1
725 shift = 0
726
727 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
728
729
730def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
731 """
732 Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
733 Returns the operator to scale
734 """
735 op_to_scale = 0
736 if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):
737 input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
738 input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
739 output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
740
741 if npu_op.activation is not None and npu_op.activation.op_type in (
742 NpuActivationOp.SIGMOID,
743 NpuActivationOp.TANH,
744 ):
745 output_scale = 1 / 0x3000
746
747 if npu_op.sub_op_type == NpuElementWiseOp.MUL:
Patrik Gustavssonb081d672021-08-25 13:49:25 +0200748 if npu_op.rescale:
749 ofm_scale, shift = npu_op.rescale
750 elif None in (input_scale, input2_scale, output_scale):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100751 ofm_scale = 1
752 shift = 0
753 else:
754 ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100755 else: # Add/Sub
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200756 # Default operand scaling is no scaling
757 opa_scale = opb_scale = 1
758 opa_shift = 0
Henrik G Olssonad656a82021-03-19 15:50:28 +0100759 bitdepth = npu_op.ifm.data_type.size_in_bits()
760 use_advanced_scaling = False
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200761 if npu_op.rescale is not None:
762 # Explicit ofm scaling
763 ofm_scale, shift = npu_op.rescale
764 elif None in (input_scale, input2_scale, output_scale):
765 # No ofm scaling
766 ofm_scale = 1
767 shift = 0
Henrik G Olssonad656a82021-03-19 15:50:28 +0100768 elif input_scale == input2_scale and bitdepth == 16:
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200769 # int16 same scaling
Henrik G Olssonad656a82021-03-19 15:50:28 +0100770 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
771 input_scale, input2_scale, output_scale
772 )
773 # align the double rounding with that of advanced scaling
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200774 opa_scale //= 2
775 opb_scale //= 2
Henrik G Olssonad656a82021-03-19 15:50:28 +0100776 shift -= 1
777 opa_shift = 0 # Unused for this case
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100778 elif input_scale == input2_scale:
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200779 # Same scaling
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100780 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
781 input_scale, input2_scale, output_scale
782 )
783 opa_shift = 0 # Unused for this case
Henrik G Olssonad656a82021-03-19 15:50:28 +0100784 # For 8 bit we can't guarantee double rounding with simplified scaling will always be
785 # the same as with advanced scaling due to different shifts. When the ofm scale fulfils
786 # the following we know that double rounding will have no effect for advanced scaling
787 # no matter the input, so we can safely use simplified scaling with double rounding disabled.
788 use_advanced_scaling = int(ofm_scale) & 0xFFF != 0
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100789 else:
Henrik G Olssonad656a82021-03-19 15:50:28 +0100790 use_advanced_scaling = True
791 if use_advanced_scaling:
792 # Use advanced implementation only when input/output scales differ,
793 # or when we can't guarantee the absence of rounding errors
Jonas Ohlssond8575072022-03-30 10:30:25 +0200794 (
795 opa_scale,
796 opa_shift,
797 ofm_scale,
798 shift,
799 op_to_scale,
800 ) = scaling.advanced_elementwise_add_sub_scale(input_scale, input2_scale, output_scale, bitdepth)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100801 opb_scale = 0 # Unused for this case
802 if npu_op.reversed_operands:
803 # If the operand order is reversed we also have to swap which operand is scaled
804 if op_to_scale == scaling.OperandToScale.OPa:
805 op_to_scale = scaling.OperandToScale.OPb
806 else:
807 op_to_scale = scaling.OperandToScale.OPa
808 emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
809 emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100810 elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):
811 output_scale = npu_op.ofm.quantization.scale_f32
812 ofm_scale, shift = scaling.quantise_scale(output_scale)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100813 else:
Tim Halle178f382022-07-12 17:02:25 +0100814 ofm_scale = 1
815 shift = 0
816 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100817 return op_to_scale
818
819
820# -------------------------------------------------------------------
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100821# PRINT
822# -------------------------------------------------------------------
Jacob Bohline99b8932020-07-13 16:01:51 +0200823
824
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100825def print_feature_map(fm: Optional[NpuFeatureMap], name: str):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100826 if fm is not None:
827 q = (
828 "no quantization"
829 if fm.quantization is None
830 else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"
831 )
832 h, w, c = fm.shape
833 sz = h * w * c * fm.data_type.size_in_bytes()
834 print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")
835 strides = get_strides(fm)
836 stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"
837 t = fm.tiles
838 addresses = [hex(addr) for addr in t.addresses]
839 print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")
Tim Hall68df8a12022-03-16 16:51:16 +0000840 print(f" name={fm.name}")
Tim Hall79d07d22020-04-27 18:20:16 +0100841
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100842
Dwight Lidman9b43f842020-12-08 17:56:44 +0100843def print_operation(npu_op: NpuOperation, index: int = 0, cmd=None):
Tim Hall68df8a12022-03-16 16:51:16 +0000844 pass_info = f" {cmd}" if cmd else ""
Dwight Lidman9b43f842020-12-08 17:56:44 +0100845 if isinstance(npu_op, NpuOperation) and not isinstance(npu_op, (NpuDmaOperation, NpuBlockOperation)):
Tim Hall68df8a12022-03-16 16:51:16 +0000846 print(f"{index} {npu_op.op_type.name} name={npu_op.name}:{pass_info}")
Dwight Lidman9b43f842020-12-08 17:56:44 +0100847 return
848 if isinstance(npu_op, NpuDmaOperation):
Tim Hall68df8a12022-03-16 16:51:16 +0000849 print(f"{index} {npu_op.op_type.name} name={npu_op.name}, src={npu_op.src}, dest={npu_op.dest}:{pass_info}")
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100850 return
851 k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100852 if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)):
Tim Hall68df8a12022-03-16 16:51:16 +0000853 print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name} name={npu_op.name}:{pass_info}")
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200854 else:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100855 if (
Dwight Lidman9b43f842020-12-08 17:56:44 +0100856 isinstance(npu_op, NpuConv2DOperation)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100857 and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1
858 ):
859 fc = "FullyConnected "
860 else:
861 fc = ""
Tim Hall68df8a12022-03-16 16:51:16 +0000862 print(f"{index} {fc}{npu_op.op_type.name} name={npu_op.name}:{pass_info}")
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100863 print_feature_map(npu_op.ifm, "IFM")
864 if npu_op.ifm2_scalar is not None:
865 quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
866 print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")
867 else:
868 print_feature_map(npu_op.ifm2, "IFM2")
869 print_feature_map(npu_op.ofm, "OFM")
870 if k is not None and npu_op.op_type != NpuOperationType.ElementWise:
871 print(f" Kernel: {k}")
872 if npu_op.padding is not None:
873 print(f" {npu_op.padding}")
874 for weights in npu_op.weights:
875 print(f" Weights: {weights}")
876 for bias in npu_op.biases:
877 print(f" Scales: {bias}")
878 if npu_op.activation is not None:
879 act = npu_op.activation
880 if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:
881 lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""
882 print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")
Dwight Lidman9b43f842020-12-08 17:56:44 +0100883 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100884 print(f" {npu_op.block_traversal}")
885 bh, bw, bc = npu_op.block_config
Dwight Lidman9b43f842020-12-08 17:56:44 +0100886 rescale = (
887 f", rescale={npu_op.rescale}" if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)) else ""
888 )
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100889 print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")
Tim Hall79d07d22020-04-27 18:20:16 +0100890
Tim Hall79d07d22020-04-27 18:20:16 +0100891
Dwight Lidman9b43f842020-12-08 17:56:44 +0100892def print_operations(npu_op_list: List[NpuOperation], npu_op_to_cmd=None):
893 npu_op_to_cmd = dict() if npu_op_to_cmd is None else npu_op_to_cmd
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100894 for index, npu_op in enumerate(npu_op_list):
Dwight Lidman9b43f842020-12-08 17:56:44 +0100895 print_operation(npu_op, index, npu_op_to_cmd.get(npu_op))
Tim Hall79d07d22020-04-27 18:20:16 +0100896
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100897
898# -------------------------------------------------------------------
899# OPERATIONS
900# -------------------------------------------------------------------
901
902
903def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):
904 """Generates NPU_OP_* command"""
Dwight Lidman9b43f842020-12-08 17:56:44 +0100905 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100906 emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100907 elif isinstance(npu_op, NpuConv2DOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100908 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100909 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100910 emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100911 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100912 emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])
Dwight Lidman9b43f842020-12-08 17:56:44 +0100913 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100914 emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])
915 else:
916 assert 0, "Unsupported operation"
917
918
Louis Verhaard933f55e2020-11-25 14:10:30 +0100919def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100920 """Generates register commands for Conv2D operations"""
921 generate_common(emit, npu_op, npu_op.block_traversal, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100922
923
Dwight Lidman9b43f842020-12-08 17:56:44 +0100924def generate_conv_depthwise_op(
925 emit: CommandStreamEmitter, npu_op: NpuConvDepthWiseOperation, arch: ArchitectureFeatures
926):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100927 """Generates register commands for depthwise convolution operations"""
928 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100929
930
931def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
932 """Generates register commands for pooling operations"""
Tim Halld6efcd32022-09-02 15:01:01 +0100933 # check that reduce_sum input is NHWC
934 if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM and npu_op.ifm.layout != NpuLayout.NHWC:
935 if npu_op.ifm.data_type == NpuDataType.INT32:
936 raise VelaError(
937 f"REDUCE_SUM ({npu_op.name}) with IFM data type of INT32 requires IFM layout to be NHWC"
938 f" ({npu_op.ifm.name} == {npu_op.ifm.layout})"
939 )
940 elif arch.accelerator_config == Accelerator.Ethos_U65_512:
941 raise VelaError(
942 f"REDUCE_SUM ({npu_op.name}) with accelerator config of Ethos_U65_512 requires IFM layout to be NHWC"
943 f" ({npu_op.ifm.name} == {npu_op.ifm.layout})"
944 )
945
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100946 use_global_scale = (
947 npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0
948 )
Patrik Gustavssonc74682c2021-08-17 14:26:38 +0200949 # Note: reuse of rescale for explicit scaling to not expose this in the external API
950 if npu_op.rescale is not None and type(npu_op.rescale) == ExplicitScaling:
951 use_global_scale = not npu_op.rescale.per_channel
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100952 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)
953 # Pooling op specific
954 if use_global_scale:
955 generate_ofm_scaling_for_pooling(emit, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100956
957
958def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):
959 """Generates register commands for elementwise operations"""
960 use_global_scale = npu_op.sub_op_type in (
961 NpuElementWiseOp.ADD,
962 NpuElementWiseOp.SUB,
963 NpuElementWiseOp.MUL,
964 NpuElementWiseOp.LRELU,
965 NpuElementWiseOp.ABS,
966 )
967 op_to_scale = generate_scaling_for_elementwise(emit, npu_op)
968 generate_common(
969 emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale
970 )
971 # Elementwise op specific
Louis Verhaard1e170182020-11-26 11:42:04 +0100972 if npu_op.sub_op_type not in UNARY_ELEMWISE_OPS:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100973 # Binary operation; generate IFM2 registers
974 assert npu_op.ifm2 is not None
975 has_scalar = npu_op.ifm2_scalar is not None
976 generate_ifm2(emit, npu_op.ifm2, has_scalar)
977 generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)
978 generate_ifm2_broadcast(emit, npu_op)
979 if has_scalar:
980 quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
981 assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()
982 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100983
984
985def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):
986 """Generates register commands for DMA operations"""
987 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100988 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100989 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)
990
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100991 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
992 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100993
994
Louis Verhaard933f55e2020-11-25 14:10:30 +0100995def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100996 """
997 Generates register commands for the given operation, but not the final NPU_OP_... command.
998 Returns the selected block config
999 """
Dwight Lidman9b43f842020-12-08 17:56:44 +01001000 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +01001001 generate_conv2d_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001002 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +01001003 generate_conv_depthwise_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001004 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +01001005 generate_pooling_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001006 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +01001007 generate_elementwise_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001008 elif isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001009 generate_dma_op(emit, npu_op)
1010 else:
1011 assert 0, "Unsupported operation"
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001012
1013
1014def generate_command_stream(
Louis Verhaard024c3552021-03-17 14:26:34 +01001015 npu_op_list: List[NpuOperation],
1016 arch: ArchitectureFeatures,
1017 verbose: bool,
1018 mem_limits: Dict[int, int],
1019 add_to_debug_db=None,
1020 npu_op_to_cmd=None,
Louis Verhaard1e170182020-11-26 11:42:04 +01001021) -> List[int]:
1022 """
1023 Generates register commands for the given list of NPU operations.
1024 Returns Ethos-U instructions, as a list of 32-bit integers.
1025 """
1026 emit = CommandStreamEmitter()
1027 if verbose:
Dwight Lidman9b43f842020-12-08 17:56:44 +01001028 print_operations(npu_op_list, npu_op_to_cmd)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001029 # Calculate memory accesses for every operation
Dwight Lidman9b43f842020-12-08 17:56:44 +01001030 memory_accesses: Dict[NpuOperation, MemoryAccessSet] = {}
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001031 for npu_op in npu_op_list:
Dwight Lidman9b43f842020-12-08 17:56:44 +01001032 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001033 memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001034 elif isinstance(npu_op, NpuBlockOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001035 memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001036 else:
1037 assert 0, "Invalid operation type"
Louis Verhaard024c3552021-03-17 14:26:34 +01001038
Tim Hallc8a73862020-10-27 12:43:14 +00001039 if arch.is_ethos_u65_system:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001040 emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
1041 dep_watermark = Watermark(0, 0)
1042 prev_op = None
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001043 # Generate register commands for all operations
1044 for op_index, npu_op in enumerate(npu_op_list):
Louis Verhaard024c3552021-03-17 14:26:34 +01001045 try:
1046 check_mem_limits(memory_accesses[npu_op], mem_limits)
1047 dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)
1048 generate_registers_for_op(emit, npu_op, arch)
1049 except VelaError as e:
1050 # Add operation info and rethrow
1051 raise VelaError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None
Dwight Lidman9b43f842020-12-08 17:56:44 +01001052 if not isinstance(npu_op, NpuDmaOperation) and isinstance(npu_op, NpuBlockOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001053 # Generate BLOCKDEP
Louis Verhaard933f55e2020-11-25 14:10:30 +01001054 blockdep = calc_blockdep(arch, prev_op, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001055 blockdep = min(blockdep, arch.max_blockdep)
1056 emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
1057 prev_op = npu_op
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001058
1059 generate_cmd_waits(emit, cmd_waits)
1060 # Generate the actual NPU_OP command
1061 generate_operation_code(emit, npu_op)
1062 if add_to_debug_db is not None:
1063 add_to_debug_db(npu_op, emit.offset)
1064 # Fill in final part of command stream:
1065 emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
Louis Verhaard1e170182020-11-26 11:42:04 +01001066 res = emit.to_list()
erik.andersson@arm.com1878dab2021-03-16 09:40:24 +01001067
1068 if emit.size_in_bytes() >= 1 << 24:
1069 raise VelaError(
1070 f"The command stream size exceeds the hardware limit of 16 MiB. "
1071 f"The current stream size is {emit.size_in_bytes()/2**20:.2F} MiB."
1072 )
1073
Tim Hall79d07d22020-04-27 18:20:16 +01001074 if verbose:
1075 emit.print_cmds()
Tim Hall114baba2022-05-10 12:42:27 +01001076 print(f"Number of commands = {len(emit.cmd_stream)}")
1077 print(f"Command stream length = {emit.size_in_bytes()} bytes")
Louis Verhaard1e170182020-11-26 11:42:04 +01001078 return res
1079
1080
Louis Verhaardaeae5672020-11-02 18:04:27 +01001081def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001082 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001083 Internal implementation of the public facing API for generating an Ethos-U register command stream.
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001084 Calculates dependencies between commands and inserts wait operations if needed.
1085
1086 :param npu_op_list: List[NpuOperation] list of high level NPU operations
Tim Hallc8a73862020-10-27 12:43:14 +00001087 :param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator
1088 :return Ethos-U instructions, as a list of 32-bit integers
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001089 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001090 accelerator = Accelerator.from_npu_accelerator(npu_accelerator)
Louis Verhaard52078302020-11-18 13:35:06 +01001091 arch = create_default_arch(accelerator)
Louis Verhaard024c3552021-03-17 14:26:34 +01001092 mem_limits = dict()
1093 for region in range(0, 8):
1094 mem_limits[region] = arch.max_address_offset
1095 mem_limits[BASE_PTR_INDEX_MEM2MEM] = arch.shram_size_bytes
1096 return generate_command_stream(npu_op_list, arch, verbose=False, mem_limits=mem_limits)