blob: f5f530d3535593295979db7014ffff8fa31ad805 [file] [log] [blame]
Rickard Bolinbc6ee582022-11-04 08:24:29 +00001# SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Rickard Bolinbc6ee582022-11-04 08:24:29 +000016#
Tim Hall79d07d22020-04-27 18:20:16 +010017# Description:
Tim Hallc8a73862020-10-27 12:43:14 +000018# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates
Tim Hall79d07d22020-04-27 18:20:16 +010019# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
Tim Hallc8a73862020-10-27 12:43:14 +000020# stream suitable for interpretation by the Ethos-U processor.
Louis Verhaardc6291292021-03-19 09:35:48 +010021import math
Tim Hall79d07d22020-04-27 18:20:16 +010022from collections import defaultdict
Diego Russoe8a10452020-04-21 17:39:10 +010023from enum import Enum
24from enum import IntEnum
Jonas Ohlsson845e2322022-03-01 12:39:55 +010025from typing import cast
Dwight Lidman9b43f842020-12-08 17:56:44 +010026from typing import Dict
Louis Verhaarde8a5a782020-11-02 18:04:27 +010027from typing import List
28from typing import Optional
Diego Russoea6111a2020-04-14 18:41:58 +010029
30import numpy as np
31
32from . import scaling
Louis Verhaardaeae5672020-11-02 18:04:27 +010033from .api import NpuAccelerator
Louis Verhaarde8a5a782020-11-02 18:04:27 +010034from .api import NpuActivation
35from .api import NpuActivationOp
36from .api import NpuAddressRange
37from .api import NpuBlockOperation
38from .api import NpuBlockTraversal
39from .api import NpuConv2DOperation
Dwight Lidman9b43f842020-12-08 17:56:44 +010040from .api import NpuConvDepthWiseOperation
Louis Verhaarde8a5a782020-11-02 18:04:27 +010041from .api import NpuDataType
42from .api import NpuDmaOperation
43from .api import NpuElementWiseOp
44from .api import NpuElementWiseOperation
45from .api import NpuFeatureMap
46from .api import NpuKernel
47from .api import NpuLayout
48from .api import NpuOperation
49from .api import NpuOperationType
50from .api import NpuPadding
51from .api import NpuPoolingOp
52from .api import NpuPoolingOperation
Louis Verhaarde8a5a782020-11-02 18:04:27 +010053from .api import NpuResamplingMode
54from .api import NpuRoundingMode
55from .api import NpuShape3D
56from .api import NpuTileBox
Tim Halld8339a72021-05-27 18:49:40 +010057from .architecture_allocator import ArchitectureBlockConfig
58from .architecture_allocator import try_block_config
Louis Verhaarde8a5a782020-11-02 18:04:27 +010059from .architecture_features import Accelerator
Diego Russoe8a10452020-04-21 17:39:10 +010060from .architecture_features import ArchitectureFeatures
Louis Verhaard52078302020-11-18 13:35:06 +010061from .architecture_features import create_default_arch
Diego Russoe8a10452020-04-21 17:39:10 +010062from .architecture_features import SHRAMElements
erik.andersson@arm.com1878dab2021-03-16 09:40:24 +010063from .errors import VelaError
Diego Russoe8a10452020-04-21 17:39:10 +010064from .ethos_u55_regs.ethos_u55_regs import acc_format
65from .ethos_u55_regs.ethos_u55_regs import activation
66from .ethos_u55_regs.ethos_u55_regs import cmd0
67from .ethos_u55_regs.ethos_u55_regs import cmd1
68from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
Fredrik Svedberga0c36242020-06-03 15:43:31 +020069from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlincf7da102020-05-20 09:03:40 +020070from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russoe8a10452020-04-21 17:39:10 +010071from .ethos_u55_regs.ethos_u55_regs import rounding
Diego Russoe8a10452020-04-21 17:39:10 +010072from .numeric_util import round_away_zero
Diego Russoe8a10452020-04-21 17:39:10 +010073from .numeric_util import round_up_to_int
Patrik Gustavssonc74682c2021-08-17 14:26:38 +020074from .operation import ExplicitScaling
Tim Hall79d07d22020-04-27 18:20:16 +010075from .operation import NpuBlockType
Dwight Lidman9b43f842020-12-08 17:56:44 +010076from .range_set import MemoryAccessSet
Louis Verhaard024c3552021-03-17 14:26:34 +010077from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM
Louis Verhaard1e170182020-11-26 11:42:04 +010078from .register_command_stream_util import calc_blockdep
79from .register_command_stream_util import get_dma_memory_accesses
80from .register_command_stream_util import get_op_memory_accesses
81from .register_command_stream_util import get_strides
82from .register_command_stream_util import get_wait_dependency
Fredrik Svedbergf3c7d552022-11-04 09:48:49 +010083from .register_command_stream_util import get_zero_point
Louis Verhaard1e170182020-11-26 11:42:04 +010084from .register_command_stream_util import has_ifm2
Fredrik Svedbergf3c7d552022-11-04 09:48:49 +010085from .register_command_stream_util import quantise
Tim Halld8339a72021-05-27 18:49:40 +010086from .register_command_stream_util import shape3d_to_block
Louis Verhaard1e170182020-11-26 11:42:04 +010087from .register_command_stream_util import to_kernel
88from .register_command_stream_util import UNARY_ELEMWISE_OPS
89from .register_command_stream_util import Watermark
Tim Hall79d07d22020-04-27 18:20:16 +010090
91
92class RegisterMachine:
93 def __init__(self):
94 self.n_banks = 1
95 self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
96 self.bank_idx = 0
97
98 def set_register(self, reg, value):
99 is_changed = self.registers[self.bank_idx][reg] != value
100 self.registers[self.bank_idx][reg] = value
101 # is_changed = True # force command
102 return is_changed
103
104 def switch_bank(self):
105 self.bank_idx = (self.bank_idx + 1) % self.n_banks
106
107
108class CmdMode(IntEnum):
109 NoPayload = 0x0000
110 Payload32 = 0x4000
111 Mask = 0xC000
112 CmdOpMask = 0x03FF
113
114
Tim Hall79d07d22020-04-27 18:20:16 +0100115class CommandStreamEmitter:
Tim Halle6ccd872020-11-09 16:46:37 +0000116 WORD_SIZE = 4
117
Tim Hall79d07d22020-04-27 18:20:16 +0100118 def __init__(self):
119 self.cmd_stream = []
120 self.reg_machine = [RegisterMachine(), RegisterMachine()]
121 self.last_absolute_wait = defaultdict(int)
Tim Halle6ccd872020-11-09 16:46:37 +0000122 self.offset = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100123
124 def get_reg_machine(self, cmd):
125 if "DMA" in cmd.name:
126 return self.reg_machine[1]
127 else:
128 return self.reg_machine[0]
129
130 def size_in_bytes(self):
131 sz = 0
132 for cmd in self.cmd_stream:
Tim Halle6ccd872020-11-09 16:46:37 +0000133 sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100134 return sz
135
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100136 def to_list(self) -> List[int]:
Tim Hall79d07d22020-04-27 18:20:16 +0100137 return [elem for cmd in self.cmd_stream for elem in cmd]
138
139 def print_cmds(self):
Tim Hall114baba2022-05-10 12:42:27 +0100140 s = f" {'Offset':6}:"
141 s += f" {'Payload':8}"
142 s += f"{'Param':4}" # no leading space for alignment
143 s += f" {'Code':4}"
144 s += f" - {'Command':30}"
145 s += f" {'Param':5}"
146 print(s)
147
148 offset = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100149 for words_for_one_command in self.cmd_stream:
150 code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
151 param = words_for_one_command[0] >> 16 # higher 16 bits
152
153 payload_mode = CmdMode(code & CmdMode.Mask)
154
Tim Hallcda4fcb2022-05-19 12:36:58 +0100155 s = f"{offset:#08x}:"
Tim Hall114baba2022-05-10 12:42:27 +0100156
Tim Hall79d07d22020-04-27 18:20:16 +0100157 if payload_mode == CmdMode.NoPayload:
Tim Hall114baba2022-05-10 12:42:27 +0100158 s += f" {'':8}"
Tim Hall79d07d22020-04-27 18:20:16 +0100159 else:
Tim Hall114baba2022-05-10 12:42:27 +0100160 assert payload_mode == CmdMode.Payload32
161 s += f" {words_for_one_command[1]:08x}"
Tim Hall79d07d22020-04-27 18:20:16 +0100162
Tim Hall114baba2022-05-10 12:42:27 +0100163 s += f" {param:04x}"
164 s += f" {code:04x}"
Tim Hall79d07d22020-04-27 18:20:16 +0100165
Tim Hall114baba2022-05-10 12:42:27 +0100166 if payload_mode == CmdMode.NoPayload:
167 s += f" - {cmd0(code & CmdMode.CmdOpMask):30}"
168 offset += 4
Tim Hall79d07d22020-04-27 18:20:16 +0100169 else:
Tim Hall114baba2022-05-10 12:42:27 +0100170 s += f" - {cmd1(code & CmdMode.CmdOpMask):30}"
171 offset += 8
Tim Hall79d07d22020-04-27 18:20:16 +0100172
Tim Hall114baba2022-05-10 12:42:27 +0100173 s += f" {param:5}"
Tim Hall79d07d22020-04-27 18:20:16 +0100174 print(s)
175
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100176 def cmd0_with_param(self, cmd: cmd0, param):
Tim Hall79d07d22020-04-27 18:20:16 +0100177 if isinstance(param, Enum):
178 param = int(param.value)
179 else:
180 param = int(param)
181 param = param & 0xFFFF
182 command = cmd.value | (param << 16)
183 if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
184 return
185
186 # This is not a redundant command, actually write it
187 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000188 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100189
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100190 def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):
Louis Verhaard893780c2021-03-30 09:02:30 +0200191 offset = int(offset) & 0xFFFFFFFF
192 param = int(param) & 0xFFFF
Tim Hall79d07d22020-04-27 18:20:16 +0100193 command = cmd.value | CmdMode.Payload32.value | (param << 16)
194
195 if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
196 return
197
198 # This is not a redundant command, actually write it
199 self.cmd_stream.append((command, offset))
Tim Halle6ccd872020-11-09 16:46:37 +0000200 self.offset += CommandStreamEmitter.WORD_SIZE * 2
Tim Hall79d07d22020-04-27 18:20:16 +0100201
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100202 def cmd1_with_address(self, cmd: cmd1, offset):
203 self.cmd1_with_offset(cmd, offset, offset >> 32)
204
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100205 def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):
Tim Hall289a41d2020-08-04 21:40:14 +0100206 param = (16 * channel) + outstanding_count
Tim Hall79d07d22020-04-27 18:20:16 +0100207 command = ((param & 0xFFFF) << 16) | cmd.value
208 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000209 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100210
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100211 def cmd_do_operation(self, cmd: cmd0, param=0):
Tim Hall79d07d22020-04-27 18:20:16 +0100212 param = int(param)
213 command = ((param & 0xFFFF) << 16) | cmd.value
214
215 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000216 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100217 self.get_reg_machine(cmd).switch_bank()
218
219
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100220# -------------------------------------------------------------------
221# REGISTER GENERATION
222# -------------------------------------------------------------------
223
224
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100225# TODO: Replace with definitions from ethos_u55_regs
226class IFM2Broadcast(IntEnum):
227 BroadcastHdim = 1 << 0
228 BroadcastWdim = 1 << 1
229 BroadcastCdim = 1 << 2
230 ReverseOperandOrder = 1 << 6
231 UseIFM2Scalar = 1 << 7
232
233
234pooling_op_map = {
235 NpuPoolingOp.MAX: pooling_mode.MAX.value,
236 NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,
237 NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,
238}
239
240elementwise_op_map = {
241 NpuElementWiseOp.MUL: elementwise_mode.MUL.value,
242 NpuElementWiseOp.ADD: elementwise_mode.ADD.value,
243 NpuElementWiseOp.SUB: elementwise_mode.SUB.value,
244 NpuElementWiseOp.MIN: elementwise_mode.MIN.value,
245 NpuElementWiseOp.MAX: elementwise_mode.MAX.value,
246 NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,
247 NpuElementWiseOp.ABS: elementwise_mode.ABS.value,
248 NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,
249 NpuElementWiseOp.SHR: elementwise_mode.SHR.value,
250 NpuElementWiseOp.SHL: elementwise_mode.SHL.value,
251}
252
253activation_op_map = {
254 NpuActivationOp.NONE_OR_RELU: activation.NONE,
255 NpuActivationOp.TANH: activation.TANH,
256 NpuActivationOp.SIGMOID: activation.SIGMOID,
257}
258
259# Maps an AccumulatorType enum to the corresponding acc_format value
260acc_format_map = {
261 SHRAMElements.Acc16: acc_format.FP_S5_10.value,
262 SHRAMElements.Acc32: acc_format.INT_32BIT.value,
263 SHRAMElements.Acc40: acc_format.INT_40BIT.value,
264}
265
266resampling_mode_map = {
267 NpuResamplingMode.NONE: resampling_mode.NONE,
268 NpuResamplingMode.NEAREST: resampling_mode.NEAREST,
269 NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,
270}
271
272# Maps data type size in bits to activation precision
273precision_map = {8: 0, 16: 1, 32: 2}
274
275# Maps rounding mode to the corresponding value
276rounding_mode_map = {
277 NpuRoundingMode.TFL: rounding.TFL.value,
278 NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,
279 NpuRoundingMode.NATURAL: rounding.NATURAL.value,
280}
281
282
Louis Verhaard024c3552021-03-17 14:26:34 +0100283def check_mem_limits(memory_accesses: MemoryAccessSet, mem_limits: Dict[int, int]):
284 """Checks that an operation's memory accesses respect the boundaries imposed by mem_limits"""
285 for mem_access in memory_accesses.accesses:
286 for region, range_set in mem_access.regions.items():
287 if region not in mem_limits:
288 raise VelaError(f"Invalid region: {region}")
289 max = mem_limits[region]
290 for start, end in range_set.ranges:
291 for offset in (start, end):
292 if offset < 0:
293 raise VelaError(f"Negative address offset: {offset}, region: {region}")
294 if offset > max:
Tim Hallcda4fcb2022-05-19 12:36:58 +0100295 raise VelaError(
296 f"Address offset out of range: {offset}, region: {region}, max: {max}. Perhaps try running"
297 f" with the HillClimb tensor allocator and/or increasing the maximum iteration of that"
298 f" allocator"
299 )
Louis Verhaard024c3552021-03-17 14:26:34 +0100300
301
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100302def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):
303 """Generates IFM_PAD registers"""
304 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)
305 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)
306 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)
307 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)
308
309
310def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):
311 """Generates ACTIVATION registers"""
312 act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)
313
314 if act.min is None:
315 quantized_min = ofm.data_type.min_value()
316 else:
317 quantized_min = quantise(act.min, ofm.quantization)
318 if act.max is None:
319 quantized_max = ofm.data_type.max_value()
320 else:
321 quantized_max = quantise(act.max, ofm.quantization)
322 quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())
323 quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())
324 if act.op_type == NpuActivationOp.TABLE_LOOKUP:
325 assert 0 <= act.lookup_table_index < 8
326 activation_value = 16 + act.lookup_table_index
327 if ofm.data_type == NpuDataType.INT32:
328 activation_value |= 3 << 12 # Force I8 range
329 quantized_min = max(-128, quantized_min)
330 quantized_max = min(127, quantized_max)
331 else:
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100332 activation_value = cast(int, activation_op_map[act.op_type])
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100333 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)
334 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)
335 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)
336
337
338def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):
339 """Generates xFM_BASE registers"""
340 if layout == NpuLayout.NHCWB16:
341 # Check that all BasePointer addresses are aligned to 16 bytes
342 assert all((int(addr) % 16) == 0 for addr in addresses)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100343 for i in range(4):
344 emit.cmd1_with_address(ptr_cmds[i], addresses[i])
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100345
346
347def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):
348 """Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""
349 emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)
350 emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)
351 emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)
352
353
354def generate_strides(
355 emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1
356):
357 """Generates STRIDE_C/Y/X registers"""
358 strides = get_strides(fm)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100359 emit.cmd1_with_address(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)
360 emit.cmd1_with_address(stride_y_cmd, strides.height) # stride between vertical values (H)
361 emit.cmd1_with_address(stride_x_cmd, strides.width) # stride between horisontal values (W)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100362
363
364def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):
365 """Generates IFM/IFM2_PRECISION register"""
366 dtype = fm.data_type
367 prec = 1 if dtype.is_signed() else 0
368 activation_precision = precision_map[dtype.size_in_bits()]
369 prec += activation_precision << 2
370
371 if fm.layout == NpuLayout.NHCWB16:
372 prec |= 1 << 6
373
374 prec |= op_to_scale << 8
375 emit.cmd0_with_param(precision_cmd, prec)
376
377
378def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):
379 """Generates OFM_PRECISION register"""
380 dtype = npu_op.ofm.data_type
381 prec = 1 if dtype.is_signed() else 0
382 activation_precision = precision_map[dtype.size_in_bits()]
383 prec += activation_precision << 1
384
385 if use_global_scale:
386 # Set global scale bit, as opposed to using per channel scale
387 prec |= 1 << 8
388 if npu_op.ofm.layout == NpuLayout.NHCWB16:
389 prec |= 1 << 6
390 prec |= rounding_mode_map[npu_op.rounding_mode] << 14
391 emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
392
393
394def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):
395 """Generates IFM2_BROADCAST register for binary elementwise operations"""
396 ifm2_broadcast = 0
397 ifm = npu_op.ifm
398 ifm2 = npu_op.ifm2
399 if npu_op.reversed_operands:
400 ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder
401 if npu_op.ifm2_scalar is not None:
402 # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
403 ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar
404 else:
405 if ifm.shape.height != ifm2.shape.height:
406 # Broadcast in 'H' dimension
407 assert ifm2.shape.height == 1
408 ifm2_broadcast |= IFM2Broadcast.BroadcastHdim
409
410 if ifm.shape.width != ifm2.shape.width:
411 # Broadcast in 'W' dimension
412 assert ifm2.shape.width == 1
413 ifm2_broadcast |= IFM2Broadcast.BroadcastWdim
414
415 if ifm.shape.depth != ifm2.shape.depth:
416 # Broadcast in 'C' dimension
417 assert ifm2.shape.depth == 1
418 ifm2_broadcast |= IFM2Broadcast.BroadcastCdim
419
420 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
421
422
423def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):
424 """Generates general IFM registers"""
425 emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)
426 generate_addresses(
427 emit,
428 [cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],
429 ifm.tiles.addresses,
430 ifm.layout,
431 )
432 generate_tiles(
433 emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles
434 )
435 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)
436 generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)
Fredrik Svedbergf3c7d552022-11-04 09:48:49 +0100437 emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, get_zero_point(ifm))
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100438
439
440def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):
441 """Generates general IFM2 registers"""
442 if not has_scalar:
443 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)
444 generate_addresses(
445 emit,
446 [cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],
447 ifm2.tiles.addresses,
448 ifm2.layout,
449 )
450 generate_tiles(
451 emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles
452 )
453 generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)
Fredrik Svedbergf3c7d552022-11-04 09:48:49 +0100454 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, get_zero_point(ifm2))
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100455
456
457def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):
458 """Generates general OFM registers"""
459 emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)
460 generate_addresses(
461 emit,
462 [cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],
463 ofm.tiles.addresses,
464 ofm.layout,
465 )
466 generate_tiles(
467 emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles
468 )
469 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)
470 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)
471 emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)
472 generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)
Fredrik Svedbergf3c7d552022-11-04 09:48:49 +0100473 emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, get_zero_point(ofm))
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100474
475
476def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):
477 """Generates KERNEL related registers"""
478 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))
479 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))
480 # set kernel x stride low bit
481 stride = (kernel.stride_x - 1) & 1
482 # set kernel y stride low bit
483 stride |= (kernel.stride_y - 1 & 1) << 1
484 # set kernel x stride extension bits
485 stride |= (kernel.stride_x - 1 >> 1) << 6
486 # set kernel y stride extension bits
487 stride |= (kernel.stride_y - 1 >> 1) << 9
488 stride |= (kernel.dilation_x - 1) << 3
489 stride |= (kernel.dilation_y - 1) << 4
490 if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:
491 stride |= 1 << 2
492 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
493
494
495def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):
496 """Generates WEIGHT registers"""
497 if len(weights) == 0:
498 return
499 emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)
500 # Set weights sources for active and present cores
501 for core, (addr, length) in enumerate(
502 [
503 (cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
504 (cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
505 ]
506 ):
507 if core < len(weights):
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100508 emit.cmd1_with_address(addr, weights[core].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100509 emit.cmd1_with_offset(length, weights[core].length)
510 elif core < arch.ncores:
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100511 emit.cmd1_with_address(addr, weights[0].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100512 emit.cmd1_with_offset(length, 0)
513
514
515def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):
516 """Generates SCALE registers"""
517 if len(biases) == 0:
518 return
519 emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)
520 # Set weights sources for active and present cores
521 for core, (addr, length) in enumerate(
522 [(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]
523 ):
524 if core < len(biases):
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100525 emit.cmd1_with_address(addr, biases[core].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100526 emit.cmd1_with_offset(length, biases[core].length)
527 elif core < arch.ncores:
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100528 emit.cmd1_with_address(addr, biases[0].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100529 emit.cmd1_with_offset(length, 0)
530
531
532def generate_block_config(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200533 emit: CommandStreamEmitter,
534 block_config: NpuShape3D,
Louis Verhaard933f55e2020-11-25 14:10:30 +0100535):
536 """Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100537 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
538 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)
539 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100540
541
Tim Halld8339a72021-05-27 18:49:40 +0100542def generate_shram_registers(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200543 emit: CommandStreamEmitter,
544 npu_op: NpuBlockOperation,
545 arch_block_config: ArchitectureBlockConfig,
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100546):
Tim Halld8339a72021-05-27 18:49:40 +0100547 """Generates IB_END/IB_START/AB_START/ACC_FORMAT registers"""
548 emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, arch_block_config.layout.ib_end)
549 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, arch_block_config.layout.ab_start)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100550 if has_ifm2(npu_op):
Tim Halld8339a72021-05-27 18:49:40 +0100551 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_IB_START, arch_block_config.layout.ib_start2)
552 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[arch_block_config.acc_type])
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100553
554
Tim Halld8339a72021-05-27 18:49:40 +0100555def get_block_config_for_npu_op(
556 arch, npu_op: NpuBlockOperation, npu_block_type: NpuBlockType, is_partkernel: bool, ifm_resampling: resampling_mode
557) -> Optional[ArchitectureBlockConfig]:
558 """
559 Given npu_op.block_config, returns a corresponding ArchitectureBlockConfig.
560 Returns None if the block_config does not fit.
561 """
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100562
563
Tim Halld8339a72021-05-27 18:49:40 +0100564def get_arch_block_config(
565 npu_op: NpuBlockOperation, block_traversal: NpuBlockTraversal, arch: ArchitectureFeatures
566) -> ArchitectureBlockConfig:
Louis Verhaard933f55e2020-11-25 14:10:30 +0100567 """Creates shared buffer allocation for the given operation"""
Tim Halld8339a72021-05-27 18:49:40 +0100568 assert npu_op.block_config is not None, "block_config has not been set"
569 block_type = NpuBlockType.Default
Dwight Lidman9b43f842020-12-08 17:56:44 +0100570 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100571 block_type = NpuBlockType.ConvolutionMxN
Dwight Lidman9b43f842020-12-08 17:56:44 +0100572 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100573 block_type = NpuBlockType.ConvolutionDepthWise
Dwight Lidman9b43f842020-12-08 17:56:44 +0100574 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100575 block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
Dwight Lidman9b43f842020-12-08 17:56:44 +0100576 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100577 block_type = NpuBlockType.ElementWise
578 else:
579 assert 0, "Unsupported operation"
580 ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
Tim Halld8339a72021-05-27 18:49:40 +0100581 is_partkernel = block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST
582 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
583 lut_banks = 2 if uses_lut else 0
584 fms = [npu_op.ifm, npu_op.ofm]
585 if npu_op.ifm2 is not None:
586 fms.append(npu_op.ifm2)
587 all_fms_have_quant = not any(fm.quantization is None or fm.quantization.scale_f32 is None for fm in fms)
588 ifm_bits = npu_op.ifm.data_type.size_in_bits()
589 ifm_shape = shape3d_to_block(npu_op.ifm.shape)
590 if has_ifm2(npu_op):
591 ifm2_shape = shape3d_to_block(npu_op.ifm2.shape)
592 else:
593 ifm2_shape = None
594 uses_scalar = npu_op.ifm2_scalar is not None
595 block_config = shape3d_to_block(npu_op.block_config)
596 arch_block_config = try_block_config(
597 block_config,
598 arch,
599 block_type,
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100600 shape3d_to_block(npu_op.ofm.shape),
Tim Halld8339a72021-05-27 18:49:40 +0100601 ifm_shape,
602 ifm2_shape,
603 uses_scalar,
604 ifm_bits,
605 is_partkernel=is_partkernel,
606 kernel=to_kernel(npu_op.kernel),
607 lut_banks=lut_banks,
608 scaled=all_fms_have_quant,
609 ifm_resampling=ifm_resampling_mode,
610 )
611 assert arch_block_config is not None, f"block_config {npu_op.block_config} does not fit"
612 return arch_block_config
Louis Verhaard933f55e2020-11-25 14:10:30 +0100613
614
Louis Verhaard1e170182020-11-26 11:42:04 +0100615def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):
616 """Generates KERNEL_WAIT/DMA_WAIT"""
617 if cmd_waits.npu >= 0:
618 emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
619
620 if cmd_waits.dma >= 0:
621 emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
622
623
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100624def generate_common(
625 emit: CommandStreamEmitter,
626 npu_op: NpuBlockOperation,
627 block_traversal: NpuBlockTraversal,
628 arch: ArchitectureFeatures,
629 use_global_scale: bool = False,
630 op_to_scale: int = 0,
631):
632 """Generate registers that are common to most operations"""
633 assert npu_op.ifm is not None and npu_op.ofm is not None
634 generate_ifm(emit, npu_op.ifm)
635 generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)
636 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])
637 if npu_op.padding is not None:
638 generate_padding(emit, npu_op.padding)
639 generate_ofm(emit, npu_op.ofm)
640 generate_ofm_precision(emit, npu_op, use_global_scale)
641 if npu_op.op_type != NpuOperationType.ElementWise:
642 assert npu_op.kernel is not None
643 generate_kernel(emit, npu_op.kernel, block_traversal)
644 generate_weights(emit, npu_op.weights, arch)
645 generate_biases(emit, npu_op.biases, arch)
646 generate_activation(emit, npu_op.activation, npu_op.ofm)
Tim Halld8339a72021-05-27 18:49:40 +0100647 arch_block_config = get_arch_block_config(npu_op, block_traversal, arch)
648 generate_block_config(emit, npu_op.block_config)
649 generate_shram_registers(emit, npu_op, arch_block_config)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100650
651
652# -------------------------------------------------------------------
653# SCALING
654# -------------------------------------------------------------------
655
656
657def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):
658 """Generates OFM_SCALE register for pooling operations"""
659 # For valid padding vela has to output scaling values
660 kernel = pool_op.kernel
661 ifm_quant = pool_op.ifm.quantization
662 ofm_quant = pool_op.ofm.quantization
663 if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):
664 assert ifm_quant.scale_f32 is not None
665 rescale = 0x3000 * ifm_quant.scale_f32
666 if pool_op.ifm.data_type == NpuDataType.INT16:
667 # Calculate scale and shift for the output scale of 1/(3*4096)
Louis Verhaardc6291292021-03-19 09:35:48 +0100668 x_log2 = math.log2(ifm_quant.scale_f32)
669 rounded_log2 = int(round(x_log2))
670 is_power_of_two = abs(x_log2 - rounded_log2) < 0.001
671 shift = rounded_log2 + 12
Patrik Gustavssone3dd2f32021-12-02 09:08:26 +0100672 if is_power_of_two and (
673 (pool_op.activation.op_type == NpuActivationOp.TANH and shift in (0, 1))
674 or (pool_op.activation.op_type == NpuActivationOp.SIGMOID and shift == 0)
675 ):
676 # Special handling if input scale is 1/2048 (tanh/sigmoid) or 1/4096 (tanh)
Louis Verhaardc6291292021-03-19 09:35:48 +0100677 scale = 3 << shift
678 shift = 0
679 else:
680 shift = 0
681 max_rescale = np.iinfo(np.int16).max / 2
682 while rescale <= max_rescale and shift <= 30:
683 shift += 1
684 rescale *= 2
685 scale = int(rescale)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100686 else:
687 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
688 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
689 scale = int(round_away_zero(scale * rescale))
690 elif pool_op.fused_quantize:
691 # Quantize op requires different scaling
692 ifm_scale_f64 = np.double(ifm_quant.scale_f32)
693 ofm_scale_f64 = np.double(ofm_quant.scale_f32)
694 scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
695 elif pool_op.rescale is not None:
Patrik Gustavssonc74682c2021-08-17 14:26:38 +0200696 if type(pool_op.rescale) == ExplicitScaling:
697 # Note: reuse of rescale for explicit scaling to not expose this in the external API
698 explicit_scaling = pool_op.rescale
699 assert explicit_scaling.per_channel is False
700 scale = explicit_scaling.multiplier[0]
701 shift = explicit_scaling.shift[0]
702 else:
Tim Hall885033b2022-07-21 11:46:03 +0100703 # for ResizeBilinear/NearestNeighbor operations with rescale
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200704 # Note: this is not used, but part of the public API
Patrik Gustavssonc74682c2021-08-17 14:26:38 +0200705 rescale = pool_op.rescale
706 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
707 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
708 scale = int(round_away_zero(scale * rescale))
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100709 else:
710 # In case avg pool fused with concat or other memory operation, rescaling might be needed.
711 # kernel height == kernel width == 1 is always true in this case
712 # Normally the scale is maximised, to get maximum precision, which means that
713 # if rescale != 1, scale need to consider the number of bits needed for rescaling
714 if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:
715 rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32
716 rescale_bits = 0
717 if kernel.height == kernel.width == 1:
718 if rescale > 1:
719 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
720 elif rescale < 1:
721 rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
722 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
723 scale = int(round_away_zero(scale * rescale))
724 else:
725 scale = 1
726 shift = 0
727
728 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
729
730
731def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
732 """
733 Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
734 Returns the operator to scale
735 """
736 op_to_scale = 0
737 if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):
738 input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
739 input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
740 output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
741
742 if npu_op.activation is not None and npu_op.activation.op_type in (
743 NpuActivationOp.SIGMOID,
744 NpuActivationOp.TANH,
745 ):
746 output_scale = 1 / 0x3000
747
748 if npu_op.sub_op_type == NpuElementWiseOp.MUL:
Patrik Gustavssonb081d672021-08-25 13:49:25 +0200749 if npu_op.rescale:
750 ofm_scale, shift = npu_op.rescale
751 elif None in (input_scale, input2_scale, output_scale):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100752 ofm_scale = 1
753 shift = 0
754 else:
755 ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100756 else: # Add/Sub
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200757 # Default operand scaling is no scaling
758 opa_scale = opb_scale = 1
759 opa_shift = 0
Henrik G Olssonad656a82021-03-19 15:50:28 +0100760 bitdepth = npu_op.ifm.data_type.size_in_bits()
761 use_advanced_scaling = False
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200762 if npu_op.rescale is not None:
763 # Explicit ofm scaling
764 ofm_scale, shift = npu_op.rescale
765 elif None in (input_scale, input2_scale, output_scale):
766 # No ofm scaling
767 ofm_scale = 1
768 shift = 0
Henrik G Olssonad656a82021-03-19 15:50:28 +0100769 elif input_scale == input2_scale and bitdepth == 16:
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200770 # int16 same scaling
Henrik G Olssonad656a82021-03-19 15:50:28 +0100771 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
772 input_scale, input2_scale, output_scale
773 )
774 # align the double rounding with that of advanced scaling
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200775 opa_scale //= 2
776 opb_scale //= 2
Henrik G Olssonad656a82021-03-19 15:50:28 +0100777 shift -= 1
778 opa_shift = 0 # Unused for this case
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100779 elif input_scale == input2_scale:
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200780 # Same scaling
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100781 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
782 input_scale, input2_scale, output_scale
783 )
784 opa_shift = 0 # Unused for this case
Henrik G Olssonad656a82021-03-19 15:50:28 +0100785 # For 8 bit we can't guarantee double rounding with simplified scaling will always be
786 # the same as with advanced scaling due to different shifts. When the ofm scale fulfils
787 # the following we know that double rounding will have no effect for advanced scaling
788 # no matter the input, so we can safely use simplified scaling with double rounding disabled.
789 use_advanced_scaling = int(ofm_scale) & 0xFFF != 0
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100790 else:
Henrik G Olssonad656a82021-03-19 15:50:28 +0100791 use_advanced_scaling = True
792 if use_advanced_scaling:
793 # Use advanced implementation only when input/output scales differ,
794 # or when we can't guarantee the absence of rounding errors
Jonas Ohlssond8575072022-03-30 10:30:25 +0200795 (
796 opa_scale,
797 opa_shift,
798 ofm_scale,
799 shift,
800 op_to_scale,
801 ) = scaling.advanced_elementwise_add_sub_scale(input_scale, input2_scale, output_scale, bitdepth)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100802 opb_scale = 0 # Unused for this case
803 if npu_op.reversed_operands:
804 # If the operand order is reversed we also have to swap which operand is scaled
805 if op_to_scale == scaling.OperandToScale.OPa:
806 op_to_scale = scaling.OperandToScale.OPb
807 else:
808 op_to_scale = scaling.OperandToScale.OPa
809 emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
810 emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100811 elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):
812 output_scale = npu_op.ofm.quantization.scale_f32
813 ofm_scale, shift = scaling.quantise_scale(output_scale)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100814 else:
Tim Halle178f382022-07-12 17:02:25 +0100815 ofm_scale = 1
816 shift = 0
817 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100818 return op_to_scale
819
820
821# -------------------------------------------------------------------
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100822# PRINT
823# -------------------------------------------------------------------
Jacob Bohline99b8932020-07-13 16:01:51 +0200824
825
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100826def print_feature_map(fm: Optional[NpuFeatureMap], name: str):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100827 if fm is not None:
828 q = (
829 "no quantization"
830 if fm.quantization is None
831 else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"
832 )
833 h, w, c = fm.shape
834 sz = h * w * c * fm.data_type.size_in_bytes()
835 print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")
836 strides = get_strides(fm)
837 stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"
838 t = fm.tiles
839 addresses = [hex(addr) for addr in t.addresses]
840 print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")
Tim Hall68df8a12022-03-16 16:51:16 +0000841 print(f" name={fm.name}")
Tim Hall79d07d22020-04-27 18:20:16 +0100842
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100843
Dwight Lidman9b43f842020-12-08 17:56:44 +0100844def print_operation(npu_op: NpuOperation, index: int = 0, cmd=None):
Tim Hall68df8a12022-03-16 16:51:16 +0000845 pass_info = f" {cmd}" if cmd else ""
Dwight Lidman9b43f842020-12-08 17:56:44 +0100846 if isinstance(npu_op, NpuOperation) and not isinstance(npu_op, (NpuDmaOperation, NpuBlockOperation)):
Tim Hall68df8a12022-03-16 16:51:16 +0000847 print(f"{index} {npu_op.op_type.name} name={npu_op.name}:{pass_info}")
Dwight Lidman9b43f842020-12-08 17:56:44 +0100848 return
849 if isinstance(npu_op, NpuDmaOperation):
Tim Hall68df8a12022-03-16 16:51:16 +0000850 print(f"{index} {npu_op.op_type.name} name={npu_op.name}, src={npu_op.src}, dest={npu_op.dest}:{pass_info}")
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100851 return
852 k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100853 if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)):
Tim Hall68df8a12022-03-16 16:51:16 +0000854 print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name} name={npu_op.name}:{pass_info}")
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200855 else:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100856 if (
Dwight Lidman9b43f842020-12-08 17:56:44 +0100857 isinstance(npu_op, NpuConv2DOperation)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100858 and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1
859 ):
860 fc = "FullyConnected "
861 else:
862 fc = ""
Tim Hall68df8a12022-03-16 16:51:16 +0000863 print(f"{index} {fc}{npu_op.op_type.name} name={npu_op.name}:{pass_info}")
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100864 print_feature_map(npu_op.ifm, "IFM")
865 if npu_op.ifm2_scalar is not None:
866 quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
867 print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")
868 else:
869 print_feature_map(npu_op.ifm2, "IFM2")
870 print_feature_map(npu_op.ofm, "OFM")
871 if k is not None and npu_op.op_type != NpuOperationType.ElementWise:
872 print(f" Kernel: {k}")
873 if npu_op.padding is not None:
874 print(f" {npu_op.padding}")
875 for weights in npu_op.weights:
876 print(f" Weights: {weights}")
877 for bias in npu_op.biases:
878 print(f" Scales: {bias}")
879 if npu_op.activation is not None:
880 act = npu_op.activation
881 if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:
882 lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""
883 print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")
Dwight Lidman9b43f842020-12-08 17:56:44 +0100884 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100885 print(f" {npu_op.block_traversal}")
886 bh, bw, bc = npu_op.block_config
Dwight Lidman9b43f842020-12-08 17:56:44 +0100887 rescale = (
888 f", rescale={npu_op.rescale}" if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)) else ""
889 )
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100890 print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")
Tim Hall79d07d22020-04-27 18:20:16 +0100891
Tim Hall79d07d22020-04-27 18:20:16 +0100892
Dwight Lidman9b43f842020-12-08 17:56:44 +0100893def print_operations(npu_op_list: List[NpuOperation], npu_op_to_cmd=None):
894 npu_op_to_cmd = dict() if npu_op_to_cmd is None else npu_op_to_cmd
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100895 for index, npu_op in enumerate(npu_op_list):
Dwight Lidman9b43f842020-12-08 17:56:44 +0100896 print_operation(npu_op, index, npu_op_to_cmd.get(npu_op))
Tim Hall79d07d22020-04-27 18:20:16 +0100897
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100898
899# -------------------------------------------------------------------
900# OPERATIONS
901# -------------------------------------------------------------------
902
903
904def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):
905 """Generates NPU_OP_* command"""
Dwight Lidman9b43f842020-12-08 17:56:44 +0100906 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100907 emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100908 elif isinstance(npu_op, NpuConv2DOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100909 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100910 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100911 emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100912 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100913 emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])
Dwight Lidman9b43f842020-12-08 17:56:44 +0100914 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100915 emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])
916 else:
917 assert 0, "Unsupported operation"
918
919
Louis Verhaard933f55e2020-11-25 14:10:30 +0100920def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100921 """Generates register commands for Conv2D operations"""
922 generate_common(emit, npu_op, npu_op.block_traversal, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100923
924
Dwight Lidman9b43f842020-12-08 17:56:44 +0100925def generate_conv_depthwise_op(
926 emit: CommandStreamEmitter, npu_op: NpuConvDepthWiseOperation, arch: ArchitectureFeatures
927):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100928 """Generates register commands for depthwise convolution operations"""
929 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100930
931
932def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
933 """Generates register commands for pooling operations"""
Tim Halld6efcd32022-09-02 15:01:01 +0100934 # check that reduce_sum input is NHWC
935 if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM and npu_op.ifm.layout != NpuLayout.NHWC:
936 if npu_op.ifm.data_type == NpuDataType.INT32:
937 raise VelaError(
938 f"REDUCE_SUM ({npu_op.name}) with IFM data type of INT32 requires IFM layout to be NHWC"
939 f" ({npu_op.ifm.name} == {npu_op.ifm.layout})"
940 )
941 elif arch.accelerator_config == Accelerator.Ethos_U65_512:
942 raise VelaError(
943 f"REDUCE_SUM ({npu_op.name}) with accelerator config of Ethos_U65_512 requires IFM layout to be NHWC"
944 f" ({npu_op.ifm.name} == {npu_op.ifm.layout})"
945 )
946
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100947 use_global_scale = (
948 npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0
949 )
Patrik Gustavssonc74682c2021-08-17 14:26:38 +0200950 # Note: reuse of rescale for explicit scaling to not expose this in the external API
951 if npu_op.rescale is not None and type(npu_op.rescale) == ExplicitScaling:
952 use_global_scale = not npu_op.rescale.per_channel
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100953 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)
954 # Pooling op specific
955 if use_global_scale:
956 generate_ofm_scaling_for_pooling(emit, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100957
958
959def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):
960 """Generates register commands for elementwise operations"""
961 use_global_scale = npu_op.sub_op_type in (
962 NpuElementWiseOp.ADD,
963 NpuElementWiseOp.SUB,
964 NpuElementWiseOp.MUL,
965 NpuElementWiseOp.LRELU,
966 NpuElementWiseOp.ABS,
967 )
968 op_to_scale = generate_scaling_for_elementwise(emit, npu_op)
969 generate_common(
970 emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale
971 )
972 # Elementwise op specific
Louis Verhaard1e170182020-11-26 11:42:04 +0100973 if npu_op.sub_op_type not in UNARY_ELEMWISE_OPS:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100974 # Binary operation; generate IFM2 registers
975 assert npu_op.ifm2 is not None
976 has_scalar = npu_op.ifm2_scalar is not None
977 generate_ifm2(emit, npu_op.ifm2, has_scalar)
978 generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)
979 generate_ifm2_broadcast(emit, npu_op)
980 if has_scalar:
981 quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
982 assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()
983 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100984
985
986def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):
987 """Generates register commands for DMA operations"""
988 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100989 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100990 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)
991
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100992 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
993 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100994
995
Louis Verhaard933f55e2020-11-25 14:10:30 +0100996def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100997 """
998 Generates register commands for the given operation, but not the final NPU_OP_... command.
999 Returns the selected block config
1000 """
Dwight Lidman9b43f842020-12-08 17:56:44 +01001001 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +01001002 generate_conv2d_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001003 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +01001004 generate_conv_depthwise_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001005 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +01001006 generate_pooling_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001007 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +01001008 generate_elementwise_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001009 elif isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001010 generate_dma_op(emit, npu_op)
1011 else:
1012 assert 0, "Unsupported operation"
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001013
1014
1015def generate_command_stream(
Louis Verhaard024c3552021-03-17 14:26:34 +01001016 npu_op_list: List[NpuOperation],
1017 arch: ArchitectureFeatures,
1018 verbose: bool,
1019 mem_limits: Dict[int, int],
1020 add_to_debug_db=None,
1021 npu_op_to_cmd=None,
Louis Verhaard1e170182020-11-26 11:42:04 +01001022) -> List[int]:
1023 """
1024 Generates register commands for the given list of NPU operations.
1025 Returns Ethos-U instructions, as a list of 32-bit integers.
1026 """
1027 emit = CommandStreamEmitter()
1028 if verbose:
Dwight Lidman9b43f842020-12-08 17:56:44 +01001029 print_operations(npu_op_list, npu_op_to_cmd)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001030 # Calculate memory accesses for every operation
Dwight Lidman9b43f842020-12-08 17:56:44 +01001031 memory_accesses: Dict[NpuOperation, MemoryAccessSet] = {}
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001032 for npu_op in npu_op_list:
Dwight Lidman9b43f842020-12-08 17:56:44 +01001033 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001034 memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001035 elif isinstance(npu_op, NpuBlockOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001036 memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001037 else:
1038 assert 0, "Invalid operation type"
Louis Verhaard024c3552021-03-17 14:26:34 +01001039
Tim Hallc8a73862020-10-27 12:43:14 +00001040 if arch.is_ethos_u65_system:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001041 emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
1042 dep_watermark = Watermark(0, 0)
1043 prev_op = None
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001044 # Generate register commands for all operations
1045 for op_index, npu_op in enumerate(npu_op_list):
Louis Verhaard024c3552021-03-17 14:26:34 +01001046 try:
1047 check_mem_limits(memory_accesses[npu_op], mem_limits)
1048 dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)
1049 generate_registers_for_op(emit, npu_op, arch)
1050 except VelaError as e:
1051 # Add operation info and rethrow
1052 raise VelaError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None
Dwight Lidman9b43f842020-12-08 17:56:44 +01001053 if not isinstance(npu_op, NpuDmaOperation) and isinstance(npu_op, NpuBlockOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001054 # Generate BLOCKDEP
Louis Verhaard933f55e2020-11-25 14:10:30 +01001055 blockdep = calc_blockdep(arch, prev_op, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001056 blockdep = min(blockdep, arch.max_blockdep)
1057 emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
1058 prev_op = npu_op
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001059
1060 generate_cmd_waits(emit, cmd_waits)
1061 # Generate the actual NPU_OP command
1062 generate_operation_code(emit, npu_op)
1063 if add_to_debug_db is not None:
1064 add_to_debug_db(npu_op, emit.offset)
1065 # Fill in final part of command stream:
1066 emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
Louis Verhaard1e170182020-11-26 11:42:04 +01001067 res = emit.to_list()
erik.andersson@arm.com1878dab2021-03-16 09:40:24 +01001068
1069 if emit.size_in_bytes() >= 1 << 24:
1070 raise VelaError(
1071 f"The command stream size exceeds the hardware limit of 16 MiB. "
1072 f"The current stream size is {emit.size_in_bytes()/2**20:.2F} MiB."
1073 )
1074
Tim Hall79d07d22020-04-27 18:20:16 +01001075 if verbose:
1076 emit.print_cmds()
Tim Hall114baba2022-05-10 12:42:27 +01001077 print(f"Number of commands = {len(emit.cmd_stream)}")
1078 print(f"Command stream length = {emit.size_in_bytes()} bytes")
Louis Verhaard1e170182020-11-26 11:42:04 +01001079 return res
1080
1081
Louis Verhaardaeae5672020-11-02 18:04:27 +01001082def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001083 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001084 Internal implementation of the public facing API for generating an Ethos-U register command stream.
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001085 Calculates dependencies between commands and inserts wait operations if needed.
1086
1087 :param npu_op_list: List[NpuOperation] list of high level NPU operations
Tim Hallc8a73862020-10-27 12:43:14 +00001088 :param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator
1089 :return Ethos-U instructions, as a list of 32-bit integers
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001090 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001091 accelerator = Accelerator.from_npu_accelerator(npu_accelerator)
Louis Verhaard52078302020-11-18 13:35:06 +01001092 arch = create_default_arch(accelerator)
Louis Verhaard024c3552021-03-17 14:26:34 +01001093 mem_limits = dict()
1094 for region in range(0, 8):
1095 mem_limits[region] = arch.max_address_offset
1096 mem_limits[BASE_PTR_INDEX_MEM2MEM] = arch.shram_size_bytes
1097 return generate_command_stream(npu_op_list, arch, verbose=False, mem_limits=mem_limits)