blob: 016546974471f9c47b137ccdc18342c4e76bf641 [file] [log] [blame]
William Isaksson56e5f0c2024-01-10 12:28:04 +01001# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Rickard Bolinbc6ee582022-11-04 08:24:29 +000016#
Tim Hall79d07d22020-04-27 18:20:16 +010017# Description:
Tim Hallc8a73862020-10-27 12:43:14 +000018# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates
Tim Hall79d07d22020-04-27 18:20:16 +010019# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
Tim Hallc8a73862020-10-27 12:43:14 +000020# stream suitable for interpretation by the Ethos-U processor.
Louis Verhaardc6291292021-03-19 09:35:48 +010021import math
Tim Hall79d07d22020-04-27 18:20:16 +010022from collections import defaultdict
Diego Russoe8a10452020-04-21 17:39:10 +010023from enum import Enum
24from enum import IntEnum
Jonas Ohlsson845e2322022-03-01 12:39:55 +010025from typing import cast
Dwight Lidman9b43f842020-12-08 17:56:44 +010026from typing import Dict
Louis Verhaarde8a5a782020-11-02 18:04:27 +010027from typing import List
28from typing import Optional
Diego Russoea6111a2020-04-14 18:41:58 +010029
30import numpy as np
31
32from . import scaling
Louis Verhaardaeae5672020-11-02 18:04:27 +010033from .api import NpuAccelerator
William Isaksson56e5f0c2024-01-10 12:28:04 +010034from .api import NpuAccumulatorType
Louis Verhaarde8a5a782020-11-02 18:04:27 +010035from .api import NpuActivation
36from .api import NpuActivationOp
37from .api import NpuAddressRange
38from .api import NpuBlockOperation
39from .api import NpuBlockTraversal
40from .api import NpuConv2DOperation
Dwight Lidman9b43f842020-12-08 17:56:44 +010041from .api import NpuConvDepthWiseOperation
Louis Verhaarde8a5a782020-11-02 18:04:27 +010042from .api import NpuDataType
43from .api import NpuDmaOperation
44from .api import NpuElementWiseOp
45from .api import NpuElementWiseOperation
46from .api import NpuFeatureMap
47from .api import NpuKernel
48from .api import NpuLayout
49from .api import NpuOperation
50from .api import NpuOperationType
51from .api import NpuPadding
52from .api import NpuPoolingOp
53from .api import NpuPoolingOperation
Louis Verhaarde8a5a782020-11-02 18:04:27 +010054from .api import NpuResamplingMode
55from .api import NpuRoundingMode
56from .api import NpuShape3D
57from .api import NpuTileBox
Tim Halld8339a72021-05-27 18:49:40 +010058from .architecture_allocator import ArchitectureBlockConfig
59from .architecture_allocator import try_block_config
Louis Verhaarde8a5a782020-11-02 18:04:27 +010060from .architecture_features import Accelerator
Diego Russoe8a10452020-04-21 17:39:10 +010061from .architecture_features import ArchitectureFeatures
Louis Verhaard52078302020-11-18 13:35:06 +010062from .architecture_features import create_default_arch
Diego Russoe8a10452020-04-21 17:39:10 +010063from .architecture_features import SHRAMElements
William Isakssona4f84112023-06-19 15:31:46 +000064from .errors import ByteAlignmentError
65from .errors import ByteSizeError
erik.andersson@arm.com1878dab2021-03-16 09:40:24 +010066from .errors import VelaError
Diego Russoe8a10452020-04-21 17:39:10 +010067from .ethos_u55_regs.ethos_u55_regs import acc_format
68from .ethos_u55_regs.ethos_u55_regs import activation
69from .ethos_u55_regs.ethos_u55_regs import cmd0
70from .ethos_u55_regs.ethos_u55_regs import cmd1
71from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
Fredrik Svedberga0c36242020-06-03 15:43:31 +020072from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlincf7da102020-05-20 09:03:40 +020073from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russoe8a10452020-04-21 17:39:10 +010074from .ethos_u55_regs.ethos_u55_regs import rounding
Diego Russoe8a10452020-04-21 17:39:10 +010075from .numeric_util import round_away_zero
Diego Russoe8a10452020-04-21 17:39:10 +010076from .numeric_util import round_up_to_int
Patrik Gustavssonc74682c2021-08-17 14:26:38 +020077from .operation import ExplicitScaling
Tim Hall79d07d22020-04-27 18:20:16 +010078from .operation import NpuBlockType
Dwight Lidman9b43f842020-12-08 17:56:44 +010079from .range_set import MemoryAccessSet
Louis Verhaard024c3552021-03-17 14:26:34 +010080from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM
Louis Verhaard1e170182020-11-26 11:42:04 +010081from .register_command_stream_util import calc_blockdep
William Isakssona4f84112023-06-19 15:31:46 +000082from .register_command_stream_util import check_addresses
83from .register_command_stream_util import check_alignment
84from .register_command_stream_util import check_dma_op
Björn Davidsson199e8e62023-10-10 11:22:59 +020085from .register_command_stream_util import check_length
William Isakssona4f84112023-06-19 15:31:46 +000086from .register_command_stream_util import check_strides
Louis Verhaard1e170182020-11-26 11:42:04 +010087from .register_command_stream_util import get_dma_memory_accesses
88from .register_command_stream_util import get_op_memory_accesses
89from .register_command_stream_util import get_strides
90from .register_command_stream_util import get_wait_dependency
Fredrik Svedbergf3c7d552022-11-04 09:48:49 +010091from .register_command_stream_util import get_zero_point
Louis Verhaard1e170182020-11-26 11:42:04 +010092from .register_command_stream_util import has_ifm2
Fredrik Svedbergf3c7d552022-11-04 09:48:49 +010093from .register_command_stream_util import quantise
Tim Halld8339a72021-05-27 18:49:40 +010094from .register_command_stream_util import shape3d_to_block
Louis Verhaard1e170182020-11-26 11:42:04 +010095from .register_command_stream_util import to_kernel
96from .register_command_stream_util import UNARY_ELEMWISE_OPS
97from .register_command_stream_util import Watermark
Tim Hall79d07d22020-04-27 18:20:16 +010098
99
100class RegisterMachine:
101 def __init__(self):
102 self.n_banks = 1
103 self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
104 self.bank_idx = 0
105
106 def set_register(self, reg, value):
107 is_changed = self.registers[self.bank_idx][reg] != value
108 self.registers[self.bank_idx][reg] = value
109 # is_changed = True # force command
110 return is_changed
111
112 def switch_bank(self):
113 self.bank_idx = (self.bank_idx + 1) % self.n_banks
114
115
116class CmdMode(IntEnum):
117 NoPayload = 0x0000
118 Payload32 = 0x4000
119 Mask = 0xC000
120 CmdOpMask = 0x03FF
121
122
Tim Hall79d07d22020-04-27 18:20:16 +0100123class CommandStreamEmitter:
Tim Halle6ccd872020-11-09 16:46:37 +0000124 WORD_SIZE = 4
125
Tim Hall79d07d22020-04-27 18:20:16 +0100126 def __init__(self):
127 self.cmd_stream = []
128 self.reg_machine = [RegisterMachine(), RegisterMachine()]
129 self.last_absolute_wait = defaultdict(int)
Tim Halle6ccd872020-11-09 16:46:37 +0000130 self.offset = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100131
132 def get_reg_machine(self, cmd):
133 if "DMA" in cmd.name:
134 return self.reg_machine[1]
135 else:
136 return self.reg_machine[0]
137
138 def size_in_bytes(self):
139 sz = 0
140 for cmd in self.cmd_stream:
Tim Halle6ccd872020-11-09 16:46:37 +0000141 sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100142 return sz
143
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100144 def to_list(self) -> List[int]:
Tim Hall79d07d22020-04-27 18:20:16 +0100145 return [elem for cmd in self.cmd_stream for elem in cmd]
146
147 def print_cmds(self):
Tim Hall114baba2022-05-10 12:42:27 +0100148 s = f" {'Offset':6}:"
149 s += f" {'Payload':8}"
150 s += f"{'Param':4}" # no leading space for alignment
151 s += f" {'Code':4}"
152 s += f" - {'Command':30}"
153 s += f" {'Param':5}"
154 print(s)
155
156 offset = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100157 for words_for_one_command in self.cmd_stream:
158 code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
159 param = words_for_one_command[0] >> 16 # higher 16 bits
160
161 payload_mode = CmdMode(code & CmdMode.Mask)
162
Tim Hallcda4fcb2022-05-19 12:36:58 +0100163 s = f"{offset:#08x}:"
Tim Hall114baba2022-05-10 12:42:27 +0100164
Tim Hall79d07d22020-04-27 18:20:16 +0100165 if payload_mode == CmdMode.NoPayload:
Tim Hall114baba2022-05-10 12:42:27 +0100166 s += f" {'':8}"
Tim Hall79d07d22020-04-27 18:20:16 +0100167 else:
Tim Hall114baba2022-05-10 12:42:27 +0100168 assert payload_mode == CmdMode.Payload32
169 s += f" {words_for_one_command[1]:08x}"
Tim Hall79d07d22020-04-27 18:20:16 +0100170
Tim Hall114baba2022-05-10 12:42:27 +0100171 s += f" {param:04x}"
172 s += f" {code:04x}"
Tim Hall79d07d22020-04-27 18:20:16 +0100173
Tim Hall114baba2022-05-10 12:42:27 +0100174 if payload_mode == CmdMode.NoPayload:
175 s += f" - {cmd0(code & CmdMode.CmdOpMask):30}"
176 offset += 4
Tim Hall79d07d22020-04-27 18:20:16 +0100177 else:
Tim Hall114baba2022-05-10 12:42:27 +0100178 s += f" - {cmd1(code & CmdMode.CmdOpMask):30}"
179 offset += 8
Tim Hall79d07d22020-04-27 18:20:16 +0100180
Tim Hall114baba2022-05-10 12:42:27 +0100181 s += f" {param:5}"
Tim Hall79d07d22020-04-27 18:20:16 +0100182 print(s)
183
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100184 def cmd0_with_param(self, cmd: cmd0, param):
Tim Hall79d07d22020-04-27 18:20:16 +0100185 if isinstance(param, Enum):
186 param = int(param.value)
187 else:
188 param = int(param)
189 param = param & 0xFFFF
190 command = cmd.value | (param << 16)
191 if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
192 return
193
194 # This is not a redundant command, actually write it
195 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000196 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100197
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100198 def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):
Louis Verhaard893780c2021-03-30 09:02:30 +0200199 offset = int(offset) & 0xFFFFFFFF
200 param = int(param) & 0xFFFF
Tim Hall79d07d22020-04-27 18:20:16 +0100201 command = cmd.value | CmdMode.Payload32.value | (param << 16)
202
203 if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
204 return
205
206 # This is not a redundant command, actually write it
207 self.cmd_stream.append((command, offset))
Tim Halle6ccd872020-11-09 16:46:37 +0000208 self.offset += CommandStreamEmitter.WORD_SIZE * 2
Tim Hall79d07d22020-04-27 18:20:16 +0100209
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100210 def cmd1_with_address(self, cmd: cmd1, offset):
211 self.cmd1_with_offset(cmd, offset, offset >> 32)
212
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100213 def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):
Tim Hall289a41d2020-08-04 21:40:14 +0100214 param = (16 * channel) + outstanding_count
Tim Hall79d07d22020-04-27 18:20:16 +0100215 command = ((param & 0xFFFF) << 16) | cmd.value
216 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000217 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100218
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100219 def cmd_do_operation(self, cmd: cmd0, param=0):
Tim Hall79d07d22020-04-27 18:20:16 +0100220 param = int(param)
221 command = ((param & 0xFFFF) << 16) | cmd.value
222
223 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000224 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100225 self.get_reg_machine(cmd).switch_bank()
226
227
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100228# -------------------------------------------------------------------
229# REGISTER GENERATION
230# -------------------------------------------------------------------
231
232
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100233# TODO: Replace with definitions from ethos_u55_regs
234class IFM2Broadcast(IntEnum):
235 BroadcastHdim = 1 << 0
236 BroadcastWdim = 1 << 1
237 BroadcastCdim = 1 << 2
238 ReverseOperandOrder = 1 << 6
239 UseIFM2Scalar = 1 << 7
240
241
242pooling_op_map = {
243 NpuPoolingOp.MAX: pooling_mode.MAX.value,
244 NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,
245 NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,
246}
247
248elementwise_op_map = {
249 NpuElementWiseOp.MUL: elementwise_mode.MUL.value,
250 NpuElementWiseOp.ADD: elementwise_mode.ADD.value,
251 NpuElementWiseOp.SUB: elementwise_mode.SUB.value,
252 NpuElementWiseOp.MIN: elementwise_mode.MIN.value,
253 NpuElementWiseOp.MAX: elementwise_mode.MAX.value,
254 NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,
255 NpuElementWiseOp.ABS: elementwise_mode.ABS.value,
256 NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,
257 NpuElementWiseOp.SHR: elementwise_mode.SHR.value,
258 NpuElementWiseOp.SHL: elementwise_mode.SHL.value,
259}
260
261activation_op_map = {
262 NpuActivationOp.NONE_OR_RELU: activation.NONE,
263 NpuActivationOp.TANH: activation.TANH,
264 NpuActivationOp.SIGMOID: activation.SIGMOID,
265}
266
267# Maps an AccumulatorType enum to the corresponding acc_format value
268acc_format_map = {
269 SHRAMElements.Acc16: acc_format.FP_S5_10.value,
270 SHRAMElements.Acc32: acc_format.INT_32BIT.value,
271 SHRAMElements.Acc40: acc_format.INT_40BIT.value,
272}
273
William Isaksson56e5f0c2024-01-10 12:28:04 +0100274npu_acc_format_map = {
275 NpuAccumulatorType.Int32: acc_format.INT_32BIT.value,
276 NpuAccumulatorType.Int40: acc_format.INT_40BIT.value,
277}
278
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100279resampling_mode_map = {
280 NpuResamplingMode.NONE: resampling_mode.NONE,
281 NpuResamplingMode.NEAREST: resampling_mode.NEAREST,
282 NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,
283}
284
285# Maps data type size in bits to activation precision
286precision_map = {8: 0, 16: 1, 32: 2}
287
288# Maps rounding mode to the corresponding value
289rounding_mode_map = {
290 NpuRoundingMode.TFL: rounding.TFL.value,
291 NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,
292 NpuRoundingMode.NATURAL: rounding.NATURAL.value,
293}
294
295
Louis Verhaard024c3552021-03-17 14:26:34 +0100296def check_mem_limits(memory_accesses: MemoryAccessSet, mem_limits: Dict[int, int]):
297 """Checks that an operation's memory accesses respect the boundaries imposed by mem_limits"""
298 for mem_access in memory_accesses.accesses:
299 for region, range_set in mem_access.regions.items():
300 if region not in mem_limits:
301 raise VelaError(f"Invalid region: {region}")
302 max = mem_limits[region]
303 for start, end in range_set.ranges:
304 for offset in (start, end):
305 if offset < 0:
306 raise VelaError(f"Negative address offset: {offset}, region: {region}")
307 if offset > max:
Tim Hallcda4fcb2022-05-19 12:36:58 +0100308 raise VelaError(
309 f"Address offset out of range: {offset}, region: {region}, max: {max}. Perhaps try running"
310 f" with the HillClimb tensor allocator and/or increasing the maximum iteration of that"
311 f" allocator"
312 )
Louis Verhaard024c3552021-03-17 14:26:34 +0100313
314
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100315def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):
316 """Generates IFM_PAD registers"""
317 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)
318 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)
319 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)
320 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)
321
322
323def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):
324 """Generates ACTIVATION registers"""
325 act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)
326
327 if act.min is None:
328 quantized_min = ofm.data_type.min_value()
329 else:
330 quantized_min = quantise(act.min, ofm.quantization)
331 if act.max is None:
332 quantized_max = ofm.data_type.max_value()
333 else:
334 quantized_max = quantise(act.max, ofm.quantization)
335 quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())
336 quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())
337 if act.op_type == NpuActivationOp.TABLE_LOOKUP:
338 assert 0 <= act.lookup_table_index < 8
339 activation_value = 16 + act.lookup_table_index
340 if ofm.data_type == NpuDataType.INT32:
341 activation_value |= 3 << 12 # Force I8 range
342 quantized_min = max(-128, quantized_min)
343 quantized_max = min(127, quantized_max)
344 else:
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100345 activation_value = cast(int, activation_op_map[act.op_type])
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100346 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)
347 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)
348 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)
349
350
William Isakssona4f84112023-06-19 15:31:46 +0000351def generate_addresses(
352 emit: CommandStreamEmitter,
353 ptr_cmds: List[cmd1],
354 addresses: List[int],
355 layout: NpuLayout,
356 element_size,
357 arch: ArchitectureFeatures,
358):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100359 """Generates xFM_BASE registers"""
William Isakssona4f84112023-06-19 15:31:46 +0000360 check_addresses(addresses, layout, element_size, arch)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100361 for i in range(4):
362 emit.cmd1_with_address(ptr_cmds[i], addresses[i])
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100363
364
365def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):
366 """Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""
367 emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)
368 emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)
369 emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)
370
371
372def generate_strides(
373 emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1
374):
375 """Generates STRIDE_C/Y/X registers"""
376 strides = get_strides(fm)
William Isakssona4f84112023-06-19 15:31:46 +0000377 check_strides(fm, strides)
378
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100379 emit.cmd1_with_address(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)
380 emit.cmd1_with_address(stride_y_cmd, strides.height) # stride between vertical values (H)
381 emit.cmd1_with_address(stride_x_cmd, strides.width) # stride between horisontal values (W)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100382
383
384def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):
385 """Generates IFM/IFM2_PRECISION register"""
386 dtype = fm.data_type
387 prec = 1 if dtype.is_signed() else 0
388 activation_precision = precision_map[dtype.size_in_bits()]
389 prec += activation_precision << 2
390
391 if fm.layout == NpuLayout.NHCWB16:
392 prec |= 1 << 6
393
394 prec |= op_to_scale << 8
395 emit.cmd0_with_param(precision_cmd, prec)
396
397
398def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):
399 """Generates OFM_PRECISION register"""
400 dtype = npu_op.ofm.data_type
401 prec = 1 if dtype.is_signed() else 0
402 activation_precision = precision_map[dtype.size_in_bits()]
403 prec += activation_precision << 1
404
405 if use_global_scale:
406 # Set global scale bit, as opposed to using per channel scale
407 prec |= 1 << 8
408 if npu_op.ofm.layout == NpuLayout.NHCWB16:
409 prec |= 1 << 6
410 prec |= rounding_mode_map[npu_op.rounding_mode] << 14
411 emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
412
413
414def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):
415 """Generates IFM2_BROADCAST register for binary elementwise operations"""
416 ifm2_broadcast = 0
417 ifm = npu_op.ifm
418 ifm2 = npu_op.ifm2
419 if npu_op.reversed_operands:
420 ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder
421 if npu_op.ifm2_scalar is not None:
422 # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
423 ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar
424 else:
425 if ifm.shape.height != ifm2.shape.height:
426 # Broadcast in 'H' dimension
427 assert ifm2.shape.height == 1
428 ifm2_broadcast |= IFM2Broadcast.BroadcastHdim
429
430 if ifm.shape.width != ifm2.shape.width:
431 # Broadcast in 'W' dimension
432 assert ifm2.shape.width == 1
433 ifm2_broadcast |= IFM2Broadcast.BroadcastWdim
434
435 if ifm.shape.depth != ifm2.shape.depth:
436 # Broadcast in 'C' dimension
437 assert ifm2.shape.depth == 1
438 ifm2_broadcast |= IFM2Broadcast.BroadcastCdim
439
440 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
441
442
William Isakssona4f84112023-06-19 15:31:46 +0000443def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100444 """Generates general IFM registers"""
445 emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)
446 generate_addresses(
447 emit,
448 [cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],
449 ifm.tiles.addresses,
450 ifm.layout,
William Isakssona4f84112023-06-19 15:31:46 +0000451 ifm.data_type.size_in_bytes(),
452 arch,
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100453 )
454 generate_tiles(
455 emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles
456 )
457 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)
458 generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)
Fredrik Svedbergf3c7d552022-11-04 09:48:49 +0100459 emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, get_zero_point(ifm))
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100460
461
William Isakssona4f84112023-06-19 15:31:46 +0000462def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100463 """Generates general IFM2 registers"""
464 if not has_scalar:
465 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)
466 generate_addresses(
467 emit,
468 [cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],
469 ifm2.tiles.addresses,
470 ifm2.layout,
William Isakssona4f84112023-06-19 15:31:46 +0000471 ifm2.data_type.size_in_bytes(),
472 arch,
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100473 )
474 generate_tiles(
475 emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles
476 )
477 generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)
Fredrik Svedbergf3c7d552022-11-04 09:48:49 +0100478 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, get_zero_point(ifm2))
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100479
480
William Isakssona4f84112023-06-19 15:31:46 +0000481def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100482 """Generates general OFM registers"""
483 emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)
484 generate_addresses(
485 emit,
486 [cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],
487 ofm.tiles.addresses,
488 ofm.layout,
William Isakssona4f84112023-06-19 15:31:46 +0000489 ofm.data_type.size_in_bytes(),
490 arch,
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100491 )
492 generate_tiles(
493 emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles
494 )
495 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)
496 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)
497 emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)
498 generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)
Fredrik Svedbergf3c7d552022-11-04 09:48:49 +0100499 emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, get_zero_point(ofm))
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100500
501
502def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):
503 """Generates KERNEL related registers"""
504 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))
505 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))
506 # set kernel x stride low bit
507 stride = (kernel.stride_x - 1) & 1
508 # set kernel y stride low bit
509 stride |= (kernel.stride_y - 1 & 1) << 1
510 # set kernel x stride extension bits
511 stride |= (kernel.stride_x - 1 >> 1) << 6
512 # set kernel y stride extension bits
513 stride |= (kernel.stride_y - 1 >> 1) << 9
514 stride |= (kernel.dilation_x - 1) << 3
515 stride |= (kernel.dilation_y - 1) << 4
516 if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:
517 stride |= 1 << 2
518 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
519
520
521def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):
522 """Generates WEIGHT registers"""
523 if len(weights) == 0:
524 return
525 emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)
526 # Set weights sources for active and present cores
527 for core, (addr, length) in enumerate(
528 [
529 (cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
530 (cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
531 ]
532 ):
533 if core < len(weights):
William Isakssona4f84112023-06-19 15:31:46 +0000534 check_alignment(weights[core].address, 16)
Björn Davidsson199e8e62023-10-10 11:22:59 +0200535 check_length(weights[core].length, 16)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100536 emit.cmd1_with_address(addr, weights[core].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100537 emit.cmd1_with_offset(length, weights[core].length)
538 elif core < arch.ncores:
William Isakssona4f84112023-06-19 15:31:46 +0000539 check_alignment(weights[0].address, 16)
William Isaksson61652832023-08-07 10:32:07 +0000540 check_length(weights[0].length, 16)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100541 emit.cmd1_with_address(addr, weights[0].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100542 emit.cmd1_with_offset(length, 0)
543
544
545def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):
546 """Generates SCALE registers"""
547 if len(biases) == 0:
548 return
549 emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)
550 # Set weights sources for active and present cores
551 for core, (addr, length) in enumerate(
552 [(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]
553 ):
554 if core < len(biases):
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100555 emit.cmd1_with_address(addr, biases[core].address)
Björn Davidsson199e8e62023-10-10 11:22:59 +0200556 check_length(biases[core].length, 16)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100557 emit.cmd1_with_offset(length, biases[core].length)
558 elif core < arch.ncores:
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100559 emit.cmd1_with_address(addr, biases[0].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100560 emit.cmd1_with_offset(length, 0)
561
562
563def generate_block_config(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200564 emit: CommandStreamEmitter,
565 block_config: NpuShape3D,
Louis Verhaard933f55e2020-11-25 14:10:30 +0100566):
567 """Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100568 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
569 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)
570 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100571
572
Tim Halld8339a72021-05-27 18:49:40 +0100573def generate_shram_registers(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200574 emit: CommandStreamEmitter,
575 npu_op: NpuBlockOperation,
576 arch_block_config: ArchitectureBlockConfig,
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100577):
Tim Halld8339a72021-05-27 18:49:40 +0100578 """Generates IB_END/IB_START/AB_START/ACC_FORMAT registers"""
579 emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, arch_block_config.layout.ib_end)
580 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, arch_block_config.layout.ab_start)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100581 if has_ifm2(npu_op):
Tim Halld8339a72021-05-27 18:49:40 +0100582 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_IB_START, arch_block_config.layout.ib_start2)
William Isaksson56e5f0c2024-01-10 12:28:04 +0100583 if npu_op.accumulator_type != NpuAccumulatorType.Default:
584 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, npu_acc_format_map[npu_op.accumulator_type])
585 else:
586 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[arch_block_config.acc_type])
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100587
588
Tim Halld8339a72021-05-27 18:49:40 +0100589def get_arch_block_config(
590 npu_op: NpuBlockOperation, block_traversal: NpuBlockTraversal, arch: ArchitectureFeatures
591) -> ArchitectureBlockConfig:
Louis Verhaard933f55e2020-11-25 14:10:30 +0100592 """Creates shared buffer allocation for the given operation"""
Tim Halld8339a72021-05-27 18:49:40 +0100593 assert npu_op.block_config is not None, "block_config has not been set"
594 block_type = NpuBlockType.Default
Dwight Lidman9b43f842020-12-08 17:56:44 +0100595 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100596 block_type = NpuBlockType.ConvolutionMxN
Dwight Lidman9b43f842020-12-08 17:56:44 +0100597 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100598 block_type = NpuBlockType.ConvolutionDepthWise
Dwight Lidman9b43f842020-12-08 17:56:44 +0100599 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100600 block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
Dwight Lidman9b43f842020-12-08 17:56:44 +0100601 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100602 block_type = NpuBlockType.ElementWise
603 else:
604 assert 0, "Unsupported operation"
605 ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
Tim Halld8339a72021-05-27 18:49:40 +0100606 is_partkernel = block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST
607 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
608 lut_banks = 2 if uses_lut else 0
609 fms = [npu_op.ifm, npu_op.ofm]
610 if npu_op.ifm2 is not None:
611 fms.append(npu_op.ifm2)
612 all_fms_have_quant = not any(fm.quantization is None or fm.quantization.scale_f32 is None for fm in fms)
613 ifm_bits = npu_op.ifm.data_type.size_in_bits()
614 ifm_shape = shape3d_to_block(npu_op.ifm.shape)
615 if has_ifm2(npu_op):
616 ifm2_shape = shape3d_to_block(npu_op.ifm2.shape)
617 else:
618 ifm2_shape = None
619 uses_scalar = npu_op.ifm2_scalar is not None
620 block_config = shape3d_to_block(npu_op.block_config)
621 arch_block_config = try_block_config(
622 block_config,
623 arch,
624 block_type,
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100625 shape3d_to_block(npu_op.ofm.shape),
Tim Halld8339a72021-05-27 18:49:40 +0100626 ifm_shape,
627 ifm2_shape,
628 uses_scalar,
629 ifm_bits,
630 is_partkernel=is_partkernel,
631 kernel=to_kernel(npu_op.kernel),
632 lut_banks=lut_banks,
633 scaled=all_fms_have_quant,
634 ifm_resampling=ifm_resampling_mode,
635 )
636 assert arch_block_config is not None, f"block_config {npu_op.block_config} does not fit"
637 return arch_block_config
Louis Verhaard933f55e2020-11-25 14:10:30 +0100638
639
Louis Verhaard1e170182020-11-26 11:42:04 +0100640def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):
641 """Generates KERNEL_WAIT/DMA_WAIT"""
642 if cmd_waits.npu >= 0:
643 emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
644
645 if cmd_waits.dma >= 0:
646 emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
647
648
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100649def generate_common(
650 emit: CommandStreamEmitter,
651 npu_op: NpuBlockOperation,
652 block_traversal: NpuBlockTraversal,
653 arch: ArchitectureFeatures,
654 use_global_scale: bool = False,
655 op_to_scale: int = 0,
656):
657 """Generate registers that are common to most operations"""
658 assert npu_op.ifm is not None and npu_op.ofm is not None
William Isakssona4f84112023-06-19 15:31:46 +0000659 generate_ifm(emit, npu_op.ifm, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100660 generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)
661 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])
662 if npu_op.padding is not None:
663 generate_padding(emit, npu_op.padding)
William Isakssona4f84112023-06-19 15:31:46 +0000664 generate_ofm(emit, npu_op.ofm, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100665 generate_ofm_precision(emit, npu_op, use_global_scale)
666 if npu_op.op_type != NpuOperationType.ElementWise:
667 assert npu_op.kernel is not None
668 generate_kernel(emit, npu_op.kernel, block_traversal)
669 generate_weights(emit, npu_op.weights, arch)
670 generate_biases(emit, npu_op.biases, arch)
671 generate_activation(emit, npu_op.activation, npu_op.ofm)
Tim Halld8339a72021-05-27 18:49:40 +0100672 arch_block_config = get_arch_block_config(npu_op, block_traversal, arch)
673 generate_block_config(emit, npu_op.block_config)
674 generate_shram_registers(emit, npu_op, arch_block_config)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100675
676
677# -------------------------------------------------------------------
678# SCALING
679# -------------------------------------------------------------------
680
681
682def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):
683 """Generates OFM_SCALE register for pooling operations"""
684 # For valid padding vela has to output scaling values
685 kernel = pool_op.kernel
686 ifm_quant = pool_op.ifm.quantization
687 ofm_quant = pool_op.ofm.quantization
688 if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):
689 assert ifm_quant.scale_f32 is not None
690 rescale = 0x3000 * ifm_quant.scale_f32
691 if pool_op.ifm.data_type == NpuDataType.INT16:
692 # Calculate scale and shift for the output scale of 1/(3*4096)
Louis Verhaardc6291292021-03-19 09:35:48 +0100693 x_log2 = math.log2(ifm_quant.scale_f32)
694 rounded_log2 = int(round(x_log2))
695 is_power_of_two = abs(x_log2 - rounded_log2) < 0.001
696 shift = rounded_log2 + 12
Patrik Gustavssone3dd2f32021-12-02 09:08:26 +0100697 if is_power_of_two and (
698 (pool_op.activation.op_type == NpuActivationOp.TANH and shift in (0, 1))
699 or (pool_op.activation.op_type == NpuActivationOp.SIGMOID and shift == 0)
700 ):
701 # Special handling if input scale is 1/2048 (tanh/sigmoid) or 1/4096 (tanh)
Louis Verhaardc6291292021-03-19 09:35:48 +0100702 scale = 3 << shift
703 shift = 0
704 else:
705 shift = 0
706 max_rescale = np.iinfo(np.int16).max / 2
707 while rescale <= max_rescale and shift <= 30:
708 shift += 1
709 rescale *= 2
710 scale = int(rescale)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100711 else:
712 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
713 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
714 scale = int(round_away_zero(scale * rescale))
715 elif pool_op.fused_quantize:
716 # Quantize op requires different scaling
717 ifm_scale_f64 = np.double(ifm_quant.scale_f32)
718 ofm_scale_f64 = np.double(ofm_quant.scale_f32)
719 scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
720 elif pool_op.rescale is not None:
Patrik Gustavssonc74682c2021-08-17 14:26:38 +0200721 if type(pool_op.rescale) == ExplicitScaling:
722 # Note: reuse of rescale for explicit scaling to not expose this in the external API
723 explicit_scaling = pool_op.rescale
724 assert explicit_scaling.per_channel is False
725 scale = explicit_scaling.multiplier[0]
726 shift = explicit_scaling.shift[0]
727 else:
Tim Hall885033b2022-07-21 11:46:03 +0100728 # for ResizeBilinear/NearestNeighbor operations with rescale
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200729 # Note: this is not used, but part of the public API
Patrik Gustavssonc74682c2021-08-17 14:26:38 +0200730 rescale = pool_op.rescale
731 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
732 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
733 scale = int(round_away_zero(scale * rescale))
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100734 else:
735 # In case avg pool fused with concat or other memory operation, rescaling might be needed.
736 # kernel height == kernel width == 1 is always true in this case
737 # Normally the scale is maximised, to get maximum precision, which means that
738 # if rescale != 1, scale need to consider the number of bits needed for rescaling
739 if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:
740 rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32
741 rescale_bits = 0
742 if kernel.height == kernel.width == 1:
743 if rescale > 1:
744 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
745 elif rescale < 1:
746 rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
747 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
748 scale = int(round_away_zero(scale * rescale))
749 else:
750 scale = 1
751 shift = 0
752
753 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
754
755
756def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
757 """
758 Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
759 Returns the operator to scale
760 """
761 op_to_scale = 0
762 if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):
763 input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
764 input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
765 output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
766
767 if npu_op.activation is not None and npu_op.activation.op_type in (
768 NpuActivationOp.SIGMOID,
769 NpuActivationOp.TANH,
770 ):
771 output_scale = 1 / 0x3000
772
773 if npu_op.sub_op_type == NpuElementWiseOp.MUL:
Patrik Gustavssonb081d672021-08-25 13:49:25 +0200774 if npu_op.rescale:
775 ofm_scale, shift = npu_op.rescale
776 elif None in (input_scale, input2_scale, output_scale):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100777 ofm_scale = 1
778 shift = 0
779 else:
780 ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100781 else: # Add/Sub
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200782 # Default operand scaling is no scaling
783 opa_scale = opb_scale = 1
784 opa_shift = 0
Henrik G Olssonad656a82021-03-19 15:50:28 +0100785 bitdepth = npu_op.ifm.data_type.size_in_bits()
786 use_advanced_scaling = False
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200787 if npu_op.rescale is not None:
788 # Explicit ofm scaling
789 ofm_scale, shift = npu_op.rescale
790 elif None in (input_scale, input2_scale, output_scale):
791 # No ofm scaling
792 ofm_scale = 1
793 shift = 0
Henrik G Olssonad656a82021-03-19 15:50:28 +0100794 elif input_scale == input2_scale and bitdepth == 16:
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200795 # int16 same scaling
Henrik G Olssonad656a82021-03-19 15:50:28 +0100796 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
797 input_scale, input2_scale, output_scale
798 )
799 # align the double rounding with that of advanced scaling
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200800 opa_scale //= 2
801 opb_scale //= 2
Henrik G Olssonad656a82021-03-19 15:50:28 +0100802 shift -= 1
803 opa_shift = 0 # Unused for this case
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100804 elif input_scale == input2_scale:
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200805 # Same scaling
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100806 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
807 input_scale, input2_scale, output_scale
808 )
809 opa_shift = 0 # Unused for this case
Henrik G Olssonad656a82021-03-19 15:50:28 +0100810 # For 8 bit we can't guarantee double rounding with simplified scaling will always be
811 # the same as with advanced scaling due to different shifts. When the ofm scale fulfils
812 # the following we know that double rounding will have no effect for advanced scaling
813 # no matter the input, so we can safely use simplified scaling with double rounding disabled.
814 use_advanced_scaling = int(ofm_scale) & 0xFFF != 0
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100815 else:
Henrik G Olssonad656a82021-03-19 15:50:28 +0100816 use_advanced_scaling = True
817 if use_advanced_scaling:
818 # Use advanced implementation only when input/output scales differ,
819 # or when we can't guarantee the absence of rounding errors
Jonas Ohlssond8575072022-03-30 10:30:25 +0200820 (
821 opa_scale,
822 opa_shift,
823 ofm_scale,
824 shift,
825 op_to_scale,
826 ) = scaling.advanced_elementwise_add_sub_scale(input_scale, input2_scale, output_scale, bitdepth)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100827 opb_scale = 0 # Unused for this case
828 if npu_op.reversed_operands:
829 # If the operand order is reversed we also have to swap which operand is scaled
830 if op_to_scale == scaling.OperandToScale.OPa:
831 op_to_scale = scaling.OperandToScale.OPb
832 else:
833 op_to_scale = scaling.OperandToScale.OPa
834 emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
835 emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100836 elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):
837 output_scale = npu_op.ofm.quantization.scale_f32
838 ofm_scale, shift = scaling.quantise_scale(output_scale)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100839 else:
Tim Halle178f382022-07-12 17:02:25 +0100840 ofm_scale = 1
841 shift = 0
842 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100843 return op_to_scale
844
845
846# -------------------------------------------------------------------
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100847# PRINT
848# -------------------------------------------------------------------
Jacob Bohline99b8932020-07-13 16:01:51 +0200849
850
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100851def print_feature_map(fm: Optional[NpuFeatureMap], name: str):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100852 if fm is not None:
853 q = (
854 "no quantization"
855 if fm.quantization is None
856 else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"
857 )
858 h, w, c = fm.shape
859 sz = h * w * c * fm.data_type.size_in_bytes()
860 print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")
861 strides = get_strides(fm)
862 stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"
863 t = fm.tiles
864 addresses = [hex(addr) for addr in t.addresses]
865 print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")
Tim Hall68df8a12022-03-16 16:51:16 +0000866 print(f" name={fm.name}")
Tim Hall79d07d22020-04-27 18:20:16 +0100867
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100868
Dwight Lidman9b43f842020-12-08 17:56:44 +0100869def print_operation(npu_op: NpuOperation, index: int = 0, cmd=None):
Tim Hall68df8a12022-03-16 16:51:16 +0000870 pass_info = f" {cmd}" if cmd else ""
Dwight Lidman9b43f842020-12-08 17:56:44 +0100871 if isinstance(npu_op, NpuOperation) and not isinstance(npu_op, (NpuDmaOperation, NpuBlockOperation)):
Tim Hall68df8a12022-03-16 16:51:16 +0000872 print(f"{index} {npu_op.op_type.name} name={npu_op.name}:{pass_info}")
Dwight Lidman9b43f842020-12-08 17:56:44 +0100873 return
874 if isinstance(npu_op, NpuDmaOperation):
Tim Hall68df8a12022-03-16 16:51:16 +0000875 print(f"{index} {npu_op.op_type.name} name={npu_op.name}, src={npu_op.src}, dest={npu_op.dest}:{pass_info}")
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100876 return
877 k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100878 if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)):
Tim Hall68df8a12022-03-16 16:51:16 +0000879 print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name} name={npu_op.name}:{pass_info}")
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200880 else:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100881 if (
Dwight Lidman9b43f842020-12-08 17:56:44 +0100882 isinstance(npu_op, NpuConv2DOperation)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100883 and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1
884 ):
885 fc = "FullyConnected "
886 else:
887 fc = ""
Tim Hall68df8a12022-03-16 16:51:16 +0000888 print(f"{index} {fc}{npu_op.op_type.name} name={npu_op.name}:{pass_info}")
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100889 print_feature_map(npu_op.ifm, "IFM")
890 if npu_op.ifm2_scalar is not None:
891 quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
892 print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")
893 else:
894 print_feature_map(npu_op.ifm2, "IFM2")
895 print_feature_map(npu_op.ofm, "OFM")
896 if k is not None and npu_op.op_type != NpuOperationType.ElementWise:
897 print(f" Kernel: {k}")
898 if npu_op.padding is not None:
899 print(f" {npu_op.padding}")
900 for weights in npu_op.weights:
901 print(f" Weights: {weights}")
902 for bias in npu_op.biases:
903 print(f" Scales: {bias}")
904 if npu_op.activation is not None:
905 act = npu_op.activation
906 if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:
907 lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""
908 print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")
Dwight Lidman9b43f842020-12-08 17:56:44 +0100909 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100910 print(f" {npu_op.block_traversal}")
911 bh, bw, bc = npu_op.block_config
Dwight Lidman9b43f842020-12-08 17:56:44 +0100912 rescale = (
913 f", rescale={npu_op.rescale}" if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)) else ""
914 )
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100915 print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")
Tim Hall79d07d22020-04-27 18:20:16 +0100916
Tim Hall79d07d22020-04-27 18:20:16 +0100917
Dwight Lidman9b43f842020-12-08 17:56:44 +0100918def print_operations(npu_op_list: List[NpuOperation], npu_op_to_cmd=None):
919 npu_op_to_cmd = dict() if npu_op_to_cmd is None else npu_op_to_cmd
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100920 for index, npu_op in enumerate(npu_op_list):
Dwight Lidman9b43f842020-12-08 17:56:44 +0100921 print_operation(npu_op, index, npu_op_to_cmd.get(npu_op))
Tim Hall79d07d22020-04-27 18:20:16 +0100922
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100923
924# -------------------------------------------------------------------
925# OPERATIONS
926# -------------------------------------------------------------------
927
928
929def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):
930 """Generates NPU_OP_* command"""
Dwight Lidman9b43f842020-12-08 17:56:44 +0100931 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100932 emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100933 elif isinstance(npu_op, NpuConv2DOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100934 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100935 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100936 emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100937 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100938 emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])
Dwight Lidman9b43f842020-12-08 17:56:44 +0100939 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100940 emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])
941 else:
942 assert 0, "Unsupported operation"
943
944
Louis Verhaard933f55e2020-11-25 14:10:30 +0100945def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100946 """Generates register commands for Conv2D operations"""
947 generate_common(emit, npu_op, npu_op.block_traversal, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100948
949
Dwight Lidman9b43f842020-12-08 17:56:44 +0100950def generate_conv_depthwise_op(
951 emit: CommandStreamEmitter, npu_op: NpuConvDepthWiseOperation, arch: ArchitectureFeatures
952):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100953 """Generates register commands for depthwise convolution operations"""
954 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100955
956
957def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
958 """Generates register commands for pooling operations"""
Tim Halld6efcd32022-09-02 15:01:01 +0100959 # check that reduce_sum input is NHWC
960 if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM and npu_op.ifm.layout != NpuLayout.NHWC:
961 if npu_op.ifm.data_type == NpuDataType.INT32:
962 raise VelaError(
963 f"REDUCE_SUM ({npu_op.name}) with IFM data type of INT32 requires IFM layout to be NHWC"
964 f" ({npu_op.ifm.name} == {npu_op.ifm.layout})"
965 )
966 elif arch.accelerator_config == Accelerator.Ethos_U65_512:
967 raise VelaError(
968 f"REDUCE_SUM ({npu_op.name}) with accelerator config of Ethos_U65_512 requires IFM layout to be NHWC"
969 f" ({npu_op.ifm.name} == {npu_op.ifm.layout})"
970 )
971
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100972 use_global_scale = (
973 npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0
974 )
Patrik Gustavssonc74682c2021-08-17 14:26:38 +0200975 # Note: reuse of rescale for explicit scaling to not expose this in the external API
976 if npu_op.rescale is not None and type(npu_op.rescale) == ExplicitScaling:
977 use_global_scale = not npu_op.rescale.per_channel
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100978 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)
979 # Pooling op specific
980 if use_global_scale:
981 generate_ofm_scaling_for_pooling(emit, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100982
983
984def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):
985 """Generates register commands for elementwise operations"""
986 use_global_scale = npu_op.sub_op_type in (
987 NpuElementWiseOp.ADD,
988 NpuElementWiseOp.SUB,
989 NpuElementWiseOp.MUL,
990 NpuElementWiseOp.LRELU,
991 NpuElementWiseOp.ABS,
992 )
993 op_to_scale = generate_scaling_for_elementwise(emit, npu_op)
994 generate_common(
995 emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale
996 )
997 # Elementwise op specific
Louis Verhaard1e170182020-11-26 11:42:04 +0100998 if npu_op.sub_op_type not in UNARY_ELEMWISE_OPS:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100999 # Binary operation; generate IFM2 registers
1000 assert npu_op.ifm2 is not None
1001 has_scalar = npu_op.ifm2_scalar is not None
William Isakssona4f84112023-06-19 15:31:46 +00001002 generate_ifm2(emit, npu_op.ifm2, has_scalar, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001003 generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)
1004 generate_ifm2_broadcast(emit, npu_op)
1005 if has_scalar:
1006 quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
1007 assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()
1008 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001009
1010
William Isakssona4f84112023-06-19 15:31:46 +00001011def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001012 """Generates register commands for DMA operations"""
William Isakssona4f84112023-06-19 15:31:46 +00001013 check_dma_op(dma_op, arch)
1014
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001015 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +01001016 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001017 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)
1018
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +01001019 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
1020 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001021
1022
Louis Verhaard933f55e2020-11-25 14:10:30 +01001023def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001024 """
1025 Generates register commands for the given operation, but not the final NPU_OP_... command.
1026 Returns the selected block config
1027 """
Dwight Lidman9b43f842020-12-08 17:56:44 +01001028 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +01001029 generate_conv2d_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001030 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +01001031 generate_conv_depthwise_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001032 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +01001033 generate_pooling_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001034 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +01001035 generate_elementwise_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001036 elif isinstance(npu_op, NpuDmaOperation):
William Isakssona4f84112023-06-19 15:31:46 +00001037 generate_dma_op(emit, npu_op, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001038 else:
1039 assert 0, "Unsupported operation"
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001040
1041
1042def generate_command_stream(
Louis Verhaard024c3552021-03-17 14:26:34 +01001043 npu_op_list: List[NpuOperation],
1044 arch: ArchitectureFeatures,
1045 verbose: bool,
1046 mem_limits: Dict[int, int],
1047 add_to_debug_db=None,
1048 npu_op_to_cmd=None,
Louis Verhaard1e170182020-11-26 11:42:04 +01001049) -> List[int]:
1050 """
1051 Generates register commands for the given list of NPU operations.
1052 Returns Ethos-U instructions, as a list of 32-bit integers.
1053 """
1054 emit = CommandStreamEmitter()
1055 if verbose:
Tim Hallcd035042023-08-08 14:10:17 +01001056 print("Register-Level Command Stream: Input")
Dwight Lidman9b43f842020-12-08 17:56:44 +01001057 print_operations(npu_op_list, npu_op_to_cmd)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001058 # Calculate memory accesses for every operation
Dwight Lidman9b43f842020-12-08 17:56:44 +01001059 memory_accesses: Dict[NpuOperation, MemoryAccessSet] = {}
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001060 for npu_op in npu_op_list:
Dwight Lidman9b43f842020-12-08 17:56:44 +01001061 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001062 memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001063 elif isinstance(npu_op, NpuBlockOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001064 memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001065 else:
1066 assert 0, "Invalid operation type"
Louis Verhaard024c3552021-03-17 14:26:34 +01001067
Tim Hallc8a73862020-10-27 12:43:14 +00001068 if arch.is_ethos_u65_system:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001069 emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001070 prev_op = None
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001071 # Generate register commands for all operations
Alexander Hanssonca9cc422023-06-22 16:01:27 +00001072 outstanding_dma_ops: List[NpuOperation] = list()
1073 outstanding_npu_ops: List[NpuOperation] = list()
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001074 for op_index, npu_op in enumerate(npu_op_list):
Louis Verhaard024c3552021-03-17 14:26:34 +01001075 try:
1076 check_mem_limits(memory_accesses[npu_op], mem_limits)
Alexander Hanssonca9cc422023-06-22 16:01:27 +00001077 cmd_waits = get_wait_dependency(arch, npu_op, memory_accesses, outstanding_dma_ops, outstanding_npu_ops)
Louis Verhaard024c3552021-03-17 14:26:34 +01001078 generate_registers_for_op(emit, npu_op, arch)
William Isakssona4f84112023-06-19 15:31:46 +00001079 except ByteAlignmentError as e:
1080 # Enables testing for ByteAlignmentErrors specifically
1081 raise ByteAlignmentError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None
1082 except ByteSizeError as e:
1083 # Enables testing for ByteSizeErrors specifically
1084 raise ByteSizeError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None
Louis Verhaard024c3552021-03-17 14:26:34 +01001085 except VelaError as e:
Louis Verhaard024c3552021-03-17 14:26:34 +01001086 raise VelaError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None
Dwight Lidman9b43f842020-12-08 17:56:44 +01001087 if not isinstance(npu_op, NpuDmaOperation) and isinstance(npu_op, NpuBlockOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001088 # Generate BLOCKDEP
Louis Verhaard933f55e2020-11-25 14:10:30 +01001089 blockdep = calc_blockdep(arch, prev_op, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001090 blockdep = min(blockdep, arch.max_blockdep)
1091 emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
1092 prev_op = npu_op
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001093
1094 generate_cmd_waits(emit, cmd_waits)
1095 # Generate the actual NPU_OP command
1096 generate_operation_code(emit, npu_op)
1097 if add_to_debug_db is not None:
William Isakssone4d2f212024-02-10 15:54:44 +01001098 if not isinstance(npu_op, NpuDmaOperation):
1099 # Subtraction by 4 is to account for that offsets are pre-incremented.
1100 add_to_debug_db(npu_op, emit.offset - 4)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001101 # Fill in final part of command stream:
1102 emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
Louis Verhaard1e170182020-11-26 11:42:04 +01001103 res = emit.to_list()
erik.andersson@arm.com1878dab2021-03-16 09:40:24 +01001104
1105 if emit.size_in_bytes() >= 1 << 24:
1106 raise VelaError(
1107 f"The command stream size exceeds the hardware limit of 16 MiB. "
1108 f"The current stream size is {emit.size_in_bytes()/2**20:.2F} MiB."
1109 )
1110
Tim Hall79d07d22020-04-27 18:20:16 +01001111 if verbose:
Tim Hallcd035042023-08-08 14:10:17 +01001112 print("Register-Level Command Stream: Output")
Tim Hall79d07d22020-04-27 18:20:16 +01001113 emit.print_cmds()
Tim Hall114baba2022-05-10 12:42:27 +01001114 print(f"Number of commands = {len(emit.cmd_stream)}")
1115 print(f"Command stream length = {emit.size_in_bytes()} bytes")
Louis Verhaard1e170182020-11-26 11:42:04 +01001116 return res
1117
1118
Louis Verhaardaeae5672020-11-02 18:04:27 +01001119def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001120 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001121 Internal implementation of the public facing API for generating an Ethos-U register command stream.
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001122 Calculates dependencies between commands and inserts wait operations if needed.
1123
1124 :param npu_op_list: List[NpuOperation] list of high level NPU operations
Tim Hallc8a73862020-10-27 12:43:14 +00001125 :param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator
1126 :return Ethos-U instructions, as a list of 32-bit integers
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001127 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001128 accelerator = Accelerator.from_npu_accelerator(npu_accelerator)
Louis Verhaard52078302020-11-18 13:35:06 +01001129 arch = create_default_arch(accelerator)
Louis Verhaard024c3552021-03-17 14:26:34 +01001130 mem_limits = dict()
1131 for region in range(0, 8):
1132 mem_limits[region] = arch.max_address_offset
1133 mem_limits[BASE_PTR_INDEX_MEM2MEM] = arch.shram_size_bytes
1134 return generate_command_stream(npu_op_list, arch, verbose=False, mem_limits=mem_limits)