blob: 71fec3bed40cb8662b29d448f5114a3da10fc271 [file] [log] [blame]
Alexander Hanssonca9cc422023-06-22 16:01:27 +00001# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Rickard Bolinbc6ee582022-11-04 08:24:29 +000016#
Tim Hall79d07d22020-04-27 18:20:16 +010017# Description:
Tim Hallc8a73862020-10-27 12:43:14 +000018# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates
Tim Hall79d07d22020-04-27 18:20:16 +010019# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
Tim Hallc8a73862020-10-27 12:43:14 +000020# stream suitable for interpretation by the Ethos-U processor.
Louis Verhaardc6291292021-03-19 09:35:48 +010021import math
Tim Hall79d07d22020-04-27 18:20:16 +010022from collections import defaultdict
Diego Russoe8a10452020-04-21 17:39:10 +010023from enum import Enum
24from enum import IntEnum
Jonas Ohlsson845e2322022-03-01 12:39:55 +010025from typing import cast
Dwight Lidman9b43f842020-12-08 17:56:44 +010026from typing import Dict
Louis Verhaarde8a5a782020-11-02 18:04:27 +010027from typing import List
28from typing import Optional
Diego Russoea6111a2020-04-14 18:41:58 +010029
30import numpy as np
31
32from . import scaling
Louis Verhaardaeae5672020-11-02 18:04:27 +010033from .api import NpuAccelerator
Louis Verhaarde8a5a782020-11-02 18:04:27 +010034from .api import NpuActivation
35from .api import NpuActivationOp
36from .api import NpuAddressRange
37from .api import NpuBlockOperation
38from .api import NpuBlockTraversal
39from .api import NpuConv2DOperation
Dwight Lidman9b43f842020-12-08 17:56:44 +010040from .api import NpuConvDepthWiseOperation
Louis Verhaarde8a5a782020-11-02 18:04:27 +010041from .api import NpuDataType
42from .api import NpuDmaOperation
43from .api import NpuElementWiseOp
44from .api import NpuElementWiseOperation
45from .api import NpuFeatureMap
46from .api import NpuKernel
47from .api import NpuLayout
48from .api import NpuOperation
49from .api import NpuOperationType
50from .api import NpuPadding
51from .api import NpuPoolingOp
52from .api import NpuPoolingOperation
Louis Verhaarde8a5a782020-11-02 18:04:27 +010053from .api import NpuResamplingMode
54from .api import NpuRoundingMode
55from .api import NpuShape3D
56from .api import NpuTileBox
Tim Halld8339a72021-05-27 18:49:40 +010057from .architecture_allocator import ArchitectureBlockConfig
58from .architecture_allocator import try_block_config
Louis Verhaarde8a5a782020-11-02 18:04:27 +010059from .architecture_features import Accelerator
Diego Russoe8a10452020-04-21 17:39:10 +010060from .architecture_features import ArchitectureFeatures
Louis Verhaard52078302020-11-18 13:35:06 +010061from .architecture_features import create_default_arch
Diego Russoe8a10452020-04-21 17:39:10 +010062from .architecture_features import SHRAMElements
William Isakssona4f84112023-06-19 15:31:46 +000063from .errors import ByteAlignmentError
64from .errors import ByteSizeError
erik.andersson@arm.com1878dab2021-03-16 09:40:24 +010065from .errors import VelaError
Diego Russoe8a10452020-04-21 17:39:10 +010066from .ethos_u55_regs.ethos_u55_regs import acc_format
67from .ethos_u55_regs.ethos_u55_regs import activation
68from .ethos_u55_regs.ethos_u55_regs import cmd0
69from .ethos_u55_regs.ethos_u55_regs import cmd1
70from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
Fredrik Svedberga0c36242020-06-03 15:43:31 +020071from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlincf7da102020-05-20 09:03:40 +020072from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russoe8a10452020-04-21 17:39:10 +010073from .ethos_u55_regs.ethos_u55_regs import rounding
Diego Russoe8a10452020-04-21 17:39:10 +010074from .numeric_util import round_away_zero
Diego Russoe8a10452020-04-21 17:39:10 +010075from .numeric_util import round_up_to_int
Patrik Gustavssonc74682c2021-08-17 14:26:38 +020076from .operation import ExplicitScaling
Tim Hall79d07d22020-04-27 18:20:16 +010077from .operation import NpuBlockType
Dwight Lidman9b43f842020-12-08 17:56:44 +010078from .range_set import MemoryAccessSet
Louis Verhaard024c3552021-03-17 14:26:34 +010079from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM
Louis Verhaard1e170182020-11-26 11:42:04 +010080from .register_command_stream_util import calc_blockdep
William Isakssona4f84112023-06-19 15:31:46 +000081from .register_command_stream_util import check_addresses
82from .register_command_stream_util import check_alignment
83from .register_command_stream_util import check_dma_op
84from .register_command_stream_util import check_size
85from .register_command_stream_util import check_strides
Louis Verhaard1e170182020-11-26 11:42:04 +010086from .register_command_stream_util import get_dma_memory_accesses
87from .register_command_stream_util import get_op_memory_accesses
88from .register_command_stream_util import get_strides
89from .register_command_stream_util import get_wait_dependency
Fredrik Svedbergf3c7d552022-11-04 09:48:49 +010090from .register_command_stream_util import get_zero_point
Louis Verhaard1e170182020-11-26 11:42:04 +010091from .register_command_stream_util import has_ifm2
Fredrik Svedbergf3c7d552022-11-04 09:48:49 +010092from .register_command_stream_util import quantise
Tim Halld8339a72021-05-27 18:49:40 +010093from .register_command_stream_util import shape3d_to_block
Louis Verhaard1e170182020-11-26 11:42:04 +010094from .register_command_stream_util import to_kernel
95from .register_command_stream_util import UNARY_ELEMWISE_OPS
96from .register_command_stream_util import Watermark
Tim Hall79d07d22020-04-27 18:20:16 +010097
98
99class RegisterMachine:
100 def __init__(self):
101 self.n_banks = 1
102 self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
103 self.bank_idx = 0
104
105 def set_register(self, reg, value):
106 is_changed = self.registers[self.bank_idx][reg] != value
107 self.registers[self.bank_idx][reg] = value
108 # is_changed = True # force command
109 return is_changed
110
111 def switch_bank(self):
112 self.bank_idx = (self.bank_idx + 1) % self.n_banks
113
114
115class CmdMode(IntEnum):
116 NoPayload = 0x0000
117 Payload32 = 0x4000
118 Mask = 0xC000
119 CmdOpMask = 0x03FF
120
121
Tim Hall79d07d22020-04-27 18:20:16 +0100122class CommandStreamEmitter:
Tim Halle6ccd872020-11-09 16:46:37 +0000123 WORD_SIZE = 4
124
Tim Hall79d07d22020-04-27 18:20:16 +0100125 def __init__(self):
126 self.cmd_stream = []
127 self.reg_machine = [RegisterMachine(), RegisterMachine()]
128 self.last_absolute_wait = defaultdict(int)
Tim Halle6ccd872020-11-09 16:46:37 +0000129 self.offset = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100130
131 def get_reg_machine(self, cmd):
132 if "DMA" in cmd.name:
133 return self.reg_machine[1]
134 else:
135 return self.reg_machine[0]
136
137 def size_in_bytes(self):
138 sz = 0
139 for cmd in self.cmd_stream:
Tim Halle6ccd872020-11-09 16:46:37 +0000140 sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100141 return sz
142
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100143 def to_list(self) -> List[int]:
Tim Hall79d07d22020-04-27 18:20:16 +0100144 return [elem for cmd in self.cmd_stream for elem in cmd]
145
146 def print_cmds(self):
Tim Hall114baba2022-05-10 12:42:27 +0100147 s = f" {'Offset':6}:"
148 s += f" {'Payload':8}"
149 s += f"{'Param':4}" # no leading space for alignment
150 s += f" {'Code':4}"
151 s += f" - {'Command':30}"
152 s += f" {'Param':5}"
153 print(s)
154
155 offset = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100156 for words_for_one_command in self.cmd_stream:
157 code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
158 param = words_for_one_command[0] >> 16 # higher 16 bits
159
160 payload_mode = CmdMode(code & CmdMode.Mask)
161
Tim Hallcda4fcb2022-05-19 12:36:58 +0100162 s = f"{offset:#08x}:"
Tim Hall114baba2022-05-10 12:42:27 +0100163
Tim Hall79d07d22020-04-27 18:20:16 +0100164 if payload_mode == CmdMode.NoPayload:
Tim Hall114baba2022-05-10 12:42:27 +0100165 s += f" {'':8}"
Tim Hall79d07d22020-04-27 18:20:16 +0100166 else:
Tim Hall114baba2022-05-10 12:42:27 +0100167 assert payload_mode == CmdMode.Payload32
168 s += f" {words_for_one_command[1]:08x}"
Tim Hall79d07d22020-04-27 18:20:16 +0100169
Tim Hall114baba2022-05-10 12:42:27 +0100170 s += f" {param:04x}"
171 s += f" {code:04x}"
Tim Hall79d07d22020-04-27 18:20:16 +0100172
Tim Hall114baba2022-05-10 12:42:27 +0100173 if payload_mode == CmdMode.NoPayload:
174 s += f" - {cmd0(code & CmdMode.CmdOpMask):30}"
175 offset += 4
Tim Hall79d07d22020-04-27 18:20:16 +0100176 else:
Tim Hall114baba2022-05-10 12:42:27 +0100177 s += f" - {cmd1(code & CmdMode.CmdOpMask):30}"
178 offset += 8
Tim Hall79d07d22020-04-27 18:20:16 +0100179
Tim Hall114baba2022-05-10 12:42:27 +0100180 s += f" {param:5}"
Tim Hall79d07d22020-04-27 18:20:16 +0100181 print(s)
182
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100183 def cmd0_with_param(self, cmd: cmd0, param):
Tim Hall79d07d22020-04-27 18:20:16 +0100184 if isinstance(param, Enum):
185 param = int(param.value)
186 else:
187 param = int(param)
188 param = param & 0xFFFF
189 command = cmd.value | (param << 16)
190 if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
191 return
192
193 # This is not a redundant command, actually write it
194 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000195 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100196
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100197 def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):
Louis Verhaard893780c2021-03-30 09:02:30 +0200198 offset = int(offset) & 0xFFFFFFFF
199 param = int(param) & 0xFFFF
Tim Hall79d07d22020-04-27 18:20:16 +0100200 command = cmd.value | CmdMode.Payload32.value | (param << 16)
201
202 if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
203 return
204
205 # This is not a redundant command, actually write it
206 self.cmd_stream.append((command, offset))
Tim Halle6ccd872020-11-09 16:46:37 +0000207 self.offset += CommandStreamEmitter.WORD_SIZE * 2
Tim Hall79d07d22020-04-27 18:20:16 +0100208
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100209 def cmd1_with_address(self, cmd: cmd1, offset):
210 self.cmd1_with_offset(cmd, offset, offset >> 32)
211
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100212 def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):
Tim Hall289a41d2020-08-04 21:40:14 +0100213 param = (16 * channel) + outstanding_count
Tim Hall79d07d22020-04-27 18:20:16 +0100214 command = ((param & 0xFFFF) << 16) | cmd.value
215 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000216 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100217
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100218 def cmd_do_operation(self, cmd: cmd0, param=0):
Tim Hall79d07d22020-04-27 18:20:16 +0100219 param = int(param)
220 command = ((param & 0xFFFF) << 16) | cmd.value
221
222 self.cmd_stream.append((command,))
Tim Halle6ccd872020-11-09 16:46:37 +0000223 self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall79d07d22020-04-27 18:20:16 +0100224 self.get_reg_machine(cmd).switch_bank()
225
226
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100227# -------------------------------------------------------------------
228# REGISTER GENERATION
229# -------------------------------------------------------------------
230
231
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100232# TODO: Replace with definitions from ethos_u55_regs
233class IFM2Broadcast(IntEnum):
234 BroadcastHdim = 1 << 0
235 BroadcastWdim = 1 << 1
236 BroadcastCdim = 1 << 2
237 ReverseOperandOrder = 1 << 6
238 UseIFM2Scalar = 1 << 7
239
240
241pooling_op_map = {
242 NpuPoolingOp.MAX: pooling_mode.MAX.value,
243 NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,
244 NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,
245}
246
247elementwise_op_map = {
248 NpuElementWiseOp.MUL: elementwise_mode.MUL.value,
249 NpuElementWiseOp.ADD: elementwise_mode.ADD.value,
250 NpuElementWiseOp.SUB: elementwise_mode.SUB.value,
251 NpuElementWiseOp.MIN: elementwise_mode.MIN.value,
252 NpuElementWiseOp.MAX: elementwise_mode.MAX.value,
253 NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,
254 NpuElementWiseOp.ABS: elementwise_mode.ABS.value,
255 NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,
256 NpuElementWiseOp.SHR: elementwise_mode.SHR.value,
257 NpuElementWiseOp.SHL: elementwise_mode.SHL.value,
258}
259
260activation_op_map = {
261 NpuActivationOp.NONE_OR_RELU: activation.NONE,
262 NpuActivationOp.TANH: activation.TANH,
263 NpuActivationOp.SIGMOID: activation.SIGMOID,
264}
265
266# Maps an AccumulatorType enum to the corresponding acc_format value
267acc_format_map = {
268 SHRAMElements.Acc16: acc_format.FP_S5_10.value,
269 SHRAMElements.Acc32: acc_format.INT_32BIT.value,
270 SHRAMElements.Acc40: acc_format.INT_40BIT.value,
271}
272
273resampling_mode_map = {
274 NpuResamplingMode.NONE: resampling_mode.NONE,
275 NpuResamplingMode.NEAREST: resampling_mode.NEAREST,
276 NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,
277}
278
279# Maps data type size in bits to activation precision
280precision_map = {8: 0, 16: 1, 32: 2}
281
282# Maps rounding mode to the corresponding value
283rounding_mode_map = {
284 NpuRoundingMode.TFL: rounding.TFL.value,
285 NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,
286 NpuRoundingMode.NATURAL: rounding.NATURAL.value,
287}
288
289
Louis Verhaard024c3552021-03-17 14:26:34 +0100290def check_mem_limits(memory_accesses: MemoryAccessSet, mem_limits: Dict[int, int]):
291 """Checks that an operation's memory accesses respect the boundaries imposed by mem_limits"""
292 for mem_access in memory_accesses.accesses:
293 for region, range_set in mem_access.regions.items():
294 if region not in mem_limits:
295 raise VelaError(f"Invalid region: {region}")
296 max = mem_limits[region]
297 for start, end in range_set.ranges:
298 for offset in (start, end):
299 if offset < 0:
300 raise VelaError(f"Negative address offset: {offset}, region: {region}")
301 if offset > max:
Tim Hallcda4fcb2022-05-19 12:36:58 +0100302 raise VelaError(
303 f"Address offset out of range: {offset}, region: {region}, max: {max}. Perhaps try running"
304 f" with the HillClimb tensor allocator and/or increasing the maximum iteration of that"
305 f" allocator"
306 )
Louis Verhaard024c3552021-03-17 14:26:34 +0100307
308
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100309def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):
310 """Generates IFM_PAD registers"""
311 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)
312 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)
313 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)
314 emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)
315
316
317def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):
318 """Generates ACTIVATION registers"""
319 act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)
320
321 if act.min is None:
322 quantized_min = ofm.data_type.min_value()
323 else:
324 quantized_min = quantise(act.min, ofm.quantization)
325 if act.max is None:
326 quantized_max = ofm.data_type.max_value()
327 else:
328 quantized_max = quantise(act.max, ofm.quantization)
329 quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())
330 quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())
331 if act.op_type == NpuActivationOp.TABLE_LOOKUP:
332 assert 0 <= act.lookup_table_index < 8
333 activation_value = 16 + act.lookup_table_index
334 if ofm.data_type == NpuDataType.INT32:
335 activation_value |= 3 << 12 # Force I8 range
336 quantized_min = max(-128, quantized_min)
337 quantized_max = min(127, quantized_max)
338 else:
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100339 activation_value = cast(int, activation_op_map[act.op_type])
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100340 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)
341 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)
342 emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)
343
344
William Isakssona4f84112023-06-19 15:31:46 +0000345def generate_addresses(
346 emit: CommandStreamEmitter,
347 ptr_cmds: List[cmd1],
348 addresses: List[int],
349 layout: NpuLayout,
350 element_size,
351 arch: ArchitectureFeatures,
352):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100353 """Generates xFM_BASE registers"""
William Isakssona4f84112023-06-19 15:31:46 +0000354 check_addresses(addresses, layout, element_size, arch)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100355 for i in range(4):
356 emit.cmd1_with_address(ptr_cmds[i], addresses[i])
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100357
358
359def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):
360 """Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""
361 emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)
362 emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)
363 emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)
364
365
366def generate_strides(
367 emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1
368):
369 """Generates STRIDE_C/Y/X registers"""
370 strides = get_strides(fm)
William Isakssona4f84112023-06-19 15:31:46 +0000371 check_strides(fm, strides)
372
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100373 emit.cmd1_with_address(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)
374 emit.cmd1_with_address(stride_y_cmd, strides.height) # stride between vertical values (H)
375 emit.cmd1_with_address(stride_x_cmd, strides.width) # stride between horisontal values (W)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100376
377
378def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):
379 """Generates IFM/IFM2_PRECISION register"""
380 dtype = fm.data_type
381 prec = 1 if dtype.is_signed() else 0
382 activation_precision = precision_map[dtype.size_in_bits()]
383 prec += activation_precision << 2
384
385 if fm.layout == NpuLayout.NHCWB16:
386 prec |= 1 << 6
387
388 prec |= op_to_scale << 8
389 emit.cmd0_with_param(precision_cmd, prec)
390
391
392def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):
393 """Generates OFM_PRECISION register"""
394 dtype = npu_op.ofm.data_type
395 prec = 1 if dtype.is_signed() else 0
396 activation_precision = precision_map[dtype.size_in_bits()]
397 prec += activation_precision << 1
398
399 if use_global_scale:
400 # Set global scale bit, as opposed to using per channel scale
401 prec |= 1 << 8
402 if npu_op.ofm.layout == NpuLayout.NHCWB16:
403 prec |= 1 << 6
404 prec |= rounding_mode_map[npu_op.rounding_mode] << 14
405 emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
406
407
408def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):
409 """Generates IFM2_BROADCAST register for binary elementwise operations"""
410 ifm2_broadcast = 0
411 ifm = npu_op.ifm
412 ifm2 = npu_op.ifm2
413 if npu_op.reversed_operands:
414 ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder
415 if npu_op.ifm2_scalar is not None:
416 # IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
417 ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar
418 else:
419 if ifm.shape.height != ifm2.shape.height:
420 # Broadcast in 'H' dimension
421 assert ifm2.shape.height == 1
422 ifm2_broadcast |= IFM2Broadcast.BroadcastHdim
423
424 if ifm.shape.width != ifm2.shape.width:
425 # Broadcast in 'W' dimension
426 assert ifm2.shape.width == 1
427 ifm2_broadcast |= IFM2Broadcast.BroadcastWdim
428
429 if ifm.shape.depth != ifm2.shape.depth:
430 # Broadcast in 'C' dimension
431 assert ifm2.shape.depth == 1
432 ifm2_broadcast |= IFM2Broadcast.BroadcastCdim
433
434 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
435
436
William Isakssona4f84112023-06-19 15:31:46 +0000437def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100438 """Generates general IFM registers"""
439 emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)
440 generate_addresses(
441 emit,
442 [cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],
443 ifm.tiles.addresses,
444 ifm.layout,
William Isakssona4f84112023-06-19 15:31:46 +0000445 ifm.data_type.size_in_bytes(),
446 arch,
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100447 )
448 generate_tiles(
449 emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles
450 )
451 emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)
452 generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)
Fredrik Svedbergf3c7d552022-11-04 09:48:49 +0100453 emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, get_zero_point(ifm))
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100454
455
William Isakssona4f84112023-06-19 15:31:46 +0000456def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100457 """Generates general IFM2 registers"""
458 if not has_scalar:
459 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)
460 generate_addresses(
461 emit,
462 [cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],
463 ifm2.tiles.addresses,
464 ifm2.layout,
William Isakssona4f84112023-06-19 15:31:46 +0000465 ifm2.data_type.size_in_bytes(),
466 arch,
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100467 )
468 generate_tiles(
469 emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles
470 )
471 generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)
Fredrik Svedbergf3c7d552022-11-04 09:48:49 +0100472 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, get_zero_point(ifm2))
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100473
474
William Isakssona4f84112023-06-19 15:31:46 +0000475def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100476 """Generates general OFM registers"""
477 emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)
478 generate_addresses(
479 emit,
480 [cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],
481 ofm.tiles.addresses,
482 ofm.layout,
William Isakssona4f84112023-06-19 15:31:46 +0000483 ofm.data_type.size_in_bytes(),
484 arch,
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100485 )
486 generate_tiles(
487 emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles
488 )
489 emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)
490 emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)
491 emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)
492 generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)
Fredrik Svedbergf3c7d552022-11-04 09:48:49 +0100493 emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, get_zero_point(ofm))
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100494
495
496def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):
497 """Generates KERNEL related registers"""
498 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))
499 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))
500 # set kernel x stride low bit
501 stride = (kernel.stride_x - 1) & 1
502 # set kernel y stride low bit
503 stride |= (kernel.stride_y - 1 & 1) << 1
504 # set kernel x stride extension bits
505 stride |= (kernel.stride_x - 1 >> 1) << 6
506 # set kernel y stride extension bits
507 stride |= (kernel.stride_y - 1 >> 1) << 9
508 stride |= (kernel.dilation_x - 1) << 3
509 stride |= (kernel.dilation_y - 1) << 4
510 if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:
511 stride |= 1 << 2
512 emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
513
514
515def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):
516 """Generates WEIGHT registers"""
517 if len(weights) == 0:
518 return
519 emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)
520 # Set weights sources for active and present cores
521 for core, (addr, length) in enumerate(
522 [
523 (cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
524 (cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
525 ]
526 ):
527 if core < len(weights):
William Isakssona4f84112023-06-19 15:31:46 +0000528 check_alignment(weights[core].address, 16)
529 check_size(weights[core].length, 16)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100530 emit.cmd1_with_address(addr, weights[core].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100531 emit.cmd1_with_offset(length, weights[core].length)
532 elif core < arch.ncores:
William Isakssona4f84112023-06-19 15:31:46 +0000533 check_alignment(weights[0].address, 16)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100534 emit.cmd1_with_address(addr, weights[0].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100535 emit.cmd1_with_offset(length, 0)
536
537
538def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):
539 """Generates SCALE registers"""
540 if len(biases) == 0:
541 return
542 emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)
543 # Set weights sources for active and present cores
544 for core, (addr, length) in enumerate(
545 [(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]
546 ):
547 if core < len(biases):
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100548 emit.cmd1_with_address(addr, biases[core].address)
William Isakssona4f84112023-06-19 15:31:46 +0000549 check_size(biases[core].length, 16)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100550 emit.cmd1_with_offset(length, biases[core].length)
551 elif core < arch.ncores:
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +0100552 emit.cmd1_with_address(addr, biases[0].address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100553 emit.cmd1_with_offset(length, 0)
554
555
556def generate_block_config(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200557 emit: CommandStreamEmitter,
558 block_config: NpuShape3D,
Louis Verhaard933f55e2020-11-25 14:10:30 +0100559):
560 """Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100561 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
562 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)
563 emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100564
565
Tim Halld8339a72021-05-27 18:49:40 +0100566def generate_shram_registers(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200567 emit: CommandStreamEmitter,
568 npu_op: NpuBlockOperation,
569 arch_block_config: ArchitectureBlockConfig,
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100570):
Tim Halld8339a72021-05-27 18:49:40 +0100571 """Generates IB_END/IB_START/AB_START/ACC_FORMAT registers"""
572 emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, arch_block_config.layout.ib_end)
573 emit.cmd0_with_param(cmd0.NPU_SET_AB_START, arch_block_config.layout.ab_start)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100574 if has_ifm2(npu_op):
Tim Halld8339a72021-05-27 18:49:40 +0100575 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_IB_START, arch_block_config.layout.ib_start2)
576 emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[arch_block_config.acc_type])
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100577
578
Tim Halld8339a72021-05-27 18:49:40 +0100579def get_block_config_for_npu_op(
580 arch, npu_op: NpuBlockOperation, npu_block_type: NpuBlockType, is_partkernel: bool, ifm_resampling: resampling_mode
581) -> Optional[ArchitectureBlockConfig]:
582 """
583 Given npu_op.block_config, returns a corresponding ArchitectureBlockConfig.
584 Returns None if the block_config does not fit.
585 """
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100586
587
Tim Halld8339a72021-05-27 18:49:40 +0100588def get_arch_block_config(
589 npu_op: NpuBlockOperation, block_traversal: NpuBlockTraversal, arch: ArchitectureFeatures
590) -> ArchitectureBlockConfig:
Louis Verhaard933f55e2020-11-25 14:10:30 +0100591 """Creates shared buffer allocation for the given operation"""
Tim Halld8339a72021-05-27 18:49:40 +0100592 assert npu_op.block_config is not None, "block_config has not been set"
593 block_type = NpuBlockType.Default
Dwight Lidman9b43f842020-12-08 17:56:44 +0100594 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100595 block_type = NpuBlockType.ConvolutionMxN
Dwight Lidman9b43f842020-12-08 17:56:44 +0100596 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100597 block_type = NpuBlockType.ConvolutionDepthWise
Dwight Lidman9b43f842020-12-08 17:56:44 +0100598 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100599 block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
Dwight Lidman9b43f842020-12-08 17:56:44 +0100600 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +0100601 block_type = NpuBlockType.ElementWise
602 else:
603 assert 0, "Unsupported operation"
604 ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
Tim Halld8339a72021-05-27 18:49:40 +0100605 is_partkernel = block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST
606 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
607 lut_banks = 2 if uses_lut else 0
608 fms = [npu_op.ifm, npu_op.ofm]
609 if npu_op.ifm2 is not None:
610 fms.append(npu_op.ifm2)
611 all_fms_have_quant = not any(fm.quantization is None or fm.quantization.scale_f32 is None for fm in fms)
612 ifm_bits = npu_op.ifm.data_type.size_in_bits()
613 ifm_shape = shape3d_to_block(npu_op.ifm.shape)
614 if has_ifm2(npu_op):
615 ifm2_shape = shape3d_to_block(npu_op.ifm2.shape)
616 else:
617 ifm2_shape = None
618 uses_scalar = npu_op.ifm2_scalar is not None
619 block_config = shape3d_to_block(npu_op.block_config)
620 arch_block_config = try_block_config(
621 block_config,
622 arch,
623 block_type,
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100624 shape3d_to_block(npu_op.ofm.shape),
Tim Halld8339a72021-05-27 18:49:40 +0100625 ifm_shape,
626 ifm2_shape,
627 uses_scalar,
628 ifm_bits,
629 is_partkernel=is_partkernel,
630 kernel=to_kernel(npu_op.kernel),
631 lut_banks=lut_banks,
632 scaled=all_fms_have_quant,
633 ifm_resampling=ifm_resampling_mode,
634 )
635 assert arch_block_config is not None, f"block_config {npu_op.block_config} does not fit"
636 return arch_block_config
Louis Verhaard933f55e2020-11-25 14:10:30 +0100637
638
Louis Verhaard1e170182020-11-26 11:42:04 +0100639def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):
640 """Generates KERNEL_WAIT/DMA_WAIT"""
641 if cmd_waits.npu >= 0:
642 emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
643
644 if cmd_waits.dma >= 0:
645 emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
646
647
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100648def generate_common(
649 emit: CommandStreamEmitter,
650 npu_op: NpuBlockOperation,
651 block_traversal: NpuBlockTraversal,
652 arch: ArchitectureFeatures,
653 use_global_scale: bool = False,
654 op_to_scale: int = 0,
655):
656 """Generate registers that are common to most operations"""
657 assert npu_op.ifm is not None and npu_op.ofm is not None
William Isakssona4f84112023-06-19 15:31:46 +0000658 generate_ifm(emit, npu_op.ifm, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100659 generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)
660 emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])
661 if npu_op.padding is not None:
662 generate_padding(emit, npu_op.padding)
William Isakssona4f84112023-06-19 15:31:46 +0000663 generate_ofm(emit, npu_op.ofm, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100664 generate_ofm_precision(emit, npu_op, use_global_scale)
665 if npu_op.op_type != NpuOperationType.ElementWise:
666 assert npu_op.kernel is not None
667 generate_kernel(emit, npu_op.kernel, block_traversal)
668 generate_weights(emit, npu_op.weights, arch)
669 generate_biases(emit, npu_op.biases, arch)
670 generate_activation(emit, npu_op.activation, npu_op.ofm)
Tim Halld8339a72021-05-27 18:49:40 +0100671 arch_block_config = get_arch_block_config(npu_op, block_traversal, arch)
672 generate_block_config(emit, npu_op.block_config)
673 generate_shram_registers(emit, npu_op, arch_block_config)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100674
675
676# -------------------------------------------------------------------
677# SCALING
678# -------------------------------------------------------------------
679
680
681def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):
682 """Generates OFM_SCALE register for pooling operations"""
683 # For valid padding vela has to output scaling values
684 kernel = pool_op.kernel
685 ifm_quant = pool_op.ifm.quantization
686 ofm_quant = pool_op.ofm.quantization
687 if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):
688 assert ifm_quant.scale_f32 is not None
689 rescale = 0x3000 * ifm_quant.scale_f32
690 if pool_op.ifm.data_type == NpuDataType.INT16:
691 # Calculate scale and shift for the output scale of 1/(3*4096)
Louis Verhaardc6291292021-03-19 09:35:48 +0100692 x_log2 = math.log2(ifm_quant.scale_f32)
693 rounded_log2 = int(round(x_log2))
694 is_power_of_two = abs(x_log2 - rounded_log2) < 0.001
695 shift = rounded_log2 + 12
Patrik Gustavssone3dd2f32021-12-02 09:08:26 +0100696 if is_power_of_two and (
697 (pool_op.activation.op_type == NpuActivationOp.TANH and shift in (0, 1))
698 or (pool_op.activation.op_type == NpuActivationOp.SIGMOID and shift == 0)
699 ):
700 # Special handling if input scale is 1/2048 (tanh/sigmoid) or 1/4096 (tanh)
Louis Verhaardc6291292021-03-19 09:35:48 +0100701 scale = 3 << shift
702 shift = 0
703 else:
704 shift = 0
705 max_rescale = np.iinfo(np.int16).max / 2
706 while rescale <= max_rescale and shift <= 30:
707 shift += 1
708 rescale *= 2
709 scale = int(rescale)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100710 else:
711 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
712 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
713 scale = int(round_away_zero(scale * rescale))
714 elif pool_op.fused_quantize:
715 # Quantize op requires different scaling
716 ifm_scale_f64 = np.double(ifm_quant.scale_f32)
717 ofm_scale_f64 = np.double(ofm_quant.scale_f32)
718 scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
719 elif pool_op.rescale is not None:
Patrik Gustavssonc74682c2021-08-17 14:26:38 +0200720 if type(pool_op.rescale) == ExplicitScaling:
721 # Note: reuse of rescale for explicit scaling to not expose this in the external API
722 explicit_scaling = pool_op.rescale
723 assert explicit_scaling.per_channel is False
724 scale = explicit_scaling.multiplier[0]
725 shift = explicit_scaling.shift[0]
726 else:
Tim Hall885033b2022-07-21 11:46:03 +0100727 # for ResizeBilinear/NearestNeighbor operations with rescale
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200728 # Note: this is not used, but part of the public API
Patrik Gustavssonc74682c2021-08-17 14:26:38 +0200729 rescale = pool_op.rescale
730 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
731 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
732 scale = int(round_away_zero(scale * rescale))
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100733 else:
734 # In case avg pool fused with concat or other memory operation, rescaling might be needed.
735 # kernel height == kernel width == 1 is always true in this case
736 # Normally the scale is maximised, to get maximum precision, which means that
737 # if rescale != 1, scale need to consider the number of bits needed for rescaling
738 if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:
739 rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32
740 rescale_bits = 0
741 if kernel.height == kernel.width == 1:
742 if rescale > 1:
743 rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
744 elif rescale < 1:
745 rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
746 scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
747 scale = int(round_away_zero(scale * rescale))
748 else:
749 scale = 1
750 shift = 0
751
752 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
753
754
755def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
756 """
757 Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
758 Returns the operator to scale
759 """
760 op_to_scale = 0
761 if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):
762 input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
763 input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
764 output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
765
766 if npu_op.activation is not None and npu_op.activation.op_type in (
767 NpuActivationOp.SIGMOID,
768 NpuActivationOp.TANH,
769 ):
770 output_scale = 1 / 0x3000
771
772 if npu_op.sub_op_type == NpuElementWiseOp.MUL:
Patrik Gustavssonb081d672021-08-25 13:49:25 +0200773 if npu_op.rescale:
774 ofm_scale, shift = npu_op.rescale
775 elif None in (input_scale, input2_scale, output_scale):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100776 ofm_scale = 1
777 shift = 0
778 else:
779 ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100780 else: # Add/Sub
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200781 # Default operand scaling is no scaling
782 opa_scale = opb_scale = 1
783 opa_shift = 0
Henrik G Olssonad656a82021-03-19 15:50:28 +0100784 bitdepth = npu_op.ifm.data_type.size_in_bits()
785 use_advanced_scaling = False
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200786 if npu_op.rescale is not None:
787 # Explicit ofm scaling
788 ofm_scale, shift = npu_op.rescale
789 elif None in (input_scale, input2_scale, output_scale):
790 # No ofm scaling
791 ofm_scale = 1
792 shift = 0
Henrik G Olssonad656a82021-03-19 15:50:28 +0100793 elif input_scale == input2_scale and bitdepth == 16:
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200794 # int16 same scaling
Henrik G Olssonad656a82021-03-19 15:50:28 +0100795 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
796 input_scale, input2_scale, output_scale
797 )
798 # align the double rounding with that of advanced scaling
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200799 opa_scale //= 2
800 opb_scale //= 2
Henrik G Olssonad656a82021-03-19 15:50:28 +0100801 shift -= 1
802 opa_shift = 0 # Unused for this case
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100803 elif input_scale == input2_scale:
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +0200804 # Same scaling
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100805 opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
806 input_scale, input2_scale, output_scale
807 )
808 opa_shift = 0 # Unused for this case
Henrik G Olssonad656a82021-03-19 15:50:28 +0100809 # For 8 bit we can't guarantee double rounding with simplified scaling will always be
810 # the same as with advanced scaling due to different shifts. When the ofm scale fulfils
811 # the following we know that double rounding will have no effect for advanced scaling
812 # no matter the input, so we can safely use simplified scaling with double rounding disabled.
813 use_advanced_scaling = int(ofm_scale) & 0xFFF != 0
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100814 else:
Henrik G Olssonad656a82021-03-19 15:50:28 +0100815 use_advanced_scaling = True
816 if use_advanced_scaling:
817 # Use advanced implementation only when input/output scales differ,
818 # or when we can't guarantee the absence of rounding errors
Jonas Ohlssond8575072022-03-30 10:30:25 +0200819 (
820 opa_scale,
821 opa_shift,
822 ofm_scale,
823 shift,
824 op_to_scale,
825 ) = scaling.advanced_elementwise_add_sub_scale(input_scale, input2_scale, output_scale, bitdepth)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100826 opb_scale = 0 # Unused for this case
827 if npu_op.reversed_operands:
828 # If the operand order is reversed we also have to swap which operand is scaled
829 if op_to_scale == scaling.OperandToScale.OPa:
830 op_to_scale = scaling.OperandToScale.OPb
831 else:
832 op_to_scale = scaling.OperandToScale.OPa
833 emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
834 emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100835 elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):
836 output_scale = npu_op.ofm.quantization.scale_f32
837 ofm_scale, shift = scaling.quantise_scale(output_scale)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100838 else:
Tim Halle178f382022-07-12 17:02:25 +0100839 ofm_scale = 1
840 shift = 0
841 emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100842 return op_to_scale
843
844
845# -------------------------------------------------------------------
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100846# PRINT
847# -------------------------------------------------------------------
Jacob Bohline99b8932020-07-13 16:01:51 +0200848
849
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100850def print_feature_map(fm: Optional[NpuFeatureMap], name: str):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100851 if fm is not None:
852 q = (
853 "no quantization"
854 if fm.quantization is None
855 else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"
856 )
857 h, w, c = fm.shape
858 sz = h * w * c * fm.data_type.size_in_bytes()
859 print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")
860 strides = get_strides(fm)
861 stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"
862 t = fm.tiles
863 addresses = [hex(addr) for addr in t.addresses]
864 print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")
Tim Hall68df8a12022-03-16 16:51:16 +0000865 print(f" name={fm.name}")
Tim Hall79d07d22020-04-27 18:20:16 +0100866
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100867
Dwight Lidman9b43f842020-12-08 17:56:44 +0100868def print_operation(npu_op: NpuOperation, index: int = 0, cmd=None):
Tim Hall68df8a12022-03-16 16:51:16 +0000869 pass_info = f" {cmd}" if cmd else ""
Dwight Lidman9b43f842020-12-08 17:56:44 +0100870 if isinstance(npu_op, NpuOperation) and not isinstance(npu_op, (NpuDmaOperation, NpuBlockOperation)):
Tim Hall68df8a12022-03-16 16:51:16 +0000871 print(f"{index} {npu_op.op_type.name} name={npu_op.name}:{pass_info}")
Dwight Lidman9b43f842020-12-08 17:56:44 +0100872 return
873 if isinstance(npu_op, NpuDmaOperation):
Tim Hall68df8a12022-03-16 16:51:16 +0000874 print(f"{index} {npu_op.op_type.name} name={npu_op.name}, src={npu_op.src}, dest={npu_op.dest}:{pass_info}")
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100875 return
876 k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100877 if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)):
Tim Hall68df8a12022-03-16 16:51:16 +0000878 print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name} name={npu_op.name}:{pass_info}")
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200879 else:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100880 if (
Dwight Lidman9b43f842020-12-08 17:56:44 +0100881 isinstance(npu_op, NpuConv2DOperation)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100882 and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1
883 ):
884 fc = "FullyConnected "
885 else:
886 fc = ""
Tim Hall68df8a12022-03-16 16:51:16 +0000887 print(f"{index} {fc}{npu_op.op_type.name} name={npu_op.name}:{pass_info}")
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100888 print_feature_map(npu_op.ifm, "IFM")
889 if npu_op.ifm2_scalar is not None:
890 quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
891 print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")
892 else:
893 print_feature_map(npu_op.ifm2, "IFM2")
894 print_feature_map(npu_op.ofm, "OFM")
895 if k is not None and npu_op.op_type != NpuOperationType.ElementWise:
896 print(f" Kernel: {k}")
897 if npu_op.padding is not None:
898 print(f" {npu_op.padding}")
899 for weights in npu_op.weights:
900 print(f" Weights: {weights}")
901 for bias in npu_op.biases:
902 print(f" Scales: {bias}")
903 if npu_op.activation is not None:
904 act = npu_op.activation
905 if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:
906 lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""
907 print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")
Dwight Lidman9b43f842020-12-08 17:56:44 +0100908 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100909 print(f" {npu_op.block_traversal}")
910 bh, bw, bc = npu_op.block_config
Dwight Lidman9b43f842020-12-08 17:56:44 +0100911 rescale = (
912 f", rescale={npu_op.rescale}" if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)) else ""
913 )
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100914 print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")
Tim Hall79d07d22020-04-27 18:20:16 +0100915
Tim Hall79d07d22020-04-27 18:20:16 +0100916
Dwight Lidman9b43f842020-12-08 17:56:44 +0100917def print_operations(npu_op_list: List[NpuOperation], npu_op_to_cmd=None):
918 npu_op_to_cmd = dict() if npu_op_to_cmd is None else npu_op_to_cmd
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100919 for index, npu_op in enumerate(npu_op_list):
Dwight Lidman9b43f842020-12-08 17:56:44 +0100920 print_operation(npu_op, index, npu_op_to_cmd.get(npu_op))
Tim Hall79d07d22020-04-27 18:20:16 +0100921
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100922
923# -------------------------------------------------------------------
924# OPERATIONS
925# -------------------------------------------------------------------
926
927
928def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):
929 """Generates NPU_OP_* command"""
Dwight Lidman9b43f842020-12-08 17:56:44 +0100930 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100931 emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100932 elif isinstance(npu_op, NpuConv2DOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100933 emit.cmd_do_operation(cmd0.NPU_OP_CONV)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100934 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100935 emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
Dwight Lidman9b43f842020-12-08 17:56:44 +0100936 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100937 emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])
Dwight Lidman9b43f842020-12-08 17:56:44 +0100938 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100939 emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])
940 else:
941 assert 0, "Unsupported operation"
942
943
Louis Verhaard933f55e2020-11-25 14:10:30 +0100944def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100945 """Generates register commands for Conv2D operations"""
946 generate_common(emit, npu_op, npu_op.block_traversal, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100947
948
Dwight Lidman9b43f842020-12-08 17:56:44 +0100949def generate_conv_depthwise_op(
950 emit: CommandStreamEmitter, npu_op: NpuConvDepthWiseOperation, arch: ArchitectureFeatures
951):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100952 """Generates register commands for depthwise convolution operations"""
953 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100954
955
956def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
957 """Generates register commands for pooling operations"""
Tim Halld6efcd32022-09-02 15:01:01 +0100958 # check that reduce_sum input is NHWC
959 if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM and npu_op.ifm.layout != NpuLayout.NHWC:
960 if npu_op.ifm.data_type == NpuDataType.INT32:
961 raise VelaError(
962 f"REDUCE_SUM ({npu_op.name}) with IFM data type of INT32 requires IFM layout to be NHWC"
963 f" ({npu_op.ifm.name} == {npu_op.ifm.layout})"
964 )
965 elif arch.accelerator_config == Accelerator.Ethos_U65_512:
966 raise VelaError(
967 f"REDUCE_SUM ({npu_op.name}) with accelerator config of Ethos_U65_512 requires IFM layout to be NHWC"
968 f" ({npu_op.ifm.name} == {npu_op.ifm.layout})"
969 )
970
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100971 use_global_scale = (
972 npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0
973 )
Patrik Gustavssonc74682c2021-08-17 14:26:38 +0200974 # Note: reuse of rescale for explicit scaling to not expose this in the external API
975 if npu_op.rescale is not None and type(npu_op.rescale) == ExplicitScaling:
976 use_global_scale = not npu_op.rescale.per_channel
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100977 generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)
978 # Pooling op specific
979 if use_global_scale:
980 generate_ofm_scaling_for_pooling(emit, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100981
982
983def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):
984 """Generates register commands for elementwise operations"""
985 use_global_scale = npu_op.sub_op_type in (
986 NpuElementWiseOp.ADD,
987 NpuElementWiseOp.SUB,
988 NpuElementWiseOp.MUL,
989 NpuElementWiseOp.LRELU,
990 NpuElementWiseOp.ABS,
991 )
992 op_to_scale = generate_scaling_for_elementwise(emit, npu_op)
993 generate_common(
994 emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale
995 )
996 # Elementwise op specific
Louis Verhaard1e170182020-11-26 11:42:04 +0100997 if npu_op.sub_op_type not in UNARY_ELEMWISE_OPS:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100998 # Binary operation; generate IFM2 registers
999 assert npu_op.ifm2 is not None
1000 has_scalar = npu_op.ifm2_scalar is not None
William Isakssona4f84112023-06-19 15:31:46 +00001001 generate_ifm2(emit, npu_op.ifm2, has_scalar, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001002 generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)
1003 generate_ifm2_broadcast(emit, npu_op)
1004 if has_scalar:
1005 quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
1006 assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()
1007 emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001008
1009
William Isakssona4f84112023-06-19 15:31:46 +00001010def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001011 """Generates register commands for DMA operations"""
William Isakssona4f84112023-06-19 15:31:46 +00001012 check_dma_op(dma_op, arch)
1013
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001014 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +01001015 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001016 emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)
1017
Mauricio Bricenoa8e48e62021-03-19 09:13:50 +01001018 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
1019 emit.cmd1_with_address(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001020
1021
Louis Verhaard933f55e2020-11-25 14:10:30 +01001022def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001023 """
1024 Generates register commands for the given operation, but not the final NPU_OP_... command.
1025 Returns the selected block config
1026 """
Dwight Lidman9b43f842020-12-08 17:56:44 +01001027 if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +01001028 generate_conv2d_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001029 elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +01001030 generate_conv_depthwise_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001031 elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +01001032 generate_pooling_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001033 elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard933f55e2020-11-25 14:10:30 +01001034 generate_elementwise_op(emit, npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001035 elif isinstance(npu_op, NpuDmaOperation):
William Isakssona4f84112023-06-19 15:31:46 +00001036 generate_dma_op(emit, npu_op, arch)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001037 else:
1038 assert 0, "Unsupported operation"
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001039
1040
1041def generate_command_stream(
Louis Verhaard024c3552021-03-17 14:26:34 +01001042 npu_op_list: List[NpuOperation],
1043 arch: ArchitectureFeatures,
1044 verbose: bool,
1045 mem_limits: Dict[int, int],
1046 add_to_debug_db=None,
1047 npu_op_to_cmd=None,
Louis Verhaard1e170182020-11-26 11:42:04 +01001048) -> List[int]:
1049 """
1050 Generates register commands for the given list of NPU operations.
1051 Returns Ethos-U instructions, as a list of 32-bit integers.
1052 """
1053 emit = CommandStreamEmitter()
1054 if verbose:
Dwight Lidman9b43f842020-12-08 17:56:44 +01001055 print_operations(npu_op_list, npu_op_to_cmd)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001056 # Calculate memory accesses for every operation
Dwight Lidman9b43f842020-12-08 17:56:44 +01001057 memory_accesses: Dict[NpuOperation, MemoryAccessSet] = {}
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001058 for npu_op in npu_op_list:
Dwight Lidman9b43f842020-12-08 17:56:44 +01001059 if isinstance(npu_op, NpuDmaOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001060 memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001061 elif isinstance(npu_op, NpuBlockOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001062 memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)
Dwight Lidman9b43f842020-12-08 17:56:44 +01001063 else:
1064 assert 0, "Invalid operation type"
Louis Verhaard024c3552021-03-17 14:26:34 +01001065
Tim Hallc8a73862020-10-27 12:43:14 +00001066 if arch.is_ethos_u65_system:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001067 emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001068 prev_op = None
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001069 # Generate register commands for all operations
Alexander Hanssonca9cc422023-06-22 16:01:27 +00001070 outstanding_dma_ops: List[NpuOperation] = list()
1071 outstanding_npu_ops: List[NpuOperation] = list()
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001072 for op_index, npu_op in enumerate(npu_op_list):
Louis Verhaard024c3552021-03-17 14:26:34 +01001073 try:
1074 check_mem_limits(memory_accesses[npu_op], mem_limits)
Alexander Hanssonca9cc422023-06-22 16:01:27 +00001075 cmd_waits = get_wait_dependency(arch, npu_op, memory_accesses, outstanding_dma_ops, outstanding_npu_ops)
Louis Verhaard024c3552021-03-17 14:26:34 +01001076 generate_registers_for_op(emit, npu_op, arch)
William Isakssona4f84112023-06-19 15:31:46 +00001077 except ByteAlignmentError as e:
1078 # Enables testing for ByteAlignmentErrors specifically
1079 raise ByteAlignmentError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None
1080 except ByteSizeError as e:
1081 # Enables testing for ByteSizeErrors specifically
1082 raise ByteSizeError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None
Louis Verhaard024c3552021-03-17 14:26:34 +01001083 except VelaError as e:
Louis Verhaard024c3552021-03-17 14:26:34 +01001084 raise VelaError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None
Dwight Lidman9b43f842020-12-08 17:56:44 +01001085 if not isinstance(npu_op, NpuDmaOperation) and isinstance(npu_op, NpuBlockOperation):
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001086 # Generate BLOCKDEP
Louis Verhaard933f55e2020-11-25 14:10:30 +01001087 blockdep = calc_blockdep(arch, prev_op, npu_op)
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001088 blockdep = min(blockdep, arch.max_blockdep)
1089 emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
1090 prev_op = npu_op
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001091
1092 generate_cmd_waits(emit, cmd_waits)
1093 # Generate the actual NPU_OP command
1094 generate_operation_code(emit, npu_op)
1095 if add_to_debug_db is not None:
1096 add_to_debug_db(npu_op, emit.offset)
1097 # Fill in final part of command stream:
1098 emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
Louis Verhaard1e170182020-11-26 11:42:04 +01001099 res = emit.to_list()
erik.andersson@arm.com1878dab2021-03-16 09:40:24 +01001100
1101 if emit.size_in_bytes() >= 1 << 24:
1102 raise VelaError(
1103 f"The command stream size exceeds the hardware limit of 16 MiB. "
1104 f"The current stream size is {emit.size_in_bytes()/2**20:.2F} MiB."
1105 )
1106
Tim Hall79d07d22020-04-27 18:20:16 +01001107 if verbose:
1108 emit.print_cmds()
Tim Hall114baba2022-05-10 12:42:27 +01001109 print(f"Number of commands = {len(emit.cmd_stream)}")
1110 print(f"Command stream length = {emit.size_in_bytes()} bytes")
Louis Verhaard1e170182020-11-26 11:42:04 +01001111 return res
1112
1113
Louis Verhaardaeae5672020-11-02 18:04:27 +01001114def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001115 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001116 Internal implementation of the public facing API for generating an Ethos-U register command stream.
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001117 Calculates dependencies between commands and inserts wait operations if needed.
1118
1119 :param npu_op_list: List[NpuOperation] list of high level NPU operations
Tim Hallc8a73862020-10-27 12:43:14 +00001120 :param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator
1121 :return Ethos-U instructions, as a list of 32-bit integers
Louis Verhaarde8a5a782020-11-02 18:04:27 +01001122 """
Louis Verhaardaeae5672020-11-02 18:04:27 +01001123 accelerator = Accelerator.from_npu_accelerator(npu_accelerator)
Louis Verhaard52078302020-11-18 13:35:06 +01001124 arch = create_default_arch(accelerator)
Louis Verhaard024c3552021-03-17 14:26:34 +01001125 mem_limits = dict()
1126 for region in range(0, 8):
1127 mem_limits[region] = arch.max_address_offset
1128 mem_limits[BASE_PTR_INDEX_MEM2MEM] = arch.shram_size_bytes
1129 return generate_command_stream(npu_op_list, arch, verbose=False, mem_limits=mem_limits)