Blame - ethosu/vela/register_command_stream_generator.py - ml/ethos-u/ethos-u-vela

blob: 016546974471f9c47b137ccdc18342c4e76bf641 [file] [log] [blame]

William Isaksson	56e5f0c	2024-01-10 12:28:04 +0100	[diff] [blame]	1	# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
Rickard Bolin	bc6ee58	2022-11-04 08:24:29 +0000	[diff] [blame]	16	#
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	17	# Description:
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	18	# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	19	# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	20	# stream suitable for interpretation by the Ethos-U processor.
Louis Verhaard	c629129	2021-03-19 09:35:48 +0100	[diff] [blame]	21	import math
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	22	from collections import defaultdict
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	23	from enum import Enum
				24	from enum import IntEnum
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	25	from typing import cast
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	26	from typing import Dict
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	27	from typing import List
				28	from typing import Optional
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	29
				30	import numpy as np
				31
				32	from . import scaling
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	33	from .api import NpuAccelerator
William Isaksson	56e5f0c	2024-01-10 12:28:04 +0100	[diff] [blame]	34	from .api import NpuAccumulatorType
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	35	from .api import NpuActivation
				36	from .api import NpuActivationOp
				37	from .api import NpuAddressRange
				38	from .api import NpuBlockOperation
				39	from .api import NpuBlockTraversal
				40	from .api import NpuConv2DOperation
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	41	from .api import NpuConvDepthWiseOperation
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	42	from .api import NpuDataType
				43	from .api import NpuDmaOperation
				44	from .api import NpuElementWiseOp
				45	from .api import NpuElementWiseOperation
				46	from .api import NpuFeatureMap
				47	from .api import NpuKernel
				48	from .api import NpuLayout
				49	from .api import NpuOperation
				50	from .api import NpuOperationType
				51	from .api import NpuPadding
				52	from .api import NpuPoolingOp
				53	from .api import NpuPoolingOperation
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	54	from .api import NpuResamplingMode
				55	from .api import NpuRoundingMode
				56	from .api import NpuShape3D
				57	from .api import NpuTileBox
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	58	from .architecture_allocator import ArchitectureBlockConfig
				59	from .architecture_allocator import try_block_config
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	60	from .architecture_features import Accelerator
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	61	from .architecture_features import ArchitectureFeatures
Louis Verhaard	5207830	2020-11-18 13:35:06 +0100	[diff] [blame]	62	from .architecture_features import create_default_arch
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	63	from .architecture_features import SHRAMElements
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame]	64	from .errors import ByteAlignmentError
				65	from .errors import ByteSizeError
erik.andersson@arm.com	1878dab	2021-03-16 09:40:24 +0100	[diff] [blame]	66	from .errors import VelaError
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	67	from .ethos_u55_regs.ethos_u55_regs import acc_format
				68	from .ethos_u55_regs.ethos_u55_regs import activation
				69	from .ethos_u55_regs.ethos_u55_regs import cmd0
				70	from .ethos_u55_regs.ethos_u55_regs import cmd1
				71	from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	72	from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlin	cf7da10	2020-05-20 09:03:40 +0200	[diff] [blame]	73	from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	74	from .ethos_u55_regs.ethos_u55_regs import rounding
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	75	from .numeric_util import round_away_zero
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	76	from .numeric_util import round_up_to_int
Patrik Gustavsson	c74682c	2021-08-17 14:26:38 +0200	[diff] [blame]	77	from .operation import ExplicitScaling
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	78	from .operation import NpuBlockType
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	79	from .range_set import MemoryAccessSet
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	80	from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	81	from .register_command_stream_util import calc_blockdep
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame]	82	from .register_command_stream_util import check_addresses
				83	from .register_command_stream_util import check_alignment
				84	from .register_command_stream_util import check_dma_op
Björn Davidsson	199e8e6	2023-10-10 11:22:59 +0200	[diff] [blame]	85	from .register_command_stream_util import check_length
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame]	86	from .register_command_stream_util import check_strides
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	87	from .register_command_stream_util import get_dma_memory_accesses
				88	from .register_command_stream_util import get_op_memory_accesses
				89	from .register_command_stream_util import get_strides
				90	from .register_command_stream_util import get_wait_dependency
Fredrik Svedberg	f3c7d55	2022-11-04 09:48:49 +0100	[diff] [blame]	91	from .register_command_stream_util import get_zero_point
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	92	from .register_command_stream_util import has_ifm2
Fredrik Svedberg	f3c7d55	2022-11-04 09:48:49 +0100	[diff] [blame]	93	from .register_command_stream_util import quantise
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	94	from .register_command_stream_util import shape3d_to_block
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	95	from .register_command_stream_util import to_kernel
				96	from .register_command_stream_util import UNARY_ELEMWISE_OPS
				97	from .register_command_stream_util import Watermark
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	98
				99
				100	class RegisterMachine:
				101	def __init__(self):
				102	self.n_banks = 1
				103	self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
				104	self.bank_idx = 0
				105
				106	def set_register(self, reg, value):
				107	is_changed = self.registers[self.bank_idx][reg] != value
				108	self.registers[self.bank_idx][reg] = value
				109	# is_changed = True # force command
				110	return is_changed
				111
				112	def switch_bank(self):
				113	self.bank_idx = (self.bank_idx + 1) % self.n_banks
				114
				115
				116	class CmdMode(IntEnum):
				117	NoPayload = 0x0000
				118	Payload32 = 0x4000
				119	Mask = 0xC000
				120	CmdOpMask = 0x03FF
				121
				122
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	123	class CommandStreamEmitter:
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	124	WORD_SIZE = 4
				125
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	126	def __init__(self):
				127	self.cmd_stream = []
				128	self.reg_machine = [RegisterMachine(), RegisterMachine()]
				129	self.last_absolute_wait = defaultdict(int)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	130	self.offset = 0
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	131
				132	def get_reg_machine(self, cmd):
				133	if "DMA" in cmd.name:
				134	return self.reg_machine[1]
				135	else:
				136	return self.reg_machine[0]
				137
				138	def size_in_bytes(self):
				139	sz = 0
				140	for cmd in self.cmd_stream:
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	141	sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	142	return sz
				143
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	144	def to_list(self) -> List[int]:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	145	return [elem for cmd in self.cmd_stream for elem in cmd]
				146
				147	def print_cmds(self):
Tim Hall	114baba	2022-05-10 12:42:27 +0100	[diff] [blame]	148	s = f" {'Offset':6}:"
				149	s += f" {'Payload':8}"
				150	s += f"{'Param':4}" # no leading space for alignment
				151	s += f" {'Code':4}"
				152	s += f" - {'Command':30}"
				153	s += f" {'Param':5}"
				154	print(s)
				155
				156	offset = 0
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	157	for words_for_one_command in self.cmd_stream:
				158	code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
				159	param = words_for_one_command[0] >> 16 # higher 16 bits
				160
				161	payload_mode = CmdMode(code & CmdMode.Mask)
				162
Tim Hall	cda4fcb	2022-05-19 12:36:58 +0100	[diff] [blame]	163	s = f"{offset:#08x}:"
Tim Hall	114baba	2022-05-10 12:42:27 +0100	[diff] [blame]	164
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	165	if payload_mode == CmdMode.NoPayload:
Tim Hall	114baba	2022-05-10 12:42:27 +0100	[diff] [blame]	166	s += f" {'':8}"
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	167	else:
Tim Hall	114baba	2022-05-10 12:42:27 +0100	[diff] [blame]	168	assert payload_mode == CmdMode.Payload32
				169	s += f" {words_for_one_command[1]:08x}"
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	170
Tim Hall	114baba	2022-05-10 12:42:27 +0100	[diff] [blame]	171	s += f" {param:04x}"
				172	s += f" {code:04x}"
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	173
Tim Hall	114baba	2022-05-10 12:42:27 +0100	[diff] [blame]	174	if payload_mode == CmdMode.NoPayload:
				175	s += f" - {cmd0(code & CmdMode.CmdOpMask):30}"
				176	offset += 4
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	177	else:
Tim Hall	114baba	2022-05-10 12:42:27 +0100	[diff] [blame]	178	s += f" - {cmd1(code & CmdMode.CmdOpMask):30}"
				179	offset += 8
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	180
Tim Hall	114baba	2022-05-10 12:42:27 +0100	[diff] [blame]	181	s += f" {param:5}"
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	182	print(s)
				183
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	184	def cmd0_with_param(self, cmd: cmd0, param):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	185	if isinstance(param, Enum):
				186	param = int(param.value)
				187	else:
				188	param = int(param)
				189	param = param & 0xFFFF
				190	command = cmd.value \| (param << 16)
				191	if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
				192	return
				193
				194	# This is not a redundant command, actually write it
				195	self.cmd_stream.append((command,))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	196	self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	197
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	198	def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):
Louis Verhaard	893780c	2021-03-30 09:02:30 +0200	[diff] [blame]	199	offset = int(offset) & 0xFFFFFFFF
				200	param = int(param) & 0xFFFF
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	201	command = cmd.value \| CmdMode.Payload32.value \| (param << 16)
				202
				203	if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
				204	return
				205
				206	# This is not a redundant command, actually write it
				207	self.cmd_stream.append((command, offset))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	208	self.offset += CommandStreamEmitter.WORD_SIZE * 2
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	209
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	210	def cmd1_with_address(self, cmd: cmd1, offset):
				211	self.cmd1_with_offset(cmd, offset, offset >> 32)
				212
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	213	def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	214	param = (16 * channel) + outstanding_count
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	215	command = ((param & 0xFFFF) << 16) \| cmd.value
				216	self.cmd_stream.append((command,))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	217	self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	218
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	219	def cmd_do_operation(self, cmd: cmd0, param=0):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	220	param = int(param)
				221	command = ((param & 0xFFFF) << 16) \| cmd.value
				222
				223	self.cmd_stream.append((command,))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	224	self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	225	self.get_reg_machine(cmd).switch_bank()
				226
				227
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	228	# -------------------------------------------------------------------
				229	# REGISTER GENERATION
				230	# -------------------------------------------------------------------
				231
				232
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	233	# TODO: Replace with definitions from ethos_u55_regs
				234	class IFM2Broadcast(IntEnum):
				235	BroadcastHdim = 1 << 0
				236	BroadcastWdim = 1 << 1
				237	BroadcastCdim = 1 << 2
				238	ReverseOperandOrder = 1 << 6
				239	UseIFM2Scalar = 1 << 7
				240
				241
				242	pooling_op_map = {
				243	NpuPoolingOp.MAX: pooling_mode.MAX.value,
				244	NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,
				245	NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,
				246	}
				247
				248	elementwise_op_map = {
				249	NpuElementWiseOp.MUL: elementwise_mode.MUL.value,
				250	NpuElementWiseOp.ADD: elementwise_mode.ADD.value,
				251	NpuElementWiseOp.SUB: elementwise_mode.SUB.value,
				252	NpuElementWiseOp.MIN: elementwise_mode.MIN.value,
				253	NpuElementWiseOp.MAX: elementwise_mode.MAX.value,
				254	NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,
				255	NpuElementWiseOp.ABS: elementwise_mode.ABS.value,
				256	NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,
				257	NpuElementWiseOp.SHR: elementwise_mode.SHR.value,
				258	NpuElementWiseOp.SHL: elementwise_mode.SHL.value,
				259	}
				260
				261	activation_op_map = {
				262	NpuActivationOp.NONE_OR_RELU: activation.NONE,
				263	NpuActivationOp.TANH: activation.TANH,
				264	NpuActivationOp.SIGMOID: activation.SIGMOID,
				265	}
				266
				267	# Maps an AccumulatorType enum to the corresponding acc_format value
				268	acc_format_map = {
				269	SHRAMElements.Acc16: acc_format.FP_S5_10.value,
				270	SHRAMElements.Acc32: acc_format.INT_32BIT.value,
				271	SHRAMElements.Acc40: acc_format.INT_40BIT.value,
				272	}
				273
William Isaksson	56e5f0c	2024-01-10 12:28:04 +0100	[diff] [blame]	274	npu_acc_format_map = {
				275	NpuAccumulatorType.Int32: acc_format.INT_32BIT.value,
				276	NpuAccumulatorType.Int40: acc_format.INT_40BIT.value,
				277	}
				278
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	279	resampling_mode_map = {
				280	NpuResamplingMode.NONE: resampling_mode.NONE,
				281	NpuResamplingMode.NEAREST: resampling_mode.NEAREST,
				282	NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,
				283	}
				284
				285	# Maps data type size in bits to activation precision
				286	precision_map = {8: 0, 16: 1, 32: 2}
				287
				288	# Maps rounding mode to the corresponding value
				289	rounding_mode_map = {
				290	NpuRoundingMode.TFL: rounding.TFL.value,
				291	NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,
				292	NpuRoundingMode.NATURAL: rounding.NATURAL.value,
				293	}
				294
				295
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	296	def check_mem_limits(memory_accesses: MemoryAccessSet, mem_limits: Dict[int, int]):
				297	"""Checks that an operation's memory accesses respect the boundaries imposed by mem_limits"""
				298	for mem_access in memory_accesses.accesses:
				299	for region, range_set in mem_access.regions.items():
				300	if region not in mem_limits:
				301	raise VelaError(f"Invalid region: {region}")
				302	max = mem_limits[region]
				303	for start, end in range_set.ranges:
				304	for offset in (start, end):
				305	if offset < 0:
				306	raise VelaError(f"Negative address offset: {offset}, region: {region}")
				307	if offset > max:
Tim Hall	cda4fcb	2022-05-19 12:36:58 +0100	[diff] [blame]	308	raise VelaError(
				309	f"Address offset out of range: {offset}, region: {region}, max: {max}. Perhaps try running"
				310	f" with the HillClimb tensor allocator and/or increasing the maximum iteration of that"
				311	f" allocator"
				312	)
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	313
				314
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	315	def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):
				316	"""Generates IFM_PAD registers"""
				317	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)
				318	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)
				319	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)
				320	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)
				321
				322
				323	def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):
				324	"""Generates ACTIVATION registers"""
				325	act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)
				326
				327	if act.min is None:
				328	quantized_min = ofm.data_type.min_value()
				329	else:
				330	quantized_min = quantise(act.min, ofm.quantization)
				331	if act.max is None:
				332	quantized_max = ofm.data_type.max_value()
				333	else:
				334	quantized_max = quantise(act.max, ofm.quantization)
				335	quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())
				336	quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())
				337	if act.op_type == NpuActivationOp.TABLE_LOOKUP:
				338	assert 0 <= act.lookup_table_index < 8
				339	activation_value = 16 + act.lookup_table_index
				340	if ofm.data_type == NpuDataType.INT32:
				341	activation_value \|= 3 << 12 # Force I8 range
				342	quantized_min = max(-128, quantized_min)
				343	quantized_max = min(127, quantized_max)
				344	else:
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	345	activation_value = cast(int, activation_op_map[act.op_type])
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	346	emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)
				347	emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)
				348	emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)
				349
				350
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame]	351	def generate_addresses(
				352	emit: CommandStreamEmitter,
				353	ptr_cmds: List[cmd1],
				354	addresses: List[int],
				355	layout: NpuLayout,
				356	element_size,
				357	arch: ArchitectureFeatures,
				358	):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	359	"""Generates xFM_BASE registers"""
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame]	360	check_addresses(addresses, layout, element_size, arch)
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	361	for i in range(4):
				362	emit.cmd1_with_address(ptr_cmds[i], addresses[i])
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	363
				364
				365	def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):
				366	"""Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""
				367	emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)
				368	emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)
				369	emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)
				370
				371
				372	def generate_strides(
				373	emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1
				374	):
				375	"""Generates STRIDE_C/Y/X registers"""
				376	strides = get_strides(fm)
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame]	377	check_strides(fm, strides)
				378
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	379	emit.cmd1_with_address(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)
				380	emit.cmd1_with_address(stride_y_cmd, strides.height) # stride between vertical values (H)
				381	emit.cmd1_with_address(stride_x_cmd, strides.width) # stride between horisontal values (W)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	382
				383
				384	def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):
				385	"""Generates IFM/IFM2_PRECISION register"""
				386	dtype = fm.data_type
				387	prec = 1 if dtype.is_signed() else 0
				388	activation_precision = precision_map[dtype.size_in_bits()]
				389	prec += activation_precision << 2
				390
				391	if fm.layout == NpuLayout.NHCWB16:
				392	prec \|= 1 << 6
				393
				394	prec \|= op_to_scale << 8
				395	emit.cmd0_with_param(precision_cmd, prec)
				396
				397
				398	def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):
				399	"""Generates OFM_PRECISION register"""
				400	dtype = npu_op.ofm.data_type
				401	prec = 1 if dtype.is_signed() else 0
				402	activation_precision = precision_map[dtype.size_in_bits()]
				403	prec += activation_precision << 1
				404
				405	if use_global_scale:
				406	# Set global scale bit, as opposed to using per channel scale
				407	prec \|= 1 << 8
				408	if npu_op.ofm.layout == NpuLayout.NHCWB16:
				409	prec \|= 1 << 6
				410	prec \|= rounding_mode_map[npu_op.rounding_mode] << 14
				411	emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
				412
				413
				414	def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):
				415	"""Generates IFM2_BROADCAST register for binary elementwise operations"""
				416	ifm2_broadcast = 0
				417	ifm = npu_op.ifm
				418	ifm2 = npu_op.ifm2
				419	if npu_op.reversed_operands:
				420	ifm2_broadcast \|= IFM2Broadcast.ReverseOperandOrder
				421	if npu_op.ifm2_scalar is not None:
				422	# IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
				423	ifm2_broadcast \|= IFM2Broadcast.UseIFM2Scalar
				424	else:
				425	if ifm.shape.height != ifm2.shape.height:
				426	# Broadcast in 'H' dimension
				427	assert ifm2.shape.height == 1
				428	ifm2_broadcast \|= IFM2Broadcast.BroadcastHdim
				429
				430	if ifm.shape.width != ifm2.shape.width:
				431	# Broadcast in 'W' dimension
				432	assert ifm2.shape.width == 1
				433	ifm2_broadcast \|= IFM2Broadcast.BroadcastWdim
				434
				435	if ifm.shape.depth != ifm2.shape.depth:
				436	# Broadcast in 'C' dimension
				437	assert ifm2.shape.depth == 1
				438	ifm2_broadcast \|= IFM2Broadcast.BroadcastCdim
				439
				440	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
				441
				442
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame]	443	def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap, arch: ArchitectureFeatures):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	444	"""Generates general IFM registers"""
				445	emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)
				446	generate_addresses(
				447	emit,
				448	[cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],
				449	ifm.tiles.addresses,
				450	ifm.layout,
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame]	451	ifm.data_type.size_in_bytes(),
				452	arch,
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	453	)
				454	generate_tiles(
				455	emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles
				456	)
				457	emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)
				458	generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)
Fredrik Svedberg	f3c7d55	2022-11-04 09:48:49 +0100	[diff] [blame]	459	emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, get_zero_point(ifm))
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	460
				461
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame]	462	def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool, arch: ArchitectureFeatures):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	463	"""Generates general IFM2 registers"""
				464	if not has_scalar:
				465	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)
				466	generate_addresses(
				467	emit,
				468	[cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],
				469	ifm2.tiles.addresses,
				470	ifm2.layout,
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame]	471	ifm2.data_type.size_in_bytes(),
				472	arch,
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	473	)
				474	generate_tiles(
				475	emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles
				476	)
				477	generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)
Fredrik Svedberg	f3c7d55	2022-11-04 09:48:49 +0100	[diff] [blame]	478	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, get_zero_point(ifm2))
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	479
				480
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame]	481	def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap, arch: ArchitectureFeatures):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	482	"""Generates general OFM registers"""
				483	emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)
				484	generate_addresses(
				485	emit,
				486	[cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],
				487	ofm.tiles.addresses,
				488	ofm.layout,
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame]	489	ofm.data_type.size_in_bytes(),
				490	arch,
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	491	)
				492	generate_tiles(
				493	emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles
				494	)
				495	emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)
				496	emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)
				497	emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)
				498	generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)
Fredrik Svedberg	f3c7d55	2022-11-04 09:48:49 +0100	[diff] [blame]	499	emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, get_zero_point(ofm))
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	500
				501
				502	def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):
				503	"""Generates KERNEL related registers"""
				504	emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))
				505	emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))
				506	# set kernel x stride low bit
				507	stride = (kernel.stride_x - 1) & 1
				508	# set kernel y stride low bit
				509	stride \|= (kernel.stride_y - 1 & 1) << 1
				510	# set kernel x stride extension bits
				511	stride \|= (kernel.stride_x - 1 >> 1) << 6
				512	# set kernel y stride extension bits
				513	stride \|= (kernel.stride_y - 1 >> 1) << 9
				514	stride \|= (kernel.dilation_x - 1) << 3
				515	stride \|= (kernel.dilation_y - 1) << 4
				516	if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:
				517	stride \|= 1 << 2
				518	emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
				519
				520
				521	def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):
				522	"""Generates WEIGHT registers"""
				523	if len(weights) == 0:
				524	return
				525	emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)
				526	# Set weights sources for active and present cores
				527	for core, (addr, length) in enumerate(
				528	[
				529	(cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
				530	(cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
				531	]
				532	):
				533	if core < len(weights):
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame]	534	check_alignment(weights[core].address, 16)
Björn Davidsson	199e8e6	2023-10-10 11:22:59 +0200	[diff] [blame]	535	check_length(weights[core].length, 16)
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	536	emit.cmd1_with_address(addr, weights[core].address)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	537	emit.cmd1_with_offset(length, weights[core].length)
				538	elif core < arch.ncores:
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame]	539	check_alignment(weights[0].address, 16)
William Isaksson	6165283	2023-08-07 10:32:07 +0000	[diff] [blame]	540	check_length(weights[0].length, 16)
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	541	emit.cmd1_with_address(addr, weights[0].address)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	542	emit.cmd1_with_offset(length, 0)
				543
				544
				545	def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):
				546	"""Generates SCALE registers"""
				547	if len(biases) == 0:
				548	return
				549	emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)
				550	# Set weights sources for active and present cores
				551	for core, (addr, length) in enumerate(
				552	[(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]
				553	):
				554	if core < len(biases):
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	555	emit.cmd1_with_address(addr, biases[core].address)
Björn Davidsson	199e8e6	2023-10-10 11:22:59 +0200	[diff] [blame]	556	check_length(biases[core].length, 16)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	557	emit.cmd1_with_offset(length, biases[core].length)
				558	elif core < arch.ncores:
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	559	emit.cmd1_with_address(addr, biases[0].address)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	560	emit.cmd1_with_offset(length, 0)
				561
				562
				563	def generate_block_config(
Jonas Ohlsson	d857507	2022-03-30 10:30:25 +0200	[diff] [blame]	564	emit: CommandStreamEmitter,
				565	block_config: NpuShape3D,
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	566	):
				567	"""Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	568	emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
				569	emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)
				570	emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	571
				572
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	573	def generate_shram_registers(
Jonas Ohlsson	d857507	2022-03-30 10:30:25 +0200	[diff] [blame]	574	emit: CommandStreamEmitter,
				575	npu_op: NpuBlockOperation,
				576	arch_block_config: ArchitectureBlockConfig,
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	577	):
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	578	"""Generates IB_END/IB_START/AB_START/ACC_FORMAT registers"""
				579	emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, arch_block_config.layout.ib_end)
				580	emit.cmd0_with_param(cmd0.NPU_SET_AB_START, arch_block_config.layout.ab_start)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	581	if has_ifm2(npu_op):
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	582	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_IB_START, arch_block_config.layout.ib_start2)
William Isaksson	56e5f0c	2024-01-10 12:28:04 +0100	[diff] [blame]	583	if npu_op.accumulator_type != NpuAccumulatorType.Default:
				584	emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, npu_acc_format_map[npu_op.accumulator_type])
				585	else:
				586	emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[arch_block_config.acc_type])
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	587
				588
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	589	def get_arch_block_config(
				590	npu_op: NpuBlockOperation, block_traversal: NpuBlockTraversal, arch: ArchitectureFeatures
				591	) -> ArchitectureBlockConfig:
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	592	"""Creates shared buffer allocation for the given operation"""
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	593	assert npu_op.block_config is not None, "block_config has not been set"
				594	block_type = NpuBlockType.Default
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	595	if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	596	block_type = NpuBlockType.ConvolutionMxN
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	597	elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	598	block_type = NpuBlockType.ConvolutionDepthWise
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	599	elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	600	block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	601	elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	602	block_type = NpuBlockType.ElementWise
				603	else:
				604	assert 0, "Unsupported operation"
				605	ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	606	is_partkernel = block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST
				607	uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
				608	lut_banks = 2 if uses_lut else 0
				609	fms = [npu_op.ifm, npu_op.ofm]
				610	if npu_op.ifm2 is not None:
				611	fms.append(npu_op.ifm2)
				612	all_fms_have_quant = not any(fm.quantization is None or fm.quantization.scale_f32 is None for fm in fms)
				613	ifm_bits = npu_op.ifm.data_type.size_in_bits()
				614	ifm_shape = shape3d_to_block(npu_op.ifm.shape)
				615	if has_ifm2(npu_op):
				616	ifm2_shape = shape3d_to_block(npu_op.ifm2.shape)
				617	else:
				618	ifm2_shape = None
				619	uses_scalar = npu_op.ifm2_scalar is not None
				620	block_config = shape3d_to_block(npu_op.block_config)
				621	arch_block_config = try_block_config(
				622	block_config,
				623	arch,
				624	block_type,
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	625	shape3d_to_block(npu_op.ofm.shape),
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	626	ifm_shape,
				627	ifm2_shape,
				628	uses_scalar,
				629	ifm_bits,
				630	is_partkernel=is_partkernel,
				631	kernel=to_kernel(npu_op.kernel),
				632	lut_banks=lut_banks,
				633	scaled=all_fms_have_quant,
				634	ifm_resampling=ifm_resampling_mode,
				635	)
				636	assert arch_block_config is not None, f"block_config {npu_op.block_config} does not fit"
				637	return arch_block_config
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	638
				639
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	640	def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):
				641	"""Generates KERNEL_WAIT/DMA_WAIT"""
				642	if cmd_waits.npu >= 0:
				643	emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
				644
				645	if cmd_waits.dma >= 0:
				646	emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
				647
				648
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	649	def generate_common(
				650	emit: CommandStreamEmitter,
				651	npu_op: NpuBlockOperation,
				652	block_traversal: NpuBlockTraversal,
				653	arch: ArchitectureFeatures,
				654	use_global_scale: bool = False,
				655	op_to_scale: int = 0,
				656	):
				657	"""Generate registers that are common to most operations"""
				658	assert npu_op.ifm is not None and npu_op.ofm is not None
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame]	659	generate_ifm(emit, npu_op.ifm, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	660	generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)
				661	emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])
				662	if npu_op.padding is not None:
				663	generate_padding(emit, npu_op.padding)
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame]	664	generate_ofm(emit, npu_op.ofm, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	665	generate_ofm_precision(emit, npu_op, use_global_scale)
				666	if npu_op.op_type != NpuOperationType.ElementWise:
				667	assert npu_op.kernel is not None
				668	generate_kernel(emit, npu_op.kernel, block_traversal)
				669	generate_weights(emit, npu_op.weights, arch)
				670	generate_biases(emit, npu_op.biases, arch)
				671	generate_activation(emit, npu_op.activation, npu_op.ofm)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	672	arch_block_config = get_arch_block_config(npu_op, block_traversal, arch)
				673	generate_block_config(emit, npu_op.block_config)
				674	generate_shram_registers(emit, npu_op, arch_block_config)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	675
				676
				677	# -------------------------------------------------------------------
				678	# SCALING
				679	# -------------------------------------------------------------------
				680
				681
				682	def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):
				683	"""Generates OFM_SCALE register for pooling operations"""
				684	# For valid padding vela has to output scaling values
				685	kernel = pool_op.kernel
				686	ifm_quant = pool_op.ifm.quantization
				687	ofm_quant = pool_op.ofm.quantization
				688	if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):
				689	assert ifm_quant.scale_f32 is not None
				690	rescale = 0x3000 * ifm_quant.scale_f32
				691	if pool_op.ifm.data_type == NpuDataType.INT16:
				692	# Calculate scale and shift for the output scale of 1/(3*4096)
Louis Verhaard	c629129	2021-03-19 09:35:48 +0100	[diff] [blame]	693	x_log2 = math.log2(ifm_quant.scale_f32)
				694	rounded_log2 = int(round(x_log2))
				695	is_power_of_two = abs(x_log2 - rounded_log2) < 0.001
				696	shift = rounded_log2 + 12
Patrik Gustavsson	e3dd2f3	2021-12-02 09:08:26 +0100	[diff] [blame]	697	if is_power_of_two and (
				698	(pool_op.activation.op_type == NpuActivationOp.TANH and shift in (0, 1))
				699	or (pool_op.activation.op_type == NpuActivationOp.SIGMOID and shift == 0)
				700	):
				701	# Special handling if input scale is 1/2048 (tanh/sigmoid) or 1/4096 (tanh)
Louis Verhaard	c629129	2021-03-19 09:35:48 +0100	[diff] [blame]	702	scale = 3 << shift
				703	shift = 0
				704	else:
				705	shift = 0
				706	max_rescale = np.iinfo(np.int16).max / 2
				707	while rescale <= max_rescale and shift <= 30:
				708	shift += 1
				709	rescale *= 2
				710	scale = int(rescale)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	711	else:
				712	rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
				713	scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
				714	scale = int(round_away_zero(scale * rescale))
				715	elif pool_op.fused_quantize:
				716	# Quantize op requires different scaling
				717	ifm_scale_f64 = np.double(ifm_quant.scale_f32)
				718	ofm_scale_f64 = np.double(ofm_quant.scale_f32)
				719	scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
				720	elif pool_op.rescale is not None:
Patrik Gustavsson	c74682c	2021-08-17 14:26:38 +0200	[diff] [blame]	721	if type(pool_op.rescale) == ExplicitScaling:
				722	# Note: reuse of rescale for explicit scaling to not expose this in the external API
				723	explicit_scaling = pool_op.rescale
				724	assert explicit_scaling.per_channel is False
				725	scale = explicit_scaling.multiplier[0]
				726	shift = explicit_scaling.shift[0]
				727	else:
Tim Hall	885033b	2022-07-21 11:46:03 +0100	[diff] [blame]	728	# for ResizeBilinear/NearestNeighbor operations with rescale
Fredrik Svedberg	4a434cb	2022-09-27 14:13:01 +0200	[diff] [blame]	729	# Note: this is not used, but part of the public API
Patrik Gustavsson	c74682c	2021-08-17 14:26:38 +0200	[diff] [blame]	730	rescale = pool_op.rescale
				731	rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
				732	scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
				733	scale = int(round_away_zero(scale * rescale))
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	734	else:
				735	# In case avg pool fused with concat or other memory operation, rescaling might be needed.
				736	# kernel height == kernel width == 1 is always true in this case
				737	# Normally the scale is maximised, to get maximum precision, which means that
				738	# if rescale != 1, scale need to consider the number of bits needed for rescaling
				739	if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:
				740	rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32
				741	rescale_bits = 0
				742	if kernel.height == kernel.width == 1:
				743	if rescale > 1:
				744	rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
				745	elif rescale < 1:
				746	rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
				747	scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
				748	scale = int(round_away_zero(scale * rescale))
				749	else:
				750	scale = 1
				751	shift = 0
				752
				753	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
				754
				755
				756	def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
				757	"""
				758	Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
				759	Returns the operator to scale
				760	"""
				761	op_to_scale = 0
				762	if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):
				763	input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
				764	input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
				765	output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
				766
				767	if npu_op.activation is not None and npu_op.activation.op_type in (
				768	NpuActivationOp.SIGMOID,
				769	NpuActivationOp.TANH,
				770	):
				771	output_scale = 1 / 0x3000
				772
				773	if npu_op.sub_op_type == NpuElementWiseOp.MUL:
Patrik Gustavsson	b081d67	2021-08-25 13:49:25 +0200	[diff] [blame]	774	if npu_op.rescale:
				775	ofm_scale, shift = npu_op.rescale
				776	elif None in (input_scale, input2_scale, output_scale):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	777	ofm_scale = 1
				778	shift = 0
				779	else:
				780	ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	781	else: # Add/Sub
Fredrik Svedberg	4a434cb	2022-09-27 14:13:01 +0200	[diff] [blame]	782	# Default operand scaling is no scaling
				783	opa_scale = opb_scale = 1
				784	opa_shift = 0
Henrik G Olsson	ad656a8	2021-03-19 15:50:28 +0100	[diff] [blame]	785	bitdepth = npu_op.ifm.data_type.size_in_bits()
				786	use_advanced_scaling = False
Fredrik Svedberg	4a434cb	2022-09-27 14:13:01 +0200	[diff] [blame]	787	if npu_op.rescale is not None:
				788	# Explicit ofm scaling
				789	ofm_scale, shift = npu_op.rescale
				790	elif None in (input_scale, input2_scale, output_scale):
				791	# No ofm scaling
				792	ofm_scale = 1
				793	shift = 0
Henrik G Olsson	ad656a8	2021-03-19 15:50:28 +0100	[diff] [blame]	794	elif input_scale == input2_scale and bitdepth == 16:
Fredrik Svedberg	4a434cb	2022-09-27 14:13:01 +0200	[diff] [blame]	795	# int16 same scaling
Henrik G Olsson	ad656a8	2021-03-19 15:50:28 +0100	[diff] [blame]	796	opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
				797	input_scale, input2_scale, output_scale
				798	)
				799	# align the double rounding with that of advanced scaling
Fredrik Svedberg	4a434cb	2022-09-27 14:13:01 +0200	[diff] [blame]	800	opa_scale //= 2
				801	opb_scale //= 2
Henrik G Olsson	ad656a8	2021-03-19 15:50:28 +0100	[diff] [blame]	802	shift -= 1
				803	opa_shift = 0 # Unused for this case
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	804	elif input_scale == input2_scale:
Fredrik Svedberg	4a434cb	2022-09-27 14:13:01 +0200	[diff] [blame]	805	# Same scaling
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	806	opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
				807	input_scale, input2_scale, output_scale
				808	)
				809	opa_shift = 0 # Unused for this case
Henrik G Olsson	ad656a8	2021-03-19 15:50:28 +0100	[diff] [blame]	810	# For 8 bit we can't guarantee double rounding with simplified scaling will always be
				811	# the same as with advanced scaling due to different shifts. When the ofm scale fulfils
				812	# the following we know that double rounding will have no effect for advanced scaling
				813	# no matter the input, so we can safely use simplified scaling with double rounding disabled.
				814	use_advanced_scaling = int(ofm_scale) & 0xFFF != 0
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	815	else:
Henrik G Olsson	ad656a8	2021-03-19 15:50:28 +0100	[diff] [blame]	816	use_advanced_scaling = True
				817	if use_advanced_scaling:
				818	# Use advanced implementation only when input/output scales differ,
				819	# or when we can't guarantee the absence of rounding errors
Jonas Ohlsson	d857507	2022-03-30 10:30:25 +0200	[diff] [blame]	820	(
				821	opa_scale,
				822	opa_shift,
				823	ofm_scale,
				824	shift,
				825	op_to_scale,
				826	) = scaling.advanced_elementwise_add_sub_scale(input_scale, input2_scale, output_scale, bitdepth)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	827	opb_scale = 0 # Unused for this case
				828	if npu_op.reversed_operands:
				829	# If the operand order is reversed we also have to swap which operand is scaled
				830	if op_to_scale == scaling.OperandToScale.OPa:
				831	op_to_scale = scaling.OperandToScale.OPb
				832	else:
				833	op_to_scale = scaling.OperandToScale.OPa
				834	emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
				835	emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	836	elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):
				837	output_scale = npu_op.ofm.quantization.scale_f32
				838	ofm_scale, shift = scaling.quantise_scale(output_scale)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	839	else:
Tim Hall	e178f38	2022-07-12 17:02:25 +0100	[diff] [blame]	840	ofm_scale = 1
				841	shift = 0
				842	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	843	return op_to_scale
				844
				845
				846	# -------------------------------------------------------------------
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	847	# PRINT
				848	# -------------------------------------------------------------------
Jacob Bohlin	e99b893	2020-07-13 16:01:51 +0200	[diff] [blame]	849
				850
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	851	def print_feature_map(fm: Optional[NpuFeatureMap], name: str):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	852	if fm is not None:
				853	q = (
				854	"no quantization"
				855	if fm.quantization is None
				856	else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"
				857	)
				858	h, w, c = fm.shape
				859	sz = h * w * c * fm.data_type.size_in_bytes()
				860	print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")
				861	strides = get_strides(fm)
				862	stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"
				863	t = fm.tiles
				864	addresses = [hex(addr) for addr in t.addresses]
				865	print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")
Tim Hall	68df8a1	2022-03-16 16:51:16 +0000	[diff] [blame]	866	print(f" name={fm.name}")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	867
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	868
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	869	def print_operation(npu_op: NpuOperation, index: int = 0, cmd=None):
Tim Hall	68df8a1	2022-03-16 16:51:16 +0000	[diff] [blame]	870	pass_info = f" {cmd}" if cmd else ""
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	871	if isinstance(npu_op, NpuOperation) and not isinstance(npu_op, (NpuDmaOperation, NpuBlockOperation)):
Tim Hall	68df8a1	2022-03-16 16:51:16 +0000	[diff] [blame]	872	print(f"{index} {npu_op.op_type.name} name={npu_op.name}:{pass_info}")
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	873	return
				874	if isinstance(npu_op, NpuDmaOperation):
Tim Hall	68df8a1	2022-03-16 16:51:16 +0000	[diff] [blame]	875	print(f"{index} {npu_op.op_type.name} name={npu_op.name}, src={npu_op.src}, dest={npu_op.dest}:{pass_info}")
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	876	return
				877	k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	878	if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)):
Tim Hall	68df8a1	2022-03-16 16:51:16 +0000	[diff] [blame]	879	print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name} name={npu_op.name}:{pass_info}")
Patrik Gustavsson	eca2e95	2020-05-27 09:15:11 +0200	[diff] [blame]	880	else:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	881	if (
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	882	isinstance(npu_op, NpuConv2DOperation)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	883	and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1
				884	):
				885	fc = "FullyConnected "
				886	else:
				887	fc = ""
Tim Hall	68df8a1	2022-03-16 16:51:16 +0000	[diff] [blame]	888	print(f"{index} {fc}{npu_op.op_type.name} name={npu_op.name}:{pass_info}")
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	889	print_feature_map(npu_op.ifm, "IFM")
				890	if npu_op.ifm2_scalar is not None:
				891	quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
				892	print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")
				893	else:
				894	print_feature_map(npu_op.ifm2, "IFM2")
				895	print_feature_map(npu_op.ofm, "OFM")
				896	if k is not None and npu_op.op_type != NpuOperationType.ElementWise:
				897	print(f" Kernel: {k}")
				898	if npu_op.padding is not None:
				899	print(f" {npu_op.padding}")
				900	for weights in npu_op.weights:
				901	print(f" Weights: {weights}")
				902	for bias in npu_op.biases:
				903	print(f" Scales: {bias}")
				904	if npu_op.activation is not None:
				905	act = npu_op.activation
				906	if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:
				907	lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""
				908	print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	909	if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	910	print(f" {npu_op.block_traversal}")
				911	bh, bw, bc = npu_op.block_config
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	912	rescale = (
				913	f", rescale={npu_op.rescale}" if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)) else ""
				914	)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	915	print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	916
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	917
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	918	def print_operations(npu_op_list: List[NpuOperation], npu_op_to_cmd=None):
				919	npu_op_to_cmd = dict() if npu_op_to_cmd is None else npu_op_to_cmd
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	920	for index, npu_op in enumerate(npu_op_list):
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	921	print_operation(npu_op, index, npu_op_to_cmd.get(npu_op))
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	922
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	923
				924	# -------------------------------------------------------------------
				925	# OPERATIONS
				926	# -------------------------------------------------------------------
				927
				928
				929	def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):
				930	"""Generates NPU_OP_* command"""
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	931	if isinstance(npu_op, NpuDmaOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	932	emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	933	elif isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	934	emit.cmd_do_operation(cmd0.NPU_OP_CONV)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	935	elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	936	emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	937	elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	938	emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	939	elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	940	emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])
				941	else:
				942	assert 0, "Unsupported operation"
				943
				944
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	945	def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	946	"""Generates register commands for Conv2D operations"""
				947	generate_common(emit, npu_op, npu_op.block_traversal, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	948
				949
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	950	def generate_conv_depthwise_op(
				951	emit: CommandStreamEmitter, npu_op: NpuConvDepthWiseOperation, arch: ArchitectureFeatures
				952	):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	953	"""Generates register commands for depthwise convolution operations"""
				954	generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	955
				956
				957	def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
				958	"""Generates register commands for pooling operations"""
Tim Hall	d6efcd3	2022-09-02 15:01:01 +0100	[diff] [blame]	959	# check that reduce_sum input is NHWC
				960	if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM and npu_op.ifm.layout != NpuLayout.NHWC:
				961	if npu_op.ifm.data_type == NpuDataType.INT32:
				962	raise VelaError(
				963	f"REDUCE_SUM ({npu_op.name}) with IFM data type of INT32 requires IFM layout to be NHWC"
				964	f" ({npu_op.ifm.name} == {npu_op.ifm.layout})"
				965	)
				966	elif arch.accelerator_config == Accelerator.Ethos_U65_512:
				967	raise VelaError(
				968	f"REDUCE_SUM ({npu_op.name}) with accelerator config of Ethos_U65_512 requires IFM layout to be NHWC"
				969	f" ({npu_op.ifm.name} == {npu_op.ifm.layout})"
				970	)
				971
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	972	use_global_scale = (
				973	npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0
				974	)
Patrik Gustavsson	c74682c	2021-08-17 14:26:38 +0200	[diff] [blame]	975	# Note: reuse of rescale for explicit scaling to not expose this in the external API
				976	if npu_op.rescale is not None and type(npu_op.rescale) == ExplicitScaling:
				977	use_global_scale = not npu_op.rescale.per_channel
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	978	generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)
				979	# Pooling op specific
				980	if use_global_scale:
				981	generate_ofm_scaling_for_pooling(emit, npu_op)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	982
				983
				984	def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):
				985	"""Generates register commands for elementwise operations"""
				986	use_global_scale = npu_op.sub_op_type in (
				987	NpuElementWiseOp.ADD,
				988	NpuElementWiseOp.SUB,
				989	NpuElementWiseOp.MUL,
				990	NpuElementWiseOp.LRELU,
				991	NpuElementWiseOp.ABS,
				992	)
				993	op_to_scale = generate_scaling_for_elementwise(emit, npu_op)
				994	generate_common(
				995	emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale
				996	)
				997	# Elementwise op specific
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	998	if npu_op.sub_op_type not in UNARY_ELEMWISE_OPS:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	999	# Binary operation; generate IFM2 registers
				1000	assert npu_op.ifm2 is not None
				1001	has_scalar = npu_op.ifm2_scalar is not None
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame]	1002	generate_ifm2(emit, npu_op.ifm2, has_scalar, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1003	generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)
				1004	generate_ifm2_broadcast(emit, npu_op)
				1005	if has_scalar:
				1006	quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
				1007	assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()
				1008	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1009
				1010
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame]	1011	def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation, arch: ArchitectureFeatures):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1012	"""Generates register commands for DMA operations"""
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame]	1013	check_dma_op(dma_op, arch)
				1014
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1015	emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	1016	emit.cmd1_with_address(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1017	emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)
				1018
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	1019	emit.cmd1_with_address(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
				1020	emit.cmd1_with_address(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1021
				1022
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	1023	def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1024	"""
				1025	Generates register commands for the given operation, but not the final NPU_OP_... command.
				1026	Returns the selected block config
				1027	"""
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	1028	if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	1029	generate_conv2d_op(emit, npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	1030	elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	1031	generate_conv_depthwise_op(emit, npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	1032	elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	1033	generate_pooling_op(emit, npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	1034	elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	1035	generate_elementwise_op(emit, npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	1036	elif isinstance(npu_op, NpuDmaOperation):
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame]	1037	generate_dma_op(emit, npu_op, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1038	else:
				1039	assert 0, "Unsupported operation"
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1040
				1041
				1042	def generate_command_stream(
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	1043	npu_op_list: List[NpuOperation],
				1044	arch: ArchitectureFeatures,
				1045	verbose: bool,
				1046	mem_limits: Dict[int, int],
				1047	add_to_debug_db=None,
				1048	npu_op_to_cmd=None,
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	1049	) -> List[int]:
				1050	"""
				1051	Generates register commands for the given list of NPU operations.
				1052	Returns Ethos-U instructions, as a list of 32-bit integers.
				1053	"""
				1054	emit = CommandStreamEmitter()
				1055	if verbose:
Tim Hall	cd03504	2023-08-08 14:10:17 +0100	[diff] [blame]	1056	print("Register-Level Command Stream: Input")
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	1057	print_operations(npu_op_list, npu_op_to_cmd)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1058	# Calculate memory accesses for every operation
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	1059	memory_accesses: Dict[NpuOperation, MemoryAccessSet] = {}
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1060	for npu_op in npu_op_list:
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	1061	if isinstance(npu_op, NpuDmaOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1062	memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	1063	elif isinstance(npu_op, NpuBlockOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1064	memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	1065	else:
				1066	assert 0, "Invalid operation type"
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	1067
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	1068	if arch.is_ethos_u65_system:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1069	emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1070	prev_op = None
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1071	# Generate register commands for all operations
Alexander Hansson	ca9cc42	2023-06-22 16:01:27 +0000	[diff] [blame]	1072	outstanding_dma_ops: List[NpuOperation] = list()
				1073	outstanding_npu_ops: List[NpuOperation] = list()
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1074	for op_index, npu_op in enumerate(npu_op_list):
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	1075	try:
				1076	check_mem_limits(memory_accesses[npu_op], mem_limits)
Alexander Hansson	ca9cc42	2023-06-22 16:01:27 +0000	[diff] [blame]	1077	cmd_waits = get_wait_dependency(arch, npu_op, memory_accesses, outstanding_dma_ops, outstanding_npu_ops)
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	1078	generate_registers_for_op(emit, npu_op, arch)
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame]	1079	except ByteAlignmentError as e:
				1080	# Enables testing for ByteAlignmentErrors specifically
				1081	raise ByteAlignmentError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None
				1082	except ByteSizeError as e:
				1083	# Enables testing for ByteSizeErrors specifically
				1084	raise ByteSizeError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	1085	except VelaError as e:
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	1086	raise VelaError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	1087	if not isinstance(npu_op, NpuDmaOperation) and isinstance(npu_op, NpuBlockOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1088	# Generate BLOCKDEP
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	1089	blockdep = calc_blockdep(arch, prev_op, npu_op)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1090	blockdep = min(blockdep, arch.max_blockdep)
				1091	emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
				1092	prev_op = npu_op
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1093
				1094	generate_cmd_waits(emit, cmd_waits)
				1095	# Generate the actual NPU_OP command
				1096	generate_operation_code(emit, npu_op)
				1097	if add_to_debug_db is not None:
William Isaksson	e4d2f21	2024-02-10 15:54:44 +0100	[diff] [blame]	1098	if not isinstance(npu_op, NpuDmaOperation):
				1099	# Subtraction by 4 is to account for that offsets are pre-incremented.
				1100	add_to_debug_db(npu_op, emit.offset - 4)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1101	# Fill in final part of command stream:
				1102	emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	1103	res = emit.to_list()
erik.andersson@arm.com	1878dab	2021-03-16 09:40:24 +0100	[diff] [blame]	1104
				1105	if emit.size_in_bytes() >= 1 << 24:
				1106	raise VelaError(
				1107	f"The command stream size exceeds the hardware limit of 16 MiB. "
				1108	f"The current stream size is {emit.size_in_bytes()/2**20:.2F} MiB."
				1109	)
				1110
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1111	if verbose:
Tim Hall	cd03504	2023-08-08 14:10:17 +0100	[diff] [blame]	1112	print("Register-Level Command Stream: Output")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1113	emit.print_cmds()
Tim Hall	114baba	2022-05-10 12:42:27 +0100	[diff] [blame]	1114	print(f"Number of commands = {len(emit.cmd_stream)}")
				1115	print(f"Command stream length = {emit.size_in_bytes()} bytes")
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	1116	return res
				1117
				1118
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	1119	def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1120	"""
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	1121	Internal implementation of the public facing API for generating an Ethos-U register command stream.
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1122	Calculates dependencies between commands and inserts wait operations if needed.
				1123
				1124	:param npu_op_list: List[NpuOperation] list of high level NPU operations
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	1125	:param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator
				1126	:return Ethos-U instructions, as a list of 32-bit integers
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1127	"""
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	1128	accelerator = Accelerator.from_npu_accelerator(npu_accelerator)
Louis Verhaard	5207830	2020-11-18 13:35:06 +0100	[diff] [blame]	1129	arch = create_default_arch(accelerator)
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	1130	mem_limits = dict()
				1131	for region in range(0, 8):
				1132	mem_limits[region] = arch.max_address_offset
				1133	mem_limits[BASE_PTR_INDEX_MEM2MEM] = arch.shram_size_bytes
				1134	return generate_command_stream(npu_op_list, arch, verbose=False, mem_limits=mem_limits)