Blame - ethosu/vela/register_command_stream_generator.py - ml/ethos-u/ethos-u-vela

blob: 71fec3bed40cb8662b29d448f5114a3da10fc271 [file] [log] [blame]

Alexander Hansson	ca9cc42	2023-06-22 16:01:27 +0000	[diff] [blame]	1	# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
Rickard Bolin	bc6ee58	2022-11-04 08:24:29 +0000	[diff] [blame]	16	#
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	17	# Description:
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	18	# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	19	# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	20	# stream suitable for interpretation by the Ethos-U processor.
Louis Verhaard	c629129	2021-03-19 09:35:48 +0100	[diff] [blame]	21	import math
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	22	from collections import defaultdict
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	23	from enum import Enum
				24	from enum import IntEnum
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	25	from typing import cast
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	26	from typing import Dict
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	27	from typing import List
				28	from typing import Optional
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	29
				30	import numpy as np
				31
				32	from . import scaling
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	33	from .api import NpuAccelerator
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	34	from .api import NpuActivation
				35	from .api import NpuActivationOp
				36	from .api import NpuAddressRange
				37	from .api import NpuBlockOperation
				38	from .api import NpuBlockTraversal
				39	from .api import NpuConv2DOperation
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	40	from .api import NpuConvDepthWiseOperation
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	41	from .api import NpuDataType
				42	from .api import NpuDmaOperation
				43	from .api import NpuElementWiseOp
				44	from .api import NpuElementWiseOperation
				45	from .api import NpuFeatureMap
				46	from .api import NpuKernel
				47	from .api import NpuLayout
				48	from .api import NpuOperation
				49	from .api import NpuOperationType
				50	from .api import NpuPadding
				51	from .api import NpuPoolingOp
				52	from .api import NpuPoolingOperation
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	53	from .api import NpuResamplingMode
				54	from .api import NpuRoundingMode
				55	from .api import NpuShape3D
				56	from .api import NpuTileBox
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	57	from .architecture_allocator import ArchitectureBlockConfig
				58	from .architecture_allocator import try_block_config
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	59	from .architecture_features import Accelerator
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	60	from .architecture_features import ArchitectureFeatures
Louis Verhaard	5207830	2020-11-18 13:35:06 +0100	[diff] [blame]	61	from .architecture_features import create_default_arch
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	62	from .architecture_features import SHRAMElements
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame^]	63	from .errors import ByteAlignmentError
				64	from .errors import ByteSizeError
erik.andersson@arm.com	1878dab	2021-03-16 09:40:24 +0100	[diff] [blame]	65	from .errors import VelaError
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	66	from .ethos_u55_regs.ethos_u55_regs import acc_format
				67	from .ethos_u55_regs.ethos_u55_regs import activation
				68	from .ethos_u55_regs.ethos_u55_regs import cmd0
				69	from .ethos_u55_regs.ethos_u55_regs import cmd1
				70	from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	71	from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlin	cf7da10	2020-05-20 09:03:40 +0200	[diff] [blame]	72	from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	73	from .ethos_u55_regs.ethos_u55_regs import rounding
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	74	from .numeric_util import round_away_zero
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	75	from .numeric_util import round_up_to_int
Patrik Gustavsson	c74682c	2021-08-17 14:26:38 +0200	[diff] [blame]	76	from .operation import ExplicitScaling
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	77	from .operation import NpuBlockType
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	78	from .range_set import MemoryAccessSet
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	79	from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	80	from .register_command_stream_util import calc_blockdep
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame^]	81	from .register_command_stream_util import check_addresses
				82	from .register_command_stream_util import check_alignment
				83	from .register_command_stream_util import check_dma_op
				84	from .register_command_stream_util import check_size
				85	from .register_command_stream_util import check_strides
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	86	from .register_command_stream_util import get_dma_memory_accesses
				87	from .register_command_stream_util import get_op_memory_accesses
				88	from .register_command_stream_util import get_strides
				89	from .register_command_stream_util import get_wait_dependency
Fredrik Svedberg	f3c7d55	2022-11-04 09:48:49 +0100	[diff] [blame]	90	from .register_command_stream_util import get_zero_point
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	91	from .register_command_stream_util import has_ifm2
Fredrik Svedberg	f3c7d55	2022-11-04 09:48:49 +0100	[diff] [blame]	92	from .register_command_stream_util import quantise
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	93	from .register_command_stream_util import shape3d_to_block
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	94	from .register_command_stream_util import to_kernel
				95	from .register_command_stream_util import UNARY_ELEMWISE_OPS
				96	from .register_command_stream_util import Watermark
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	97
				98
				99	class RegisterMachine:
				100	def __init__(self):
				101	self.n_banks = 1
				102	self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
				103	self.bank_idx = 0
				104
				105	def set_register(self, reg, value):
				106	is_changed = self.registers[self.bank_idx][reg] != value
				107	self.registers[self.bank_idx][reg] = value
				108	# is_changed = True # force command
				109	return is_changed
				110
				111	def switch_bank(self):
				112	self.bank_idx = (self.bank_idx + 1) % self.n_banks
				113
				114
				115	class CmdMode(IntEnum):
				116	NoPayload = 0x0000
				117	Payload32 = 0x4000
				118	Mask = 0xC000
				119	CmdOpMask = 0x03FF
				120
				121
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	122	class CommandStreamEmitter:
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	123	WORD_SIZE = 4
				124
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	125	def __init__(self):
				126	self.cmd_stream = []
				127	self.reg_machine = [RegisterMachine(), RegisterMachine()]
				128	self.last_absolute_wait = defaultdict(int)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	129	self.offset = 0
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	130
				131	def get_reg_machine(self, cmd):
				132	if "DMA" in cmd.name:
				133	return self.reg_machine[1]
				134	else:
				135	return self.reg_machine[0]
				136
				137	def size_in_bytes(self):
				138	sz = 0
				139	for cmd in self.cmd_stream:
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	140	sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	141	return sz
				142
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	143	def to_list(self) -> List[int]:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	144	return [elem for cmd in self.cmd_stream for elem in cmd]
				145
				146	def print_cmds(self):
Tim Hall	114baba	2022-05-10 12:42:27 +0100	[diff] [blame]	147	s = f" {'Offset':6}:"
				148	s += f" {'Payload':8}"
				149	s += f"{'Param':4}" # no leading space for alignment
				150	s += f" {'Code':4}"
				151	s += f" - {'Command':30}"
				152	s += f" {'Param':5}"
				153	print(s)
				154
				155	offset = 0
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	156	for words_for_one_command in self.cmd_stream:
				157	code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
				158	param = words_for_one_command[0] >> 16 # higher 16 bits
				159
				160	payload_mode = CmdMode(code & CmdMode.Mask)
				161
Tim Hall	cda4fcb	2022-05-19 12:36:58 +0100	[diff] [blame]	162	s = f"{offset:#08x}:"
Tim Hall	114baba	2022-05-10 12:42:27 +0100	[diff] [blame]	163
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	164	if payload_mode == CmdMode.NoPayload:
Tim Hall	114baba	2022-05-10 12:42:27 +0100	[diff] [blame]	165	s += f" {'':8}"
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	166	else:
Tim Hall	114baba	2022-05-10 12:42:27 +0100	[diff] [blame]	167	assert payload_mode == CmdMode.Payload32
				168	s += f" {words_for_one_command[1]:08x}"
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	169
Tim Hall	114baba	2022-05-10 12:42:27 +0100	[diff] [blame]	170	s += f" {param:04x}"
				171	s += f" {code:04x}"
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	172
Tim Hall	114baba	2022-05-10 12:42:27 +0100	[diff] [blame]	173	if payload_mode == CmdMode.NoPayload:
				174	s += f" - {cmd0(code & CmdMode.CmdOpMask):30}"
				175	offset += 4
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	176	else:
Tim Hall	114baba	2022-05-10 12:42:27 +0100	[diff] [blame]	177	s += f" - {cmd1(code & CmdMode.CmdOpMask):30}"
				178	offset += 8
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	179
Tim Hall	114baba	2022-05-10 12:42:27 +0100	[diff] [blame]	180	s += f" {param:5}"
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	181	print(s)
				182
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	183	def cmd0_with_param(self, cmd: cmd0, param):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	184	if isinstance(param, Enum):
				185	param = int(param.value)
				186	else:
				187	param = int(param)
				188	param = param & 0xFFFF
				189	command = cmd.value \| (param << 16)
				190	if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
				191	return
				192
				193	# This is not a redundant command, actually write it
				194	self.cmd_stream.append((command,))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	195	self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	196
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	197	def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):
Louis Verhaard	893780c	2021-03-30 09:02:30 +0200	[diff] [blame]	198	offset = int(offset) & 0xFFFFFFFF
				199	param = int(param) & 0xFFFF
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	200	command = cmd.value \| CmdMode.Payload32.value \| (param << 16)
				201
				202	if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
				203	return
				204
				205	# This is not a redundant command, actually write it
				206	self.cmd_stream.append((command, offset))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	207	self.offset += CommandStreamEmitter.WORD_SIZE * 2
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	208
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	209	def cmd1_with_address(self, cmd: cmd1, offset):
				210	self.cmd1_with_offset(cmd, offset, offset >> 32)
				211
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	212	def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	213	param = (16 * channel) + outstanding_count
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	214	command = ((param & 0xFFFF) << 16) \| cmd.value
				215	self.cmd_stream.append((command,))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	216	self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	217
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	218	def cmd_do_operation(self, cmd: cmd0, param=0):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	219	param = int(param)
				220	command = ((param & 0xFFFF) << 16) \| cmd.value
				221
				222	self.cmd_stream.append((command,))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	223	self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	224	self.get_reg_machine(cmd).switch_bank()
				225
				226
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	227	# -------------------------------------------------------------------
				228	# REGISTER GENERATION
				229	# -------------------------------------------------------------------
				230
				231
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	232	# TODO: Replace with definitions from ethos_u55_regs
				233	class IFM2Broadcast(IntEnum):
				234	BroadcastHdim = 1 << 0
				235	BroadcastWdim = 1 << 1
				236	BroadcastCdim = 1 << 2
				237	ReverseOperandOrder = 1 << 6
				238	UseIFM2Scalar = 1 << 7
				239
				240
				241	pooling_op_map = {
				242	NpuPoolingOp.MAX: pooling_mode.MAX.value,
				243	NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,
				244	NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,
				245	}
				246
				247	elementwise_op_map = {
				248	NpuElementWiseOp.MUL: elementwise_mode.MUL.value,
				249	NpuElementWiseOp.ADD: elementwise_mode.ADD.value,
				250	NpuElementWiseOp.SUB: elementwise_mode.SUB.value,
				251	NpuElementWiseOp.MIN: elementwise_mode.MIN.value,
				252	NpuElementWiseOp.MAX: elementwise_mode.MAX.value,
				253	NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,
				254	NpuElementWiseOp.ABS: elementwise_mode.ABS.value,
				255	NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,
				256	NpuElementWiseOp.SHR: elementwise_mode.SHR.value,
				257	NpuElementWiseOp.SHL: elementwise_mode.SHL.value,
				258	}
				259
				260	activation_op_map = {
				261	NpuActivationOp.NONE_OR_RELU: activation.NONE,
				262	NpuActivationOp.TANH: activation.TANH,
				263	NpuActivationOp.SIGMOID: activation.SIGMOID,
				264	}
				265
				266	# Maps an AccumulatorType enum to the corresponding acc_format value
				267	acc_format_map = {
				268	SHRAMElements.Acc16: acc_format.FP_S5_10.value,
				269	SHRAMElements.Acc32: acc_format.INT_32BIT.value,
				270	SHRAMElements.Acc40: acc_format.INT_40BIT.value,
				271	}
				272
				273	resampling_mode_map = {
				274	NpuResamplingMode.NONE: resampling_mode.NONE,
				275	NpuResamplingMode.NEAREST: resampling_mode.NEAREST,
				276	NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,
				277	}
				278
				279	# Maps data type size in bits to activation precision
				280	precision_map = {8: 0, 16: 1, 32: 2}
				281
				282	# Maps rounding mode to the corresponding value
				283	rounding_mode_map = {
				284	NpuRoundingMode.TFL: rounding.TFL.value,
				285	NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,
				286	NpuRoundingMode.NATURAL: rounding.NATURAL.value,
				287	}
				288
				289
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	290	def check_mem_limits(memory_accesses: MemoryAccessSet, mem_limits: Dict[int, int]):
				291	"""Checks that an operation's memory accesses respect the boundaries imposed by mem_limits"""
				292	for mem_access in memory_accesses.accesses:
				293	for region, range_set in mem_access.regions.items():
				294	if region not in mem_limits:
				295	raise VelaError(f"Invalid region: {region}")
				296	max = mem_limits[region]
				297	for start, end in range_set.ranges:
				298	for offset in (start, end):
				299	if offset < 0:
				300	raise VelaError(f"Negative address offset: {offset}, region: {region}")
				301	if offset > max:
Tim Hall	cda4fcb	2022-05-19 12:36:58 +0100	[diff] [blame]	302	raise VelaError(
				303	f"Address offset out of range: {offset}, region: {region}, max: {max}. Perhaps try running"
				304	f" with the HillClimb tensor allocator and/or increasing the maximum iteration of that"
				305	f" allocator"
				306	)
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	307
				308
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	309	def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):
				310	"""Generates IFM_PAD registers"""
				311	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)
				312	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)
				313	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)
				314	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)
				315
				316
				317	def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):
				318	"""Generates ACTIVATION registers"""
				319	act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)
				320
				321	if act.min is None:
				322	quantized_min = ofm.data_type.min_value()
				323	else:
				324	quantized_min = quantise(act.min, ofm.quantization)
				325	if act.max is None:
				326	quantized_max = ofm.data_type.max_value()
				327	else:
				328	quantized_max = quantise(act.max, ofm.quantization)
				329	quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())
				330	quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())
				331	if act.op_type == NpuActivationOp.TABLE_LOOKUP:
				332	assert 0 <= act.lookup_table_index < 8
				333	activation_value = 16 + act.lookup_table_index
				334	if ofm.data_type == NpuDataType.INT32:
				335	activation_value \|= 3 << 12 # Force I8 range
				336	quantized_min = max(-128, quantized_min)
				337	quantized_max = min(127, quantized_max)
				338	else:
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	339	activation_value = cast(int, activation_op_map[act.op_type])
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	340	emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)
				341	emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)
				342	emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)
				343
				344
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame^]	345	def generate_addresses(
				346	emit: CommandStreamEmitter,
				347	ptr_cmds: List[cmd1],
				348	addresses: List[int],
				349	layout: NpuLayout,
				350	element_size,
				351	arch: ArchitectureFeatures,
				352	):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	353	"""Generates xFM_BASE registers"""
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame^]	354	check_addresses(addresses, layout, element_size, arch)
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	355	for i in range(4):
				356	emit.cmd1_with_address(ptr_cmds[i], addresses[i])
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	357
				358
				359	def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):
				360	"""Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""
				361	emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)
				362	emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)
				363	emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)
				364
				365
				366	def generate_strides(
				367	emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1
				368	):
				369	"""Generates STRIDE_C/Y/X registers"""
				370	strides = get_strides(fm)
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame^]	371	check_strides(fm, strides)
				372
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	373	emit.cmd1_with_address(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)
				374	emit.cmd1_with_address(stride_y_cmd, strides.height) # stride between vertical values (H)
				375	emit.cmd1_with_address(stride_x_cmd, strides.width) # stride between horisontal values (W)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	376
				377
				378	def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):
				379	"""Generates IFM/IFM2_PRECISION register"""
				380	dtype = fm.data_type
				381	prec = 1 if dtype.is_signed() else 0
				382	activation_precision = precision_map[dtype.size_in_bits()]
				383	prec += activation_precision << 2
				384
				385	if fm.layout == NpuLayout.NHCWB16:
				386	prec \|= 1 << 6
				387
				388	prec \|= op_to_scale << 8
				389	emit.cmd0_with_param(precision_cmd, prec)
				390
				391
				392	def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):
				393	"""Generates OFM_PRECISION register"""
				394	dtype = npu_op.ofm.data_type
				395	prec = 1 if dtype.is_signed() else 0
				396	activation_precision = precision_map[dtype.size_in_bits()]
				397	prec += activation_precision << 1
				398
				399	if use_global_scale:
				400	# Set global scale bit, as opposed to using per channel scale
				401	prec \|= 1 << 8
				402	if npu_op.ofm.layout == NpuLayout.NHCWB16:
				403	prec \|= 1 << 6
				404	prec \|= rounding_mode_map[npu_op.rounding_mode] << 14
				405	emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
				406
				407
				408	def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):
				409	"""Generates IFM2_BROADCAST register for binary elementwise operations"""
				410	ifm2_broadcast = 0
				411	ifm = npu_op.ifm
				412	ifm2 = npu_op.ifm2
				413	if npu_op.reversed_operands:
				414	ifm2_broadcast \|= IFM2Broadcast.ReverseOperandOrder
				415	if npu_op.ifm2_scalar is not None:
				416	# IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
				417	ifm2_broadcast \|= IFM2Broadcast.UseIFM2Scalar
				418	else:
				419	if ifm.shape.height != ifm2.shape.height:
				420	# Broadcast in 'H' dimension
				421	assert ifm2.shape.height == 1
				422	ifm2_broadcast \|= IFM2Broadcast.BroadcastHdim
				423
				424	if ifm.shape.width != ifm2.shape.width:
				425	# Broadcast in 'W' dimension
				426	assert ifm2.shape.width == 1
				427	ifm2_broadcast \|= IFM2Broadcast.BroadcastWdim
				428
				429	if ifm.shape.depth != ifm2.shape.depth:
				430	# Broadcast in 'C' dimension
				431	assert ifm2.shape.depth == 1
				432	ifm2_broadcast \|= IFM2Broadcast.BroadcastCdim
				433
				434	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
				435
				436
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame^]	437	def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap, arch: ArchitectureFeatures):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	438	"""Generates general IFM registers"""
				439	emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)
				440	generate_addresses(
				441	emit,
				442	[cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],
				443	ifm.tiles.addresses,
				444	ifm.layout,
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame^]	445	ifm.data_type.size_in_bytes(),
				446	arch,
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	447	)
				448	generate_tiles(
				449	emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles
				450	)
				451	emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)
				452	generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)
Fredrik Svedberg	f3c7d55	2022-11-04 09:48:49 +0100	[diff] [blame]	453	emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, get_zero_point(ifm))
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	454
				455
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame^]	456	def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool, arch: ArchitectureFeatures):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	457	"""Generates general IFM2 registers"""
				458	if not has_scalar:
				459	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)
				460	generate_addresses(
				461	emit,
				462	[cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],
				463	ifm2.tiles.addresses,
				464	ifm2.layout,
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame^]	465	ifm2.data_type.size_in_bytes(),
				466	arch,
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	467	)
				468	generate_tiles(
				469	emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles
				470	)
				471	generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)
Fredrik Svedberg	f3c7d55	2022-11-04 09:48:49 +0100	[diff] [blame]	472	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, get_zero_point(ifm2))
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	473
				474
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame^]	475	def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap, arch: ArchitectureFeatures):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	476	"""Generates general OFM registers"""
				477	emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)
				478	generate_addresses(
				479	emit,
				480	[cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],
				481	ofm.tiles.addresses,
				482	ofm.layout,
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame^]	483	ofm.data_type.size_in_bytes(),
				484	arch,
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	485	)
				486	generate_tiles(
				487	emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles
				488	)
				489	emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)
				490	emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)
				491	emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)
				492	generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)
Fredrik Svedberg	f3c7d55	2022-11-04 09:48:49 +0100	[diff] [blame]	493	emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, get_zero_point(ofm))
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	494
				495
				496	def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):
				497	"""Generates KERNEL related registers"""
				498	emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))
				499	emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))
				500	# set kernel x stride low bit
				501	stride = (kernel.stride_x - 1) & 1
				502	# set kernel y stride low bit
				503	stride \|= (kernel.stride_y - 1 & 1) << 1
				504	# set kernel x stride extension bits
				505	stride \|= (kernel.stride_x - 1 >> 1) << 6
				506	# set kernel y stride extension bits
				507	stride \|= (kernel.stride_y - 1 >> 1) << 9
				508	stride \|= (kernel.dilation_x - 1) << 3
				509	stride \|= (kernel.dilation_y - 1) << 4
				510	if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:
				511	stride \|= 1 << 2
				512	emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
				513
				514
				515	def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):
				516	"""Generates WEIGHT registers"""
				517	if len(weights) == 0:
				518	return
				519	emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)
				520	# Set weights sources for active and present cores
				521	for core, (addr, length) in enumerate(
				522	[
				523	(cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
				524	(cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
				525	]
				526	):
				527	if core < len(weights):
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame^]	528	check_alignment(weights[core].address, 16)
				529	check_size(weights[core].length, 16)
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	530	emit.cmd1_with_address(addr, weights[core].address)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	531	emit.cmd1_with_offset(length, weights[core].length)
				532	elif core < arch.ncores:
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame^]	533	check_alignment(weights[0].address, 16)
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	534	emit.cmd1_with_address(addr, weights[0].address)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	535	emit.cmd1_with_offset(length, 0)
				536
				537
				538	def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):
				539	"""Generates SCALE registers"""
				540	if len(biases) == 0:
				541	return
				542	emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)
				543	# Set weights sources for active and present cores
				544	for core, (addr, length) in enumerate(
				545	[(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]
				546	):
				547	if core < len(biases):
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	548	emit.cmd1_with_address(addr, biases[core].address)
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame^]	549	check_size(biases[core].length, 16)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	550	emit.cmd1_with_offset(length, biases[core].length)
				551	elif core < arch.ncores:
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	552	emit.cmd1_with_address(addr, biases[0].address)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	553	emit.cmd1_with_offset(length, 0)
				554
				555
				556	def generate_block_config(
Jonas Ohlsson	d857507	2022-03-30 10:30:25 +0200	[diff] [blame]	557	emit: CommandStreamEmitter,
				558	block_config: NpuShape3D,
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	559	):
				560	"""Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	561	emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
				562	emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)
				563	emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	564
				565
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	566	def generate_shram_registers(
Jonas Ohlsson	d857507	2022-03-30 10:30:25 +0200	[diff] [blame]	567	emit: CommandStreamEmitter,
				568	npu_op: NpuBlockOperation,
				569	arch_block_config: ArchitectureBlockConfig,
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	570	):
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	571	"""Generates IB_END/IB_START/AB_START/ACC_FORMAT registers"""
				572	emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, arch_block_config.layout.ib_end)
				573	emit.cmd0_with_param(cmd0.NPU_SET_AB_START, arch_block_config.layout.ab_start)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	574	if has_ifm2(npu_op):
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	575	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_IB_START, arch_block_config.layout.ib_start2)
				576	emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[arch_block_config.acc_type])
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	577
				578
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	579	def get_block_config_for_npu_op(
				580	arch, npu_op: NpuBlockOperation, npu_block_type: NpuBlockType, is_partkernel: bool, ifm_resampling: resampling_mode
				581	) -> Optional[ArchitectureBlockConfig]:
				582	"""
				583	Given npu_op.block_config, returns a corresponding ArchitectureBlockConfig.
				584	Returns None if the block_config does not fit.
				585	"""
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	586
				587
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	588	def get_arch_block_config(
				589	npu_op: NpuBlockOperation, block_traversal: NpuBlockTraversal, arch: ArchitectureFeatures
				590	) -> ArchitectureBlockConfig:
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	591	"""Creates shared buffer allocation for the given operation"""
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	592	assert npu_op.block_config is not None, "block_config has not been set"
				593	block_type = NpuBlockType.Default
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	594	if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	595	block_type = NpuBlockType.ConvolutionMxN
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	596	elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	597	block_type = NpuBlockType.ConvolutionDepthWise
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	598	elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	599	block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	600	elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	601	block_type = NpuBlockType.ElementWise
				602	else:
				603	assert 0, "Unsupported operation"
				604	ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	605	is_partkernel = block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST
				606	uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
				607	lut_banks = 2 if uses_lut else 0
				608	fms = [npu_op.ifm, npu_op.ofm]
				609	if npu_op.ifm2 is not None:
				610	fms.append(npu_op.ifm2)
				611	all_fms_have_quant = not any(fm.quantization is None or fm.quantization.scale_f32 is None for fm in fms)
				612	ifm_bits = npu_op.ifm.data_type.size_in_bits()
				613	ifm_shape = shape3d_to_block(npu_op.ifm.shape)
				614	if has_ifm2(npu_op):
				615	ifm2_shape = shape3d_to_block(npu_op.ifm2.shape)
				616	else:
				617	ifm2_shape = None
				618	uses_scalar = npu_op.ifm2_scalar is not None
				619	block_config = shape3d_to_block(npu_op.block_config)
				620	arch_block_config = try_block_config(
				621	block_config,
				622	arch,
				623	block_type,
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	624	shape3d_to_block(npu_op.ofm.shape),
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	625	ifm_shape,
				626	ifm2_shape,
				627	uses_scalar,
				628	ifm_bits,
				629	is_partkernel=is_partkernel,
				630	kernel=to_kernel(npu_op.kernel),
				631	lut_banks=lut_banks,
				632	scaled=all_fms_have_quant,
				633	ifm_resampling=ifm_resampling_mode,
				634	)
				635	assert arch_block_config is not None, f"block_config {npu_op.block_config} does not fit"
				636	return arch_block_config
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	637
				638
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	639	def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):
				640	"""Generates KERNEL_WAIT/DMA_WAIT"""
				641	if cmd_waits.npu >= 0:
				642	emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
				643
				644	if cmd_waits.dma >= 0:
				645	emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
				646
				647
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	648	def generate_common(
				649	emit: CommandStreamEmitter,
				650	npu_op: NpuBlockOperation,
				651	block_traversal: NpuBlockTraversal,
				652	arch: ArchitectureFeatures,
				653	use_global_scale: bool = False,
				654	op_to_scale: int = 0,
				655	):
				656	"""Generate registers that are common to most operations"""
				657	assert npu_op.ifm is not None and npu_op.ofm is not None
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame^]	658	generate_ifm(emit, npu_op.ifm, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	659	generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)
				660	emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])
				661	if npu_op.padding is not None:
				662	generate_padding(emit, npu_op.padding)
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame^]	663	generate_ofm(emit, npu_op.ofm, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	664	generate_ofm_precision(emit, npu_op, use_global_scale)
				665	if npu_op.op_type != NpuOperationType.ElementWise:
				666	assert npu_op.kernel is not None
				667	generate_kernel(emit, npu_op.kernel, block_traversal)
				668	generate_weights(emit, npu_op.weights, arch)
				669	generate_biases(emit, npu_op.biases, arch)
				670	generate_activation(emit, npu_op.activation, npu_op.ofm)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	671	arch_block_config = get_arch_block_config(npu_op, block_traversal, arch)
				672	generate_block_config(emit, npu_op.block_config)
				673	generate_shram_registers(emit, npu_op, arch_block_config)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	674
				675
				676	# -------------------------------------------------------------------
				677	# SCALING
				678	# -------------------------------------------------------------------
				679
				680
				681	def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):
				682	"""Generates OFM_SCALE register for pooling operations"""
				683	# For valid padding vela has to output scaling values
				684	kernel = pool_op.kernel
				685	ifm_quant = pool_op.ifm.quantization
				686	ofm_quant = pool_op.ofm.quantization
				687	if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):
				688	assert ifm_quant.scale_f32 is not None
				689	rescale = 0x3000 * ifm_quant.scale_f32
				690	if pool_op.ifm.data_type == NpuDataType.INT16:
				691	# Calculate scale and shift for the output scale of 1/(3*4096)
Louis Verhaard	c629129	2021-03-19 09:35:48 +0100	[diff] [blame]	692	x_log2 = math.log2(ifm_quant.scale_f32)
				693	rounded_log2 = int(round(x_log2))
				694	is_power_of_two = abs(x_log2 - rounded_log2) < 0.001
				695	shift = rounded_log2 + 12
Patrik Gustavsson	e3dd2f3	2021-12-02 09:08:26 +0100	[diff] [blame]	696	if is_power_of_two and (
				697	(pool_op.activation.op_type == NpuActivationOp.TANH and shift in (0, 1))
				698	or (pool_op.activation.op_type == NpuActivationOp.SIGMOID and shift == 0)
				699	):
				700	# Special handling if input scale is 1/2048 (tanh/sigmoid) or 1/4096 (tanh)
Louis Verhaard	c629129	2021-03-19 09:35:48 +0100	[diff] [blame]	701	scale = 3 << shift
				702	shift = 0
				703	else:
				704	shift = 0
				705	max_rescale = np.iinfo(np.int16).max / 2
				706	while rescale <= max_rescale and shift <= 30:
				707	shift += 1
				708	rescale *= 2
				709	scale = int(rescale)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	710	else:
				711	rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
				712	scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
				713	scale = int(round_away_zero(scale * rescale))
				714	elif pool_op.fused_quantize:
				715	# Quantize op requires different scaling
				716	ifm_scale_f64 = np.double(ifm_quant.scale_f32)
				717	ofm_scale_f64 = np.double(ofm_quant.scale_f32)
				718	scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
				719	elif pool_op.rescale is not None:
Patrik Gustavsson	c74682c	2021-08-17 14:26:38 +0200	[diff] [blame]	720	if type(pool_op.rescale) == ExplicitScaling:
				721	# Note: reuse of rescale for explicit scaling to not expose this in the external API
				722	explicit_scaling = pool_op.rescale
				723	assert explicit_scaling.per_channel is False
				724	scale = explicit_scaling.multiplier[0]
				725	shift = explicit_scaling.shift[0]
				726	else:
Tim Hall	885033b	2022-07-21 11:46:03 +0100	[diff] [blame]	727	# for ResizeBilinear/NearestNeighbor operations with rescale
Fredrik Svedberg	4a434cb	2022-09-27 14:13:01 +0200	[diff] [blame]	728	# Note: this is not used, but part of the public API
Patrik Gustavsson	c74682c	2021-08-17 14:26:38 +0200	[diff] [blame]	729	rescale = pool_op.rescale
				730	rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
				731	scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
				732	scale = int(round_away_zero(scale * rescale))
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	733	else:
				734	# In case avg pool fused with concat or other memory operation, rescaling might be needed.
				735	# kernel height == kernel width == 1 is always true in this case
				736	# Normally the scale is maximised, to get maximum precision, which means that
				737	# if rescale != 1, scale need to consider the number of bits needed for rescaling
				738	if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:
				739	rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32
				740	rescale_bits = 0
				741	if kernel.height == kernel.width == 1:
				742	if rescale > 1:
				743	rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
				744	elif rescale < 1:
				745	rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
				746	scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
				747	scale = int(round_away_zero(scale * rescale))
				748	else:
				749	scale = 1
				750	shift = 0
				751
				752	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
				753
				754
				755	def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
				756	"""
				757	Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
				758	Returns the operator to scale
				759	"""
				760	op_to_scale = 0
				761	if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):
				762	input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
				763	input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
				764	output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
				765
				766	if npu_op.activation is not None and npu_op.activation.op_type in (
				767	NpuActivationOp.SIGMOID,
				768	NpuActivationOp.TANH,
				769	):
				770	output_scale = 1 / 0x3000
				771
				772	if npu_op.sub_op_type == NpuElementWiseOp.MUL:
Patrik Gustavsson	b081d67	2021-08-25 13:49:25 +0200	[diff] [blame]	773	if npu_op.rescale:
				774	ofm_scale, shift = npu_op.rescale
				775	elif None in (input_scale, input2_scale, output_scale):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	776	ofm_scale = 1
				777	shift = 0
				778	else:
				779	ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	780	else: # Add/Sub
Fredrik Svedberg	4a434cb	2022-09-27 14:13:01 +0200	[diff] [blame]	781	# Default operand scaling is no scaling
				782	opa_scale = opb_scale = 1
				783	opa_shift = 0
Henrik G Olsson	ad656a8	2021-03-19 15:50:28 +0100	[diff] [blame]	784	bitdepth = npu_op.ifm.data_type.size_in_bits()
				785	use_advanced_scaling = False
Fredrik Svedberg	4a434cb	2022-09-27 14:13:01 +0200	[diff] [blame]	786	if npu_op.rescale is not None:
				787	# Explicit ofm scaling
				788	ofm_scale, shift = npu_op.rescale
				789	elif None in (input_scale, input2_scale, output_scale):
				790	# No ofm scaling
				791	ofm_scale = 1
				792	shift = 0
Henrik G Olsson	ad656a8	2021-03-19 15:50:28 +0100	[diff] [blame]	793	elif input_scale == input2_scale and bitdepth == 16:
Fredrik Svedberg	4a434cb	2022-09-27 14:13:01 +0200	[diff] [blame]	794	# int16 same scaling
Henrik G Olsson	ad656a8	2021-03-19 15:50:28 +0100	[diff] [blame]	795	opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
				796	input_scale, input2_scale, output_scale
				797	)
				798	# align the double rounding with that of advanced scaling
Fredrik Svedberg	4a434cb	2022-09-27 14:13:01 +0200	[diff] [blame]	799	opa_scale //= 2
				800	opb_scale //= 2
Henrik G Olsson	ad656a8	2021-03-19 15:50:28 +0100	[diff] [blame]	801	shift -= 1
				802	opa_shift = 0 # Unused for this case
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	803	elif input_scale == input2_scale:
Fredrik Svedberg	4a434cb	2022-09-27 14:13:01 +0200	[diff] [blame]	804	# Same scaling
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	805	opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
				806	input_scale, input2_scale, output_scale
				807	)
				808	opa_shift = 0 # Unused for this case
Henrik G Olsson	ad656a8	2021-03-19 15:50:28 +0100	[diff] [blame]	809	# For 8 bit we can't guarantee double rounding with simplified scaling will always be
				810	# the same as with advanced scaling due to different shifts. When the ofm scale fulfils
				811	# the following we know that double rounding will have no effect for advanced scaling
				812	# no matter the input, so we can safely use simplified scaling with double rounding disabled.
				813	use_advanced_scaling = int(ofm_scale) & 0xFFF != 0
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	814	else:
Henrik G Olsson	ad656a8	2021-03-19 15:50:28 +0100	[diff] [blame]	815	use_advanced_scaling = True
				816	if use_advanced_scaling:
				817	# Use advanced implementation only when input/output scales differ,
				818	# or when we can't guarantee the absence of rounding errors
Jonas Ohlsson	d857507	2022-03-30 10:30:25 +0200	[diff] [blame]	819	(
				820	opa_scale,
				821	opa_shift,
				822	ofm_scale,
				823	shift,
				824	op_to_scale,
				825	) = scaling.advanced_elementwise_add_sub_scale(input_scale, input2_scale, output_scale, bitdepth)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	826	opb_scale = 0 # Unused for this case
				827	if npu_op.reversed_operands:
				828	# If the operand order is reversed we also have to swap which operand is scaled
				829	if op_to_scale == scaling.OperandToScale.OPa:
				830	op_to_scale = scaling.OperandToScale.OPb
				831	else:
				832	op_to_scale = scaling.OperandToScale.OPa
				833	emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
				834	emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	835	elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):
				836	output_scale = npu_op.ofm.quantization.scale_f32
				837	ofm_scale, shift = scaling.quantise_scale(output_scale)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	838	else:
Tim Hall	e178f38	2022-07-12 17:02:25 +0100	[diff] [blame]	839	ofm_scale = 1
				840	shift = 0
				841	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	842	return op_to_scale
				843
				844
				845	# -------------------------------------------------------------------
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	846	# PRINT
				847	# -------------------------------------------------------------------
Jacob Bohlin	e99b893	2020-07-13 16:01:51 +0200	[diff] [blame]	848
				849
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	850	def print_feature_map(fm: Optional[NpuFeatureMap], name: str):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	851	if fm is not None:
				852	q = (
				853	"no quantization"
				854	if fm.quantization is None
				855	else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"
				856	)
				857	h, w, c = fm.shape
				858	sz = h * w * c * fm.data_type.size_in_bytes()
				859	print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")
				860	strides = get_strides(fm)
				861	stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"
				862	t = fm.tiles
				863	addresses = [hex(addr) for addr in t.addresses]
				864	print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")
Tim Hall	68df8a1	2022-03-16 16:51:16 +0000	[diff] [blame]	865	print(f" name={fm.name}")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	866
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	867
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	868	def print_operation(npu_op: NpuOperation, index: int = 0, cmd=None):
Tim Hall	68df8a1	2022-03-16 16:51:16 +0000	[diff] [blame]	869	pass_info = f" {cmd}" if cmd else ""
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	870	if isinstance(npu_op, NpuOperation) and not isinstance(npu_op, (NpuDmaOperation, NpuBlockOperation)):
Tim Hall	68df8a1	2022-03-16 16:51:16 +0000	[diff] [blame]	871	print(f"{index} {npu_op.op_type.name} name={npu_op.name}:{pass_info}")
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	872	return
				873	if isinstance(npu_op, NpuDmaOperation):
Tim Hall	68df8a1	2022-03-16 16:51:16 +0000	[diff] [blame]	874	print(f"{index} {npu_op.op_type.name} name={npu_op.name}, src={npu_op.src}, dest={npu_op.dest}:{pass_info}")
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	875	return
				876	k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	877	if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)):
Tim Hall	68df8a1	2022-03-16 16:51:16 +0000	[diff] [blame]	878	print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name} name={npu_op.name}:{pass_info}")
Patrik Gustavsson	eca2e95	2020-05-27 09:15:11 +0200	[diff] [blame]	879	else:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	880	if (
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	881	isinstance(npu_op, NpuConv2DOperation)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	882	and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1
				883	):
				884	fc = "FullyConnected "
				885	else:
				886	fc = ""
Tim Hall	68df8a1	2022-03-16 16:51:16 +0000	[diff] [blame]	887	print(f"{index} {fc}{npu_op.op_type.name} name={npu_op.name}:{pass_info}")
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	888	print_feature_map(npu_op.ifm, "IFM")
				889	if npu_op.ifm2_scalar is not None:
				890	quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
				891	print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")
				892	else:
				893	print_feature_map(npu_op.ifm2, "IFM2")
				894	print_feature_map(npu_op.ofm, "OFM")
				895	if k is not None and npu_op.op_type != NpuOperationType.ElementWise:
				896	print(f" Kernel: {k}")
				897	if npu_op.padding is not None:
				898	print(f" {npu_op.padding}")
				899	for weights in npu_op.weights:
				900	print(f" Weights: {weights}")
				901	for bias in npu_op.biases:
				902	print(f" Scales: {bias}")
				903	if npu_op.activation is not None:
				904	act = npu_op.activation
				905	if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:
				906	lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""
				907	print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	908	if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	909	print(f" {npu_op.block_traversal}")
				910	bh, bw, bc = npu_op.block_config
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	911	rescale = (
				912	f", rescale={npu_op.rescale}" if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)) else ""
				913	)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	914	print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	915
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	916
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	917	def print_operations(npu_op_list: List[NpuOperation], npu_op_to_cmd=None):
				918	npu_op_to_cmd = dict() if npu_op_to_cmd is None else npu_op_to_cmd
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	919	for index, npu_op in enumerate(npu_op_list):
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	920	print_operation(npu_op, index, npu_op_to_cmd.get(npu_op))
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	921
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	922
				923	# -------------------------------------------------------------------
				924	# OPERATIONS
				925	# -------------------------------------------------------------------
				926
				927
				928	def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):
				929	"""Generates NPU_OP_* command"""
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	930	if isinstance(npu_op, NpuDmaOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	931	emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	932	elif isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	933	emit.cmd_do_operation(cmd0.NPU_OP_CONV)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	934	elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	935	emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	936	elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	937	emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	938	elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	939	emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])
				940	else:
				941	assert 0, "Unsupported operation"
				942
				943
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	944	def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	945	"""Generates register commands for Conv2D operations"""
				946	generate_common(emit, npu_op, npu_op.block_traversal, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	947
				948
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	949	def generate_conv_depthwise_op(
				950	emit: CommandStreamEmitter, npu_op: NpuConvDepthWiseOperation, arch: ArchitectureFeatures
				951	):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	952	"""Generates register commands for depthwise convolution operations"""
				953	generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	954
				955
				956	def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
				957	"""Generates register commands for pooling operations"""
Tim Hall	d6efcd3	2022-09-02 15:01:01 +0100	[diff] [blame]	958	# check that reduce_sum input is NHWC
				959	if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM and npu_op.ifm.layout != NpuLayout.NHWC:
				960	if npu_op.ifm.data_type == NpuDataType.INT32:
				961	raise VelaError(
				962	f"REDUCE_SUM ({npu_op.name}) with IFM data type of INT32 requires IFM layout to be NHWC"
				963	f" ({npu_op.ifm.name} == {npu_op.ifm.layout})"
				964	)
				965	elif arch.accelerator_config == Accelerator.Ethos_U65_512:
				966	raise VelaError(
				967	f"REDUCE_SUM ({npu_op.name}) with accelerator config of Ethos_U65_512 requires IFM layout to be NHWC"
				968	f" ({npu_op.ifm.name} == {npu_op.ifm.layout})"
				969	)
				970
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	971	use_global_scale = (
				972	npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0
				973	)
Patrik Gustavsson	c74682c	2021-08-17 14:26:38 +0200	[diff] [blame]	974	# Note: reuse of rescale for explicit scaling to not expose this in the external API
				975	if npu_op.rescale is not None and type(npu_op.rescale) == ExplicitScaling:
				976	use_global_scale = not npu_op.rescale.per_channel
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	977	generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)
				978	# Pooling op specific
				979	if use_global_scale:
				980	generate_ofm_scaling_for_pooling(emit, npu_op)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	981
				982
				983	def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):
				984	"""Generates register commands for elementwise operations"""
				985	use_global_scale = npu_op.sub_op_type in (
				986	NpuElementWiseOp.ADD,
				987	NpuElementWiseOp.SUB,
				988	NpuElementWiseOp.MUL,
				989	NpuElementWiseOp.LRELU,
				990	NpuElementWiseOp.ABS,
				991	)
				992	op_to_scale = generate_scaling_for_elementwise(emit, npu_op)
				993	generate_common(
				994	emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale
				995	)
				996	# Elementwise op specific
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	997	if npu_op.sub_op_type not in UNARY_ELEMWISE_OPS:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	998	# Binary operation; generate IFM2 registers
				999	assert npu_op.ifm2 is not None
				1000	has_scalar = npu_op.ifm2_scalar is not None
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame^]	1001	generate_ifm2(emit, npu_op.ifm2, has_scalar, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1002	generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)
				1003	generate_ifm2_broadcast(emit, npu_op)
				1004	if has_scalar:
				1005	quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
				1006	assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()
				1007	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1008
				1009
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame^]	1010	def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation, arch: ArchitectureFeatures):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1011	"""Generates register commands for DMA operations"""
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame^]	1012	check_dma_op(dma_op, arch)
				1013
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1014	emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	1015	emit.cmd1_with_address(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1016	emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)
				1017
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	1018	emit.cmd1_with_address(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
				1019	emit.cmd1_with_address(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1020
				1021
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	1022	def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1023	"""
				1024	Generates register commands for the given operation, but not the final NPU_OP_... command.
				1025	Returns the selected block config
				1026	"""
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	1027	if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	1028	generate_conv2d_op(emit, npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	1029	elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	1030	generate_conv_depthwise_op(emit, npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	1031	elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	1032	generate_pooling_op(emit, npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	1033	elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	1034	generate_elementwise_op(emit, npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	1035	elif isinstance(npu_op, NpuDmaOperation):
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame^]	1036	generate_dma_op(emit, npu_op, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1037	else:
				1038	assert 0, "Unsupported operation"
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1039
				1040
				1041	def generate_command_stream(
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	1042	npu_op_list: List[NpuOperation],
				1043	arch: ArchitectureFeatures,
				1044	verbose: bool,
				1045	mem_limits: Dict[int, int],
				1046	add_to_debug_db=None,
				1047	npu_op_to_cmd=None,
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	1048	) -> List[int]:
				1049	"""
				1050	Generates register commands for the given list of NPU operations.
				1051	Returns Ethos-U instructions, as a list of 32-bit integers.
				1052	"""
				1053	emit = CommandStreamEmitter()
				1054	if verbose:
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	1055	print_operations(npu_op_list, npu_op_to_cmd)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1056	# Calculate memory accesses for every operation
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	1057	memory_accesses: Dict[NpuOperation, MemoryAccessSet] = {}
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1058	for npu_op in npu_op_list:
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	1059	if isinstance(npu_op, NpuDmaOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1060	memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	1061	elif isinstance(npu_op, NpuBlockOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1062	memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	1063	else:
				1064	assert 0, "Invalid operation type"
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	1065
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	1066	if arch.is_ethos_u65_system:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1067	emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1068	prev_op = None
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1069	# Generate register commands for all operations
Alexander Hansson	ca9cc42	2023-06-22 16:01:27 +0000	[diff] [blame]	1070	outstanding_dma_ops: List[NpuOperation] = list()
				1071	outstanding_npu_ops: List[NpuOperation] = list()
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1072	for op_index, npu_op in enumerate(npu_op_list):
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	1073	try:
				1074	check_mem_limits(memory_accesses[npu_op], mem_limits)
Alexander Hansson	ca9cc42	2023-06-22 16:01:27 +0000	[diff] [blame]	1075	cmd_waits = get_wait_dependency(arch, npu_op, memory_accesses, outstanding_dma_ops, outstanding_npu_ops)
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	1076	generate_registers_for_op(emit, npu_op, arch)
William Isaksson	a4f8411	2023-06-19 15:31:46 +0000	[diff] [blame^]	1077	except ByteAlignmentError as e:
				1078	# Enables testing for ByteAlignmentErrors specifically
				1079	raise ByteAlignmentError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None
				1080	except ByteSizeError as e:
				1081	# Enables testing for ByteSizeErrors specifically
				1082	raise ByteSizeError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	1083	except VelaError as e:
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	1084	raise VelaError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	1085	if not isinstance(npu_op, NpuDmaOperation) and isinstance(npu_op, NpuBlockOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1086	# Generate BLOCKDEP
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	1087	blockdep = calc_blockdep(arch, prev_op, npu_op)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1088	blockdep = min(blockdep, arch.max_blockdep)
				1089	emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
				1090	prev_op = npu_op
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1091
				1092	generate_cmd_waits(emit, cmd_waits)
				1093	# Generate the actual NPU_OP command
				1094	generate_operation_code(emit, npu_op)
				1095	if add_to_debug_db is not None:
				1096	add_to_debug_db(npu_op, emit.offset)
				1097	# Fill in final part of command stream:
				1098	emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	1099	res = emit.to_list()
erik.andersson@arm.com	1878dab	2021-03-16 09:40:24 +0100	[diff] [blame]	1100
				1101	if emit.size_in_bytes() >= 1 << 24:
				1102	raise VelaError(
				1103	f"The command stream size exceeds the hardware limit of 16 MiB. "
				1104	f"The current stream size is {emit.size_in_bytes()/2**20:.2F} MiB."
				1105	)
				1106
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1107	if verbose:
				1108	emit.print_cmds()
Tim Hall	114baba	2022-05-10 12:42:27 +0100	[diff] [blame]	1109	print(f"Number of commands = {len(emit.cmd_stream)}")
				1110	print(f"Command stream length = {emit.size_in_bytes()} bytes")
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	1111	return res
				1112
				1113
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	1114	def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1115	"""
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	1116	Internal implementation of the public facing API for generating an Ethos-U register command stream.
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1117	Calculates dependencies between commands and inserts wait operations if needed.
				1118
				1119	:param npu_op_list: List[NpuOperation] list of high level NPU operations
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	1120	:param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator
				1121	:return Ethos-U instructions, as a list of 32-bit integers
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1122	"""
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	1123	accelerator = Accelerator.from_npu_accelerator(npu_accelerator)
Louis Verhaard	5207830	2020-11-18 13:35:06 +0100	[diff] [blame]	1124	arch = create_default_arch(accelerator)
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	1125	mem_limits = dict()
				1126	for region in range(0, 8):
				1127	mem_limits[region] = arch.max_address_offset
				1128	mem_limits[BASE_PTR_INDEX_MEM2MEM] = arch.shram_size_bytes
				1129	return generate_command_stream(npu_op_list, arch, verbose=False, mem_limits=mem_limits)