Blame - ethosu/vela/register_command_stream_generator.py - ml/ethos-u/ethos-u-vela

blob: 20431273cb7ff970c78aace98a676470091a80c1 [file] [log] [blame]

erik.andersson@arm.com	460c689	2021-02-24 14:38:09 +0100	[diff] [blame]	1	# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	16	# Description:
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	17	# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	18	# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	19	# stream suitable for interpretation by the Ethos-U processor.
Louis Verhaard	c629129	2021-03-19 09:35:48 +0100	[diff] [blame]	20	import math
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	21	from collections import defaultdict
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	22	from enum import Enum
				23	from enum import IntEnum
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	24	from typing import Dict
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	25	from typing import List
				26	from typing import Optional
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	27
				28	import numpy as np
				29
				30	from . import scaling
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	31	from .api import NpuAccelerator
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	32	from .api import NpuActivation
				33	from .api import NpuActivationOp
				34	from .api import NpuAddressRange
				35	from .api import NpuBlockOperation
				36	from .api import NpuBlockTraversal
				37	from .api import NpuConv2DOperation
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	38	from .api import NpuConvDepthWiseOperation
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	39	from .api import NpuDataType
				40	from .api import NpuDmaOperation
				41	from .api import NpuElementWiseOp
				42	from .api import NpuElementWiseOperation
				43	from .api import NpuFeatureMap
				44	from .api import NpuKernel
				45	from .api import NpuLayout
				46	from .api import NpuOperation
				47	from .api import NpuOperationType
				48	from .api import NpuPadding
				49	from .api import NpuPoolingOp
				50	from .api import NpuPoolingOperation
				51	from .api import NpuQuantization
				52	from .api import NpuResamplingMode
				53	from .api import NpuRoundingMode
				54	from .api import NpuShape3D
				55	from .api import NpuTileBox
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame^]	56	from .architecture_allocator import ArchitectureBlockConfig
				57	from .architecture_allocator import try_block_config
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	58	from .architecture_features import Accelerator
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	59	from .architecture_features import ArchitectureFeatures
Louis Verhaard	5207830	2020-11-18 13:35:06 +0100	[diff] [blame]	60	from .architecture_features import create_default_arch
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	61	from .architecture_features import SHRAMElements
erik.andersson@arm.com	1878dab	2021-03-16 09:40:24 +0100	[diff] [blame]	62	from .errors import VelaError
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	63	from .ethos_u55_regs.ethos_u55_regs import acc_format
				64	from .ethos_u55_regs.ethos_u55_regs import activation
				65	from .ethos_u55_regs.ethos_u55_regs import cmd0
				66	from .ethos_u55_regs.ethos_u55_regs import cmd1
				67	from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	68	from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlin	cf7da10	2020-05-20 09:03:40 +0200	[diff] [blame]	69	from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	70	from .ethos_u55_regs.ethos_u55_regs import rounding
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	71	from .numeric_util import quantise_float32
				72	from .numeric_util import round_away_zero
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	73	from .numeric_util import round_up_to_int
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	74	from .operation import NpuBlockType
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	75	from .range_set import MemoryAccessSet
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	76	from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	77	from .register_command_stream_util import calc_blockdep
				78	from .register_command_stream_util import get_dma_memory_accesses
				79	from .register_command_stream_util import get_op_memory_accesses
				80	from .register_command_stream_util import get_strides
				81	from .register_command_stream_util import get_wait_dependency
				82	from .register_command_stream_util import has_ifm2
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame^]	83	from .register_command_stream_util import shape3d_to_block
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	84	from .register_command_stream_util import to_kernel
				85	from .register_command_stream_util import UNARY_ELEMWISE_OPS
				86	from .register_command_stream_util import Watermark
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	87
				88
				89	class RegisterMachine:
				90	def __init__(self):
				91	self.n_banks = 1
				92	self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
				93	self.bank_idx = 0
				94
				95	def set_register(self, reg, value):
				96	is_changed = self.registers[self.bank_idx][reg] != value
				97	self.registers[self.bank_idx][reg] = value
				98	# is_changed = True # force command
				99	return is_changed
				100
				101	def switch_bank(self):
				102	self.bank_idx = (self.bank_idx + 1) % self.n_banks
				103
				104
				105	class CmdMode(IntEnum):
				106	NoPayload = 0x0000
				107	Payload32 = 0x4000
				108	Mask = 0xC000
				109	CmdOpMask = 0x03FF
				110
				111
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	112	class CommandStreamEmitter:
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	113	WORD_SIZE = 4
				114
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	115	def __init__(self):
				116	self.cmd_stream = []
				117	self.reg_machine = [RegisterMachine(), RegisterMachine()]
				118	self.last_absolute_wait = defaultdict(int)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	119	self.offset = 0
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	120
				121	def get_reg_machine(self, cmd):
				122	if "DMA" in cmd.name:
				123	return self.reg_machine[1]
				124	else:
				125	return self.reg_machine[0]
				126
				127	def size_in_bytes(self):
				128	sz = 0
				129	for cmd in self.cmd_stream:
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	130	sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	131	return sz
				132
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	133	def to_list(self) -> List[int]:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	134	return [elem for cmd in self.cmd_stream for elem in cmd]
				135
				136	def print_cmds(self):
				137	print("Code: Command: Param: Payload:")
				138	for words_for_one_command in self.cmd_stream:
				139	code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
				140	param = words_for_one_command[0] >> 16 # higher 16 bits
				141
				142	payload_mode = CmdMode(code & CmdMode.Mask)
				143
				144	# code and command
				145	s = " 0x%04x " % code
				146	if payload_mode == CmdMode.NoPayload:
				147	s += str(cmd0(code & CmdMode.CmdOpMask))
				148	else:
				149	s += str(cmd1(code & CmdMode.CmdOpMask))
				150
				151	s = s.ljust(40)
				152	s += "%5d" % param
				153
				154	# payload
				155	if payload_mode == CmdMode.Payload32:
				156	s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])
				157	else:
				158	s += " -"
				159
				160	print(s)
				161
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	162	def cmd0_with_param(self, cmd: cmd0, param):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	163	if isinstance(param, Enum):
				164	param = int(param.value)
				165	else:
				166	param = int(param)
				167	param = param & 0xFFFF
				168	command = cmd.value \| (param << 16)
				169	if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
				170	return
				171
				172	# This is not a redundant command, actually write it
				173	self.cmd_stream.append((command,))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	174	self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	175
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	176	def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):
Louis Verhaard	893780c	2021-03-30 09:02:30 +0200	[diff] [blame]	177	offset = int(offset) & 0xFFFFFFFF
				178	param = int(param) & 0xFFFF
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	179	command = cmd.value \| CmdMode.Payload32.value \| (param << 16)
				180
				181	if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
				182	return
				183
				184	# This is not a redundant command, actually write it
				185	self.cmd_stream.append((command, offset))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	186	self.offset += CommandStreamEmitter.WORD_SIZE * 2
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	187
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	188	def cmd1_with_address(self, cmd: cmd1, offset):
				189	self.cmd1_with_offset(cmd, offset, offset >> 32)
				190
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	191	def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	192	param = (16 * channel) + outstanding_count
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	193	command = ((param & 0xFFFF) << 16) \| cmd.value
				194	self.cmd_stream.append((command,))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	195	self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	196
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	197	def cmd_do_operation(self, cmd: cmd0, param=0):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	198	param = int(param)
				199	command = ((param & 0xFFFF) << 16) \| cmd.value
				200
				201	self.cmd_stream.append((command,))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	202	self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	203	self.get_reg_machine(cmd).switch_bank()
				204
				205
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	206	# -------------------------------------------------------------------
				207	# REGISTER GENERATION
				208	# -------------------------------------------------------------------
				209
				210
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	211	# TODO: Replace with definitions from ethos_u55_regs
				212	class IFM2Broadcast(IntEnum):
				213	BroadcastHdim = 1 << 0
				214	BroadcastWdim = 1 << 1
				215	BroadcastCdim = 1 << 2
				216	ReverseOperandOrder = 1 << 6
				217	UseIFM2Scalar = 1 << 7
				218
				219
				220	pooling_op_map = {
				221	NpuPoolingOp.MAX: pooling_mode.MAX.value,
				222	NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,
				223	NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,
				224	}
				225
				226	elementwise_op_map = {
				227	NpuElementWiseOp.MUL: elementwise_mode.MUL.value,
				228	NpuElementWiseOp.ADD: elementwise_mode.ADD.value,
				229	NpuElementWiseOp.SUB: elementwise_mode.SUB.value,
				230	NpuElementWiseOp.MIN: elementwise_mode.MIN.value,
				231	NpuElementWiseOp.MAX: elementwise_mode.MAX.value,
				232	NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,
				233	NpuElementWiseOp.ABS: elementwise_mode.ABS.value,
				234	NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,
				235	NpuElementWiseOp.SHR: elementwise_mode.SHR.value,
				236	NpuElementWiseOp.SHL: elementwise_mode.SHL.value,
				237	}
				238
				239	activation_op_map = {
				240	NpuActivationOp.NONE_OR_RELU: activation.NONE,
				241	NpuActivationOp.TANH: activation.TANH,
				242	NpuActivationOp.SIGMOID: activation.SIGMOID,
				243	}
				244
				245	# Maps an AccumulatorType enum to the corresponding acc_format value
				246	acc_format_map = {
				247	SHRAMElements.Acc16: acc_format.FP_S5_10.value,
				248	SHRAMElements.Acc32: acc_format.INT_32BIT.value,
				249	SHRAMElements.Acc40: acc_format.INT_40BIT.value,
				250	}
				251
				252	resampling_mode_map = {
				253	NpuResamplingMode.NONE: resampling_mode.NONE,
				254	NpuResamplingMode.NEAREST: resampling_mode.NEAREST,
				255	NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,
				256	}
				257
				258	# Maps data type size in bits to activation precision
				259	precision_map = {8: 0, 16: 1, 32: 2}
				260
				261	# Maps rounding mode to the corresponding value
				262	rounding_mode_map = {
				263	NpuRoundingMode.TFL: rounding.TFL.value,
				264	NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,
				265	NpuRoundingMode.NATURAL: rounding.NATURAL.value,
				266	}
				267
				268
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	269	def check_mem_limits(memory_accesses: MemoryAccessSet, mem_limits: Dict[int, int]):
				270	"""Checks that an operation's memory accesses respect the boundaries imposed by mem_limits"""
				271	for mem_access in memory_accesses.accesses:
				272	for region, range_set in mem_access.regions.items():
				273	if region not in mem_limits:
				274	raise VelaError(f"Invalid region: {region}")
				275	max = mem_limits[region]
				276	for start, end in range_set.ranges:
				277	for offset in (start, end):
				278	if offset < 0:
				279	raise VelaError(f"Negative address offset: {offset}, region: {region}")
				280	if offset > max:
				281	raise VelaError(f"Address offset out of range: {offset}, region: {region}, max: {max}")
				282
				283
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	284	def quantise(value: float, quant: Optional[NpuQuantization]) -> int:
				285	"""Quantizes the given value"""
				286	scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32
				287	zp = 0 if quant is None else quant.zero_point
				288	return quantise_float32(value, scale, zp)
				289
				290
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	291	def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):
				292	"""Generates IFM_PAD registers"""
				293	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)
				294	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)
				295	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)
				296	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)
				297
				298
				299	def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):
				300	"""Generates ACTIVATION registers"""
				301	act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)
				302
				303	if act.min is None:
				304	quantized_min = ofm.data_type.min_value()
				305	else:
				306	quantized_min = quantise(act.min, ofm.quantization)
				307	if act.max is None:
				308	quantized_max = ofm.data_type.max_value()
				309	else:
				310	quantized_max = quantise(act.max, ofm.quantization)
				311	quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())
				312	quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())
				313	if act.op_type == NpuActivationOp.TABLE_LOOKUP:
				314	assert 0 <= act.lookup_table_index < 8
				315	activation_value = 16 + act.lookup_table_index
				316	if ofm.data_type == NpuDataType.INT32:
				317	activation_value \|= 3 << 12 # Force I8 range
				318	quantized_min = max(-128, quantized_min)
				319	quantized_max = min(127, quantized_max)
				320	else:
				321	activation_value = activation_op_map[act.op_type]
				322	emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)
				323	emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)
				324	emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)
				325
				326
				327	def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):
				328	"""Generates xFM_BASE registers"""
				329	if layout == NpuLayout.NHCWB16:
				330	# Check that all BasePointer addresses are aligned to 16 bytes
				331	assert all((int(addr) % 16) == 0 for addr in addresses)
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	332	for i in range(4):
				333	emit.cmd1_with_address(ptr_cmds[i], addresses[i])
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	334
				335
				336	def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):
				337	"""Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""
				338	emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)
				339	emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)
				340	emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)
				341
				342
				343	def generate_strides(
				344	emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1
				345	):
				346	"""Generates STRIDE_C/Y/X registers"""
				347	strides = get_strides(fm)
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	348	emit.cmd1_with_address(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)
				349	emit.cmd1_with_address(stride_y_cmd, strides.height) # stride between vertical values (H)
				350	emit.cmd1_with_address(stride_x_cmd, strides.width) # stride between horisontal values (W)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	351
				352
				353	def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):
				354	"""Generates IFM/IFM2_PRECISION register"""
				355	dtype = fm.data_type
				356	prec = 1 if dtype.is_signed() else 0
				357	activation_precision = precision_map[dtype.size_in_bits()]
				358	prec += activation_precision << 2
				359
				360	if fm.layout == NpuLayout.NHCWB16:
				361	prec \|= 1 << 6
				362
				363	prec \|= op_to_scale << 8
				364	emit.cmd0_with_param(precision_cmd, prec)
				365
				366
				367	def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):
				368	"""Generates OFM_PRECISION register"""
				369	dtype = npu_op.ofm.data_type
				370	prec = 1 if dtype.is_signed() else 0
				371	activation_precision = precision_map[dtype.size_in_bits()]
				372	prec += activation_precision << 1
				373
				374	if use_global_scale:
				375	# Set global scale bit, as opposed to using per channel scale
				376	prec \|= 1 << 8
				377	if npu_op.ofm.layout == NpuLayout.NHCWB16:
				378	prec \|= 1 << 6
				379	prec \|= rounding_mode_map[npu_op.rounding_mode] << 14
				380	emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
				381
				382
				383	def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):
				384	"""Generates IFM2_BROADCAST register for binary elementwise operations"""
				385	ifm2_broadcast = 0
				386	ifm = npu_op.ifm
				387	ifm2 = npu_op.ifm2
				388	if npu_op.reversed_operands:
				389	ifm2_broadcast \|= IFM2Broadcast.ReverseOperandOrder
				390	if npu_op.ifm2_scalar is not None:
				391	# IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
				392	ifm2_broadcast \|= IFM2Broadcast.UseIFM2Scalar
				393	else:
				394	if ifm.shape.height != ifm2.shape.height:
				395	# Broadcast in 'H' dimension
				396	assert ifm2.shape.height == 1
				397	ifm2_broadcast \|= IFM2Broadcast.BroadcastHdim
				398
				399	if ifm.shape.width != ifm2.shape.width:
				400	# Broadcast in 'W' dimension
				401	assert ifm2.shape.width == 1
				402	ifm2_broadcast \|= IFM2Broadcast.BroadcastWdim
				403
				404	if ifm.shape.depth != ifm2.shape.depth:
				405	# Broadcast in 'C' dimension
				406	assert ifm2.shape.depth == 1
				407	ifm2_broadcast \|= IFM2Broadcast.BroadcastCdim
				408
				409	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
				410
				411
				412	def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):
				413	"""Generates general IFM registers"""
				414	emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)
				415	generate_addresses(
				416	emit,
				417	[cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],
				418	ifm.tiles.addresses,
				419	ifm.layout,
				420	)
				421	generate_tiles(
				422	emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles
				423	)
				424	emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)
				425	generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)
				426	emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))
				427
				428
				429	def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):
				430	"""Generates general IFM2 registers"""
				431	if not has_scalar:
				432	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)
				433	generate_addresses(
				434	emit,
				435	[cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],
				436	ifm2.tiles.addresses,
				437	ifm2.layout,
				438	)
				439	generate_tiles(
				440	emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles
				441	)
				442	generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)
				443	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))
				444
				445
				446	def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):
				447	"""Generates general OFM registers"""
				448	emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)
				449	generate_addresses(
				450	emit,
				451	[cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],
				452	ofm.tiles.addresses,
				453	ofm.layout,
				454	)
				455	generate_tiles(
				456	emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles
				457	)
				458	emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)
				459	emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)
				460	emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)
				461	generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)
				462	emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))
				463
				464
				465	def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):
				466	"""Generates KERNEL related registers"""
				467	emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))
				468	emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))
				469	# set kernel x stride low bit
				470	stride = (kernel.stride_x - 1) & 1
				471	# set kernel y stride low bit
				472	stride \|= (kernel.stride_y - 1 & 1) << 1
				473	# set kernel x stride extension bits
				474	stride \|= (kernel.stride_x - 1 >> 1) << 6
				475	# set kernel y stride extension bits
				476	stride \|= (kernel.stride_y - 1 >> 1) << 9
				477	stride \|= (kernel.dilation_x - 1) << 3
				478	stride \|= (kernel.dilation_y - 1) << 4
				479	if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:
				480	stride \|= 1 << 2
				481	emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
				482
				483
				484	def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):
				485	"""Generates WEIGHT registers"""
				486	if len(weights) == 0:
				487	return
				488	emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)
				489	# Set weights sources for active and present cores
				490	for core, (addr, length) in enumerate(
				491	[
				492	(cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
				493	(cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
				494	]
				495	):
				496	if core < len(weights):
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	497	emit.cmd1_with_address(addr, weights[core].address)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	498	emit.cmd1_with_offset(length, weights[core].length)
				499	elif core < arch.ncores:
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	500	emit.cmd1_with_address(addr, weights[0].address)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	501	emit.cmd1_with_offset(length, 0)
				502
				503
				504	def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):
				505	"""Generates SCALE registers"""
				506	if len(biases) == 0:
				507	return
				508	emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)
				509	# Set weights sources for active and present cores
				510	for core, (addr, length) in enumerate(
				511	[(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]
				512	):
				513	if core < len(biases):
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	514	emit.cmd1_with_address(addr, biases[core].address)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	515	emit.cmd1_with_offset(length, biases[core].length)
				516	elif core < arch.ncores:
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	517	emit.cmd1_with_address(addr, biases[0].address)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	518	emit.cmd1_with_offset(length, 0)
				519
				520
				521	def generate_block_config(
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame^]	522	emit: CommandStreamEmitter, block_config: NpuShape3D,
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	523	):
				524	"""Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	525	emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
				526	emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)
				527	emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	528
				529
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame^]	530	def generate_shram_registers(
				531	emit: CommandStreamEmitter, npu_op: NpuBlockOperation, arch_block_config: ArchitectureBlockConfig,
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	532	):
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame^]	533	"""Generates IB_END/IB_START/AB_START/ACC_FORMAT registers"""
				534	emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, arch_block_config.layout.ib_end)
				535	emit.cmd0_with_param(cmd0.NPU_SET_AB_START, arch_block_config.layout.ab_start)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	536	if has_ifm2(npu_op):
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame^]	537	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_IB_START, arch_block_config.layout.ib_start2)
				538	emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[arch_block_config.acc_type])
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	539
				540
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame^]	541	def get_block_config_for_npu_op(
				542	arch, npu_op: NpuBlockOperation, npu_block_type: NpuBlockType, is_partkernel: bool, ifm_resampling: resampling_mode
				543	) -> Optional[ArchitectureBlockConfig]:
				544	"""
				545	Given npu_op.block_config, returns a corresponding ArchitectureBlockConfig.
				546	Returns None if the block_config does not fit.
				547	"""
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	548
				549
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame^]	550	def get_arch_block_config(
				551	npu_op: NpuBlockOperation, block_traversal: NpuBlockTraversal, arch: ArchitectureFeatures
				552	) -> ArchitectureBlockConfig:
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	553	"""Creates shared buffer allocation for the given operation"""
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame^]	554	assert npu_op.block_config is not None, "block_config has not been set"
				555	block_type = NpuBlockType.Default
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	556	if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	557	block_type = NpuBlockType.ConvolutionMxN
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	558	elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	559	block_type = NpuBlockType.ConvolutionDepthWise
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	560	elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	561	block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	562	elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	563	block_type = NpuBlockType.ElementWise
				564	else:
				565	assert 0, "Unsupported operation"
				566	ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame^]	567	is_partkernel = block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST
				568	uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
				569	lut_banks = 2 if uses_lut else 0
				570	fms = [npu_op.ifm, npu_op.ofm]
				571	if npu_op.ifm2 is not None:
				572	fms.append(npu_op.ifm2)
				573	all_fms_have_quant = not any(fm.quantization is None or fm.quantization.scale_f32 is None for fm in fms)
				574	ifm_bits = npu_op.ifm.data_type.size_in_bits()
				575	ifm_shape = shape3d_to_block(npu_op.ifm.shape)
				576	if has_ifm2(npu_op):
				577	ifm2_shape = shape3d_to_block(npu_op.ifm2.shape)
				578	else:
				579	ifm2_shape = None
				580	uses_scalar = npu_op.ifm2_scalar is not None
				581	block_config = shape3d_to_block(npu_op.block_config)
				582	arch_block_config = try_block_config(
				583	block_config,
				584	arch,
				585	block_type,
				586	ifm_shape,
				587	ifm2_shape,
				588	uses_scalar,
				589	ifm_bits,
				590	is_partkernel=is_partkernel,
				591	kernel=to_kernel(npu_op.kernel),
				592	lut_banks=lut_banks,
				593	scaled=all_fms_have_quant,
				594	ifm_resampling=ifm_resampling_mode,
				595	)
				596	assert arch_block_config is not None, f"block_config {npu_op.block_config} does not fit"
				597	return arch_block_config
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	598
				599
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	600	def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):
				601	"""Generates KERNEL_WAIT/DMA_WAIT"""
				602	if cmd_waits.npu >= 0:
				603	emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
				604
				605	if cmd_waits.dma >= 0:
				606	emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
				607
				608
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	609	def generate_common(
				610	emit: CommandStreamEmitter,
				611	npu_op: NpuBlockOperation,
				612	block_traversal: NpuBlockTraversal,
				613	arch: ArchitectureFeatures,
				614	use_global_scale: bool = False,
				615	op_to_scale: int = 0,
				616	):
				617	"""Generate registers that are common to most operations"""
				618	assert npu_op.ifm is not None and npu_op.ofm is not None
				619	generate_ifm(emit, npu_op.ifm)
				620	generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)
				621	emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])
				622	if npu_op.padding is not None:
				623	generate_padding(emit, npu_op.padding)
				624	generate_ofm(emit, npu_op.ofm)
				625	generate_ofm_precision(emit, npu_op, use_global_scale)
				626	if npu_op.op_type != NpuOperationType.ElementWise:
				627	assert npu_op.kernel is not None
				628	generate_kernel(emit, npu_op.kernel, block_traversal)
				629	generate_weights(emit, npu_op.weights, arch)
				630	generate_biases(emit, npu_op.biases, arch)
				631	generate_activation(emit, npu_op.activation, npu_op.ofm)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame^]	632	arch_block_config = get_arch_block_config(npu_op, block_traversal, arch)
				633	generate_block_config(emit, npu_op.block_config)
				634	generate_shram_registers(emit, npu_op, arch_block_config)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	635
				636
				637	# -------------------------------------------------------------------
				638	# SCALING
				639	# -------------------------------------------------------------------
				640
				641
				642	def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):
				643	"""Generates OFM_SCALE register for pooling operations"""
				644	# For valid padding vela has to output scaling values
				645	kernel = pool_op.kernel
				646	ifm_quant = pool_op.ifm.quantization
				647	ofm_quant = pool_op.ofm.quantization
				648	if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):
				649	assert ifm_quant.scale_f32 is not None
				650	rescale = 0x3000 * ifm_quant.scale_f32
				651	if pool_op.ifm.data_type == NpuDataType.INT16:
				652	# Calculate scale and shift for the output scale of 1/(3*4096)
Louis Verhaard	c629129	2021-03-19 09:35:48 +0100	[diff] [blame]	653	x_log2 = math.log2(ifm_quant.scale_f32)
				654	rounded_log2 = int(round(x_log2))
				655	is_power_of_two = abs(x_log2 - rounded_log2) < 0.001
				656	shift = rounded_log2 + 12
				657	if is_power_of_two and shift in (0, 1):
				658	# Special handling if input scale is 1/2048 or 1/4096
				659	scale = 3 << shift
				660	shift = 0
				661	else:
				662	shift = 0
				663	max_rescale = np.iinfo(np.int16).max / 2
				664	while rescale <= max_rescale and shift <= 30:
				665	shift += 1
				666	rescale *= 2
				667	scale = int(rescale)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	668	else:
				669	rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
				670	scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
				671	scale = int(round_away_zero(scale * rescale))
				672	elif pool_op.fused_quantize:
				673	# Quantize op requires different scaling
				674	ifm_scale_f64 = np.double(ifm_quant.scale_f32)
				675	ofm_scale_f64 = np.double(ofm_quant.scale_f32)
				676	scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
				677	elif pool_op.rescale is not None:
Fredrik Svedberg	e82be7c	2021-01-18 15:21:03 +0100	[diff] [blame]	678	# for ResizeBilinear operations with rescale
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	679	rescale = pool_op.rescale
				680	rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
				681	scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
				682	scale = int(round_away_zero(scale * rescale))
				683	else:
				684	# In case avg pool fused with concat or other memory operation, rescaling might be needed.
				685	# kernel height == kernel width == 1 is always true in this case
				686	# Normally the scale is maximised, to get maximum precision, which means that
				687	# if rescale != 1, scale need to consider the number of bits needed for rescaling
				688	if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:
				689	rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32
				690	rescale_bits = 0
				691	if kernel.height == kernel.width == 1:
				692	if rescale > 1:
				693	rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
				694	elif rescale < 1:
				695	rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
				696	scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
				697	scale = int(round_away_zero(scale * rescale))
				698	else:
				699	scale = 1
				700	shift = 0
				701
				702	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
				703
				704
				705	def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
				706	"""
				707	Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
				708	Returns the operator to scale
				709	"""
				710	op_to_scale = 0
				711	if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):
				712	input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
				713	input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
				714	output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
				715
				716	if npu_op.activation is not None and npu_op.activation.op_type in (
				717	NpuActivationOp.SIGMOID,
				718	NpuActivationOp.TANH,
				719	):
				720	output_scale = 1 / 0x3000
				721
				722	if npu_op.sub_op_type == NpuElementWiseOp.MUL:
				723	if None in (input_scale, input2_scale, output_scale):
				724	ofm_scale = 1
				725	shift = 0
				726	else:
				727	ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
				728	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
				729	else: # Add/Sub
Henrik G Olsson	ad656a8	2021-03-19 15:50:28 +0100	[diff] [blame]	730	bitdepth = npu_op.ifm.data_type.size_in_bits()
				731	use_advanced_scaling = False
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	732	if None in (input_scale, input2_scale, output_scale):
				733	opa_scale = opb_scale = ofm_scale = 1
				734	opa_shift = shift = 0
				735	if npu_op.rescale is not None:
				736	ofm_scale, shift = npu_op.rescale
Henrik G Olsson	ad656a8	2021-03-19 15:50:28 +0100	[diff] [blame]	737	elif input_scale == input2_scale and bitdepth == 16:
				738	opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
				739	input_scale, input2_scale, output_scale
				740	)
				741	# align the double rounding with that of advanced scaling
				742	opa_scale /= 2
				743	opb_scale /= 2
				744	shift -= 1
				745	opa_shift = 0 # Unused for this case
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	746	elif input_scale == input2_scale:
				747	opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
				748	input_scale, input2_scale, output_scale
				749	)
				750	opa_shift = 0 # Unused for this case
Henrik G Olsson	ad656a8	2021-03-19 15:50:28 +0100	[diff] [blame]	751	# For 8 bit we can't guarantee double rounding with simplified scaling will always be
				752	# the same as with advanced scaling due to different shifts. When the ofm scale fulfils
				753	# the following we know that double rounding will have no effect for advanced scaling
				754	# no matter the input, so we can safely use simplified scaling with double rounding disabled.
				755	use_advanced_scaling = int(ofm_scale) & 0xFFF != 0
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	756	else:
Henrik G Olsson	ad656a8	2021-03-19 15:50:28 +0100	[diff] [blame]	757	use_advanced_scaling = True
				758	if use_advanced_scaling:
				759	# Use advanced implementation only when input/output scales differ,
				760	# or when we can't guarantee the absence of rounding errors
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	761	(opa_scale, opa_shift, ofm_scale, shift, op_to_scale,) = scaling.advanced_elementwise_add_sub_scale(
				762	input_scale, input2_scale, output_scale, bitdepth
				763	)
				764	opb_scale = 0 # Unused for this case
				765	if npu_op.reversed_operands:
				766	# If the operand order is reversed we also have to swap which operand is scaled
				767	if op_to_scale == scaling.OperandToScale.OPa:
				768	op_to_scale = scaling.OperandToScale.OPb
				769	else:
				770	op_to_scale = scaling.OperandToScale.OPa
				771	emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
				772	emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
				773	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
				774	elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):
				775	output_scale = npu_op.ofm.quantization.scale_f32
				776	ofm_scale, shift = scaling.quantise_scale(output_scale)
				777	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
				778	else:
				779	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)
				780	return op_to_scale
				781
				782
				783	# -------------------------------------------------------------------
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	784	# PRINT
				785	# -------------------------------------------------------------------
Jacob Bohlin	e99b893	2020-07-13 16:01:51 +0200	[diff] [blame]	786
				787
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	788	def print_feature_map(fm: NpuFeatureMap, name: str):
				789	if fm is not None:
				790	q = (
				791	"no quantization"
				792	if fm.quantization is None
				793	else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"
				794	)
				795	h, w, c = fm.shape
				796	sz = h * w * c * fm.data_type.size_in_bytes()
				797	print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")
				798	strides = get_strides(fm)
				799	stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"
				800	t = fm.tiles
				801	addresses = [hex(addr) for addr in t.addresses]
				802	print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	803
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	804
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	805	def print_operation(npu_op: NpuOperation, index: int = 0, cmd=None):
				806	pass_info = f", {cmd}" if cmd else ""
				807	if isinstance(npu_op, NpuOperation) and not isinstance(npu_op, (NpuDmaOperation, NpuBlockOperation)):
				808	print(f"{index} {npu_op.op_type.name}{pass_info}")
				809	return
				810	if isinstance(npu_op, NpuDmaOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	811	print(f"{index} DMA_START src={npu_op.src}, dest={npu_op.dest}{pass_info}")
				812	return
				813	k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	814	if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	815	print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name}:{pass_info}")
Patrik Gustavsson	eca2e95	2020-05-27 09:15:11 +0200	[diff] [blame]	816	else:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	817	if (
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	818	isinstance(npu_op, NpuConv2DOperation)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	819	and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1
				820	):
				821	fc = "FullyConnected "
				822	else:
				823	fc = ""
				824	print(f"{index} {fc}{npu_op.op_type.name}{pass_info}")
				825	print_feature_map(npu_op.ifm, "IFM")
				826	if npu_op.ifm2_scalar is not None:
				827	quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
				828	print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")
				829	else:
				830	print_feature_map(npu_op.ifm2, "IFM2")
				831	print_feature_map(npu_op.ofm, "OFM")
				832	if k is not None and npu_op.op_type != NpuOperationType.ElementWise:
				833	print(f" Kernel: {k}")
				834	if npu_op.padding is not None:
				835	print(f" {npu_op.padding}")
				836	for weights in npu_op.weights:
				837	print(f" Weights: {weights}")
				838	for bias in npu_op.biases:
				839	print(f" Scales: {bias}")
				840	if npu_op.activation is not None:
				841	act = npu_op.activation
				842	if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:
				843	lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""
				844	print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	845	if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	846	print(f" {npu_op.block_traversal}")
				847	bh, bw, bc = npu_op.block_config
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	848	rescale = (
				849	f", rescale={npu_op.rescale}" if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)) else ""
				850	)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	851	print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	852
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	853
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	854	def print_operations(npu_op_list: List[NpuOperation], npu_op_to_cmd=None):
				855	npu_op_to_cmd = dict() if npu_op_to_cmd is None else npu_op_to_cmd
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	856	for index, npu_op in enumerate(npu_op_list):
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	857	print_operation(npu_op, index, npu_op_to_cmd.get(npu_op))
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	858
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	859
				860	# -------------------------------------------------------------------
				861	# OPERATIONS
				862	# -------------------------------------------------------------------
				863
				864
				865	def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):
				866	"""Generates NPU_OP_* command"""
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	867	if isinstance(npu_op, NpuDmaOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	868	emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	869	elif isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	870	emit.cmd_do_operation(cmd0.NPU_OP_CONV)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	871	elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	872	emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	873	elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	874	emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	875	elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	876	emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])
				877	else:
				878	assert 0, "Unsupported operation"
				879
				880
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	881	def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	882	"""Generates register commands for Conv2D operations"""
				883	generate_common(emit, npu_op, npu_op.block_traversal, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	884
				885
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	886	def generate_conv_depthwise_op(
				887	emit: CommandStreamEmitter, npu_op: NpuConvDepthWiseOperation, arch: ArchitectureFeatures
				888	):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	889	"""Generates register commands for depthwise convolution operations"""
				890	generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	891
				892
				893	def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
				894	"""Generates register commands for pooling operations"""
				895	use_global_scale = (
				896	npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0
				897	)
				898	generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)
				899	# Pooling op specific
				900	if use_global_scale:
				901	generate_ofm_scaling_for_pooling(emit, npu_op)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	902
				903
				904	def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):
				905	"""Generates register commands for elementwise operations"""
				906	use_global_scale = npu_op.sub_op_type in (
				907	NpuElementWiseOp.ADD,
				908	NpuElementWiseOp.SUB,
				909	NpuElementWiseOp.MUL,
				910	NpuElementWiseOp.LRELU,
				911	NpuElementWiseOp.ABS,
				912	)
				913	op_to_scale = generate_scaling_for_elementwise(emit, npu_op)
				914	generate_common(
				915	emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale
				916	)
				917	# Elementwise op specific
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	918	if npu_op.sub_op_type not in UNARY_ELEMWISE_OPS:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	919	# Binary operation; generate IFM2 registers
				920	assert npu_op.ifm2 is not None
				921	has_scalar = npu_op.ifm2_scalar is not None
				922	generate_ifm2(emit, npu_op.ifm2, has_scalar)
				923	generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)
				924	generate_ifm2_broadcast(emit, npu_op)
				925	if has_scalar:
				926	quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
				927	assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()
				928	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	929
				930
				931	def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):
				932	"""Generates register commands for DMA operations"""
				933	emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	934	emit.cmd1_with_address(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	935	emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)
				936
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	937	emit.cmd1_with_address(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
				938	emit.cmd1_with_address(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	939
				940
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	941	def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	942	"""
				943	Generates register commands for the given operation, but not the final NPU_OP_... command.
				944	Returns the selected block config
				945	"""
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	946	if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	947	generate_conv2d_op(emit, npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	948	elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	949	generate_conv_depthwise_op(emit, npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	950	elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	951	generate_pooling_op(emit, npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	952	elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	953	generate_elementwise_op(emit, npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	954	elif isinstance(npu_op, NpuDmaOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	955	generate_dma_op(emit, npu_op)
				956	else:
				957	assert 0, "Unsupported operation"
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	958
				959
				960	def generate_command_stream(
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	961	npu_op_list: List[NpuOperation],
				962	arch: ArchitectureFeatures,
				963	verbose: bool,
				964	mem_limits: Dict[int, int],
				965	add_to_debug_db=None,
				966	npu_op_to_cmd=None,
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	967	) -> List[int]:
				968	"""
				969	Generates register commands for the given list of NPU operations.
				970	Returns Ethos-U instructions, as a list of 32-bit integers.
				971	"""
				972	emit = CommandStreamEmitter()
				973	if verbose:
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	974	print_operations(npu_op_list, npu_op_to_cmd)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	975	# Calculate memory accesses for every operation
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	976	memory_accesses: Dict[NpuOperation, MemoryAccessSet] = {}
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	977	for npu_op in npu_op_list:
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	978	if isinstance(npu_op, NpuDmaOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	979	memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	980	elif isinstance(npu_op, NpuBlockOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	981	memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	982	else:
				983	assert 0, "Invalid operation type"
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	984
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	985	if arch.is_ethos_u65_system:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	986	emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
				987	dep_watermark = Watermark(0, 0)
				988	prev_op = None
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	989	# Generate register commands for all operations
				990	for op_index, npu_op in enumerate(npu_op_list):
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	991	try:
				992	check_mem_limits(memory_accesses[npu_op], mem_limits)
				993	dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)
				994	generate_registers_for_op(emit, npu_op, arch)
				995	except VelaError as e:
				996	# Add operation info and rethrow
				997	raise VelaError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	998	if not isinstance(npu_op, NpuDmaOperation) and isinstance(npu_op, NpuBlockOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	999	# Generate BLOCKDEP
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	1000	blockdep = calc_blockdep(arch, prev_op, npu_op)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1001	blockdep = min(blockdep, arch.max_blockdep)
				1002	emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
				1003	prev_op = npu_op
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1004
				1005	generate_cmd_waits(emit, cmd_waits)
				1006	# Generate the actual NPU_OP command
				1007	generate_operation_code(emit, npu_op)
				1008	if add_to_debug_db is not None:
				1009	add_to_debug_db(npu_op, emit.offset)
				1010	# Fill in final part of command stream:
				1011	emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	1012	res = emit.to_list()
erik.andersson@arm.com	1878dab	2021-03-16 09:40:24 +0100	[diff] [blame]	1013
				1014	if emit.size_in_bytes() >= 1 << 24:
				1015	raise VelaError(
				1016	f"The command stream size exceeds the hardware limit of 16 MiB. "
				1017	f"The current stream size is {emit.size_in_bytes()/2**20:.2F} MiB."
				1018	)
				1019
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1020	if verbose:
				1021	emit.print_cmds()
				1022	print("number of commands", len(emit.cmd_stream))
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	1023	print("command stream length in words", len(res))
				1024	return res
				1025
				1026
				1027	# -------------------------------------------------------------------
				1028	# EXTERNAL API
				1029	# -------------------------------------------------------------------
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1030
				1031
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	1032	def find_block_configs(npu_op: NpuOperation, npu_accelerator: NpuAccelerator) -> List[NpuShape3D]:
				1033	"""
				1034	Internal implementation of the public facing API for finding block configs.
				1035	"""
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	1036	if isinstance(npu_op, NpuBlockOperation):
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame^]	1037	# TODO: implement this function
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	1038	arch = create_default_arch(Accelerator.from_npu_accelerator(npu_accelerator))
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame^]	1039	block = arch.ofm_ublock
				1040	return [NpuShape3D(height=block.height, width=block.width, depth=block.depth)]
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	1041	return []
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	1042
				1043
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	1044	def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1045	"""
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	1046	Internal implementation of the public facing API for generating an Ethos-U register command stream.
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1047	Calculates dependencies between commands and inserts wait operations if needed.
				1048
				1049	:param npu_op_list: List[NpuOperation] list of high level NPU operations
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	1050	:param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator
				1051	:return Ethos-U instructions, as a list of 32-bit integers
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1052	"""
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	1053	accelerator = Accelerator.from_npu_accelerator(npu_accelerator)
Louis Verhaard	5207830	2020-11-18 13:35:06 +0100	[diff] [blame]	1054	arch = create_default_arch(accelerator)
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	1055	mem_limits = dict()
				1056	for region in range(0, 8):
				1057	mem_limits[region] = arch.max_address_offset
				1058	mem_limits[BASE_PTR_INDEX_MEM2MEM] = arch.shram_size_bytes
				1059	return generate_command_stream(npu_op_list, arch, verbose=False, mem_limits=mem_limits)