Blame - ethosu/vela/register_command_stream_generator.py - ml/ethos-u/ethos-u-vela

blob: fb705b9641eb7f6873434b8b693a1149920b4c23 [file] [log] [blame]

erik.andersson@arm.com	460c689	2021-02-24 14:38:09 +0100	[diff] [blame]	1	# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	16	# Description:
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	17	# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	18	# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	19	# stream suitable for interpretation by the Ethos-U processor.
Louis Verhaard	c629129	2021-03-19 09:35:48 +0100	[diff] [blame]	20	import math
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	21	from collections import defaultdict
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	22	from enum import Enum
				23	from enum import IntEnum
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	24	from typing import Dict
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	25	from typing import List
				26	from typing import Optional
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	27
				28	import numpy as np
				29
				30	from . import scaling
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	31	from .api import NpuAccelerator
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	32	from .api import NpuActivation
				33	from .api import NpuActivationOp
				34	from .api import NpuAddressRange
				35	from .api import NpuBlockOperation
				36	from .api import NpuBlockTraversal
				37	from .api import NpuConv2DOperation
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	38	from .api import NpuConvDepthWiseOperation
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	39	from .api import NpuDataType
				40	from .api import NpuDmaOperation
				41	from .api import NpuElementWiseOp
				42	from .api import NpuElementWiseOperation
				43	from .api import NpuFeatureMap
				44	from .api import NpuKernel
				45	from .api import NpuLayout
				46	from .api import NpuOperation
				47	from .api import NpuOperationType
				48	from .api import NpuPadding
				49	from .api import NpuPoolingOp
				50	from .api import NpuPoolingOperation
				51	from .api import NpuQuantization
				52	from .api import NpuResamplingMode
				53	from .api import NpuRoundingMode
				54	from .api import NpuShape3D
				55	from .api import NpuTileBox
				56	from .architecture_features import Accelerator
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	57	from .architecture_features import ArchitectureFeatures
				58	from .architecture_features import Block
Louis Verhaard	5207830	2020-11-18 13:35:06 +0100	[diff] [blame]	59	from .architecture_features import create_default_arch
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	60	from .architecture_features import SharedBufferArea
				61	from .architecture_features import SHRAMElements
erik.andersson@arm.com	1878dab	2021-03-16 09:40:24 +0100	[diff] [blame]	62	from .errors import VelaError
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	63	from .ethos_u55_regs.ethos_u55_regs import acc_format
				64	from .ethos_u55_regs.ethos_u55_regs import activation
				65	from .ethos_u55_regs.ethos_u55_regs import cmd0
				66	from .ethos_u55_regs.ethos_u55_regs import cmd1
				67	from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	68	from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlin	cf7da10	2020-05-20 09:03:40 +0200	[diff] [blame]	69	from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	70	from .ethos_u55_regs.ethos_u55_regs import rounding
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	71	from .numeric_util import quantise_float32
				72	from .numeric_util import round_away_zero
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	73	from .numeric_util import round_up_to_int
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	74	from .operation import NpuBlockType
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	75	from .range_set import MemoryAccessSet
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	76	from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	77	from .register_command_stream_util import calc_blockdep
				78	from .register_command_stream_util import get_dma_memory_accesses
				79	from .register_command_stream_util import get_op_memory_accesses
				80	from .register_command_stream_util import get_strides
				81	from .register_command_stream_util import get_wait_dependency
				82	from .register_command_stream_util import has_ifm2
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	83	from .register_command_stream_util import to_kernel
				84	from .register_command_stream_util import UNARY_ELEMWISE_OPS
				85	from .register_command_stream_util import Watermark
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	86	from .shared_buffer_allocation import find_suitable_block_configs
				87	from .shared_buffer_allocation import shared_buffer_allocation_for_npu_op
				88	from .shared_buffer_allocation import SharedBufferAllocation
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	89
				90
				91	class RegisterMachine:
				92	def __init__(self):
				93	self.n_banks = 1
				94	self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
				95	self.bank_idx = 0
				96
				97	def set_register(self, reg, value):
				98	is_changed = self.registers[self.bank_idx][reg] != value
				99	self.registers[self.bank_idx][reg] = value
				100	# is_changed = True # force command
				101	return is_changed
				102
				103	def switch_bank(self):
				104	self.bank_idx = (self.bank_idx + 1) % self.n_banks
				105
				106
				107	class CmdMode(IntEnum):
				108	NoPayload = 0x0000
				109	Payload32 = 0x4000
				110	Mask = 0xC000
				111	CmdOpMask = 0x03FF
				112
				113
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	114	class CommandStreamEmitter:
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	115	WORD_SIZE = 4
				116
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	117	def __init__(self):
				118	self.cmd_stream = []
				119	self.reg_machine = [RegisterMachine(), RegisterMachine()]
				120	self.last_absolute_wait = defaultdict(int)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	121	self.offset = 0
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	122
				123	def get_reg_machine(self, cmd):
				124	if "DMA" in cmd.name:
				125	return self.reg_machine[1]
				126	else:
				127	return self.reg_machine[0]
				128
				129	def size_in_bytes(self):
				130	sz = 0
				131	for cmd in self.cmd_stream:
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	132	sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	133	return sz
				134
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	135	def to_list(self) -> List[int]:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	136	return [elem for cmd in self.cmd_stream for elem in cmd]
				137
				138	def print_cmds(self):
				139	print("Code: Command: Param: Payload:")
				140	for words_for_one_command in self.cmd_stream:
				141	code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
				142	param = words_for_one_command[0] >> 16 # higher 16 bits
				143
				144	payload_mode = CmdMode(code & CmdMode.Mask)
				145
				146	# code and command
				147	s = " 0x%04x " % code
				148	if payload_mode == CmdMode.NoPayload:
				149	s += str(cmd0(code & CmdMode.CmdOpMask))
				150	else:
				151	s += str(cmd1(code & CmdMode.CmdOpMask))
				152
				153	s = s.ljust(40)
				154	s += "%5d" % param
				155
				156	# payload
				157	if payload_mode == CmdMode.Payload32:
				158	s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])
				159	else:
				160	s += " -"
				161
				162	print(s)
				163
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	164	def cmd0_with_param(self, cmd: cmd0, param):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	165	if isinstance(param, Enum):
				166	param = int(param.value)
				167	else:
				168	param = int(param)
				169	param = param & 0xFFFF
				170	command = cmd.value \| (param << 16)
				171	if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
				172	return
				173
				174	# This is not a redundant command, actually write it
				175	self.cmd_stream.append((command,))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	176	self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	177
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	178	def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):
Louis Verhaard	893780c	2021-03-30 09:02:30 +0200	[diff] [blame^]	179	offset = int(offset) & 0xFFFFFFFF
				180	param = int(param) & 0xFFFF
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	181	command = cmd.value \| CmdMode.Payload32.value \| (param << 16)
				182
				183	if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
				184	return
				185
				186	# This is not a redundant command, actually write it
				187	self.cmd_stream.append((command, offset))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	188	self.offset += CommandStreamEmitter.WORD_SIZE * 2
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	189
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	190	def cmd1_with_address(self, cmd: cmd1, offset):
				191	self.cmd1_with_offset(cmd, offset, offset >> 32)
				192
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	193	def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	194	param = (16 * channel) + outstanding_count
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	195	command = ((param & 0xFFFF) << 16) \| cmd.value
				196	self.cmd_stream.append((command,))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	197	self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	198
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	199	def cmd_do_operation(self, cmd: cmd0, param=0):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	200	param = int(param)
				201	command = ((param & 0xFFFF) << 16) \| cmd.value
				202
				203	self.cmd_stream.append((command,))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	204	self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	205	self.get_reg_machine(cmd).switch_bank()
				206
				207
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	208	# -------------------------------------------------------------------
				209	# REGISTER GENERATION
				210	# -------------------------------------------------------------------
				211
				212
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	213	# TODO: Replace with definitions from ethos_u55_regs
				214	class IFM2Broadcast(IntEnum):
				215	BroadcastHdim = 1 << 0
				216	BroadcastWdim = 1 << 1
				217	BroadcastCdim = 1 << 2
				218	ReverseOperandOrder = 1 << 6
				219	UseIFM2Scalar = 1 << 7
				220
				221
				222	pooling_op_map = {
				223	NpuPoolingOp.MAX: pooling_mode.MAX.value,
				224	NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,
				225	NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,
				226	}
				227
				228	elementwise_op_map = {
				229	NpuElementWiseOp.MUL: elementwise_mode.MUL.value,
				230	NpuElementWiseOp.ADD: elementwise_mode.ADD.value,
				231	NpuElementWiseOp.SUB: elementwise_mode.SUB.value,
				232	NpuElementWiseOp.MIN: elementwise_mode.MIN.value,
				233	NpuElementWiseOp.MAX: elementwise_mode.MAX.value,
				234	NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,
				235	NpuElementWiseOp.ABS: elementwise_mode.ABS.value,
				236	NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,
				237	NpuElementWiseOp.SHR: elementwise_mode.SHR.value,
				238	NpuElementWiseOp.SHL: elementwise_mode.SHL.value,
				239	}
				240
				241	activation_op_map = {
				242	NpuActivationOp.NONE_OR_RELU: activation.NONE,
				243	NpuActivationOp.TANH: activation.TANH,
				244	NpuActivationOp.SIGMOID: activation.SIGMOID,
				245	}
				246
				247	# Maps an AccumulatorType enum to the corresponding acc_format value
				248	acc_format_map = {
				249	SHRAMElements.Acc16: acc_format.FP_S5_10.value,
				250	SHRAMElements.Acc32: acc_format.INT_32BIT.value,
				251	SHRAMElements.Acc40: acc_format.INT_40BIT.value,
				252	}
				253
				254	resampling_mode_map = {
				255	NpuResamplingMode.NONE: resampling_mode.NONE,
				256	NpuResamplingMode.NEAREST: resampling_mode.NEAREST,
				257	NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,
				258	}
				259
				260	# Maps data type size in bits to activation precision
				261	precision_map = {8: 0, 16: 1, 32: 2}
				262
				263	# Maps rounding mode to the corresponding value
				264	rounding_mode_map = {
				265	NpuRoundingMode.TFL: rounding.TFL.value,
				266	NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,
				267	NpuRoundingMode.NATURAL: rounding.NATURAL.value,
				268	}
				269
				270
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	271	def check_mem_limits(memory_accesses: MemoryAccessSet, mem_limits: Dict[int, int]):
				272	"""Checks that an operation's memory accesses respect the boundaries imposed by mem_limits"""
				273	for mem_access in memory_accesses.accesses:
				274	for region, range_set in mem_access.regions.items():
				275	if region not in mem_limits:
				276	raise VelaError(f"Invalid region: {region}")
				277	max = mem_limits[region]
				278	for start, end in range_set.ranges:
				279	for offset in (start, end):
				280	if offset < 0:
				281	raise VelaError(f"Negative address offset: {offset}, region: {region}")
				282	if offset > max:
				283	raise VelaError(f"Address offset out of range: {offset}, region: {region}, max: {max}")
				284
				285
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	286	def quantise(value: float, quant: Optional[NpuQuantization]) -> int:
				287	"""Quantizes the given value"""
				288	scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32
				289	zp = 0 if quant is None else quant.zero_point
				290	return quantise_float32(value, scale, zp)
				291
				292
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	293	def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):
				294	"""Generates IFM_PAD registers"""
				295	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)
				296	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)
				297	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)
				298	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)
				299
				300
				301	def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):
				302	"""Generates ACTIVATION registers"""
				303	act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)
				304
				305	if act.min is None:
				306	quantized_min = ofm.data_type.min_value()
				307	else:
				308	quantized_min = quantise(act.min, ofm.quantization)
				309	if act.max is None:
				310	quantized_max = ofm.data_type.max_value()
				311	else:
				312	quantized_max = quantise(act.max, ofm.quantization)
				313	quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())
				314	quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())
				315	if act.op_type == NpuActivationOp.TABLE_LOOKUP:
				316	assert 0 <= act.lookup_table_index < 8
				317	activation_value = 16 + act.lookup_table_index
				318	if ofm.data_type == NpuDataType.INT32:
				319	activation_value \|= 3 << 12 # Force I8 range
				320	quantized_min = max(-128, quantized_min)
				321	quantized_max = min(127, quantized_max)
				322	else:
				323	activation_value = activation_op_map[act.op_type]
				324	emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)
				325	emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)
				326	emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)
				327
				328
				329	def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):
				330	"""Generates xFM_BASE registers"""
				331	if layout == NpuLayout.NHCWB16:
				332	# Check that all BasePointer addresses are aligned to 16 bytes
				333	assert all((int(addr) % 16) == 0 for addr in addresses)
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	334	for i in range(4):
				335	emit.cmd1_with_address(ptr_cmds[i], addresses[i])
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	336
				337
				338	def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):
				339	"""Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""
				340	emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)
				341	emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)
				342	emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)
				343
				344
				345	def generate_strides(
				346	emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1
				347	):
				348	"""Generates STRIDE_C/Y/X registers"""
				349	strides = get_strides(fm)
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	350	emit.cmd1_with_address(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)
				351	emit.cmd1_with_address(stride_y_cmd, strides.height) # stride between vertical values (H)
				352	emit.cmd1_with_address(stride_x_cmd, strides.width) # stride between horisontal values (W)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	353
				354
				355	def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):
				356	"""Generates IFM/IFM2_PRECISION register"""
				357	dtype = fm.data_type
				358	prec = 1 if dtype.is_signed() else 0
				359	activation_precision = precision_map[dtype.size_in_bits()]
				360	prec += activation_precision << 2
				361
				362	if fm.layout == NpuLayout.NHCWB16:
				363	prec \|= 1 << 6
				364
				365	prec \|= op_to_scale << 8
				366	emit.cmd0_with_param(precision_cmd, prec)
				367
				368
				369	def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):
				370	"""Generates OFM_PRECISION register"""
				371	dtype = npu_op.ofm.data_type
				372	prec = 1 if dtype.is_signed() else 0
				373	activation_precision = precision_map[dtype.size_in_bits()]
				374	prec += activation_precision << 1
				375
				376	if use_global_scale:
				377	# Set global scale bit, as opposed to using per channel scale
				378	prec \|= 1 << 8
				379	if npu_op.ofm.layout == NpuLayout.NHCWB16:
				380	prec \|= 1 << 6
				381	prec \|= rounding_mode_map[npu_op.rounding_mode] << 14
				382	emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
				383
				384
				385	def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):
				386	"""Generates IFM2_BROADCAST register for binary elementwise operations"""
				387	ifm2_broadcast = 0
				388	ifm = npu_op.ifm
				389	ifm2 = npu_op.ifm2
				390	if npu_op.reversed_operands:
				391	ifm2_broadcast \|= IFM2Broadcast.ReverseOperandOrder
				392	if npu_op.ifm2_scalar is not None:
				393	# IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
				394	ifm2_broadcast \|= IFM2Broadcast.UseIFM2Scalar
				395	else:
				396	if ifm.shape.height != ifm2.shape.height:
				397	# Broadcast in 'H' dimension
				398	assert ifm2.shape.height == 1
				399	ifm2_broadcast \|= IFM2Broadcast.BroadcastHdim
				400
				401	if ifm.shape.width != ifm2.shape.width:
				402	# Broadcast in 'W' dimension
				403	assert ifm2.shape.width == 1
				404	ifm2_broadcast \|= IFM2Broadcast.BroadcastWdim
				405
				406	if ifm.shape.depth != ifm2.shape.depth:
				407	# Broadcast in 'C' dimension
				408	assert ifm2.shape.depth == 1
				409	ifm2_broadcast \|= IFM2Broadcast.BroadcastCdim
				410
				411	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
				412
				413
				414	def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):
				415	"""Generates general IFM registers"""
				416	emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)
				417	generate_addresses(
				418	emit,
				419	[cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],
				420	ifm.tiles.addresses,
				421	ifm.layout,
				422	)
				423	generate_tiles(
				424	emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles
				425	)
				426	emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)
				427	generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)
				428	emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))
				429
				430
				431	def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):
				432	"""Generates general IFM2 registers"""
				433	if not has_scalar:
				434	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)
				435	generate_addresses(
				436	emit,
				437	[cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],
				438	ifm2.tiles.addresses,
				439	ifm2.layout,
				440	)
				441	generate_tiles(
				442	emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles
				443	)
				444	generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)
				445	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))
				446
				447
				448	def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):
				449	"""Generates general OFM registers"""
				450	emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)
				451	generate_addresses(
				452	emit,
				453	[cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],
				454	ofm.tiles.addresses,
				455	ofm.layout,
				456	)
				457	generate_tiles(
				458	emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles
				459	)
				460	emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)
				461	emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)
				462	emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)
				463	generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)
				464	emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))
				465
				466
				467	def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):
				468	"""Generates KERNEL related registers"""
				469	emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))
				470	emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))
				471	# set kernel x stride low bit
				472	stride = (kernel.stride_x - 1) & 1
				473	# set kernel y stride low bit
				474	stride \|= (kernel.stride_y - 1 & 1) << 1
				475	# set kernel x stride extension bits
				476	stride \|= (kernel.stride_x - 1 >> 1) << 6
				477	# set kernel y stride extension bits
				478	stride \|= (kernel.stride_y - 1 >> 1) << 9
				479	stride \|= (kernel.dilation_x - 1) << 3
				480	stride \|= (kernel.dilation_y - 1) << 4
				481	if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:
				482	stride \|= 1 << 2
				483	emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
				484
				485
				486	def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):
				487	"""Generates WEIGHT registers"""
				488	if len(weights) == 0:
				489	return
				490	emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)
				491	# Set weights sources for active and present cores
				492	for core, (addr, length) in enumerate(
				493	[
				494	(cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
				495	(cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
				496	]
				497	):
				498	if core < len(weights):
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	499	emit.cmd1_with_address(addr, weights[core].address)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	500	emit.cmd1_with_offset(length, weights[core].length)
				501	elif core < arch.ncores:
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	502	emit.cmd1_with_address(addr, weights[0].address)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	503	emit.cmd1_with_offset(length, 0)
				504
				505
				506	def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):
				507	"""Generates SCALE registers"""
				508	if len(biases) == 0:
				509	return
				510	emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)
				511	# Set weights sources for active and present cores
				512	for core, (addr, length) in enumerate(
				513	[(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]
				514	):
				515	if core < len(biases):
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	516	emit.cmd1_with_address(addr, biases[core].address)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	517	emit.cmd1_with_offset(length, biases[core].length)
				518	elif core < arch.ncores:
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	519	emit.cmd1_with_address(addr, biases[0].address)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	520	emit.cmd1_with_offset(length, 0)
				521
				522
				523	def generate_block_config(
				524	emit: CommandStreamEmitter,
				525	npu_op: NpuBlockOperation,
				526	arch: ArchitectureFeatures,
				527	shared_buffer: SharedBufferAllocation,
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	528	):
				529	"""Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	530	block_config = npu_op.block_config
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	531	assert block_config is not None, "block_config has not been set"
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	532	alloc = shared_buffer.try_block(Block(block_config.width, block_config.height, block_config.depth))
				533	assert alloc is not None, f"Block config {block_config} does not fit, op: {npu_op.op_type}"
				534	emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
				535	emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)
				536	emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	537
				538
				539	def generate_shram_registers_elementwise(
				540	emit: CommandStreamEmitter,
				541	npu_op: NpuElementWiseOperation,
				542	arch: ArchitectureFeatures,
				543	shared_buffer: SharedBufferAllocation,
				544	):
				545	"""Generates IB_END/IB_START/AB_START registers for elementwise operations"""
				546	# For elementwise set the required SHRAM to be equal to the total size of available SHRAM
				547	uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
				548	shram_required = arch.available_shram_banks(uses_lut)
				549
				550	# Acc buffers not needed so set AB_START to size of SHRAM
				551	emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)
				552	emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shram_required)
				553	if has_ifm2(npu_op):
				554	# Set IFM2_IB_START to the latter half of the IB space
				555	ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]
				556	emit.cmd0_with_param(
				557	cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) // shared_buffer.ifm_count + ifm_ib_start,
				558	)
				559	emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
				560
				561
				562	def generate_shram_registers_non_elementwise(emit: CommandStreamEmitter, shared_buffer: SharedBufferAllocation):
				563	"""Generates IB_END/IB_START/AB_START registers for non-elementwise operations"""
				564	emit.cmd0_with_param(
				565	cmd0.NPU_SET_IFM_IB_END,
				566	shared_buffer.bank_locations[SharedBufferArea.IFM] + shared_buffer.banks_required[SharedBufferArea.IFM],
				567	)
				568	emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])
				569	emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
				570
				571
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	572	def create_shared_buffer(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> SharedBufferAllocation:
				573	"""Creates shared buffer allocation for the given operation"""
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	574	if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	575	block_type = NpuBlockType.ConvolutionMxN
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	576	elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	577	block_type = NpuBlockType.ConvolutionDepthWise
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	578	elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	579	block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	580	elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	581	block_type = NpuBlockType.ElementWise
				582	else:
				583	assert 0, "Unsupported operation"
				584	ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
				585	return shared_buffer_allocation_for_npu_op(arch, npu_op, block_type, ifm_resampling_mode)
				586
				587
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	588	def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):
				589	"""Generates KERNEL_WAIT/DMA_WAIT"""
				590	if cmd_waits.npu >= 0:
				591	emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
				592
				593	if cmd_waits.dma >= 0:
				594	emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
				595
				596
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	597	def generate_common(
				598	emit: CommandStreamEmitter,
				599	npu_op: NpuBlockOperation,
				600	block_traversal: NpuBlockTraversal,
				601	arch: ArchitectureFeatures,
				602	use_global_scale: bool = False,
				603	op_to_scale: int = 0,
				604	):
				605	"""Generate registers that are common to most operations"""
				606	assert npu_op.ifm is not None and npu_op.ofm is not None
				607	generate_ifm(emit, npu_op.ifm)
				608	generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)
				609	emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])
				610	if npu_op.padding is not None:
				611	generate_padding(emit, npu_op.padding)
				612	generate_ofm(emit, npu_op.ofm)
				613	generate_ofm_precision(emit, npu_op, use_global_scale)
				614	if npu_op.op_type != NpuOperationType.ElementWise:
				615	assert npu_op.kernel is not None
				616	generate_kernel(emit, npu_op.kernel, block_traversal)
				617	generate_weights(emit, npu_op.weights, arch)
				618	generate_biases(emit, npu_op.biases, arch)
				619	generate_activation(emit, npu_op.activation, npu_op.ofm)
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	620	shared_buffer = create_shared_buffer(npu_op, arch)
				621	generate_block_config(emit, npu_op, arch, shared_buffer)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	622	if isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	623	generate_shram_registers_elementwise(emit, npu_op, arch, shared_buffer)
				624	else:
				625	generate_shram_registers_non_elementwise(emit, shared_buffer)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	626
				627
				628	# -------------------------------------------------------------------
				629	# SCALING
				630	# -------------------------------------------------------------------
				631
				632
				633	def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):
				634	"""Generates OFM_SCALE register for pooling operations"""
				635	# For valid padding vela has to output scaling values
				636	kernel = pool_op.kernel
				637	ifm_quant = pool_op.ifm.quantization
				638	ofm_quant = pool_op.ofm.quantization
				639	if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):
				640	assert ifm_quant.scale_f32 is not None
				641	rescale = 0x3000 * ifm_quant.scale_f32
				642	if pool_op.ifm.data_type == NpuDataType.INT16:
				643	# Calculate scale and shift for the output scale of 1/(3*4096)
Louis Verhaard	c629129	2021-03-19 09:35:48 +0100	[diff] [blame]	644	x_log2 = math.log2(ifm_quant.scale_f32)
				645	rounded_log2 = int(round(x_log2))
				646	is_power_of_two = abs(x_log2 - rounded_log2) < 0.001
				647	shift = rounded_log2 + 12
				648	if is_power_of_two and shift in (0, 1):
				649	# Special handling if input scale is 1/2048 or 1/4096
				650	scale = 3 << shift
				651	shift = 0
				652	else:
				653	shift = 0
				654	max_rescale = np.iinfo(np.int16).max / 2
				655	while rescale <= max_rescale and shift <= 30:
				656	shift += 1
				657	rescale *= 2
				658	scale = int(rescale)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	659	else:
				660	rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
				661	scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
				662	scale = int(round_away_zero(scale * rescale))
				663	elif pool_op.fused_quantize:
				664	# Quantize op requires different scaling
				665	ifm_scale_f64 = np.double(ifm_quant.scale_f32)
				666	ofm_scale_f64 = np.double(ofm_quant.scale_f32)
				667	scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
				668	elif pool_op.rescale is not None:
Fredrik Svedberg	e82be7c	2021-01-18 15:21:03 +0100	[diff] [blame]	669	# for ResizeBilinear operations with rescale
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	670	rescale = pool_op.rescale
				671	rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
				672	scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
				673	scale = int(round_away_zero(scale * rescale))
				674	else:
				675	# In case avg pool fused with concat or other memory operation, rescaling might be needed.
				676	# kernel height == kernel width == 1 is always true in this case
				677	# Normally the scale is maximised, to get maximum precision, which means that
				678	# if rescale != 1, scale need to consider the number of bits needed for rescaling
				679	if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:
				680	rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32
				681	rescale_bits = 0
				682	if kernel.height == kernel.width == 1:
				683	if rescale > 1:
				684	rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
				685	elif rescale < 1:
				686	rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
				687	scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
				688	scale = int(round_away_zero(scale * rescale))
				689	else:
				690	scale = 1
				691	shift = 0
				692
				693	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
				694
				695
				696	def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
				697	"""
				698	Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
				699	Returns the operator to scale
				700	"""
				701	op_to_scale = 0
				702	if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):
				703	input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
				704	input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
				705	output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
				706
				707	if npu_op.activation is not None and npu_op.activation.op_type in (
				708	NpuActivationOp.SIGMOID,
				709	NpuActivationOp.TANH,
				710	):
				711	output_scale = 1 / 0x3000
				712
				713	if npu_op.sub_op_type == NpuElementWiseOp.MUL:
				714	if None in (input_scale, input2_scale, output_scale):
				715	ofm_scale = 1
				716	shift = 0
				717	else:
				718	ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
				719	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
				720	else: # Add/Sub
				721	if None in (input_scale, input2_scale, output_scale):
				722	opa_scale = opb_scale = ofm_scale = 1
				723	opa_shift = shift = 0
				724	if npu_op.rescale is not None:
				725	ofm_scale, shift = npu_op.rescale
				726	elif input_scale == input2_scale:
				727	opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
				728	input_scale, input2_scale, output_scale
				729	)
				730	opa_shift = 0 # Unused for this case
				731	else:
				732	# Use advanced implementation only when input scales differ
				733	bitdepth = npu_op.ifm.data_type.size_in_bits()
				734	(opa_scale, opa_shift, ofm_scale, shift, op_to_scale,) = scaling.advanced_elementwise_add_sub_scale(
				735	input_scale, input2_scale, output_scale, bitdepth
				736	)
				737	opb_scale = 0 # Unused for this case
				738	if npu_op.reversed_operands:
				739	# If the operand order is reversed we also have to swap which operand is scaled
				740	if op_to_scale == scaling.OperandToScale.OPa:
				741	op_to_scale = scaling.OperandToScale.OPb
				742	else:
				743	op_to_scale = scaling.OperandToScale.OPa
				744	emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
				745	emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
				746	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
				747	elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):
				748	output_scale = npu_op.ofm.quantization.scale_f32
				749	ofm_scale, shift = scaling.quantise_scale(output_scale)
				750	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
				751	else:
				752	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)
				753	return op_to_scale
				754
				755
				756	# -------------------------------------------------------------------
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	757	# PRINT
				758	# -------------------------------------------------------------------
Jacob Bohlin	e99b893	2020-07-13 16:01:51 +0200	[diff] [blame]	759
				760
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	761	def print_feature_map(fm: NpuFeatureMap, name: str):
				762	if fm is not None:
				763	q = (
				764	"no quantization"
				765	if fm.quantization is None
				766	else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"
				767	)
				768	h, w, c = fm.shape
				769	sz = h * w * c * fm.data_type.size_in_bytes()
				770	print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")
				771	strides = get_strides(fm)
				772	stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"
				773	t = fm.tiles
				774	addresses = [hex(addr) for addr in t.addresses]
				775	print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	776
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	777
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	778	def print_operation(npu_op: NpuOperation, index: int = 0, cmd=None):
				779	pass_info = f", {cmd}" if cmd else ""
				780	if isinstance(npu_op, NpuOperation) and not isinstance(npu_op, (NpuDmaOperation, NpuBlockOperation)):
				781	print(f"{index} {npu_op.op_type.name}{pass_info}")
				782	return
				783	if isinstance(npu_op, NpuDmaOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	784	print(f"{index} DMA_START src={npu_op.src}, dest={npu_op.dest}{pass_info}")
				785	return
				786	k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	787	if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	788	print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name}:{pass_info}")
Patrik Gustavsson	eca2e95	2020-05-27 09:15:11 +0200	[diff] [blame]	789	else:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	790	if (
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	791	isinstance(npu_op, NpuConv2DOperation)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	792	and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1
				793	):
				794	fc = "FullyConnected "
				795	else:
				796	fc = ""
				797	print(f"{index} {fc}{npu_op.op_type.name}{pass_info}")
				798	print_feature_map(npu_op.ifm, "IFM")
				799	if npu_op.ifm2_scalar is not None:
				800	quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
				801	print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")
				802	else:
				803	print_feature_map(npu_op.ifm2, "IFM2")
				804	print_feature_map(npu_op.ofm, "OFM")
				805	if k is not None and npu_op.op_type != NpuOperationType.ElementWise:
				806	print(f" Kernel: {k}")
				807	if npu_op.padding is not None:
				808	print(f" {npu_op.padding}")
				809	for weights in npu_op.weights:
				810	print(f" Weights: {weights}")
				811	for bias in npu_op.biases:
				812	print(f" Scales: {bias}")
				813	if npu_op.activation is not None:
				814	act = npu_op.activation
				815	if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:
				816	lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""
				817	print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	818	if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	819	print(f" {npu_op.block_traversal}")
				820	bh, bw, bc = npu_op.block_config
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	821	rescale = (
				822	f", rescale={npu_op.rescale}" if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)) else ""
				823	)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	824	print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	825
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	826
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	827	def print_operations(npu_op_list: List[NpuOperation], npu_op_to_cmd=None):
				828	npu_op_to_cmd = dict() if npu_op_to_cmd is None else npu_op_to_cmd
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	829	for index, npu_op in enumerate(npu_op_list):
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	830	print_operation(npu_op, index, npu_op_to_cmd.get(npu_op))
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	831
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	832
				833	# -------------------------------------------------------------------
				834	# OPERATIONS
				835	# -------------------------------------------------------------------
				836
				837
				838	def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):
				839	"""Generates NPU_OP_* command"""
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	840	if isinstance(npu_op, NpuDmaOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	841	emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	842	elif isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	843	emit.cmd_do_operation(cmd0.NPU_OP_CONV)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	844	elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	845	emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	846	elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	847	emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	848	elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	849	emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])
				850	else:
				851	assert 0, "Unsupported operation"
				852
				853
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	854	def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	855	"""Generates register commands for Conv2D operations"""
				856	generate_common(emit, npu_op, npu_op.block_traversal, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	857
				858
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	859	def generate_conv_depthwise_op(
				860	emit: CommandStreamEmitter, npu_op: NpuConvDepthWiseOperation, arch: ArchitectureFeatures
				861	):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	862	"""Generates register commands for depthwise convolution operations"""
				863	generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	864
				865
				866	def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
				867	"""Generates register commands for pooling operations"""
				868	use_global_scale = (
				869	npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0
				870	)
				871	generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)
				872	# Pooling op specific
				873	if use_global_scale:
				874	generate_ofm_scaling_for_pooling(emit, npu_op)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	875
				876
				877	def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):
				878	"""Generates register commands for elementwise operations"""
				879	use_global_scale = npu_op.sub_op_type in (
				880	NpuElementWiseOp.ADD,
				881	NpuElementWiseOp.SUB,
				882	NpuElementWiseOp.MUL,
				883	NpuElementWiseOp.LRELU,
				884	NpuElementWiseOp.ABS,
				885	)
				886	op_to_scale = generate_scaling_for_elementwise(emit, npu_op)
				887	generate_common(
				888	emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale
				889	)
				890	# Elementwise op specific
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	891	if npu_op.sub_op_type not in UNARY_ELEMWISE_OPS:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	892	# Binary operation; generate IFM2 registers
				893	assert npu_op.ifm2 is not None
				894	has_scalar = npu_op.ifm2_scalar is not None
				895	generate_ifm2(emit, npu_op.ifm2, has_scalar)
				896	generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)
				897	generate_ifm2_broadcast(emit, npu_op)
				898	if has_scalar:
				899	quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
				900	assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()
				901	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	902
				903
				904	def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):
				905	"""Generates register commands for DMA operations"""
				906	emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	907	emit.cmd1_with_address(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	908	emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)
				909
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	910	emit.cmd1_with_address(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
				911	emit.cmd1_with_address(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	912
				913
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	914	def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	915	"""
				916	Generates register commands for the given operation, but not the final NPU_OP_... command.
				917	Returns the selected block config
				918	"""
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	919	if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	920	generate_conv2d_op(emit, npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	921	elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	922	generate_conv_depthwise_op(emit, npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	923	elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	924	generate_pooling_op(emit, npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	925	elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	926	generate_elementwise_op(emit, npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	927	elif isinstance(npu_op, NpuDmaOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	928	generate_dma_op(emit, npu_op)
				929	else:
				930	assert 0, "Unsupported operation"
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	931
				932
				933	def generate_command_stream(
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	934	npu_op_list: List[NpuOperation],
				935	arch: ArchitectureFeatures,
				936	verbose: bool,
				937	mem_limits: Dict[int, int],
				938	add_to_debug_db=None,
				939	npu_op_to_cmd=None,
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	940	) -> List[int]:
				941	"""
				942	Generates register commands for the given list of NPU operations.
				943	Returns Ethos-U instructions, as a list of 32-bit integers.
				944	"""
				945	emit = CommandStreamEmitter()
				946	if verbose:
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	947	print_operations(npu_op_list, npu_op_to_cmd)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	948	# Calculate memory accesses for every operation
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	949	memory_accesses: Dict[NpuOperation, MemoryAccessSet] = {}
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	950	for npu_op in npu_op_list:
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	951	if isinstance(npu_op, NpuDmaOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	952	memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	953	elif isinstance(npu_op, NpuBlockOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	954	memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	955	else:
				956	assert 0, "Invalid operation type"
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	957
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	958	if arch.is_ethos_u65_system:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	959	emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
				960	dep_watermark = Watermark(0, 0)
				961	prev_op = None
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	962	# Generate register commands for all operations
				963	for op_index, npu_op in enumerate(npu_op_list):
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	964	try:
				965	check_mem_limits(memory_accesses[npu_op], mem_limits)
				966	dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)
				967	generate_registers_for_op(emit, npu_op, arch)
				968	except VelaError as e:
				969	# Add operation info and rethrow
				970	raise VelaError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	971	if not isinstance(npu_op, NpuDmaOperation) and isinstance(npu_op, NpuBlockOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	972	# Generate BLOCKDEP
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	973	blockdep = calc_blockdep(arch, prev_op, npu_op)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	974	blockdep = min(blockdep, arch.max_blockdep)
				975	emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
				976	prev_op = npu_op
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	977
				978	generate_cmd_waits(emit, cmd_waits)
				979	# Generate the actual NPU_OP command
				980	generate_operation_code(emit, npu_op)
				981	if add_to_debug_db is not None:
				982	add_to_debug_db(npu_op, emit.offset)
				983	# Fill in final part of command stream:
				984	emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	985	res = emit.to_list()
erik.andersson@arm.com	1878dab	2021-03-16 09:40:24 +0100	[diff] [blame]	986
				987	if emit.size_in_bytes() >= 1 << 24:
				988	raise VelaError(
				989	f"The command stream size exceeds the hardware limit of 16 MiB. "
				990	f"The current stream size is {emit.size_in_bytes()/2**20:.2F} MiB."
				991	)
				992
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	993	if verbose:
				994	emit.print_cmds()
				995	print("number of commands", len(emit.cmd_stream))
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	996	print("command stream length in words", len(res))
				997	return res
				998
				999
				1000	# -------------------------------------------------------------------
				1001	# EXTERNAL API
				1002	# -------------------------------------------------------------------
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1003
				1004
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	1005	def find_block_configs(npu_op: NpuOperation, npu_accelerator: NpuAccelerator) -> List[NpuShape3D]:
				1006	"""
				1007	Internal implementation of the public facing API for finding block configs.
				1008	"""
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	1009	if isinstance(npu_op, NpuBlockOperation):
				1010	arch = create_default_arch(Accelerator.from_npu_accelerator(npu_accelerator))
				1011	shared_buffer = create_shared_buffer(npu_op, arch)
				1012	blocks = find_suitable_block_configs(arch, shared_buffer)
				1013	return [NpuShape3D(height=block[0], width=block[1], depth=block[3]) for block in blocks]
				1014	return []
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	1015
				1016
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	1017	def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1018	"""
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	1019	Internal implementation of the public facing API for generating an Ethos-U register command stream.
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1020	Calculates dependencies between commands and inserts wait operations if needed.
				1021
				1022	:param npu_op_list: List[NpuOperation] list of high level NPU operations
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	1023	:param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator
				1024	:return Ethos-U instructions, as a list of 32-bit integers
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1025	"""
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	1026	accelerator = Accelerator.from_npu_accelerator(npu_accelerator)
Louis Verhaard	5207830	2020-11-18 13:35:06 +0100	[diff] [blame]	1027	arch = create_default_arch(accelerator)
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	1028	mem_limits = dict()
				1029	for region in range(0, 8):
				1030	mem_limits[region] = arch.max_address_offset
				1031	mem_limits[BASE_PTR_INDEX_MEM2MEM] = arch.shram_size_bytes
				1032	return generate_command_stream(npu_op_list, arch, verbose=False, mem_limits=mem_limits)