Blame - ethosu/vela/register_command_stream_generator.py - ml/ethos-u/ethos-u-vela

blob: f92536915ec3900e7ec2742602f817bfbd19700e [file] [log] [blame]

erik.andersson@arm.com	460c689	2021-02-24 14:38:09 +0100	[diff] [blame]	1	# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	16	# Description:
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	17	# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	18	# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	19	# stream suitable for interpretation by the Ethos-U processor.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	20	from collections import defaultdict
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	21	from enum import Enum
				22	from enum import IntEnum
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	23	from typing import Dict
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	24	from typing import List
				25	from typing import Optional
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	26
				27	import numpy as np
				28
				29	from . import scaling
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	30	from .api import NpuAccelerator
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	31	from .api import NpuActivation
				32	from .api import NpuActivationOp
				33	from .api import NpuAddressRange
				34	from .api import NpuBlockOperation
				35	from .api import NpuBlockTraversal
				36	from .api import NpuConv2DOperation
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	37	from .api import NpuConvDepthWiseOperation
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	38	from .api import NpuDataType
				39	from .api import NpuDmaOperation
				40	from .api import NpuElementWiseOp
				41	from .api import NpuElementWiseOperation
				42	from .api import NpuFeatureMap
				43	from .api import NpuKernel
				44	from .api import NpuLayout
				45	from .api import NpuOperation
				46	from .api import NpuOperationType
				47	from .api import NpuPadding
				48	from .api import NpuPoolingOp
				49	from .api import NpuPoolingOperation
				50	from .api import NpuQuantization
				51	from .api import NpuResamplingMode
				52	from .api import NpuRoundingMode
				53	from .api import NpuShape3D
				54	from .api import NpuTileBox
				55	from .architecture_features import Accelerator
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	56	from .architecture_features import ArchitectureFeatures
				57	from .architecture_features import Block
Louis Verhaard	5207830	2020-11-18 13:35:06 +0100	[diff] [blame]	58	from .architecture_features import create_default_arch
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	59	from .architecture_features import SharedBufferArea
				60	from .architecture_features import SHRAMElements
erik.andersson@arm.com	1878dab	2021-03-16 09:40:24 +0100	[diff] [blame^]	61	from .errors import VelaError
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	62	from .ethos_u55_regs.ethos_u55_regs import acc_format
				63	from .ethos_u55_regs.ethos_u55_regs import activation
				64	from .ethos_u55_regs.ethos_u55_regs import cmd0
				65	from .ethos_u55_regs.ethos_u55_regs import cmd1
				66	from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	67	from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlin	cf7da10	2020-05-20 09:03:40 +0200	[diff] [blame]	68	from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	69	from .ethos_u55_regs.ethos_u55_regs import rounding
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	70	from .numeric_util import quantise_float32
				71	from .numeric_util import round_away_zero
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	72	from .numeric_util import round_up_to_int
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	73	from .operation import NpuBlockType
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	74	from .range_set import MemoryAccessSet
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	75	from .register_command_stream_util import calc_blockdep
				76	from .register_command_stream_util import get_dma_memory_accesses
				77	from .register_command_stream_util import get_op_memory_accesses
				78	from .register_command_stream_util import get_strides
				79	from .register_command_stream_util import get_wait_dependency
				80	from .register_command_stream_util import has_ifm2
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	81	from .register_command_stream_util import to_kernel
				82	from .register_command_stream_util import UNARY_ELEMWISE_OPS
				83	from .register_command_stream_util import Watermark
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	84	from .shared_buffer_allocation import find_suitable_block_configs
				85	from .shared_buffer_allocation import shared_buffer_allocation_for_npu_op
				86	from .shared_buffer_allocation import SharedBufferAllocation
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	87
				88
				89	class RegisterMachine:
				90	def __init__(self):
				91	self.n_banks = 1
				92	self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
				93	self.bank_idx = 0
				94
				95	def set_register(self, reg, value):
				96	is_changed = self.registers[self.bank_idx][reg] != value
				97	self.registers[self.bank_idx][reg] = value
				98	# is_changed = True # force command
				99	return is_changed
				100
				101	def switch_bank(self):
				102	self.bank_idx = (self.bank_idx + 1) % self.n_banks
				103
				104
				105	class CmdMode(IntEnum):
				106	NoPayload = 0x0000
				107	Payload32 = 0x4000
				108	Mask = 0xC000
				109	CmdOpMask = 0x03FF
				110
				111
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	112	class CommandStreamEmitter:
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	113	WORD_SIZE = 4
				114
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	115	def __init__(self):
				116	self.cmd_stream = []
				117	self.reg_machine = [RegisterMachine(), RegisterMachine()]
				118	self.last_absolute_wait = defaultdict(int)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	119	self.offset = 0
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	120
				121	def get_reg_machine(self, cmd):
				122	if "DMA" in cmd.name:
				123	return self.reg_machine[1]
				124	else:
				125	return self.reg_machine[0]
				126
				127	def size_in_bytes(self):
				128	sz = 0
				129	for cmd in self.cmd_stream:
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	130	sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	131	return sz
				132
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	133	def to_list(self) -> List[int]:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	134	return [elem for cmd in self.cmd_stream for elem in cmd]
				135
				136	def print_cmds(self):
				137	print("Code: Command: Param: Payload:")
				138	for words_for_one_command in self.cmd_stream:
				139	code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
				140	param = words_for_one_command[0] >> 16 # higher 16 bits
				141
				142	payload_mode = CmdMode(code & CmdMode.Mask)
				143
				144	# code and command
				145	s = " 0x%04x " % code
				146	if payload_mode == CmdMode.NoPayload:
				147	s += str(cmd0(code & CmdMode.CmdOpMask))
				148	else:
				149	s += str(cmd1(code & CmdMode.CmdOpMask))
				150
				151	s = s.ljust(40)
				152	s += "%5d" % param
				153
				154	# payload
				155	if payload_mode == CmdMode.Payload32:
				156	s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])
				157	else:
				158	s += " -"
				159
				160	print(s)
				161
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	162	def cmd0_with_param(self, cmd: cmd0, param):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	163	if isinstance(param, Enum):
				164	param = int(param.value)
				165	else:
				166	param = int(param)
				167	param = param & 0xFFFF
				168	command = cmd.value \| (param << 16)
				169	if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
				170	return
				171
				172	# This is not a redundant command, actually write it
				173	self.cmd_stream.append((command,))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	174	self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	175
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	176	def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	177	offset = int(offset) & 0xFFFFFFFFF
				178	command = cmd.value \| CmdMode.Payload32.value \| (param << 16)
				179
				180	if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
				181	return
				182
				183	# This is not a redundant command, actually write it
				184	self.cmd_stream.append((command, offset))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	185	self.offset += CommandStreamEmitter.WORD_SIZE * 2
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	186
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	187	def cmd1_with_address(self, cmd: cmd1, offset):
				188	self.cmd1_with_offset(cmd, offset, offset >> 32)
				189
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	190	def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	191	param = (16 * channel) + outstanding_count
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	192	command = ((param & 0xFFFF) << 16) \| cmd.value
				193	self.cmd_stream.append((command,))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	194	self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	195
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	196	def cmd_do_operation(self, cmd: cmd0, param=0):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	197	param = int(param)
				198	command = ((param & 0xFFFF) << 16) \| cmd.value
				199
				200	self.cmd_stream.append((command,))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	201	self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	202	self.get_reg_machine(cmd).switch_bank()
				203
				204
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	205	# -------------------------------------------------------------------
				206	# REGISTER GENERATION
				207	# -------------------------------------------------------------------
				208
				209
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	210	# TODO: Replace with definitions from ethos_u55_regs
				211	class IFM2Broadcast(IntEnum):
				212	BroadcastHdim = 1 << 0
				213	BroadcastWdim = 1 << 1
				214	BroadcastCdim = 1 << 2
				215	ReverseOperandOrder = 1 << 6
				216	UseIFM2Scalar = 1 << 7
				217
				218
				219	pooling_op_map = {
				220	NpuPoolingOp.MAX: pooling_mode.MAX.value,
				221	NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,
				222	NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,
				223	}
				224
				225	elementwise_op_map = {
				226	NpuElementWiseOp.MUL: elementwise_mode.MUL.value,
				227	NpuElementWiseOp.ADD: elementwise_mode.ADD.value,
				228	NpuElementWiseOp.SUB: elementwise_mode.SUB.value,
				229	NpuElementWiseOp.MIN: elementwise_mode.MIN.value,
				230	NpuElementWiseOp.MAX: elementwise_mode.MAX.value,
				231	NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,
				232	NpuElementWiseOp.ABS: elementwise_mode.ABS.value,
				233	NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,
				234	NpuElementWiseOp.SHR: elementwise_mode.SHR.value,
				235	NpuElementWiseOp.SHL: elementwise_mode.SHL.value,
				236	}
				237
				238	activation_op_map = {
				239	NpuActivationOp.NONE_OR_RELU: activation.NONE,
				240	NpuActivationOp.TANH: activation.TANH,
				241	NpuActivationOp.SIGMOID: activation.SIGMOID,
				242	}
				243
				244	# Maps an AccumulatorType enum to the corresponding acc_format value
				245	acc_format_map = {
				246	SHRAMElements.Acc16: acc_format.FP_S5_10.value,
				247	SHRAMElements.Acc32: acc_format.INT_32BIT.value,
				248	SHRAMElements.Acc40: acc_format.INT_40BIT.value,
				249	}
				250
				251	resampling_mode_map = {
				252	NpuResamplingMode.NONE: resampling_mode.NONE,
				253	NpuResamplingMode.NEAREST: resampling_mode.NEAREST,
				254	NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,
				255	}
				256
				257	# Maps data type size in bits to activation precision
				258	precision_map = {8: 0, 16: 1, 32: 2}
				259
				260	# Maps rounding mode to the corresponding value
				261	rounding_mode_map = {
				262	NpuRoundingMode.TFL: rounding.TFL.value,
				263	NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,
				264	NpuRoundingMode.NATURAL: rounding.NATURAL.value,
				265	}
				266
				267
				268	def quantise(value: float, quant: Optional[NpuQuantization]) -> int:
				269	"""Quantizes the given value"""
				270	scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32
				271	zp = 0 if quant is None else quant.zero_point
				272	return quantise_float32(value, scale, zp)
				273
				274
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	275	def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):
				276	"""Generates IFM_PAD registers"""
				277	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)
				278	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)
				279	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)
				280	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)
				281
				282
				283	def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):
				284	"""Generates ACTIVATION registers"""
				285	act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)
				286
				287	if act.min is None:
				288	quantized_min = ofm.data_type.min_value()
				289	else:
				290	quantized_min = quantise(act.min, ofm.quantization)
				291	if act.max is None:
				292	quantized_max = ofm.data_type.max_value()
				293	else:
				294	quantized_max = quantise(act.max, ofm.quantization)
				295	quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())
				296	quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())
				297	if act.op_type == NpuActivationOp.TABLE_LOOKUP:
				298	assert 0 <= act.lookup_table_index < 8
				299	activation_value = 16 + act.lookup_table_index
				300	if ofm.data_type == NpuDataType.INT32:
				301	activation_value \|= 3 << 12 # Force I8 range
				302	quantized_min = max(-128, quantized_min)
				303	quantized_max = min(127, quantized_max)
				304	else:
				305	activation_value = activation_op_map[act.op_type]
				306	emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)
				307	emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)
				308	emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)
				309
				310
				311	def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):
				312	"""Generates xFM_BASE registers"""
				313	if layout == NpuLayout.NHCWB16:
				314	# Check that all BasePointer addresses are aligned to 16 bytes
				315	assert all((int(addr) % 16) == 0 for addr in addresses)
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	316	for i in range(4):
				317	emit.cmd1_with_address(ptr_cmds[i], addresses[i])
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	318
				319
				320	def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):
				321	"""Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""
				322	emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)
				323	emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)
				324	emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)
				325
				326
				327	def generate_strides(
				328	emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1
				329	):
				330	"""Generates STRIDE_C/Y/X registers"""
				331	strides = get_strides(fm)
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	332	emit.cmd1_with_address(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)
				333	emit.cmd1_with_address(stride_y_cmd, strides.height) # stride between vertical values (H)
				334	emit.cmd1_with_address(stride_x_cmd, strides.width) # stride between horisontal values (W)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	335
				336
				337	def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):
				338	"""Generates IFM/IFM2_PRECISION register"""
				339	dtype = fm.data_type
				340	prec = 1 if dtype.is_signed() else 0
				341	activation_precision = precision_map[dtype.size_in_bits()]
				342	prec += activation_precision << 2
				343
				344	if fm.layout == NpuLayout.NHCWB16:
				345	prec \|= 1 << 6
				346
				347	prec \|= op_to_scale << 8
				348	emit.cmd0_with_param(precision_cmd, prec)
				349
				350
				351	def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):
				352	"""Generates OFM_PRECISION register"""
				353	dtype = npu_op.ofm.data_type
				354	prec = 1 if dtype.is_signed() else 0
				355	activation_precision = precision_map[dtype.size_in_bits()]
				356	prec += activation_precision << 1
				357
				358	if use_global_scale:
				359	# Set global scale bit, as opposed to using per channel scale
				360	prec \|= 1 << 8
				361	if npu_op.ofm.layout == NpuLayout.NHCWB16:
				362	prec \|= 1 << 6
				363	prec \|= rounding_mode_map[npu_op.rounding_mode] << 14
				364	emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
				365
				366
				367	def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):
				368	"""Generates IFM2_BROADCAST register for binary elementwise operations"""
				369	ifm2_broadcast = 0
				370	ifm = npu_op.ifm
				371	ifm2 = npu_op.ifm2
				372	if npu_op.reversed_operands:
				373	ifm2_broadcast \|= IFM2Broadcast.ReverseOperandOrder
				374	if npu_op.ifm2_scalar is not None:
				375	# IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
				376	ifm2_broadcast \|= IFM2Broadcast.UseIFM2Scalar
				377	else:
				378	if ifm.shape.height != ifm2.shape.height:
				379	# Broadcast in 'H' dimension
				380	assert ifm2.shape.height == 1
				381	ifm2_broadcast \|= IFM2Broadcast.BroadcastHdim
				382
				383	if ifm.shape.width != ifm2.shape.width:
				384	# Broadcast in 'W' dimension
				385	assert ifm2.shape.width == 1
				386	ifm2_broadcast \|= IFM2Broadcast.BroadcastWdim
				387
				388	if ifm.shape.depth != ifm2.shape.depth:
				389	# Broadcast in 'C' dimension
				390	assert ifm2.shape.depth == 1
				391	ifm2_broadcast \|= IFM2Broadcast.BroadcastCdim
				392
				393	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
				394
				395
				396	def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):
				397	"""Generates general IFM registers"""
				398	emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)
				399	generate_addresses(
				400	emit,
				401	[cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],
				402	ifm.tiles.addresses,
				403	ifm.layout,
				404	)
				405	generate_tiles(
				406	emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles
				407	)
				408	emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)
				409	generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)
				410	emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))
				411
				412
				413	def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):
				414	"""Generates general IFM2 registers"""
				415	if not has_scalar:
				416	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)
				417	generate_addresses(
				418	emit,
				419	[cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],
				420	ifm2.tiles.addresses,
				421	ifm2.layout,
				422	)
				423	generate_tiles(
				424	emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles
				425	)
				426	generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)
				427	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))
				428
				429
				430	def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):
				431	"""Generates general OFM registers"""
				432	emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)
				433	generate_addresses(
				434	emit,
				435	[cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],
				436	ofm.tiles.addresses,
				437	ofm.layout,
				438	)
				439	generate_tiles(
				440	emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles
				441	)
				442	emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)
				443	emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)
				444	emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)
				445	generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)
				446	emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))
				447
				448
				449	def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):
				450	"""Generates KERNEL related registers"""
				451	emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))
				452	emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))
				453	# set kernel x stride low bit
				454	stride = (kernel.stride_x - 1) & 1
				455	# set kernel y stride low bit
				456	stride \|= (kernel.stride_y - 1 & 1) << 1
				457	# set kernel x stride extension bits
				458	stride \|= (kernel.stride_x - 1 >> 1) << 6
				459	# set kernel y stride extension bits
				460	stride \|= (kernel.stride_y - 1 >> 1) << 9
				461	stride \|= (kernel.dilation_x - 1) << 3
				462	stride \|= (kernel.dilation_y - 1) << 4
				463	if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:
				464	stride \|= 1 << 2
				465	emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
				466
				467
				468	def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):
				469	"""Generates WEIGHT registers"""
				470	if len(weights) == 0:
				471	return
				472	emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)
				473	# Set weights sources for active and present cores
				474	for core, (addr, length) in enumerate(
				475	[
				476	(cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
				477	(cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
				478	]
				479	):
				480	if core < len(weights):
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	481	emit.cmd1_with_address(addr, weights[core].address)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	482	emit.cmd1_with_offset(length, weights[core].length)
				483	elif core < arch.ncores:
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	484	emit.cmd1_with_address(addr, weights[0].address)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	485	emit.cmd1_with_offset(length, 0)
				486
				487
				488	def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):
				489	"""Generates SCALE registers"""
				490	if len(biases) == 0:
				491	return
				492	emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)
				493	# Set weights sources for active and present cores
				494	for core, (addr, length) in enumerate(
				495	[(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]
				496	):
				497	if core < len(biases):
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	498	emit.cmd1_with_address(addr, biases[core].address)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	499	emit.cmd1_with_offset(length, biases[core].length)
				500	elif core < arch.ncores:
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	501	emit.cmd1_with_address(addr, biases[0].address)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	502	emit.cmd1_with_offset(length, 0)
				503
				504
				505	def generate_block_config(
				506	emit: CommandStreamEmitter,
				507	npu_op: NpuBlockOperation,
				508	arch: ArchitectureFeatures,
				509	shared_buffer: SharedBufferAllocation,
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	510	):
				511	"""Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	512	block_config = npu_op.block_config
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	513	assert block_config is not None, "block_config has not been set"
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	514	alloc = shared_buffer.try_block(Block(block_config.width, block_config.height, block_config.depth))
				515	assert alloc is not None, f"Block config {block_config} does not fit, op: {npu_op.op_type}"
				516	emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
				517	emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)
				518	emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	519
				520
				521	def generate_shram_registers_elementwise(
				522	emit: CommandStreamEmitter,
				523	npu_op: NpuElementWiseOperation,
				524	arch: ArchitectureFeatures,
				525	shared_buffer: SharedBufferAllocation,
				526	):
				527	"""Generates IB_END/IB_START/AB_START registers for elementwise operations"""
				528	# For elementwise set the required SHRAM to be equal to the total size of available SHRAM
				529	uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
				530	shram_required = arch.available_shram_banks(uses_lut)
				531
				532	# Acc buffers not needed so set AB_START to size of SHRAM
				533	emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)
				534	emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shram_required)
				535	if has_ifm2(npu_op):
				536	# Set IFM2_IB_START to the latter half of the IB space
				537	ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]
				538	emit.cmd0_with_param(
				539	cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) // shared_buffer.ifm_count + ifm_ib_start,
				540	)
				541	emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
				542
				543
				544	def generate_shram_registers_non_elementwise(emit: CommandStreamEmitter, shared_buffer: SharedBufferAllocation):
				545	"""Generates IB_END/IB_START/AB_START registers for non-elementwise operations"""
				546	emit.cmd0_with_param(
				547	cmd0.NPU_SET_IFM_IB_END,
				548	shared_buffer.bank_locations[SharedBufferArea.IFM] + shared_buffer.banks_required[SharedBufferArea.IFM],
				549	)
				550	emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])
				551	emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
				552
				553
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	554	def create_shared_buffer(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> SharedBufferAllocation:
				555	"""Creates shared buffer allocation for the given operation"""
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	556	if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	557	block_type = NpuBlockType.ConvolutionMxN
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	558	elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	559	block_type = NpuBlockType.ConvolutionDepthWise
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	560	elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	561	block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	562	elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	563	block_type = NpuBlockType.ElementWise
				564	else:
				565	assert 0, "Unsupported operation"
				566	ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
				567	return shared_buffer_allocation_for_npu_op(arch, npu_op, block_type, ifm_resampling_mode)
				568
				569
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	570	def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):
				571	"""Generates KERNEL_WAIT/DMA_WAIT"""
				572	if cmd_waits.npu >= 0:
				573	emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
				574
				575	if cmd_waits.dma >= 0:
				576	emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
				577
				578
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	579	def generate_common(
				580	emit: CommandStreamEmitter,
				581	npu_op: NpuBlockOperation,
				582	block_traversal: NpuBlockTraversal,
				583	arch: ArchitectureFeatures,
				584	use_global_scale: bool = False,
				585	op_to_scale: int = 0,
				586	):
				587	"""Generate registers that are common to most operations"""
				588	assert npu_op.ifm is not None and npu_op.ofm is not None
				589	generate_ifm(emit, npu_op.ifm)
				590	generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)
				591	emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])
				592	if npu_op.padding is not None:
				593	generate_padding(emit, npu_op.padding)
				594	generate_ofm(emit, npu_op.ofm)
				595	generate_ofm_precision(emit, npu_op, use_global_scale)
				596	if npu_op.op_type != NpuOperationType.ElementWise:
				597	assert npu_op.kernel is not None
				598	generate_kernel(emit, npu_op.kernel, block_traversal)
				599	generate_weights(emit, npu_op.weights, arch)
				600	generate_biases(emit, npu_op.biases, arch)
				601	generate_activation(emit, npu_op.activation, npu_op.ofm)
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	602	shared_buffer = create_shared_buffer(npu_op, arch)
				603	generate_block_config(emit, npu_op, arch, shared_buffer)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	604	if isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	605	generate_shram_registers_elementwise(emit, npu_op, arch, shared_buffer)
				606	else:
				607	generate_shram_registers_non_elementwise(emit, shared_buffer)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	608
				609
				610	# -------------------------------------------------------------------
				611	# SCALING
				612	# -------------------------------------------------------------------
				613
				614
				615	def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):
				616	"""Generates OFM_SCALE register for pooling operations"""
				617	# For valid padding vela has to output scaling values
				618	kernel = pool_op.kernel
				619	ifm_quant = pool_op.ifm.quantization
				620	ofm_quant = pool_op.ofm.quantization
				621	if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):
				622	assert ifm_quant.scale_f32 is not None
				623	rescale = 0x3000 * ifm_quant.scale_f32
				624	if pool_op.ifm.data_type == NpuDataType.INT16:
				625	# Calculate scale and shift for the output scale of 1/(3*4096)
				626	shift = 0
				627	max_rescale = np.iinfo(np.int16).max / 2
				628	while rescale <= max_rescale and shift <= 30:
				629	shift += 1
				630	rescale *= 2
				631	scale = int(rescale)
				632	else:
				633	rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
				634	scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
				635	scale = int(round_away_zero(scale * rescale))
				636	elif pool_op.fused_quantize:
				637	# Quantize op requires different scaling
				638	ifm_scale_f64 = np.double(ifm_quant.scale_f32)
				639	ofm_scale_f64 = np.double(ofm_quant.scale_f32)
				640	scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
				641	elif pool_op.rescale is not None:
Fredrik Svedberg	e82be7c	2021-01-18 15:21:03 +0100	[diff] [blame]	642	# for ResizeBilinear operations with rescale
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	643	rescale = pool_op.rescale
				644	rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
				645	scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
				646	scale = int(round_away_zero(scale * rescale))
				647	else:
				648	# In case avg pool fused with concat or other memory operation, rescaling might be needed.
				649	# kernel height == kernel width == 1 is always true in this case
				650	# Normally the scale is maximised, to get maximum precision, which means that
				651	# if rescale != 1, scale need to consider the number of bits needed for rescaling
				652	if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:
				653	rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32
				654	rescale_bits = 0
				655	if kernel.height == kernel.width == 1:
				656	if rescale > 1:
				657	rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
				658	elif rescale < 1:
				659	rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
				660	scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
				661	scale = int(round_away_zero(scale * rescale))
				662	else:
				663	scale = 1
				664	shift = 0
				665
				666	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
				667
				668
				669	def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
				670	"""
				671	Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
				672	Returns the operator to scale
				673	"""
				674	op_to_scale = 0
				675	if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):
				676	input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
				677	input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
				678	output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
				679
				680	if npu_op.activation is not None and npu_op.activation.op_type in (
				681	NpuActivationOp.SIGMOID,
				682	NpuActivationOp.TANH,
				683	):
				684	output_scale = 1 / 0x3000
				685
				686	if npu_op.sub_op_type == NpuElementWiseOp.MUL:
				687	if None in (input_scale, input2_scale, output_scale):
				688	ofm_scale = 1
				689	shift = 0
				690	else:
				691	ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
				692	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
				693	else: # Add/Sub
				694	if None in (input_scale, input2_scale, output_scale):
				695	opa_scale = opb_scale = ofm_scale = 1
				696	opa_shift = shift = 0
				697	if npu_op.rescale is not None:
				698	ofm_scale, shift = npu_op.rescale
				699	elif input_scale == input2_scale:
				700	opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
				701	input_scale, input2_scale, output_scale
				702	)
				703	opa_shift = 0 # Unused for this case
				704	else:
				705	# Use advanced implementation only when input scales differ
				706	bitdepth = npu_op.ifm.data_type.size_in_bits()
				707	(opa_scale, opa_shift, ofm_scale, shift, op_to_scale,) = scaling.advanced_elementwise_add_sub_scale(
				708	input_scale, input2_scale, output_scale, bitdepth
				709	)
				710	opb_scale = 0 # Unused for this case
				711	if npu_op.reversed_operands:
				712	# If the operand order is reversed we also have to swap which operand is scaled
				713	if op_to_scale == scaling.OperandToScale.OPa:
				714	op_to_scale = scaling.OperandToScale.OPb
				715	else:
				716	op_to_scale = scaling.OperandToScale.OPa
				717	emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
				718	emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
				719	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
				720	elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):
				721	output_scale = npu_op.ofm.quantization.scale_f32
				722	ofm_scale, shift = scaling.quantise_scale(output_scale)
				723	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
				724	else:
				725	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)
				726	return op_to_scale
				727
				728
				729	# -------------------------------------------------------------------
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	730	# PRINT
				731	# -------------------------------------------------------------------
Jacob Bohlin	e99b893	2020-07-13 16:01:51 +0200	[diff] [blame]	732
				733
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	734	def print_feature_map(fm: NpuFeatureMap, name: str):
				735	if fm is not None:
				736	q = (
				737	"no quantization"
				738	if fm.quantization is None
				739	else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"
				740	)
				741	h, w, c = fm.shape
				742	sz = h * w * c * fm.data_type.size_in_bytes()
				743	print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")
				744	strides = get_strides(fm)
				745	stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"
				746	t = fm.tiles
				747	addresses = [hex(addr) for addr in t.addresses]
				748	print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	749
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	750
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	751	def print_operation(npu_op: NpuOperation, index: int = 0, cmd=None):
				752	pass_info = f", {cmd}" if cmd else ""
				753	if isinstance(npu_op, NpuOperation) and not isinstance(npu_op, (NpuDmaOperation, NpuBlockOperation)):
				754	print(f"{index} {npu_op.op_type.name}{pass_info}")
				755	return
				756	if isinstance(npu_op, NpuDmaOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	757	print(f"{index} DMA_START src={npu_op.src}, dest={npu_op.dest}{pass_info}")
				758	return
				759	k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	760	if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	761	print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name}:{pass_info}")
Patrik Gustavsson	eca2e95	2020-05-27 09:15:11 +0200	[diff] [blame]	762	else:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	763	if (
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	764	isinstance(npu_op, NpuConv2DOperation)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	765	and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1
				766	):
				767	fc = "FullyConnected "
				768	else:
				769	fc = ""
				770	print(f"{index} {fc}{npu_op.op_type.name}{pass_info}")
				771	print_feature_map(npu_op.ifm, "IFM")
				772	if npu_op.ifm2_scalar is not None:
				773	quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
				774	print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")
				775	else:
				776	print_feature_map(npu_op.ifm2, "IFM2")
				777	print_feature_map(npu_op.ofm, "OFM")
				778	if k is not None and npu_op.op_type != NpuOperationType.ElementWise:
				779	print(f" Kernel: {k}")
				780	if npu_op.padding is not None:
				781	print(f" {npu_op.padding}")
				782	for weights in npu_op.weights:
				783	print(f" Weights: {weights}")
				784	for bias in npu_op.biases:
				785	print(f" Scales: {bias}")
				786	if npu_op.activation is not None:
				787	act = npu_op.activation
				788	if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:
				789	lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""
				790	print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	791	if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	792	print(f" {npu_op.block_traversal}")
				793	bh, bw, bc = npu_op.block_config
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	794	rescale = (
				795	f", rescale={npu_op.rescale}" if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)) else ""
				796	)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	797	print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	798
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	799
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	800	def print_operations(npu_op_list: List[NpuOperation], npu_op_to_cmd=None):
				801	npu_op_to_cmd = dict() if npu_op_to_cmd is None else npu_op_to_cmd
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	802	for index, npu_op in enumerate(npu_op_list):
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	803	print_operation(npu_op, index, npu_op_to_cmd.get(npu_op))
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	804
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	805
				806	# -------------------------------------------------------------------
				807	# OPERATIONS
				808	# -------------------------------------------------------------------
				809
				810
				811	def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):
				812	"""Generates NPU_OP_* command"""
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	813	if isinstance(npu_op, NpuDmaOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	814	emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	815	elif isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	816	emit.cmd_do_operation(cmd0.NPU_OP_CONV)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	817	elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	818	emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	819	elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	820	emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	821	elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	822	emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])
				823	else:
				824	assert 0, "Unsupported operation"
				825
				826
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	827	def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	828	"""Generates register commands for Conv2D operations"""
				829	generate_common(emit, npu_op, npu_op.block_traversal, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	830
				831
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	832	def generate_conv_depthwise_op(
				833	emit: CommandStreamEmitter, npu_op: NpuConvDepthWiseOperation, arch: ArchitectureFeatures
				834	):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	835	"""Generates register commands for depthwise convolution operations"""
				836	generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	837
				838
				839	def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
				840	"""Generates register commands for pooling operations"""
				841	use_global_scale = (
				842	npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0
				843	)
				844	generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)
				845	# Pooling op specific
				846	if use_global_scale:
				847	generate_ofm_scaling_for_pooling(emit, npu_op)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	848
				849
				850	def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):
				851	"""Generates register commands for elementwise operations"""
				852	use_global_scale = npu_op.sub_op_type in (
				853	NpuElementWiseOp.ADD,
				854	NpuElementWiseOp.SUB,
				855	NpuElementWiseOp.MUL,
				856	NpuElementWiseOp.LRELU,
				857	NpuElementWiseOp.ABS,
				858	)
				859	op_to_scale = generate_scaling_for_elementwise(emit, npu_op)
				860	generate_common(
				861	emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale
				862	)
				863	# Elementwise op specific
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	864	if npu_op.sub_op_type not in UNARY_ELEMWISE_OPS:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	865	# Binary operation; generate IFM2 registers
				866	assert npu_op.ifm2 is not None
				867	has_scalar = npu_op.ifm2_scalar is not None
				868	generate_ifm2(emit, npu_op.ifm2, has_scalar)
				869	generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)
				870	generate_ifm2_broadcast(emit, npu_op)
				871	if has_scalar:
				872	quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
				873	assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()
				874	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	875
				876
				877	def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):
				878	"""Generates register commands for DMA operations"""
				879	emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	880	emit.cmd1_with_address(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	881	emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)
				882
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	883	emit.cmd1_with_address(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
				884	emit.cmd1_with_address(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	885
				886
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	887	def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	888	"""
				889	Generates register commands for the given operation, but not the final NPU_OP_... command.
				890	Returns the selected block config
				891	"""
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	892	if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	893	generate_conv2d_op(emit, npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	894	elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	895	generate_conv_depthwise_op(emit, npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	896	elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	897	generate_pooling_op(emit, npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	898	elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	899	generate_elementwise_op(emit, npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	900	elif isinstance(npu_op, NpuDmaOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	901	generate_dma_op(emit, npu_op)
				902	else:
				903	assert 0, "Unsupported operation"
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	904
				905
				906	def generate_command_stream(
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	907	npu_op_list: List[NpuOperation], arch: ArchitectureFeatures, verbose: bool, add_to_debug_db=None, npu_op_to_cmd=None
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	908	) -> List[int]:
				909	"""
				910	Generates register commands for the given list of NPU operations.
				911	Returns Ethos-U instructions, as a list of 32-bit integers.
				912	"""
				913	emit = CommandStreamEmitter()
				914	if verbose:
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	915	print_operations(npu_op_list, npu_op_to_cmd)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	916	# Calculate memory accesses for every operation
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	917	memory_accesses: Dict[NpuOperation, MemoryAccessSet] = {}
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	918	for npu_op in npu_op_list:
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	919	if isinstance(npu_op, NpuDmaOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	920	memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	921	elif isinstance(npu_op, NpuBlockOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	922	memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	923	else:
				924	assert 0, "Invalid operation type"
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	925	if arch.is_ethos_u65_system:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	926	emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
				927	dep_watermark = Watermark(0, 0)
				928	prev_op = None
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	929	# Generate register commands for all operations
				930	for op_index, npu_op in enumerate(npu_op_list):
				931	dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	932	generate_registers_for_op(emit, npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	933	if not isinstance(npu_op, NpuDmaOperation) and isinstance(npu_op, NpuBlockOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	934	# Generate BLOCKDEP
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	935	blockdep = calc_blockdep(arch, prev_op, npu_op)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	936	blockdep = min(blockdep, arch.max_blockdep)
				937	emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
				938	prev_op = npu_op
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	939
				940	generate_cmd_waits(emit, cmd_waits)
				941	# Generate the actual NPU_OP command
				942	generate_operation_code(emit, npu_op)
				943	if add_to_debug_db is not None:
				944	add_to_debug_db(npu_op, emit.offset)
				945	# Fill in final part of command stream:
				946	emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	947	res = emit.to_list()
erik.andersson@arm.com	1878dab	2021-03-16 09:40:24 +0100	[diff] [blame^]	948
				949	if emit.size_in_bytes() >= 1 << 24:
				950	raise VelaError(
				951	f"The command stream size exceeds the hardware limit of 16 MiB. "
				952	f"The current stream size is {emit.size_in_bytes()/2**20:.2F} MiB."
				953	)
				954
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	955	if verbose:
				956	emit.print_cmds()
				957	print("number of commands", len(emit.cmd_stream))
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	958	print("command stream length in words", len(res))
				959	return res
				960
				961
				962	# -------------------------------------------------------------------
				963	# EXTERNAL API
				964	# -------------------------------------------------------------------
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	965
				966
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	967	def find_block_configs(npu_op: NpuOperation, npu_accelerator: NpuAccelerator) -> List[NpuShape3D]:
				968	"""
				969	Internal implementation of the public facing API for finding block configs.
				970	"""
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	971	if isinstance(npu_op, NpuBlockOperation):
				972	arch = create_default_arch(Accelerator.from_npu_accelerator(npu_accelerator))
				973	shared_buffer = create_shared_buffer(npu_op, arch)
				974	blocks = find_suitable_block_configs(arch, shared_buffer)
				975	return [NpuShape3D(height=block[0], width=block[1], depth=block[3]) for block in blocks]
				976	return []
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	977
				978
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	979	def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	980	"""
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	981	Internal implementation of the public facing API for generating an Ethos-U register command stream.
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	982	Calculates dependencies between commands and inserts wait operations if needed.
				983
				984	:param npu_op_list: List[NpuOperation] list of high level NPU operations
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	985	:param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator
				986	:return Ethos-U instructions, as a list of 32-bit integers
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	987	"""
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	988	accelerator = Accelerator.from_npu_accelerator(npu_accelerator)
Louis Verhaard	5207830	2020-11-18 13:35:06 +0100	[diff] [blame]	989	arch = create_default_arch(accelerator)
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	990	return generate_command_stream(npu_op_list, arch, verbose=False)