Blame - ethosu/vela/register_command_stream_generator.py - ml/ethos-u/ethos-u-vela

blob: 741b09c15cdb84785c450d4ecdfe4c1ec6031e4c [file] [log] [blame]

Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1	# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
				2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	16	# Description:
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	17	# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	18	# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	19	# stream suitable for interpretation by the Ethos-U processor.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	20	from collections import defaultdict
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	21	from collections import namedtuple
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	22	from enum import Enum
				23	from enum import IntEnum
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	24	from typing import List
				25	from typing import Optional
Louis Verhaard	d266580	2020-11-20 13:08:55 +0100	[diff] [blame]	26	from typing import Tuple
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	27
				28	import numpy as np
				29
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	30	from . import numeric_util
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	31	from . import scaling
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	32	from .api import NpuAccelerator
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	33	from .api import NpuActivation
				34	from .api import NpuActivationOp
				35	from .api import NpuAddressRange
				36	from .api import NpuBlockOperation
				37	from .api import NpuBlockTraversal
				38	from .api import NpuConv2DOperation
				39	from .api import NpuDataType
				40	from .api import NpuDmaOperation
				41	from .api import NpuElementWiseOp
				42	from .api import NpuElementWiseOperation
				43	from .api import NpuFeatureMap
				44	from .api import NpuKernel
				45	from .api import NpuLayout
				46	from .api import NpuOperation
				47	from .api import NpuOperationType
				48	from .api import NpuPadding
				49	from .api import NpuPoolingOp
				50	from .api import NpuPoolingOperation
				51	from .api import NpuQuantization
				52	from .api import NpuResamplingMode
				53	from .api import NpuRoundingMode
				54	from .api import NpuShape3D
				55	from .api import NpuTileBox
				56	from .architecture_features import Accelerator
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	57	from .architecture_features import ArchitectureFeatures
				58	from .architecture_features import Block
Louis Verhaard	5207830	2020-11-18 13:35:06 +0100	[diff] [blame]	59	from .architecture_features import create_default_arch
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	60	from .architecture_features import Rect
				61	from .architecture_features import SharedBufferArea
				62	from .architecture_features import SHRAMElements
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	63	from .debug_database import DebugDatabase
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	64	from .ethos_u55_regs.ethos_u55_regs import acc_format
				65	from .ethos_u55_regs.ethos_u55_regs import activation
				66	from .ethos_u55_regs.ethos_u55_regs import cmd0
				67	from .ethos_u55_regs.ethos_u55_regs import cmd1
				68	from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	69	from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlin	cf7da10	2020-05-20 09:03:40 +0200	[diff] [blame]	70	from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	71	from .ethos_u55_regs.ethos_u55_regs import rounding
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	72	from .high_level_command_stream import CommandType
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	73	from .high_level_command_to_npu_op import convert_command_to_npu_op
				74	from .high_level_command_to_npu_op import to_kernel
				75	from .high_level_command_to_npu_op import unary_elementwise_ops
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	76	from .numeric_util import quantise_float32
				77	from .numeric_util import round_away_zero
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	78	from .numeric_util import round_up_to_int
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	79	from .operation import NpuBlockType
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	80	from .range_set import AccessDirection
				81	from .range_set import MemoryAccessSet
				82	from .range_set import MemoryRangeSet
				83	from .shared_buffer_allocation import find_suitable_block_configs
				84	from .shared_buffer_allocation import shared_buffer_allocation_for_npu_op
				85	from .shared_buffer_allocation import SharedBufferAllocation
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	86
				87
				88	class RegisterMachine:
				89	def __init__(self):
				90	self.n_banks = 1
				91	self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
				92	self.bank_idx = 0
				93
				94	def set_register(self, reg, value):
				95	is_changed = self.registers[self.bank_idx][reg] != value
				96	self.registers[self.bank_idx][reg] = value
				97	# is_changed = True # force command
				98	return is_changed
				99
				100	def switch_bank(self):
				101	self.bank_idx = (self.bank_idx + 1) % self.n_banks
				102
				103
				104	class CmdMode(IntEnum):
				105	NoPayload = 0x0000
				106	Payload32 = 0x4000
				107	Mask = 0xC000
				108	CmdOpMask = 0x03FF
				109
				110
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	111	class CommandStreamEmitter:
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	112	WORD_SIZE = 4
				113
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	114	def __init__(self):
				115	self.cmd_stream = []
				116	self.reg_machine = [RegisterMachine(), RegisterMachine()]
				117	self.last_absolute_wait = defaultdict(int)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	118	self.offset = 0
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	119
				120	def get_reg_machine(self, cmd):
				121	if "DMA" in cmd.name:
				122	return self.reg_machine[1]
				123	else:
				124	return self.reg_machine[0]
				125
				126	def size_in_bytes(self):
				127	sz = 0
				128	for cmd in self.cmd_stream:
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	129	sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	130	return sz
				131
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	132	def to_list(self) -> List[int]:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	133	return [elem for cmd in self.cmd_stream for elem in cmd]
				134
				135	def print_cmds(self):
				136	print("Code: Command: Param: Payload:")
				137	for words_for_one_command in self.cmd_stream:
				138	code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
				139	param = words_for_one_command[0] >> 16 # higher 16 bits
				140
				141	payload_mode = CmdMode(code & CmdMode.Mask)
				142
				143	# code and command
				144	s = " 0x%04x " % code
				145	if payload_mode == CmdMode.NoPayload:
				146	s += str(cmd0(code & CmdMode.CmdOpMask))
				147	else:
				148	s += str(cmd1(code & CmdMode.CmdOpMask))
				149
				150	s = s.ljust(40)
				151	s += "%5d" % param
				152
				153	# payload
				154	if payload_mode == CmdMode.Payload32:
				155	s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])
				156	else:
				157	s += " -"
				158
				159	print(s)
				160
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	161	def cmd0_with_param(self, cmd: cmd0, param):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	162	if isinstance(param, Enum):
				163	param = int(param.value)
				164	else:
				165	param = int(param)
				166	param = param & 0xFFFF
				167	command = cmd.value \| (param << 16)
				168	if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
				169	return
				170
				171	# This is not a redundant command, actually write it
				172	self.cmd_stream.append((command,))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	173	self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	174
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	175	def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	176	offset = int(offset) & 0xFFFFFFFFF
				177	command = cmd.value \| CmdMode.Payload32.value \| (param << 16)
				178
				179	if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
				180	return
				181
				182	# This is not a redundant command, actually write it
				183	self.cmd_stream.append((command, offset))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	184	self.offset += CommandStreamEmitter.WORD_SIZE * 2
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	185
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	186	def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	187	param = (16 * channel) + outstanding_count
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	188	command = ((param & 0xFFFF) << 16) \| cmd.value
				189	self.cmd_stream.append((command,))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	190	self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	191
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	192	def cmd_do_operation(self, cmd: cmd0, param=0):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	193	param = int(param)
				194	command = ((param & 0xFFFF) << 16) \| cmd.value
				195
				196	self.cmd_stream.append((command,))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	197	self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	198	self.get_reg_machine(cmd).switch_bank()
				199
				200
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	201	# -------------------------------------------------------------------
				202	# REGISTER GENERATION
				203	# -------------------------------------------------------------------
				204
				205
				206	class BasePointerIndex(IntEnum):
				207	WeightTensor = 0 # base address index for the Weight tensor
				208	ScratchTensor = 1 # base address index for the Scratch_tensor in the TensorArena
				209	ScratchFastTensor = 2 # base address for the Scratch_fast_tensor
				210	Mem2Mem = (1 << 8) \| (3 << 0) # base address slot for memory 2 memory transfer
				211
				212
				213	# TODO: Replace with definitions from ethos_u55_regs
				214	class IFM2Broadcast(IntEnum):
				215	BroadcastHdim = 1 << 0
				216	BroadcastWdim = 1 << 1
				217	BroadcastCdim = 1 << 2
				218	ReverseOperandOrder = 1 << 6
				219	UseIFM2Scalar = 1 << 7
				220
				221
				222	pooling_op_map = {
				223	NpuPoolingOp.MAX: pooling_mode.MAX.value,
				224	NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,
				225	NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,
				226	}
				227
				228	elementwise_op_map = {
				229	NpuElementWiseOp.MUL: elementwise_mode.MUL.value,
				230	NpuElementWiseOp.ADD: elementwise_mode.ADD.value,
				231	NpuElementWiseOp.SUB: elementwise_mode.SUB.value,
				232	NpuElementWiseOp.MIN: elementwise_mode.MIN.value,
				233	NpuElementWiseOp.MAX: elementwise_mode.MAX.value,
				234	NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,
				235	NpuElementWiseOp.ABS: elementwise_mode.ABS.value,
				236	NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,
				237	NpuElementWiseOp.SHR: elementwise_mode.SHR.value,
				238	NpuElementWiseOp.SHL: elementwise_mode.SHL.value,
				239	}
				240
				241	activation_op_map = {
				242	NpuActivationOp.NONE_OR_RELU: activation.NONE,
				243	NpuActivationOp.TANH: activation.TANH,
				244	NpuActivationOp.SIGMOID: activation.SIGMOID,
				245	}
				246
				247	# Maps an AccumulatorType enum to the corresponding acc_format value
				248	acc_format_map = {
				249	SHRAMElements.Acc16: acc_format.FP_S5_10.value,
				250	SHRAMElements.Acc32: acc_format.INT_32BIT.value,
				251	SHRAMElements.Acc40: acc_format.INT_40BIT.value,
				252	}
				253
				254	resampling_mode_map = {
				255	NpuResamplingMode.NONE: resampling_mode.NONE,
				256	NpuResamplingMode.NEAREST: resampling_mode.NEAREST,
				257	NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,
				258	}
				259
				260	# Maps data type size in bits to activation precision
				261	precision_map = {8: 0, 16: 1, 32: 2}
				262
				263	# Maps rounding mode to the corresponding value
				264	rounding_mode_map = {
				265	NpuRoundingMode.TFL: rounding.TFL.value,
				266	NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,
				267	NpuRoundingMode.NATURAL: rounding.NATURAL.value,
				268	}
				269
				270
				271	def quantise(value: float, quant: Optional[NpuQuantization]) -> int:
				272	"""Quantizes the given value"""
				273	scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32
				274	zp = 0 if quant is None else quant.zero_point
				275	return quantise_float32(value, scale, zp)
				276
				277
				278	def has_ifm2(npu_op: NpuBlockOperation) -> bool:
				279	"""Checks if op has non-scalar IFM2"""
				280	return npu_op.ifm2 is not None and npu_op.ifm2_scalar is None
				281
				282
				283	def is_dma_op(npu_op: NpuOperation) -> bool:
				284	"""Checks if op is a DMA operation"""
				285	return npu_op.op_type == NpuOperationType.Dma
				286
				287
				288	def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):
				289	"""Generates IFM_PAD registers"""
				290	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)
				291	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)
				292	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)
				293	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)
				294
				295
				296	def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):
				297	"""Generates ACTIVATION registers"""
				298	act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)
				299
				300	if act.min is None:
				301	quantized_min = ofm.data_type.min_value()
				302	else:
				303	quantized_min = quantise(act.min, ofm.quantization)
				304	if act.max is None:
				305	quantized_max = ofm.data_type.max_value()
				306	else:
				307	quantized_max = quantise(act.max, ofm.quantization)
				308	quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())
				309	quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())
				310	if act.op_type == NpuActivationOp.TABLE_LOOKUP:
				311	assert 0 <= act.lookup_table_index < 8
				312	activation_value = 16 + act.lookup_table_index
				313	if ofm.data_type == NpuDataType.INT32:
				314	activation_value \|= 3 << 12 # Force I8 range
				315	quantized_min = max(-128, quantized_min)
				316	quantized_max = min(127, quantized_max)
				317	else:
				318	activation_value = activation_op_map[act.op_type]
				319	emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)
				320	emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)
				321	emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)
				322
				323
				324	def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):
				325	"""Generates xFM_BASE registers"""
				326	if layout == NpuLayout.NHCWB16:
				327	# Check that all BasePointer addresses are aligned to 16 bytes
				328	assert all((int(addr) % 16) == 0 for addr in addresses)
				329	emit.cmd1_with_offset(ptr_cmds[0], addresses[0])
				330	emit.cmd1_with_offset(ptr_cmds[1], addresses[1])
				331	emit.cmd1_with_offset(ptr_cmds[2], addresses[2])
				332	emit.cmd1_with_offset(ptr_cmds[3], addresses[3])
				333
				334
				335	def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):
				336	"""Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""
				337	emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)
				338	emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)
				339	emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)
				340
				341
				342	def generate_strides(
				343	emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1
				344	):
				345	"""Generates STRIDE_C/Y/X registers"""
				346	strides = get_strides(fm)
				347	emit.cmd1_with_offset(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)
				348	emit.cmd1_with_offset(stride_y_cmd, strides.height) # stride between vertical values (H)
				349	emit.cmd1_with_offset(stride_x_cmd, strides.width) # stride between horisontal values (W)
				350
				351
				352	def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):
				353	"""Generates IFM/IFM2_PRECISION register"""
				354	dtype = fm.data_type
				355	prec = 1 if dtype.is_signed() else 0
				356	activation_precision = precision_map[dtype.size_in_bits()]
				357	prec += activation_precision << 2
				358
				359	if fm.layout == NpuLayout.NHCWB16:
				360	prec \|= 1 << 6
				361
				362	prec \|= op_to_scale << 8
				363	emit.cmd0_with_param(precision_cmd, prec)
				364
				365
				366	def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):
				367	"""Generates OFM_PRECISION register"""
				368	dtype = npu_op.ofm.data_type
				369	prec = 1 if dtype.is_signed() else 0
				370	activation_precision = precision_map[dtype.size_in_bits()]
				371	prec += activation_precision << 1
				372
				373	if use_global_scale:
				374	# Set global scale bit, as opposed to using per channel scale
				375	prec \|= 1 << 8
				376	if npu_op.ofm.layout == NpuLayout.NHCWB16:
				377	prec \|= 1 << 6
				378	prec \|= rounding_mode_map[npu_op.rounding_mode] << 14
				379	emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
				380
				381
				382	def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):
				383	"""Generates IFM2_BROADCAST register for binary elementwise operations"""
				384	ifm2_broadcast = 0
				385	ifm = npu_op.ifm
				386	ifm2 = npu_op.ifm2
				387	if npu_op.reversed_operands:
				388	ifm2_broadcast \|= IFM2Broadcast.ReverseOperandOrder
				389	if npu_op.ifm2_scalar is not None:
				390	# IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
				391	ifm2_broadcast \|= IFM2Broadcast.UseIFM2Scalar
				392	else:
				393	if ifm.shape.height != ifm2.shape.height:
				394	# Broadcast in 'H' dimension
				395	assert ifm2.shape.height == 1
				396	ifm2_broadcast \|= IFM2Broadcast.BroadcastHdim
				397
				398	if ifm.shape.width != ifm2.shape.width:
				399	# Broadcast in 'W' dimension
				400	assert ifm2.shape.width == 1
				401	ifm2_broadcast \|= IFM2Broadcast.BroadcastWdim
				402
				403	if ifm.shape.depth != ifm2.shape.depth:
				404	# Broadcast in 'C' dimension
				405	assert ifm2.shape.depth == 1
				406	ifm2_broadcast \|= IFM2Broadcast.BroadcastCdim
				407
				408	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
				409
				410
				411	def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):
				412	"""Generates general IFM registers"""
				413	emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)
				414	generate_addresses(
				415	emit,
				416	[cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],
				417	ifm.tiles.addresses,
				418	ifm.layout,
				419	)
				420	generate_tiles(
				421	emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles
				422	)
				423	emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)
				424	generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)
				425	emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))
				426
				427
				428	def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):
				429	"""Generates general IFM2 registers"""
				430	if not has_scalar:
				431	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)
				432	generate_addresses(
				433	emit,
				434	[cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],
				435	ifm2.tiles.addresses,
				436	ifm2.layout,
				437	)
				438	generate_tiles(
				439	emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles
				440	)
				441	generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)
				442	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))
				443
				444
				445	def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):
				446	"""Generates general OFM registers"""
				447	emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)
				448	generate_addresses(
				449	emit,
				450	[cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],
				451	ofm.tiles.addresses,
				452	ofm.layout,
				453	)
				454	generate_tiles(
				455	emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles
				456	)
				457	emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)
				458	emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)
				459	emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)
				460	generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)
				461	emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))
				462
				463
				464	def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):
				465	"""Generates KERNEL related registers"""
				466	emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))
				467	emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))
				468	# set kernel x stride low bit
				469	stride = (kernel.stride_x - 1) & 1
				470	# set kernel y stride low bit
				471	stride \|= (kernel.stride_y - 1 & 1) << 1
				472	# set kernel x stride extension bits
				473	stride \|= (kernel.stride_x - 1 >> 1) << 6
				474	# set kernel y stride extension bits
				475	stride \|= (kernel.stride_y - 1 >> 1) << 9
				476	stride \|= (kernel.dilation_x - 1) << 3
				477	stride \|= (kernel.dilation_y - 1) << 4
				478	if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:
				479	stride \|= 1 << 2
				480	emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
				481
				482
				483	def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):
				484	"""Generates WEIGHT registers"""
				485	if len(weights) == 0:
				486	return
				487	emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)
				488	# Set weights sources for active and present cores
				489	for core, (addr, length) in enumerate(
				490	[
				491	(cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
				492	(cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
				493	]
				494	):
				495	if core < len(weights):
				496	emit.cmd1_with_offset(addr, weights[core].address)
				497	emit.cmd1_with_offset(length, weights[core].length)
				498	elif core < arch.ncores:
				499	emit.cmd1_with_offset(addr, weights[0].address)
				500	emit.cmd1_with_offset(length, 0)
				501
				502
				503	def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):
				504	"""Generates SCALE registers"""
				505	if len(biases) == 0:
				506	return
				507	emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)
				508	# Set weights sources for active and present cores
				509	for core, (addr, length) in enumerate(
				510	[(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]
				511	):
				512	if core < len(biases):
				513	emit.cmd1_with_offset(addr, biases[core].address)
				514	emit.cmd1_with_offset(length, biases[core].length)
				515	elif core < arch.ncores:
				516	emit.cmd1_with_offset(addr, biases[0].address)
				517	emit.cmd1_with_offset(length, 0)
				518
				519
				520	def generate_block_config(
				521	emit: CommandStreamEmitter,
				522	npu_op: NpuBlockOperation,
				523	arch: ArchitectureFeatures,
				524	shared_buffer: SharedBufferAllocation,
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	525	):
				526	"""Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	527	block_config = npu_op.block_config
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	528	assert block_config is not None, "block_config has not been set"
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	529	alloc = shared_buffer.try_block(Block(block_config.width, block_config.height, block_config.depth))
				530	assert alloc is not None, f"Block config {block_config} does not fit, op: {npu_op.op_type}"
				531	emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
				532	emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)
				533	emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	534
				535
				536	def generate_shram_registers_elementwise(
				537	emit: CommandStreamEmitter,
				538	npu_op: NpuElementWiseOperation,
				539	arch: ArchitectureFeatures,
				540	shared_buffer: SharedBufferAllocation,
				541	):
				542	"""Generates IB_END/IB_START/AB_START registers for elementwise operations"""
				543	# For elementwise set the required SHRAM to be equal to the total size of available SHRAM
				544	uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
				545	shram_required = arch.available_shram_banks(uses_lut)
				546
				547	# Acc buffers not needed so set AB_START to size of SHRAM
				548	emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)
				549	emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shram_required)
				550	if has_ifm2(npu_op):
				551	# Set IFM2_IB_START to the latter half of the IB space
				552	ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]
				553	emit.cmd0_with_param(
				554	cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) // shared_buffer.ifm_count + ifm_ib_start,
				555	)
				556	emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
				557
				558
				559	def generate_shram_registers_non_elementwise(emit: CommandStreamEmitter, shared_buffer: SharedBufferAllocation):
				560	"""Generates IB_END/IB_START/AB_START registers for non-elementwise operations"""
				561	emit.cmd0_with_param(
				562	cmd0.NPU_SET_IFM_IB_END,
				563	shared_buffer.bank_locations[SharedBufferArea.IFM] + shared_buffer.banks_required[SharedBufferArea.IFM],
				564	)
				565	emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])
				566	emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
				567
				568
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	569	def create_shared_buffer(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> SharedBufferAllocation:
				570	"""Creates shared buffer allocation for the given operation"""
				571	op_type = npu_op.op_type
				572	block_type = NpuBlockType.Default
				573	if op_type == NpuOperationType.Conv2D:
				574	block_type = NpuBlockType.ConvolutionMxN
				575	elif op_type == NpuOperationType.ConvDepthWise:
				576	block_type = NpuBlockType.ConvolutionDepthWise
				577	elif op_type == NpuOperationType.Pooling:
				578	block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
				579	elif op_type == NpuOperationType.ElementWise:
				580	block_type = NpuBlockType.ElementWise
				581	else:
				582	assert 0, "Unsupported operation"
				583	ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
				584	return shared_buffer_allocation_for_npu_op(arch, npu_op, block_type, ifm_resampling_mode)
				585
				586
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	587	def generate_common(
				588	emit: CommandStreamEmitter,
				589	npu_op: NpuBlockOperation,
				590	block_traversal: NpuBlockTraversal,
				591	arch: ArchitectureFeatures,
				592	use_global_scale: bool = False,
				593	op_to_scale: int = 0,
				594	):
				595	"""Generate registers that are common to most operations"""
				596	assert npu_op.ifm is not None and npu_op.ofm is not None
				597	generate_ifm(emit, npu_op.ifm)
				598	generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)
				599	emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])
				600	if npu_op.padding is not None:
				601	generate_padding(emit, npu_op.padding)
				602	generate_ofm(emit, npu_op.ofm)
				603	generate_ofm_precision(emit, npu_op, use_global_scale)
				604	if npu_op.op_type != NpuOperationType.ElementWise:
				605	assert npu_op.kernel is not None
				606	generate_kernel(emit, npu_op.kernel, block_traversal)
				607	generate_weights(emit, npu_op.weights, arch)
				608	generate_biases(emit, npu_op.biases, arch)
				609	generate_activation(emit, npu_op.activation, npu_op.ofm)
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	610	shared_buffer = create_shared_buffer(npu_op, arch)
				611	generate_block_config(emit, npu_op, arch, shared_buffer)
				612	if npu_op.op_type == NpuOperationType.ElementWise:
				613	generate_shram_registers_elementwise(emit, npu_op, arch, shared_buffer)
				614	else:
				615	generate_shram_registers_non_elementwise(emit, shared_buffer)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	616
				617
				618	# -------------------------------------------------------------------
				619	# SCALING
				620	# -------------------------------------------------------------------
				621
				622
				623	def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):
				624	"""Generates OFM_SCALE register for pooling operations"""
				625	# For valid padding vela has to output scaling values
				626	kernel = pool_op.kernel
				627	ifm_quant = pool_op.ifm.quantization
				628	ofm_quant = pool_op.ofm.quantization
				629	if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):
				630	assert ifm_quant.scale_f32 is not None
				631	rescale = 0x3000 * ifm_quant.scale_f32
				632	if pool_op.ifm.data_type == NpuDataType.INT16:
				633	# Calculate scale and shift for the output scale of 1/(3*4096)
				634	shift = 0
				635	max_rescale = np.iinfo(np.int16).max / 2
				636	while rescale <= max_rescale and shift <= 30:
				637	shift += 1
				638	rescale *= 2
				639	scale = int(rescale)
				640	else:
				641	rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
				642	scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
				643	scale = int(round_away_zero(scale * rescale))
				644	elif pool_op.fused_quantize:
				645	# Quantize op requires different scaling
				646	ifm_scale_f64 = np.double(ifm_quant.scale_f32)
				647	ofm_scale_f64 = np.double(ofm_quant.scale_f32)
				648	scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
				649	elif pool_op.rescale is not None:
				650	# for ResizeBilinear operations with "rescale" in primary_op.attrs
				651	rescale = pool_op.rescale
				652	rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
				653	scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
				654	scale = int(round_away_zero(scale * rescale))
				655	else:
				656	# In case avg pool fused with concat or other memory operation, rescaling might be needed.
				657	# kernel height == kernel width == 1 is always true in this case
				658	# Normally the scale is maximised, to get maximum precision, which means that
				659	# if rescale != 1, scale need to consider the number of bits needed for rescaling
				660	if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:
				661	rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32
				662	rescale_bits = 0
				663	if kernel.height == kernel.width == 1:
				664	if rescale > 1:
				665	rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
				666	elif rescale < 1:
				667	rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
				668	scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
				669	scale = int(round_away_zero(scale * rescale))
				670	else:
				671	scale = 1
				672	shift = 0
				673
				674	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
				675
				676
				677	def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
				678	"""
				679	Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
				680	Returns the operator to scale
				681	"""
				682	op_to_scale = 0
				683	if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):
				684	input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
				685	input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
				686	output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
				687
				688	if npu_op.activation is not None and npu_op.activation.op_type in (
				689	NpuActivationOp.SIGMOID,
				690	NpuActivationOp.TANH,
				691	):
				692	output_scale = 1 / 0x3000
				693
				694	if npu_op.sub_op_type == NpuElementWiseOp.MUL:
				695	if None in (input_scale, input2_scale, output_scale):
				696	ofm_scale = 1
				697	shift = 0
				698	else:
				699	ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
				700	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
				701	else: # Add/Sub
				702	if None in (input_scale, input2_scale, output_scale):
				703	opa_scale = opb_scale = ofm_scale = 1
				704	opa_shift = shift = 0
				705	if npu_op.rescale is not None:
				706	ofm_scale, shift = npu_op.rescale
				707	elif input_scale == input2_scale:
				708	opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
				709	input_scale, input2_scale, output_scale
				710	)
				711	opa_shift = 0 # Unused for this case
				712	else:
				713	# Use advanced implementation only when input scales differ
				714	bitdepth = npu_op.ifm.data_type.size_in_bits()
				715	(opa_scale, opa_shift, ofm_scale, shift, op_to_scale,) = scaling.advanced_elementwise_add_sub_scale(
				716	input_scale, input2_scale, output_scale, bitdepth
				717	)
				718	opb_scale = 0 # Unused for this case
				719	if npu_op.reversed_operands:
				720	# If the operand order is reversed we also have to swap which operand is scaled
				721	if op_to_scale == scaling.OperandToScale.OPa:
				722	op_to_scale = scaling.OperandToScale.OPb
				723	else:
				724	op_to_scale = scaling.OperandToScale.OPa
				725	emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
				726	emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
				727	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
				728	elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):
				729	output_scale = npu_op.ofm.quantization.scale_f32
				730	ofm_scale, shift = scaling.quantise_scale(output_scale)
				731	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
				732	else:
				733	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)
				734	return op_to_scale
				735
				736
				737	# -------------------------------------------------------------------
				738	# ADDRESSING/STRIDES (helper functions)
				739	# -------------------------------------------------------------------
				740
				741
				742	def ranges_overlap(range1: NpuAddressRange, range2: NpuAddressRange) -> bool:
				743	"""Checks if the ranges overlap"""
				744	return range1.region == range2.region and numeric_util.overlaps(
				745	range1.address, range1.address + range1.length, range2.address, range2.address + range2.length
				746	)
				747
				748
Louis Verhaard	d266580	2020-11-20 13:08:55 +0100	[diff] [blame]	749	def range_lists_overlap(list1: List[Optional[NpuAddressRange]], list2: List[Optional[NpuAddressRange]]) -> bool:
				750	"""Checks if there is any address overlap between list1 and list2"""
				751	for range1 in list1:
				752	if range1 is None:
				753	continue
				754	for range2 in list2:
				755	if range2 is not None and ranges_overlap(range1, range2):
				756	return True
				757	return False
				758
				759
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	760	def get_strides(fm: NpuFeatureMap) -> NpuShape3D:
				761	"""Calculates STRIDE_C/Y/X"""
				762	if fm.strides is not None:
				763	return fm.strides
				764	elem_size = fm.data_type.size_in_bytes()
				765	if fm.layout == NpuLayout.NHWC:
				766	stride_c = elem_size
				767	stride_x = fm.shape.depth * stride_c
				768	stride_y = fm.shape.width * stride_x
				769	else:
				770	stride_x = 16 * elem_size
				771	stride_c = stride_x * fm.shape.width
				772	stride_y = elem_size * fm.shape.width * numeric_util.round_up(fm.shape.depth, 16)
				773	return NpuShape3D(depth=stride_c, height=stride_y, width=stride_x)
				774
				775
				776	def get_address(fm: NpuFeatureMap, strides: NpuShape3D, y: int, x: int, c: int) -> int:
				777	"""Returns address of given coordinate"""
				778	t = 0
				779	BRICK = 16
				780	stride_c = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHWC else strides.depth
				781	stride_x = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHCWB16 else strides.width
				782	if x >= fm.tiles.width_0:
				783	x -= fm.tiles.width_0
				784	t = 1
				785	if y >= fm.tiles.height_1:
				786	y -= fm.tiles.height_1
				787	t += 2
				788	elif y >= fm.tiles.height_0:
				789	y -= fm.tiles.height_0
				790	t += 2
				791	elem_size = fm.data_type.size_in_bytes()
				792	return (
				793	fm.tiles.addresses[t] + y * strides.height + x * stride_x + (c // BRICK) * stride_c + int(c % BRICK) * elem_size
				794	)
				795
				796
				797	def get_address_range(
				798	fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int
				799	) -> NpuAddressRange:
Louis Verhaard	d266580	2020-11-20 13:08:55 +0100	[diff] [blame]	800	"""
				801	Gets address range for (y0, x0, c0) - (y1, x1, c1) (inclusive, so the second coordinate is within the fm).
				802	The begin and end coordinates must be within the same tile.
				803	"""
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	804	addr0 = get_address(fm, strides, y0, x0, c0)
				805	addr1 = get_address(fm, strides, y1, x1, c1)
				806	return NpuAddressRange(region=fm.region, address=addr0, length=addr1 - addr0 + fm.data_type.size_in_bytes())
				807
				808
Louis Verhaard	d266580	2020-11-20 13:08:55 +0100	[diff] [blame]	809	def get_h_ranges(
				810	fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int
				811	) -> List[NpuAddressRange]:
				812	"""
				813	Gets address ranges for (y0, x0, c0) - (y1, x1, c1) (inclusive, so the second coordinate is within the fm);
				814	the begin and end coordinates must be within the same tile.
				815	Divides the area in horizontal "stripes" of height 1, and returns the address ranges for these "stripes".
				816	"""
				817	return [get_address_range(fm, strides, y, x0, c0, y, x1, c1) for y in range(y0, y1 + 1)]
				818
				819
				820	def get_address_ranges_for_area(
				821	fm: NpuFeatureMap, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int
				822	) -> List[NpuAddressRange]:
				823	"""
				824	Returns a list of adddress ranges that covers the area (y0, x0, c0) - (y1, x1, c1) (inclusive).
				825	Divides the area in horizontal "stripes" of height 1, and returns the address ranges for these "stripes".
				826
				827	For example, for the area marked with X (in a feature map with 4 tiles) as input, this function would return
				828	6 address ranges: the address ranges for 1-height areas [AAA, BBB, CC, DD, EEE, FF]
				829
				830	.....\|.... .....\|....
				831	t0 ..XXX\|XX.. t1 t0 ..AAA\|CC.. t1
				832	..XXX\|XX.. ..BBB\|DD..
				833	-----+---- --> -----+----
				834	t2 ..XXX\|XX.. t3 t2 ..EEE\|FF.. t3
				835	.....\|.... .....\|....
				836	"""
				837	strides = get_strides(fm)
				838	height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0
				839	h, w, c = fm.shape
				840	y2, x2, c2 = min(y1, h - 1), min(x1, w - 1), min(c1, c - 1)
				841	ranges = []
				842	if x0 < width_0 and y0 < height_0:
				843	# Horizontal ranges for tile 0
				844	ranges.extend(get_h_ranges(fm, strides, y0, x0, c0, min(y2, height_0 - 1), min(x2, width_0 - 1), c2))
				845	if x2 >= width_0 and y0 < height_1:
				846	# Horizontal ranges for tile 1
				847	ranges.extend(get_h_ranges(fm, strides, y0, max(x0, width_0), c0, min(y2, height_1 - 1), x2, c2))
				848	if x0 < width_0 and y2 >= height_0:
				849	# Horizontal ranges for tile 2
				850	ranges.extend(get_h_ranges(fm, strides, max(y0, height_0), x0, c0, y2, min(x2, width_0 - 1), c2))
				851	if x2 >= width_0 and y2 >= height_1:
				852	# Horizontal ranges for tile 3
				853	ranges.extend(get_h_ranges(fm, strides, max(y0, height_1), max(x0, width_0), c0, y2, x2, c2))
				854	return ranges
				855
				856
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	857	def get_address_ranges(fm: NpuFeatureMap) -> List[Optional[NpuAddressRange]]:
				858	"""Returns 4 adddress ranges, one for every tile, None if the tile is not in use"""
				859	strides = get_strides(fm)
				860	height, width, depth = fm.shape.height, fm.shape.width, fm.shape.depth
				861	height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0
				862	t0 = get_address_range(fm, strides, 0, 0, 0, min(height, height_0) - 1, min(width, width_0) - 1, depth - 1,)
				863	if width > width_0:
				864	t1 = get_address_range(fm, strides, 0, width_0, 0, min(height, height_1) - 1, width - 1, depth - 1)
				865	else:
				866	t1 = None
				867	if height > height_0:
				868	t2 = get_address_range(fm, strides, height_0, 0, 0, height - 1, min(width, width_0) - 1, depth - 1)
				869	else:
				870	t2 = None
				871	if t1 is not None and t2 is not None:
Louis Verhaard	d266580	2020-11-20 13:08:55 +0100	[diff] [blame]	872	t3 = get_address_range(fm, strides, height_1, width_0, 0, height - 1, width - 1, depth - 1)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	873	else:
				874	t3 = None
				875	return [t0, t1, t2, t3]
				876
				877
				878	# -------------------------------------------------------------------
				879	# DMA_WAIT/KERNEL_WAIT
				880	# -------------------------------------------------------------------
				881
				882
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	883	Watermark = namedtuple("Watermark", ["npu", "dma"])
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	884
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	885
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	886	def memory_range_set(range: NpuAddressRange) -> MemoryRangeSet:
				887	return MemoryRangeSet(range.region, range.address, range.address + range.length)
				888
				889
				890	def get_dma_memory_accesses(dma_op: NpuDmaOperation) -> MemoryAccessSet:
				891	"""Returns the address that are read and written by the given DMA operation"""
				892	res = MemoryAccessSet()
				893	res.add(memory_range_set(dma_op.src), AccessDirection.Read)
				894	res.add(memory_range_set(dma_op.dest), AccessDirection.Write)
				895	return res
				896
				897
				898	def get_op_memory_accesses(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> MemoryAccessSet:
				899	"""Returns the addresses that are read and written by the given operation"""
				900	assert npu_op.ifm is not None and npu_op.ofm is not None
				901	# Read addresses
				902	read_ranges = get_address_ranges(npu_op.ifm)
				903	if has_ifm2(npu_op):
				904	assert npu_op.ifm2 is not None
				905	read_ranges.extend(get_address_ranges(npu_op.ifm2))
				906	read_ranges.extend(npu_op.weights)
				907	read_ranges.extend(npu_op.biases)
				908	if npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP:
				909	address = arch.available_shram_banks(True) * arch.shram_bank_size
				910	read_ranges.append(NpuAddressRange(region=BasePointerIndex.Mem2Mem, address=address, length=2048))
				911	# Written addresses
				912	write_ranges = get_address_ranges(npu_op.ofm)
				913	# Add write access to SHRAM, needed when LUTs can overwrite accumulator banks
				914	uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
				915	written_shram_size = arch.available_shram_banks(uses_lut) * arch.shram_bank_size
				916	write_ranges.append(NpuAddressRange(region=BasePointerIndex.Mem2Mem, address=0, length=written_shram_size))
				917
				918	res = MemoryAccessSet()
				919	for read_range in read_ranges:
				920	if read_range is not None:
				921	res.add(memory_range_set(read_range), AccessDirection.Read)
				922	for write_range in write_ranges:
				923	if write_range is not None:
				924	res.add(memory_range_set(write_range), AccessDirection.Write)
				925	return res
				926
				927
				928	def get_wait_dependency(
				929	arch: ArchitectureFeatures, npu_op_list: List[NpuOperation], memory_accesses, op_index: int, watermark: Watermark
				930	):
				931	"""Used to calculate whether DMA wait or kernel wait operations are needed"""
				932	npu_op = npu_op_list[op_index]
				933	op_access = memory_accesses[npu_op]
				934	index = op_index - 1
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	935
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	936	# NPU dependency tracking
				937	npu_outstanding = -1
				938	npu_ops = 0
				939	npu_index = watermark.npu
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	940
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	941	# DMA dependency tracking
				942	dma_outstanding = -1
				943	dma_ops = 0
				944	dma_index = watermark.dma
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	945
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	946	# Seek back in the command stream looking for NPU or DMA dependencies
				947	# but only as far as the first dependency or the watermarks (dependencies
				948	# before this point have been satisfied already).
				949	# The watermark moves to after the latest element we must wait for, not
				950	# the command that issues the wait.
				951	# NPU->NPU dependency is handled via blockdep.
				952	while (index >= npu_index) or (index >= dma_index):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	953	prev_op = npu_op_list[index]
				954	prev_access = memory_accesses[prev_op]
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	955
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	956	# Check NPU consuming DMA output
				957	if is_dma_op(prev_op):
				958	if index >= dma_index:
				959	if not is_dma_op(npu_op):
				960	if (dma_outstanding == -1) and prev_access.conflicts(op_access):
				961	dma_outstanding = dma_ops
				962	dma_ops += 1 # Count DMA ops in the pipeline
				963	if dma_ops >= arch.max_outstanding_dma:
				964	dma_index = max(index + 1, dma_index)
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	965	# Check DMA consuming NPU output
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	966	else:
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	967	if index >= npu_index:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	968	if is_dma_op(npu_op) and npu_outstanding == -1 and prev_access.conflicts(op_access):
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	969	npu_outstanding = npu_ops
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	970	npu_ops += 1 # Count NPU ops in the pipeline
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	971	if npu_ops >= arch.max_outstanding_kernels:
				972	npu_index = max(index + 1, npu_index)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	973
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	974	index -= 1
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	975
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	976	# Update DMA watermark if we didn't see any and the NPU pipeline is full
				977	if (dma_ops == 0) and (npu_ops >= arch.max_outstanding_kernels):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	978	dma_index = op_index
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	979
				980	# Bring the search watermark forwards as we complete for those dependencies
				981	watermark = Watermark(npu_index, dma_index)
				982	outstanding = Watermark(npu_outstanding, dma_outstanding)
				983
				984	return watermark, outstanding
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	985
				986
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	987	def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):
				988	if cmd_waits.npu >= 0:
				989	emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
				990
				991	if cmd_waits.dma >= 0:
				992	emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
				993
				994
				995	# -------------------------------------------------------------------
				996	# BLOCKDEP
				997	# -------------------------------------------------------------------
				998
				999
Louis Verhaard	d266580	2020-11-20 13:08:55 +0100	[diff] [blame]	1000	def shape3d_size(shape: NpuShape3D) -> int:
				1001	return shape.width * shape.height * shape.depth
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1002
				1003
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1004	def shape3d_to_rect(shape: NpuShape3D) -> Rect:
				1005	return Rect(0, 0, 0, shape.width - 1, shape.height - 1, shape.depth - 1)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1006
				1007
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1008	def get_ifm_ofm_block_depth(arch: ArchitectureFeatures, npu_op: NpuBlockOperation) -> int:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1009	# Note: NOT equivalent to the normal ifm block depth calculation since
				1010	# it takes into account 'depthless' block operations by returning full
				1011	# depth
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1012	if npu_op.op_type == NpuOperationType.Conv2D:
				1013	res = arch.calc_ifm_block_depth(npu_op.ifm.shape.depth, npu_op.ifm.data_type.size_in_bits())
				1014	return res
				1015	return npu_op.ofm.shape.depth
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1016
				1017
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	1018	def calc_blockdep(arch: ArchitectureFeatures, prev_op: Optional[NpuBlockOperation], npu_op: NpuBlockOperation,) -> int:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1019	"""Calculates the value of the BLOCKDEP register"""
				1020	if prev_op is None:
				1021	return 0
Louis Verhaard	d266580	2020-11-20 13:08:55 +0100	[diff] [blame]	1022	assert npu_op.ifm is not None
				1023	assert prev_op.ofm is not None
				1024	# Check if IFM or IFM2 overlaps with prev op's OFM
				1025	prev_ofm_ranges = get_address_ranges(prev_op.ofm)
				1026	ifm_ranges = get_address_ranges(npu_op.ifm)
				1027	ifm_overlaps = range_lists_overlap(prev_ofm_ranges, ifm_ranges)
				1028	if has_ifm2(npu_op):
				1029	assert npu_op.ifm2 is not None
				1030	ifm2_ranges = get_address_ranges(npu_op.ifm2)
				1031	ifm2_overlaps = range_lists_overlap(prev_ofm_ranges, ifm2_ranges)
				1032	else:
				1033	ifm2_overlaps = False
				1034	if ifm_overlaps and ifm2_overlaps:
				1035	# Both IFM and IFM2 overlap (should be rare)
				1036	return 0
				1037	if not ifm_overlaps and not ifm2_overlaps:
				1038	# No overlap between prev OFM and IFM/IFM2
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1039	return ArchitectureFeatures.MAX_BLOCKDEP
Louis Verhaard	d266580	2020-11-20 13:08:55 +0100	[diff] [blame]	1040	if ifm2_overlaps and shape3d_size(npu_op.ifm2.shape) < shape3d_size(npu_op.ifm.shape):
				1041	# Prev OFM produces IFM2 which is broadcasted (this should be rare)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1042	return 0
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	1043	prev_block_config = prev_op.block_config
				1044	block_config = npu_op.block_config
Louis Verhaard	d266580	2020-11-20 13:08:55 +0100	[diff] [blame]	1045	overlapping_fm = npu_op.ifm if ifm_overlaps else npu_op.ifm2
				1046	assert overlapping_fm is not None
				1047
				1048	def intersects(ifm_start_coord: Tuple, ifm_end_coord: Tuple, ofm_start_coord: Tuple, ofm_end_coord: Tuple) -> bool:
				1049	"""Checks if the given IFM area overlaps with the given OFM area"""
				1050	if overlapping_fm.shape == prev_op.ofm.shape and overlapping_fm.tiles == prev_op.ofm.tiles:
				1051	# Common case: prev_op.ofm == op.ifm; in this case it suffices to check
				1052	# if the xyz coordinates overlap, which is quick and easy
				1053	return ArchitectureFeatures.intersects(ifm_start_coord, ifm_end_coord, ofm_start_coord, ofm_end_coord)
				1054	# The OFM produces a part of the IFM (e.g. a stripe), or the IFM consumes part of the OFM.
				1055	# In this case address comparison is needed between the two areas
				1056	x0, y0, c0 = ifm_start_coord
				1057	x1, y1, c1 = ifm_end_coord
				1058	ifm_ranges = get_address_ranges_for_area(overlapping_fm, y0, x0, c0, y1, x1, c1)
				1059	x0, y0, c0 = ofm_start_coord
				1060	x1, y1, c1 = ofm_end_coord
				1061	prev_ofm_ranges = get_address_ranges_for_area(prev_op.ofm, y0, x0, c0, y1, x1, c1)
				1062	return range_lists_overlap(ifm_ranges, prev_ofm_ranges)
				1063
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1064	prev_ofm_block = Block(prev_block_config.width, prev_block_config.height, prev_block_config.depth)
				1065	prev_ofm_rect = shape3d_to_rect(prev_op.ofm.shape)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1066	cur_ifm_block_depth = get_ifm_ofm_block_depth(arch, npu_op)
				1067	cur_ofm_block = Block(block_config.width, block_config.height, block_config.depth)
				1068	cur_ofm_rect = shape3d_to_rect(npu_op.ofm.shape)
				1069	cur_ifm_rect = shape3d_to_rect(npu_op.ifm.shape)
				1070	cur_padLT = (0, 0) if npu_op.padding is None else (npu_op.padding.left, npu_op.padding.top)
Louis Verhaard	d266580	2020-11-20 13:08:55 +0100	[diff] [blame]	1071	return arch.calc_block_dep(
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1072	prev_ofm_rect,
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1073	prev_ofm_block,
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1074	cur_ifm_rect,
				1075	cur_ofm_rect,
				1076	cur_ifm_block_depth,
				1077	cur_ofm_block,
				1078	to_kernel(npu_op.kernel),
				1079	cur_padLT,
Louis Verhaard	d266580	2020-11-20 13:08:55 +0100	[diff] [blame]	1080	intersects=intersects,
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1081	)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1082
				1083
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1084	# -------------------------------------------------------------------
				1085	# PRINT
				1086	# -------------------------------------------------------------------
Jacob Bohlin	e99b893	2020-07-13 16:01:51 +0200	[diff] [blame]	1087
				1088
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1089	def print_feature_map(fm: NpuFeatureMap, name: str):
				1090	if fm is not None:
				1091	q = (
				1092	"no quantization"
				1093	if fm.quantization is None
				1094	else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"
				1095	)
				1096	h, w, c = fm.shape
				1097	sz = h * w * c * fm.data_type.size_in_bytes()
				1098	print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")
				1099	strides = get_strides(fm)
				1100	stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"
				1101	t = fm.tiles
				1102	addresses = [hex(addr) for addr in t.addresses]
				1103	print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1104
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1105
				1106	def print_operation(npu_op: NpuOperation, index: int = 0):
				1107	pass_info = f", {npu_op.cmd}" if hasattr(npu_op, "cmd") else ""
				1108	if is_dma_op(npu_op):
				1109	print(f"{index} DMA_START src={npu_op.src}, dest={npu_op.dest}{pass_info}")
				1110	return
				1111	k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)
				1112	if npu_op.op_type in (NpuOperationType.Pooling, NpuOperationType.ElementWise):
				1113	print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name}:{pass_info}")
Patrik Gustavsson	eca2e95	2020-05-27 09:15:11 +0200	[diff] [blame]	1114	else:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1115	if (
				1116	npu_op.op_type == NpuOperationType.Conv2D
				1117	and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1
				1118	):
				1119	fc = "FullyConnected "
				1120	else:
				1121	fc = ""
				1122	print(f"{index} {fc}{npu_op.op_type.name}{pass_info}")
				1123	print_feature_map(npu_op.ifm, "IFM")
				1124	if npu_op.ifm2_scalar is not None:
				1125	quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
				1126	print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")
				1127	else:
				1128	print_feature_map(npu_op.ifm2, "IFM2")
				1129	print_feature_map(npu_op.ofm, "OFM")
				1130	if k is not None and npu_op.op_type != NpuOperationType.ElementWise:
				1131	print(f" Kernel: {k}")
				1132	if npu_op.padding is not None:
				1133	print(f" {npu_op.padding}")
				1134	for weights in npu_op.weights:
				1135	print(f" Weights: {weights}")
				1136	for bias in npu_op.biases:
				1137	print(f" Scales: {bias}")
				1138	if npu_op.activation is not None:
				1139	act = npu_op.activation
				1140	if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:
				1141	lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""
				1142	print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")
				1143	if npu_op.op_type == NpuOperationType.Conv2D:
				1144	print(f" {npu_op.block_traversal}")
				1145	bh, bw, bc = npu_op.block_config
				1146	rescale = f", rescale={npu_op.rescale}" if hasattr(npu_op, "rescale") else ""
				1147	print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1148
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1149
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1150	def print_operations(npu_op_list: List[NpuOperation]):
				1151	for index, npu_op in enumerate(npu_op_list):
				1152	print_operation(npu_op, index)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1153
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1154
				1155	# -------------------------------------------------------------------
				1156	# OPERATIONS
				1157	# -------------------------------------------------------------------
				1158
				1159
				1160	def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):
				1161	"""Generates NPU_OP_* command"""
				1162	op_type = npu_op.op_type
				1163	if op_type == NpuOperationType.Dma:
				1164	emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)
				1165	elif op_type == NpuOperationType.Conv2D:
				1166	emit.cmd_do_operation(cmd0.NPU_OP_CONV)
				1167	elif op_type == NpuOperationType.ConvDepthWise:
				1168	emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
				1169	elif op_type == NpuOperationType.Pooling:
				1170	emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])
				1171	elif op_type == NpuOperationType.ElementWise:
				1172	emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])
				1173	else:
				1174	assert 0, "Unsupported operation"
				1175
				1176
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	1177	def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1178	"""Generates register commands for Conv2D operations"""
				1179	generate_common(emit, npu_op, npu_op.block_traversal, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1180
				1181
				1182	def generate_conv_depthwise_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
				1183	"""Generates register commands for depthwise convolution operations"""
				1184	generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1185
				1186
				1187	def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
				1188	"""Generates register commands for pooling operations"""
				1189	use_global_scale = (
				1190	npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0
				1191	)
				1192	generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)
				1193	# Pooling op specific
				1194	if use_global_scale:
				1195	generate_ofm_scaling_for_pooling(emit, npu_op)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1196
				1197
				1198	def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):
				1199	"""Generates register commands for elementwise operations"""
				1200	use_global_scale = npu_op.sub_op_type in (
				1201	NpuElementWiseOp.ADD,
				1202	NpuElementWiseOp.SUB,
				1203	NpuElementWiseOp.MUL,
				1204	NpuElementWiseOp.LRELU,
				1205	NpuElementWiseOp.ABS,
				1206	)
				1207	op_to_scale = generate_scaling_for_elementwise(emit, npu_op)
				1208	generate_common(
				1209	emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale
				1210	)
				1211	# Elementwise op specific
				1212	if npu_op.sub_op_type not in unary_elementwise_ops:
				1213	# Binary operation; generate IFM2 registers
				1214	assert npu_op.ifm2 is not None
				1215	has_scalar = npu_op.ifm2_scalar is not None
				1216	generate_ifm2(emit, npu_op.ifm2, has_scalar)
				1217	generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)
				1218	generate_ifm2_broadcast(emit, npu_op)
				1219	if has_scalar:
				1220	quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
				1221	assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()
				1222	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1223
				1224
				1225	def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):
				1226	"""Generates register commands for DMA operations"""
				1227	emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)
				1228	emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
				1229	emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)
				1230
				1231	emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
				1232	emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
				1233
				1234
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	1235	def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1236	"""
				1237	Generates register commands for the given operation, but not the final NPU_OP_... command.
				1238	Returns the selected block config
				1239	"""
				1240	op_type = npu_op.op_type
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1241	if op_type == NpuOperationType.Conv2D:
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	1242	generate_conv2d_op(emit, npu_op, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1243	elif op_type == NpuOperationType.ConvDepthWise:
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	1244	generate_conv_depthwise_op(emit, npu_op, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1245	elif op_type == NpuOperationType.Pooling:
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	1246	generate_pooling_op(emit, npu_op, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1247	elif op_type == NpuOperationType.ElementWise:
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	1248	generate_elementwise_op(emit, npu_op, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1249	elif op_type == NpuOperationType.Dma:
				1250	generate_dma_op(emit, npu_op)
				1251	else:
				1252	assert 0, "Unsupported operation"
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1253
				1254
				1255	def generate_command_stream(
				1256	emit: CommandStreamEmitter, npu_op_list: List[NpuOperation], arch: ArchitectureFeatures, add_to_debug_db=None
				1257	):
				1258	"""Generates register commands for the given list of NPU operations"""
				1259	# Calculate memory accesses for every operation
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	1260	memory_accesses = {}
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1261	for npu_op in npu_op_list:
				1262	if is_dma_op(npu_op):
				1263	memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)
				1264	else:
				1265	memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	1266	if arch.is_ethos_u65_system:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1267	emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
				1268	dep_watermark = Watermark(0, 0)
				1269	prev_op = None
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1270	# Generate register commands for all operations
				1271	for op_index, npu_op in enumerate(npu_op_list):
				1272	dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	1273	generate_registers_for_op(emit, npu_op, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1274	if not is_dma_op(npu_op):
				1275	# Generate BLOCKDEP
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	1276	blockdep = calc_blockdep(arch, prev_op, npu_op)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1277	blockdep = min(blockdep, arch.max_blockdep)
				1278	emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
				1279	prev_op = npu_op
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1280
				1281	generate_cmd_waits(emit, cmd_waits)
				1282	# Generate the actual NPU_OP command
				1283	generate_operation_code(emit, npu_op)
				1284	if add_to_debug_db is not None:
				1285	add_to_debug_db(npu_op, emit.offset)
				1286	# Fill in final part of command stream:
				1287	emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
				1288
				1289
				1290	def generate_register_command_stream_for_sg(nng, sg, arch, verbose=False):
				1291	"""Generates command stream for the subgraph, adds it to sg.register_command_stream"""
				1292	# Convert high level command stream to list of NpuOperation
				1293	npu_op_list = []
				1294	npu_op_to_cmd = dict() # map from npu op to high level command
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1295	for cmd in sg.high_level_command_stream:
				1296	if cmd.cmdtype == CommandType.NpuStripe and cmd.ps.npu_block_type == NpuBlockType.Default:
				1297	print("Warning: Skipping register command stream generation for", cmd.ps)
				1298	else:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1299	npu_op = convert_command_to_npu_op(cmd, arch)
				1300	npu_op_list.append(npu_op)
				1301	npu_op_to_cmd[npu_op] = cmd
				1302	if verbose:
				1303	print_operations(npu_op_list)
				1304	# Generate register commands
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	1305	stream_id = DebugDatabase.add_stream(sg)
				1306	DebugDatabase.set_stream_offset(sg, 0) # Default to zero, can only set during file writing
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1307	emit = CommandStreamEmitter()
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	1308
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1309	def add_to_debug_db(npu_op: NpuOperation, offset: int):
				1310	"""Adds info to the debug database"""
				1311	if not is_dma_op(npu_op):
				1312	cmd = npu_op_to_cmd[npu_op]
				1313	DebugDatabase.add_command(stream_id, offset, cmd.ps.primary_op)
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	1314
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1315	generate_command_stream(emit, npu_op_list, arch, add_to_debug_db)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1316	sg.register_command_stream = emit.to_list()
				1317	if verbose:
				1318	emit.print_cmds()
				1319	print("number of commands", len(emit.cmd_stream))
				1320	print("command stream length in words", len(sg.register_command_stream))
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1321
				1322
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	1323	def find_block_configs(npu_op: NpuOperation, npu_accelerator: NpuAccelerator) -> List[NpuShape3D]:
				1324	"""
				1325	Internal implementation of the public facing API for finding block configs.
				1326	"""
				1327	if is_dma_op(npu_op):
				1328	return []
				1329	arch = create_default_arch(Accelerator.from_npu_accelerator(npu_accelerator))
				1330	shared_buffer = create_shared_buffer(npu_op, arch)
				1331	blocks = find_suitable_block_configs(arch, shared_buffer)
				1332	return [NpuShape3D(height=block[0], width=block[1], depth=block[3]) for block in blocks]
				1333
				1334
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	1335	def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1336	"""
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	1337	Internal implementation of the public facing API for generating an Ethos-U register command stream.
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1338	Calculates dependencies between commands and inserts wait operations if needed.
				1339
				1340	:param npu_op_list: List[NpuOperation] list of high level NPU operations
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	1341	:param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator
				1342	:return Ethos-U instructions, as a list of 32-bit integers
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1343	"""
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	1344	accelerator = Accelerator.from_npu_accelerator(npu_accelerator)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1345	emit = CommandStreamEmitter()
Louis Verhaard	5207830	2020-11-18 13:35:06 +0100	[diff] [blame]	1346	arch = create_default_arch(accelerator)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1347	generate_command_stream(emit, npu_op_list, arch)
				1348	return emit.to_list()