Blame - ethosu/vela/register_command_stream_generator.py - ml/ethos-u/ethos-u-vela

blob: 015a8c49df36a2505b40800fc3f6920c8f7ecce3 [file] [log] [blame]

Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1	# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
				2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	16	# Description:
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	17	# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	18	# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	19	# stream suitable for interpretation by the Ethos-U processor.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	20	from collections import defaultdict
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	21	from collections import namedtuple
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	22	from enum import Enum
				23	from enum import IntEnum
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	24	from typing import List
				25	from typing import Optional
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	26
				27	import numpy as np
				28
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	29	from . import numeric_util
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	30	from . import scaling
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	31	from .api import NpuAccelerator
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	32	from .api import NpuActivation
				33	from .api import NpuActivationOp
				34	from .api import NpuAddressRange
				35	from .api import NpuBlockOperation
				36	from .api import NpuBlockTraversal
				37	from .api import NpuConv2DOperation
				38	from .api import NpuDataType
				39	from .api import NpuDmaOperation
				40	from .api import NpuElementWiseOp
				41	from .api import NpuElementWiseOperation
				42	from .api import NpuFeatureMap
				43	from .api import NpuKernel
				44	from .api import NpuLayout
				45	from .api import NpuOperation
				46	from .api import NpuOperationType
				47	from .api import NpuPadding
				48	from .api import NpuPoolingOp
				49	from .api import NpuPoolingOperation
				50	from .api import NpuQuantization
				51	from .api import NpuResamplingMode
				52	from .api import NpuRoundingMode
				53	from .api import NpuShape3D
				54	from .api import NpuTileBox
				55	from .architecture_features import Accelerator
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	56	from .architecture_features import ArchitectureFeatures
				57	from .architecture_features import Block
Louis Verhaard	5207830	2020-11-18 13:35:06 +0100	[diff] [blame]	58	from .architecture_features import create_default_arch
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	59	from .architecture_features import Rect
				60	from .architecture_features import SharedBufferArea
				61	from .architecture_features import SHRAMElements
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	62	from .debug_database import DebugDatabase
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	63	from .ethos_u55_regs.ethos_u55_regs import acc_format
				64	from .ethos_u55_regs.ethos_u55_regs import activation
				65	from .ethos_u55_regs.ethos_u55_regs import cmd0
				66	from .ethos_u55_regs.ethos_u55_regs import cmd1
				67	from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	68	from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlin	cf7da10	2020-05-20 09:03:40 +0200	[diff] [blame]	69	from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	70	from .ethos_u55_regs.ethos_u55_regs import rounding
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	71	from .high_level_command_stream import CommandType
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	72	from .high_level_command_to_npu_op import convert_command_to_npu_op
				73	from .high_level_command_to_npu_op import to_kernel
				74	from .high_level_command_to_npu_op import unary_elementwise_ops
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	75	from .numeric_util import quantise_float32
				76	from .numeric_util import round_away_zero
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	77	from .numeric_util import round_up_to_int
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	78	from .operation import NpuBlockType
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	79	from .range_set import AccessDirection
				80	from .range_set import MemoryAccessSet
				81	from .range_set import MemoryRangeSet
				82	from .shared_buffer_allocation import find_suitable_block_configs
				83	from .shared_buffer_allocation import shared_buffer_allocation_for_npu_op
				84	from .shared_buffer_allocation import SharedBufferAllocation
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	85
				86
				87	class RegisterMachine:
				88	def __init__(self):
				89	self.n_banks = 1
				90	self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
				91	self.bank_idx = 0
				92
				93	def set_register(self, reg, value):
				94	is_changed = self.registers[self.bank_idx][reg] != value
				95	self.registers[self.bank_idx][reg] = value
				96	# is_changed = True # force command
				97	return is_changed
				98
				99	def switch_bank(self):
				100	self.bank_idx = (self.bank_idx + 1) % self.n_banks
				101
				102
				103	class CmdMode(IntEnum):
				104	NoPayload = 0x0000
				105	Payload32 = 0x4000
				106	Mask = 0xC000
				107	CmdOpMask = 0x03FF
				108
				109
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	110	class CommandStreamEmitter:
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	111	WORD_SIZE = 4
				112
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	113	def __init__(self):
				114	self.cmd_stream = []
				115	self.reg_machine = [RegisterMachine(), RegisterMachine()]
				116	self.last_absolute_wait = defaultdict(int)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	117	self.offset = 0
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	118
				119	def get_reg_machine(self, cmd):
				120	if "DMA" in cmd.name:
				121	return self.reg_machine[1]
				122	else:
				123	return self.reg_machine[0]
				124
				125	def size_in_bytes(self):
				126	sz = 0
				127	for cmd in self.cmd_stream:
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	128	sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	129	return sz
				130
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	131	def to_list(self) -> List[int]:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	132	return [elem for cmd in self.cmd_stream for elem in cmd]
				133
				134	def print_cmds(self):
				135	print("Code: Command: Param: Payload:")
				136	for words_for_one_command in self.cmd_stream:
				137	code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
				138	param = words_for_one_command[0] >> 16 # higher 16 bits
				139
				140	payload_mode = CmdMode(code & CmdMode.Mask)
				141
				142	# code and command
				143	s = " 0x%04x " % code
				144	if payload_mode == CmdMode.NoPayload:
				145	s += str(cmd0(code & CmdMode.CmdOpMask))
				146	else:
				147	s += str(cmd1(code & CmdMode.CmdOpMask))
				148
				149	s = s.ljust(40)
				150	s += "%5d" % param
				151
				152	# payload
				153	if payload_mode == CmdMode.Payload32:
				154	s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])
				155	else:
				156	s += " -"
				157
				158	print(s)
				159
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	160	def cmd0_with_param(self, cmd: cmd0, param):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	161	if isinstance(param, Enum):
				162	param = int(param.value)
				163	else:
				164	param = int(param)
				165	param = param & 0xFFFF
				166	command = cmd.value \| (param << 16)
				167	if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
				168	return
				169
				170	# This is not a redundant command, actually write it
				171	self.cmd_stream.append((command,))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	172	self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	173
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	174	def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	175	offset = int(offset) & 0xFFFFFFFFF
				176	command = cmd.value \| CmdMode.Payload32.value \| (param << 16)
				177
				178	if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
				179	return
				180
				181	# This is not a redundant command, actually write it
				182	self.cmd_stream.append((command, offset))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	183	self.offset += CommandStreamEmitter.WORD_SIZE * 2
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	184
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	185	def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	186	param = (16 * channel) + outstanding_count
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	187	command = ((param & 0xFFFF) << 16) \| cmd.value
				188	self.cmd_stream.append((command,))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	189	self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	190
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	191	def cmd_do_operation(self, cmd: cmd0, param=0):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	192	param = int(param)
				193	command = ((param & 0xFFFF) << 16) \| cmd.value
				194
				195	self.cmd_stream.append((command,))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	196	self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	197	self.get_reg_machine(cmd).switch_bank()
				198
				199
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	200	# -------------------------------------------------------------------
				201	# REGISTER GENERATION
				202	# -------------------------------------------------------------------
				203
				204
				205	class BasePointerIndex(IntEnum):
				206	WeightTensor = 0 # base address index for the Weight tensor
				207	ScratchTensor = 1 # base address index for the Scratch_tensor in the TensorArena
				208	ScratchFastTensor = 2 # base address for the Scratch_fast_tensor
				209	Mem2Mem = (1 << 8) \| (3 << 0) # base address slot for memory 2 memory transfer
				210
				211
				212	# TODO: Replace with definitions from ethos_u55_regs
				213	class IFM2Broadcast(IntEnum):
				214	BroadcastHdim = 1 << 0
				215	BroadcastWdim = 1 << 1
				216	BroadcastCdim = 1 << 2
				217	ReverseOperandOrder = 1 << 6
				218	UseIFM2Scalar = 1 << 7
				219
				220
				221	pooling_op_map = {
				222	NpuPoolingOp.MAX: pooling_mode.MAX.value,
				223	NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,
				224	NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,
				225	}
				226
				227	elementwise_op_map = {
				228	NpuElementWiseOp.MUL: elementwise_mode.MUL.value,
				229	NpuElementWiseOp.ADD: elementwise_mode.ADD.value,
				230	NpuElementWiseOp.SUB: elementwise_mode.SUB.value,
				231	NpuElementWiseOp.MIN: elementwise_mode.MIN.value,
				232	NpuElementWiseOp.MAX: elementwise_mode.MAX.value,
				233	NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,
				234	NpuElementWiseOp.ABS: elementwise_mode.ABS.value,
				235	NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,
				236	NpuElementWiseOp.SHR: elementwise_mode.SHR.value,
				237	NpuElementWiseOp.SHL: elementwise_mode.SHL.value,
				238	}
				239
				240	activation_op_map = {
				241	NpuActivationOp.NONE_OR_RELU: activation.NONE,
				242	NpuActivationOp.TANH: activation.TANH,
				243	NpuActivationOp.SIGMOID: activation.SIGMOID,
				244	}
				245
				246	# Maps an AccumulatorType enum to the corresponding acc_format value
				247	acc_format_map = {
				248	SHRAMElements.Acc16: acc_format.FP_S5_10.value,
				249	SHRAMElements.Acc32: acc_format.INT_32BIT.value,
				250	SHRAMElements.Acc40: acc_format.INT_40BIT.value,
				251	}
				252
				253	resampling_mode_map = {
				254	NpuResamplingMode.NONE: resampling_mode.NONE,
				255	NpuResamplingMode.NEAREST: resampling_mode.NEAREST,
				256	NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,
				257	}
				258
				259	# Maps data type size in bits to activation precision
				260	precision_map = {8: 0, 16: 1, 32: 2}
				261
				262	# Maps rounding mode to the corresponding value
				263	rounding_mode_map = {
				264	NpuRoundingMode.TFL: rounding.TFL.value,
				265	NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,
				266	NpuRoundingMode.NATURAL: rounding.NATURAL.value,
				267	}
				268
				269
				270	def quantise(value: float, quant: Optional[NpuQuantization]) -> int:
				271	"""Quantizes the given value"""
				272	scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32
				273	zp = 0 if quant is None else quant.zero_point
				274	return quantise_float32(value, scale, zp)
				275
				276
				277	def has_ifm2(npu_op: NpuBlockOperation) -> bool:
				278	"""Checks if op has non-scalar IFM2"""
				279	return npu_op.ifm2 is not None and npu_op.ifm2_scalar is None
				280
				281
				282	def is_dma_op(npu_op: NpuOperation) -> bool:
				283	"""Checks if op is a DMA operation"""
				284	return npu_op.op_type == NpuOperationType.Dma
				285
				286
				287	def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):
				288	"""Generates IFM_PAD registers"""
				289	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)
				290	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)
				291	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)
				292	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)
				293
				294
				295	def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):
				296	"""Generates ACTIVATION registers"""
				297	act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)
				298
				299	if act.min is None:
				300	quantized_min = ofm.data_type.min_value()
				301	else:
				302	quantized_min = quantise(act.min, ofm.quantization)
				303	if act.max is None:
				304	quantized_max = ofm.data_type.max_value()
				305	else:
				306	quantized_max = quantise(act.max, ofm.quantization)
				307	quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())
				308	quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())
				309	if act.op_type == NpuActivationOp.TABLE_LOOKUP:
				310	assert 0 <= act.lookup_table_index < 8
				311	activation_value = 16 + act.lookup_table_index
				312	if ofm.data_type == NpuDataType.INT32:
				313	activation_value \|= 3 << 12 # Force I8 range
				314	quantized_min = max(-128, quantized_min)
				315	quantized_max = min(127, quantized_max)
				316	else:
				317	activation_value = activation_op_map[act.op_type]
				318	emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)
				319	emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)
				320	emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)
				321
				322
				323	def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):
				324	"""Generates xFM_BASE registers"""
				325	if layout == NpuLayout.NHCWB16:
				326	# Check that all BasePointer addresses are aligned to 16 bytes
				327	assert all((int(addr) % 16) == 0 for addr in addresses)
				328	emit.cmd1_with_offset(ptr_cmds[0], addresses[0])
				329	emit.cmd1_with_offset(ptr_cmds[1], addresses[1])
				330	emit.cmd1_with_offset(ptr_cmds[2], addresses[2])
				331	emit.cmd1_with_offset(ptr_cmds[3], addresses[3])
				332
				333
				334	def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):
				335	"""Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""
				336	emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)
				337	emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)
				338	emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)
				339
				340
				341	def generate_strides(
				342	emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1
				343	):
				344	"""Generates STRIDE_C/Y/X registers"""
				345	strides = get_strides(fm)
				346	emit.cmd1_with_offset(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)
				347	emit.cmd1_with_offset(stride_y_cmd, strides.height) # stride between vertical values (H)
				348	emit.cmd1_with_offset(stride_x_cmd, strides.width) # stride between horisontal values (W)
				349
				350
				351	def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):
				352	"""Generates IFM/IFM2_PRECISION register"""
				353	dtype = fm.data_type
				354	prec = 1 if dtype.is_signed() else 0
				355	activation_precision = precision_map[dtype.size_in_bits()]
				356	prec += activation_precision << 2
				357
				358	if fm.layout == NpuLayout.NHCWB16:
				359	prec \|= 1 << 6
				360
				361	prec \|= op_to_scale << 8
				362	emit.cmd0_with_param(precision_cmd, prec)
				363
				364
				365	def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):
				366	"""Generates OFM_PRECISION register"""
				367	dtype = npu_op.ofm.data_type
				368	prec = 1 if dtype.is_signed() else 0
				369	activation_precision = precision_map[dtype.size_in_bits()]
				370	prec += activation_precision << 1
				371
				372	if use_global_scale:
				373	# Set global scale bit, as opposed to using per channel scale
				374	prec \|= 1 << 8
				375	if npu_op.ofm.layout == NpuLayout.NHCWB16:
				376	prec \|= 1 << 6
				377	prec \|= rounding_mode_map[npu_op.rounding_mode] << 14
				378	emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
				379
				380
				381	def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):
				382	"""Generates IFM2_BROADCAST register for binary elementwise operations"""
				383	ifm2_broadcast = 0
				384	ifm = npu_op.ifm
				385	ifm2 = npu_op.ifm2
				386	if npu_op.reversed_operands:
				387	ifm2_broadcast \|= IFM2Broadcast.ReverseOperandOrder
				388	if npu_op.ifm2_scalar is not None:
				389	# IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
				390	ifm2_broadcast \|= IFM2Broadcast.UseIFM2Scalar
				391	else:
				392	if ifm.shape.height != ifm2.shape.height:
				393	# Broadcast in 'H' dimension
				394	assert ifm2.shape.height == 1
				395	ifm2_broadcast \|= IFM2Broadcast.BroadcastHdim
				396
				397	if ifm.shape.width != ifm2.shape.width:
				398	# Broadcast in 'W' dimension
				399	assert ifm2.shape.width == 1
				400	ifm2_broadcast \|= IFM2Broadcast.BroadcastWdim
				401
				402	if ifm.shape.depth != ifm2.shape.depth:
				403	# Broadcast in 'C' dimension
				404	assert ifm2.shape.depth == 1
				405	ifm2_broadcast \|= IFM2Broadcast.BroadcastCdim
				406
				407	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
				408
				409
				410	def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):
				411	"""Generates general IFM registers"""
				412	emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)
				413	generate_addresses(
				414	emit,
				415	[cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],
				416	ifm.tiles.addresses,
				417	ifm.layout,
				418	)
				419	generate_tiles(
				420	emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles
				421	)
				422	emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)
				423	generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)
				424	emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))
				425
				426
				427	def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):
				428	"""Generates general IFM2 registers"""
				429	if not has_scalar:
				430	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)
				431	generate_addresses(
				432	emit,
				433	[cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],
				434	ifm2.tiles.addresses,
				435	ifm2.layout,
				436	)
				437	generate_tiles(
				438	emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles
				439	)
				440	generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)
				441	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))
				442
				443
				444	def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):
				445	"""Generates general OFM registers"""
				446	emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)
				447	generate_addresses(
				448	emit,
				449	[cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],
				450	ofm.tiles.addresses,
				451	ofm.layout,
				452	)
				453	generate_tiles(
				454	emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles
				455	)
				456	emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)
				457	emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)
				458	emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)
				459	generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)
				460	emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))
				461
				462
				463	def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):
				464	"""Generates KERNEL related registers"""
				465	emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))
				466	emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))
				467	# set kernel x stride low bit
				468	stride = (kernel.stride_x - 1) & 1
				469	# set kernel y stride low bit
				470	stride \|= (kernel.stride_y - 1 & 1) << 1
				471	# set kernel x stride extension bits
				472	stride \|= (kernel.stride_x - 1 >> 1) << 6
				473	# set kernel y stride extension bits
				474	stride \|= (kernel.stride_y - 1 >> 1) << 9
				475	stride \|= (kernel.dilation_x - 1) << 3
				476	stride \|= (kernel.dilation_y - 1) << 4
				477	if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:
				478	stride \|= 1 << 2
				479	emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
				480
				481
				482	def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):
				483	"""Generates WEIGHT registers"""
				484	if len(weights) == 0:
				485	return
				486	emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)
				487	# Set weights sources for active and present cores
				488	for core, (addr, length) in enumerate(
				489	[
				490	(cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
				491	(cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
				492	]
				493	):
				494	if core < len(weights):
				495	emit.cmd1_with_offset(addr, weights[core].address)
				496	emit.cmd1_with_offset(length, weights[core].length)
				497	elif core < arch.ncores:
				498	emit.cmd1_with_offset(addr, weights[0].address)
				499	emit.cmd1_with_offset(length, 0)
				500
				501
				502	def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):
				503	"""Generates SCALE registers"""
				504	if len(biases) == 0:
				505	return
				506	emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)
				507	# Set weights sources for active and present cores
				508	for core, (addr, length) in enumerate(
				509	[(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]
				510	):
				511	if core < len(biases):
				512	emit.cmd1_with_offset(addr, biases[core].address)
				513	emit.cmd1_with_offset(length, biases[core].length)
				514	elif core < arch.ncores:
				515	emit.cmd1_with_offset(addr, biases[0].address)
				516	emit.cmd1_with_offset(length, 0)
				517
				518
				519	def generate_block_config(
				520	emit: CommandStreamEmitter,
				521	npu_op: NpuBlockOperation,
				522	arch: ArchitectureFeatures,
				523	shared_buffer: SharedBufferAllocation,
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame^]	524	):
				525	"""Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	526	block_config = npu_op.block_config
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame^]	527	assert block_config is not None, "block_config has not been set"
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	528	alloc = shared_buffer.try_block(Block(block_config.width, block_config.height, block_config.depth))
				529	assert alloc is not None, f"Block config {block_config} does not fit, op: {npu_op.op_type}"
				530	emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
				531	emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)
				532	emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	533
				534
				535	def generate_shram_registers_elementwise(
				536	emit: CommandStreamEmitter,
				537	npu_op: NpuElementWiseOperation,
				538	arch: ArchitectureFeatures,
				539	shared_buffer: SharedBufferAllocation,
				540	):
				541	"""Generates IB_END/IB_START/AB_START registers for elementwise operations"""
				542	# For elementwise set the required SHRAM to be equal to the total size of available SHRAM
				543	uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
				544	shram_required = arch.available_shram_banks(uses_lut)
				545
				546	# Acc buffers not needed so set AB_START to size of SHRAM
				547	emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)
				548	emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shram_required)
				549	if has_ifm2(npu_op):
				550	# Set IFM2_IB_START to the latter half of the IB space
				551	ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]
				552	emit.cmd0_with_param(
				553	cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) // shared_buffer.ifm_count + ifm_ib_start,
				554	)
				555	emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
				556
				557
				558	def generate_shram_registers_non_elementwise(emit: CommandStreamEmitter, shared_buffer: SharedBufferAllocation):
				559	"""Generates IB_END/IB_START/AB_START registers for non-elementwise operations"""
				560	emit.cmd0_with_param(
				561	cmd0.NPU_SET_IFM_IB_END,
				562	shared_buffer.bank_locations[SharedBufferArea.IFM] + shared_buffer.banks_required[SharedBufferArea.IFM],
				563	)
				564	emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])
				565	emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
				566
				567
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame^]	568	def create_shared_buffer(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> SharedBufferAllocation:
				569	"""Creates shared buffer allocation for the given operation"""
				570	op_type = npu_op.op_type
				571	block_type = NpuBlockType.Default
				572	if op_type == NpuOperationType.Conv2D:
				573	block_type = NpuBlockType.ConvolutionMxN
				574	elif op_type == NpuOperationType.ConvDepthWise:
				575	block_type = NpuBlockType.ConvolutionDepthWise
				576	elif op_type == NpuOperationType.Pooling:
				577	block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
				578	elif op_type == NpuOperationType.ElementWise:
				579	block_type = NpuBlockType.ElementWise
				580	else:
				581	assert 0, "Unsupported operation"
				582	ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
				583	return shared_buffer_allocation_for_npu_op(arch, npu_op, block_type, ifm_resampling_mode)
				584
				585
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	586	def generate_common(
				587	emit: CommandStreamEmitter,
				588	npu_op: NpuBlockOperation,
				589	block_traversal: NpuBlockTraversal,
				590	arch: ArchitectureFeatures,
				591	use_global_scale: bool = False,
				592	op_to_scale: int = 0,
				593	):
				594	"""Generate registers that are common to most operations"""
				595	assert npu_op.ifm is not None and npu_op.ofm is not None
				596	generate_ifm(emit, npu_op.ifm)
				597	generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)
				598	emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])
				599	if npu_op.padding is not None:
				600	generate_padding(emit, npu_op.padding)
				601	generate_ofm(emit, npu_op.ofm)
				602	generate_ofm_precision(emit, npu_op, use_global_scale)
				603	if npu_op.op_type != NpuOperationType.ElementWise:
				604	assert npu_op.kernel is not None
				605	generate_kernel(emit, npu_op.kernel, block_traversal)
				606	generate_weights(emit, npu_op.weights, arch)
				607	generate_biases(emit, npu_op.biases, arch)
				608	generate_activation(emit, npu_op.activation, npu_op.ofm)
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame^]	609	shared_buffer = create_shared_buffer(npu_op, arch)
				610	generate_block_config(emit, npu_op, arch, shared_buffer)
				611	if npu_op.op_type == NpuOperationType.ElementWise:
				612	generate_shram_registers_elementwise(emit, npu_op, arch, shared_buffer)
				613	else:
				614	generate_shram_registers_non_elementwise(emit, shared_buffer)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	615
				616
				617	# -------------------------------------------------------------------
				618	# SCALING
				619	# -------------------------------------------------------------------
				620
				621
				622	def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):
				623	"""Generates OFM_SCALE register for pooling operations"""
				624	# For valid padding vela has to output scaling values
				625	kernel = pool_op.kernel
				626	ifm_quant = pool_op.ifm.quantization
				627	ofm_quant = pool_op.ofm.quantization
				628	if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):
				629	assert ifm_quant.scale_f32 is not None
				630	rescale = 0x3000 * ifm_quant.scale_f32
				631	if pool_op.ifm.data_type == NpuDataType.INT16:
				632	# Calculate scale and shift for the output scale of 1/(3*4096)
				633	shift = 0
				634	max_rescale = np.iinfo(np.int16).max / 2
				635	while rescale <= max_rescale and shift <= 30:
				636	shift += 1
				637	rescale *= 2
				638	scale = int(rescale)
				639	else:
				640	rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
				641	scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
				642	scale = int(round_away_zero(scale * rescale))
				643	elif pool_op.fused_quantize:
				644	# Quantize op requires different scaling
				645	ifm_scale_f64 = np.double(ifm_quant.scale_f32)
				646	ofm_scale_f64 = np.double(ofm_quant.scale_f32)
				647	scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
				648	elif pool_op.rescale is not None:
				649	# for ResizeBilinear operations with "rescale" in primary_op.attrs
				650	rescale = pool_op.rescale
				651	rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
				652	scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
				653	scale = int(round_away_zero(scale * rescale))
				654	else:
				655	# In case avg pool fused with concat or other memory operation, rescaling might be needed.
				656	# kernel height == kernel width == 1 is always true in this case
				657	# Normally the scale is maximised, to get maximum precision, which means that
				658	# if rescale != 1, scale need to consider the number of bits needed for rescaling
				659	if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:
				660	rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32
				661	rescale_bits = 0
				662	if kernel.height == kernel.width == 1:
				663	if rescale > 1:
				664	rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
				665	elif rescale < 1:
				666	rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
				667	scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
				668	scale = int(round_away_zero(scale * rescale))
				669	else:
				670	scale = 1
				671	shift = 0
				672
				673	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
				674
				675
				676	def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
				677	"""
				678	Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
				679	Returns the operator to scale
				680	"""
				681	op_to_scale = 0
				682	if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):
				683	input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
				684	input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
				685	output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
				686
				687	if npu_op.activation is not None and npu_op.activation.op_type in (
				688	NpuActivationOp.SIGMOID,
				689	NpuActivationOp.TANH,
				690	):
				691	output_scale = 1 / 0x3000
				692
				693	if npu_op.sub_op_type == NpuElementWiseOp.MUL:
				694	if None in (input_scale, input2_scale, output_scale):
				695	ofm_scale = 1
				696	shift = 0
				697	else:
				698	ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
				699	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
				700	else: # Add/Sub
				701	if None in (input_scale, input2_scale, output_scale):
				702	opa_scale = opb_scale = ofm_scale = 1
				703	opa_shift = shift = 0
				704	if npu_op.rescale is not None:
				705	ofm_scale, shift = npu_op.rescale
				706	elif input_scale == input2_scale:
				707	opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
				708	input_scale, input2_scale, output_scale
				709	)
				710	opa_shift = 0 # Unused for this case
				711	else:
				712	# Use advanced implementation only when input scales differ
				713	bitdepth = npu_op.ifm.data_type.size_in_bits()
				714	(opa_scale, opa_shift, ofm_scale, shift, op_to_scale,) = scaling.advanced_elementwise_add_sub_scale(
				715	input_scale, input2_scale, output_scale, bitdepth
				716	)
				717	opb_scale = 0 # Unused for this case
				718	if npu_op.reversed_operands:
				719	# If the operand order is reversed we also have to swap which operand is scaled
				720	if op_to_scale == scaling.OperandToScale.OPa:
				721	op_to_scale = scaling.OperandToScale.OPb
				722	else:
				723	op_to_scale = scaling.OperandToScale.OPa
				724	emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
				725	emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
				726	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
				727	elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):
				728	output_scale = npu_op.ofm.quantization.scale_f32
				729	ofm_scale, shift = scaling.quantise_scale(output_scale)
				730	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
				731	else:
				732	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)
				733	return op_to_scale
				734
				735
				736	# -------------------------------------------------------------------
				737	# ADDRESSING/STRIDES (helper functions)
				738	# -------------------------------------------------------------------
				739
				740
				741	def ranges_overlap(range1: NpuAddressRange, range2: NpuAddressRange) -> bool:
				742	"""Checks if the ranges overlap"""
				743	return range1.region == range2.region and numeric_util.overlaps(
				744	range1.address, range1.address + range1.length, range2.address, range2.address + range2.length
				745	)
				746
				747
				748	def get_strides(fm: NpuFeatureMap) -> NpuShape3D:
				749	"""Calculates STRIDE_C/Y/X"""
				750	if fm.strides is not None:
				751	return fm.strides
				752	elem_size = fm.data_type.size_in_bytes()
				753	if fm.layout == NpuLayout.NHWC:
				754	stride_c = elem_size
				755	stride_x = fm.shape.depth * stride_c
				756	stride_y = fm.shape.width * stride_x
				757	else:
				758	stride_x = 16 * elem_size
				759	stride_c = stride_x * fm.shape.width
				760	stride_y = elem_size * fm.shape.width * numeric_util.round_up(fm.shape.depth, 16)
				761	return NpuShape3D(depth=stride_c, height=stride_y, width=stride_x)
				762
				763
				764	def get_address(fm: NpuFeatureMap, strides: NpuShape3D, y: int, x: int, c: int) -> int:
				765	"""Returns address of given coordinate"""
				766	t = 0
				767	BRICK = 16
				768	stride_c = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHWC else strides.depth
				769	stride_x = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHCWB16 else strides.width
				770	if x >= fm.tiles.width_0:
				771	x -= fm.tiles.width_0
				772	t = 1
				773	if y >= fm.tiles.height_1:
				774	y -= fm.tiles.height_1
				775	t += 2
				776	elif y >= fm.tiles.height_0:
				777	y -= fm.tiles.height_0
				778	t += 2
				779	elem_size = fm.data_type.size_in_bytes()
				780	return (
				781	fm.tiles.addresses[t] + y * strides.height + x * stride_x + (c // BRICK) * stride_c + int(c % BRICK) * elem_size
				782	)
				783
				784
				785	def get_address_range(
				786	fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int
				787	) -> NpuAddressRange:
				788	"""Gets address range for (y0, x0, c0) - (y1, x1, c1)"""
				789	addr0 = get_address(fm, strides, y0, x0, c0)
				790	addr1 = get_address(fm, strides, y1, x1, c1)
				791	return NpuAddressRange(region=fm.region, address=addr0, length=addr1 - addr0 + fm.data_type.size_in_bytes())
				792
				793
				794	def get_address_ranges(fm: NpuFeatureMap) -> List[Optional[NpuAddressRange]]:
				795	"""Returns 4 adddress ranges, one for every tile, None if the tile is not in use"""
				796	strides = get_strides(fm)
				797	height, width, depth = fm.shape.height, fm.shape.width, fm.shape.depth
				798	height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0
				799	t0 = get_address_range(fm, strides, 0, 0, 0, min(height, height_0) - 1, min(width, width_0) - 1, depth - 1,)
				800	if width > width_0:
				801	t1 = get_address_range(fm, strides, 0, width_0, 0, min(height, height_1) - 1, width - 1, depth - 1)
				802	else:
				803	t1 = None
				804	if height > height_0:
				805	t2 = get_address_range(fm, strides, height_0, 0, 0, height - 1, min(width, width_0) - 1, depth - 1)
				806	else:
				807	t2 = None
				808	if t1 is not None and t2 is not None:
				809	t3 = get_address_range(fm, strides, height_0, width_0, 0, height - 1, width - 1, depth - 1)
				810	else:
				811	t3 = None
				812	return [t0, t1, t2, t3]
				813
				814
				815	# -------------------------------------------------------------------
				816	# DMA_WAIT/KERNEL_WAIT
				817	# -------------------------------------------------------------------
				818
				819
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	820	Watermark = namedtuple("Watermark", ["npu", "dma"])
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	821
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	822
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	823	def memory_range_set(range: NpuAddressRange) -> MemoryRangeSet:
				824	return MemoryRangeSet(range.region, range.address, range.address + range.length)
				825
				826
				827	def get_dma_memory_accesses(dma_op: NpuDmaOperation) -> MemoryAccessSet:
				828	"""Returns the address that are read and written by the given DMA operation"""
				829	res = MemoryAccessSet()
				830	res.add(memory_range_set(dma_op.src), AccessDirection.Read)
				831	res.add(memory_range_set(dma_op.dest), AccessDirection.Write)
				832	return res
				833
				834
				835	def get_op_memory_accesses(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> MemoryAccessSet:
				836	"""Returns the addresses that are read and written by the given operation"""
				837	assert npu_op.ifm is not None and npu_op.ofm is not None
				838	# Read addresses
				839	read_ranges = get_address_ranges(npu_op.ifm)
				840	if has_ifm2(npu_op):
				841	assert npu_op.ifm2 is not None
				842	read_ranges.extend(get_address_ranges(npu_op.ifm2))
				843	read_ranges.extend(npu_op.weights)
				844	read_ranges.extend(npu_op.biases)
				845	if npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP:
				846	address = arch.available_shram_banks(True) * arch.shram_bank_size
				847	read_ranges.append(NpuAddressRange(region=BasePointerIndex.Mem2Mem, address=address, length=2048))
				848	# Written addresses
				849	write_ranges = get_address_ranges(npu_op.ofm)
				850	# Add write access to SHRAM, needed when LUTs can overwrite accumulator banks
				851	uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
				852	written_shram_size = arch.available_shram_banks(uses_lut) * arch.shram_bank_size
				853	write_ranges.append(NpuAddressRange(region=BasePointerIndex.Mem2Mem, address=0, length=written_shram_size))
				854
				855	res = MemoryAccessSet()
				856	for read_range in read_ranges:
				857	if read_range is not None:
				858	res.add(memory_range_set(read_range), AccessDirection.Read)
				859	for write_range in write_ranges:
				860	if write_range is not None:
				861	res.add(memory_range_set(write_range), AccessDirection.Write)
				862	return res
				863
				864
				865	def get_wait_dependency(
				866	arch: ArchitectureFeatures, npu_op_list: List[NpuOperation], memory_accesses, op_index: int, watermark: Watermark
				867	):
				868	"""Used to calculate whether DMA wait or kernel wait operations are needed"""
				869	npu_op = npu_op_list[op_index]
				870	op_access = memory_accesses[npu_op]
				871	index = op_index - 1
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	872
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	873	# NPU dependency tracking
				874	npu_outstanding = -1
				875	npu_ops = 0
				876	npu_index = watermark.npu
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	877
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	878	# DMA dependency tracking
				879	dma_outstanding = -1
				880	dma_ops = 0
				881	dma_index = watermark.dma
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	882
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	883	# Seek back in the command stream looking for NPU or DMA dependencies
				884	# but only as far as the first dependency or the watermarks (dependencies
				885	# before this point have been satisfied already).
				886	# The watermark moves to after the latest element we must wait for, not
				887	# the command that issues the wait.
				888	# NPU->NPU dependency is handled via blockdep.
				889	while (index >= npu_index) or (index >= dma_index):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	890	prev_op = npu_op_list[index]
				891	prev_access = memory_accesses[prev_op]
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	892
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	893	# Check NPU consuming DMA output
				894	if is_dma_op(prev_op):
				895	if index >= dma_index:
				896	if not is_dma_op(npu_op):
				897	if (dma_outstanding == -1) and prev_access.conflicts(op_access):
				898	dma_outstanding = dma_ops
				899	dma_ops += 1 # Count DMA ops in the pipeline
				900	if dma_ops >= arch.max_outstanding_dma:
				901	dma_index = max(index + 1, dma_index)
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	902	# Check DMA consuming NPU output
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	903	else:
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	904	if index >= npu_index:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	905	if is_dma_op(npu_op) and npu_outstanding == -1 and prev_access.conflicts(op_access):
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	906	npu_outstanding = npu_ops
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	907	npu_ops += 1 # Count NPU ops in the pipeline
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	908	if npu_ops >= arch.max_outstanding_kernels:
				909	npu_index = max(index + 1, npu_index)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	910
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	911	index -= 1
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	912
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	913	# Update DMA watermark if we didn't see any and the NPU pipeline is full
				914	if (dma_ops == 0) and (npu_ops >= arch.max_outstanding_kernels):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	915	dma_index = op_index
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	916
				917	# Bring the search watermark forwards as we complete for those dependencies
				918	watermark = Watermark(npu_index, dma_index)
				919	outstanding = Watermark(npu_outstanding, dma_outstanding)
				920
				921	return watermark, outstanding
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	922
				923
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	924	def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):
				925	if cmd_waits.npu >= 0:
				926	emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
				927
				928	if cmd_waits.dma >= 0:
				929	emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
				930
				931
				932	# -------------------------------------------------------------------
				933	# BLOCKDEP
				934	# -------------------------------------------------------------------
				935
				936
				937	def is_dependent_on_prev_op(prev_op: NpuBlockOperation, npu_op: NpuBlockOperation) -> bool:
				938	"""Checks if npu_op's input is dependent on prev_op's output"""
				939	assert npu_op.ifm is not None
				940	assert prev_op.ofm is not None
				941	curr_input_ranges = get_address_ranges(npu_op.ifm)
				942
				943	if has_ifm2(npu_op):
				944	assert npu_op.ifm2 is not None
				945	curr_input_ranges.extend(get_address_ranges(npu_op.ifm2))
				946	for prev_range in get_address_ranges(prev_op.ofm):
				947	if prev_range is None:
				948	continue
				949	for curr_range in curr_input_ranges:
				950	if curr_range is not None and ranges_overlap(prev_range, curr_range):
				951	return True
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	952	return False
				953
				954
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	955	def shape3d_to_rect(shape: NpuShape3D) -> Rect:
				956	return Rect(0, 0, 0, shape.width - 1, shape.height - 1, shape.depth - 1)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	957
				958
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	959	def get_ifm_ofm_block_depth(arch: ArchitectureFeatures, npu_op: NpuBlockOperation) -> int:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	960	# Note: NOT equivalent to the normal ifm block depth calculation since
				961	# it takes into account 'depthless' block operations by returning full
				962	# depth
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	963	if npu_op.op_type == NpuOperationType.Conv2D:
				964	res = arch.calc_ifm_block_depth(npu_op.ifm.shape.depth, npu_op.ifm.data_type.size_in_bits())
				965	return res
				966	return npu_op.ofm.shape.depth
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	967
				968
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame^]	969	def calc_blockdep(arch: ArchitectureFeatures, prev_op: Optional[NpuBlockOperation], npu_op: NpuBlockOperation,) -> int:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	970	"""Calculates the value of the BLOCKDEP register"""
				971	if prev_op is None:
				972	return 0
				973	if not is_dependent_on_prev_op(prev_op, npu_op):
				974	return ArchitectureFeatures.MAX_BLOCKDEP
				975	if prev_op.ofm.shape != npu_op.ifm.shape:
				976	return 0
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame^]	977	prev_block_config = prev_op.block_config
				978	block_config = npu_op.block_config
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	979	prev_ifm_block_depth = get_ifm_ofm_block_depth(arch, prev_op)
				980	prev_ofm_block = Block(prev_block_config.width, prev_block_config.height, prev_block_config.depth)
				981	prev_ofm_rect = shape3d_to_rect(prev_op.ofm.shape)
				982	prev_ifm_rect = shape3d_to_rect(prev_op.ifm.shape)
				983	cur_ifm_block_depth = get_ifm_ofm_block_depth(arch, npu_op)
				984	cur_ofm_block = Block(block_config.width, block_config.height, block_config.depth)
				985	cur_ofm_rect = shape3d_to_rect(npu_op.ofm.shape)
				986	cur_ifm_rect = shape3d_to_rect(npu_op.ifm.shape)
				987	cur_padLT = (0, 0) if npu_op.padding is None else (npu_op.padding.left, npu_op.padding.top)
				988	blockdep = arch.calc_block_dep(
				989	prev_ifm_rect,
				990	prev_ofm_rect,
				991	prev_ifm_block_depth,
				992	prev_ofm_block,
				993	to_kernel(prev_op.kernel),
				994	cur_ifm_rect,
				995	cur_ofm_rect,
				996	cur_ifm_block_depth,
				997	cur_ofm_block,
				998	to_kernel(npu_op.kernel),
				999	cur_padLT,
				1000	)
				1001	return blockdep
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1002
				1003
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1004	# -------------------------------------------------------------------
				1005	# PRINT
				1006	# -------------------------------------------------------------------
Jacob Bohlin	e99b893	2020-07-13 16:01:51 +0200	[diff] [blame]	1007
				1008
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1009	def print_feature_map(fm: NpuFeatureMap, name: str):
				1010	if fm is not None:
				1011	q = (
				1012	"no quantization"
				1013	if fm.quantization is None
				1014	else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"
				1015	)
				1016	h, w, c = fm.shape
				1017	sz = h * w * c * fm.data_type.size_in_bytes()
				1018	print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")
				1019	strides = get_strides(fm)
				1020	stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"
				1021	t = fm.tiles
				1022	addresses = [hex(addr) for addr in t.addresses]
				1023	print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1024
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1025
				1026	def print_operation(npu_op: NpuOperation, index: int = 0):
				1027	pass_info = f", {npu_op.cmd}" if hasattr(npu_op, "cmd") else ""
				1028	if is_dma_op(npu_op):
				1029	print(f"{index} DMA_START src={npu_op.src}, dest={npu_op.dest}{pass_info}")
				1030	return
				1031	k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)
				1032	if npu_op.op_type in (NpuOperationType.Pooling, NpuOperationType.ElementWise):
				1033	print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name}:{pass_info}")
Patrik Gustavsson	eca2e95	2020-05-27 09:15:11 +0200	[diff] [blame]	1034	else:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1035	if (
				1036	npu_op.op_type == NpuOperationType.Conv2D
				1037	and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1
				1038	):
				1039	fc = "FullyConnected "
				1040	else:
				1041	fc = ""
				1042	print(f"{index} {fc}{npu_op.op_type.name}{pass_info}")
				1043	print_feature_map(npu_op.ifm, "IFM")
				1044	if npu_op.ifm2_scalar is not None:
				1045	quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
				1046	print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")
				1047	else:
				1048	print_feature_map(npu_op.ifm2, "IFM2")
				1049	print_feature_map(npu_op.ofm, "OFM")
				1050	if k is not None and npu_op.op_type != NpuOperationType.ElementWise:
				1051	print(f" Kernel: {k}")
				1052	if npu_op.padding is not None:
				1053	print(f" {npu_op.padding}")
				1054	for weights in npu_op.weights:
				1055	print(f" Weights: {weights}")
				1056	for bias in npu_op.biases:
				1057	print(f" Scales: {bias}")
				1058	if npu_op.activation is not None:
				1059	act = npu_op.activation
				1060	if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:
				1061	lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""
				1062	print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")
				1063	if npu_op.op_type == NpuOperationType.Conv2D:
				1064	print(f" {npu_op.block_traversal}")
				1065	bh, bw, bc = npu_op.block_config
				1066	rescale = f", rescale={npu_op.rescale}" if hasattr(npu_op, "rescale") else ""
				1067	print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1068
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1069
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1070	def print_operations(npu_op_list: List[NpuOperation]):
				1071	for index, npu_op in enumerate(npu_op_list):
				1072	print_operation(npu_op, index)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1073
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1074
				1075	# -------------------------------------------------------------------
				1076	# OPERATIONS
				1077	# -------------------------------------------------------------------
				1078
				1079
				1080	def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):
				1081	"""Generates NPU_OP_* command"""
				1082	op_type = npu_op.op_type
				1083	if op_type == NpuOperationType.Dma:
				1084	emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)
				1085	elif op_type == NpuOperationType.Conv2D:
				1086	emit.cmd_do_operation(cmd0.NPU_OP_CONV)
				1087	elif op_type == NpuOperationType.ConvDepthWise:
				1088	emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
				1089	elif op_type == NpuOperationType.Pooling:
				1090	emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])
				1091	elif op_type == NpuOperationType.ElementWise:
				1092	emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])
				1093	else:
				1094	assert 0, "Unsupported operation"
				1095
				1096
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame^]	1097	def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1098	"""Generates register commands for Conv2D operations"""
				1099	generate_common(emit, npu_op, npu_op.block_traversal, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1100
				1101
				1102	def generate_conv_depthwise_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
				1103	"""Generates register commands for depthwise convolution operations"""
				1104	generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1105
				1106
				1107	def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
				1108	"""Generates register commands for pooling operations"""
				1109	use_global_scale = (
				1110	npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0
				1111	)
				1112	generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)
				1113	# Pooling op specific
				1114	if use_global_scale:
				1115	generate_ofm_scaling_for_pooling(emit, npu_op)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1116
				1117
				1118	def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):
				1119	"""Generates register commands for elementwise operations"""
				1120	use_global_scale = npu_op.sub_op_type in (
				1121	NpuElementWiseOp.ADD,
				1122	NpuElementWiseOp.SUB,
				1123	NpuElementWiseOp.MUL,
				1124	NpuElementWiseOp.LRELU,
				1125	NpuElementWiseOp.ABS,
				1126	)
				1127	op_to_scale = generate_scaling_for_elementwise(emit, npu_op)
				1128	generate_common(
				1129	emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale
				1130	)
				1131	# Elementwise op specific
				1132	if npu_op.sub_op_type not in unary_elementwise_ops:
				1133	# Binary operation; generate IFM2 registers
				1134	assert npu_op.ifm2 is not None
				1135	has_scalar = npu_op.ifm2_scalar is not None
				1136	generate_ifm2(emit, npu_op.ifm2, has_scalar)
				1137	generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)
				1138	generate_ifm2_broadcast(emit, npu_op)
				1139	if has_scalar:
				1140	quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
				1141	assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()
				1142	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1143
				1144
				1145	def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):
				1146	"""Generates register commands for DMA operations"""
				1147	emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)
				1148	emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
				1149	emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)
				1150
				1151	emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
				1152	emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
				1153
				1154
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame^]	1155	def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1156	"""
				1157	Generates register commands for the given operation, but not the final NPU_OP_... command.
				1158	Returns the selected block config
				1159	"""
				1160	op_type = npu_op.op_type
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1161	if op_type == NpuOperationType.Conv2D:
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame^]	1162	generate_conv2d_op(emit, npu_op, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1163	elif op_type == NpuOperationType.ConvDepthWise:
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame^]	1164	generate_conv_depthwise_op(emit, npu_op, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1165	elif op_type == NpuOperationType.Pooling:
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame^]	1166	generate_pooling_op(emit, npu_op, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1167	elif op_type == NpuOperationType.ElementWise:
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame^]	1168	generate_elementwise_op(emit, npu_op, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1169	elif op_type == NpuOperationType.Dma:
				1170	generate_dma_op(emit, npu_op)
				1171	else:
				1172	assert 0, "Unsupported operation"
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1173
				1174
				1175	def generate_command_stream(
				1176	emit: CommandStreamEmitter, npu_op_list: List[NpuOperation], arch: ArchitectureFeatures, add_to_debug_db=None
				1177	):
				1178	"""Generates register commands for the given list of NPU operations"""
				1179	# Calculate memory accesses for every operation
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	1180	memory_accesses = {}
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1181	for npu_op in npu_op_list:
				1182	if is_dma_op(npu_op):
				1183	memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)
				1184	else:
				1185	memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	1186	if arch.is_ethos_u65_system:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1187	emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
				1188	dep_watermark = Watermark(0, 0)
				1189	prev_op = None
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1190	# Generate register commands for all operations
				1191	for op_index, npu_op in enumerate(npu_op_list):
				1192	dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame^]	1193	generate_registers_for_op(emit, npu_op, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1194	if not is_dma_op(npu_op):
				1195	# Generate BLOCKDEP
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame^]	1196	blockdep = calc_blockdep(arch, prev_op, npu_op)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1197	blockdep = min(blockdep, arch.max_blockdep)
				1198	emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
				1199	prev_op = npu_op
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1200
				1201	generate_cmd_waits(emit, cmd_waits)
				1202	# Generate the actual NPU_OP command
				1203	generate_operation_code(emit, npu_op)
				1204	if add_to_debug_db is not None:
				1205	add_to_debug_db(npu_op, emit.offset)
				1206	# Fill in final part of command stream:
				1207	emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
				1208
				1209
				1210	def generate_register_command_stream_for_sg(nng, sg, arch, verbose=False):
				1211	"""Generates command stream for the subgraph, adds it to sg.register_command_stream"""
				1212	# Convert high level command stream to list of NpuOperation
				1213	npu_op_list = []
				1214	npu_op_to_cmd = dict() # map from npu op to high level command
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1215	for cmd in sg.high_level_command_stream:
				1216	if cmd.cmdtype == CommandType.NpuStripe and cmd.ps.npu_block_type == NpuBlockType.Default:
				1217	print("Warning: Skipping register command stream generation for", cmd.ps)
				1218	else:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1219	npu_op = convert_command_to_npu_op(cmd, arch)
				1220	npu_op_list.append(npu_op)
				1221	npu_op_to_cmd[npu_op] = cmd
				1222	if verbose:
				1223	print_operations(npu_op_list)
				1224	# Generate register commands
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	1225	stream_id = DebugDatabase.add_stream(sg)
				1226	DebugDatabase.set_stream_offset(sg, 0) # Default to zero, can only set during file writing
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1227	emit = CommandStreamEmitter()
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	1228
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1229	def add_to_debug_db(npu_op: NpuOperation, offset: int):
				1230	"""Adds info to the debug database"""
				1231	if not is_dma_op(npu_op):
				1232	cmd = npu_op_to_cmd[npu_op]
				1233	DebugDatabase.add_command(stream_id, offset, cmd.ps.primary_op)
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	1234
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1235	generate_command_stream(emit, npu_op_list, arch, add_to_debug_db)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1236	sg.register_command_stream = emit.to_list()
				1237	if verbose:
				1238	emit.print_cmds()
				1239	print("number of commands", len(emit.cmd_stream))
				1240	print("command stream length in words", len(sg.register_command_stream))
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1241
				1242
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame^]	1243	def find_block_configs(npu_op: NpuOperation, npu_accelerator: NpuAccelerator) -> List[NpuShape3D]:
				1244	"""
				1245	Internal implementation of the public facing API for finding block configs.
				1246	"""
				1247	if is_dma_op(npu_op):
				1248	return []
				1249	arch = create_default_arch(Accelerator.from_npu_accelerator(npu_accelerator))
				1250	shared_buffer = create_shared_buffer(npu_op, arch)
				1251	blocks = find_suitable_block_configs(arch, shared_buffer)
				1252	return [NpuShape3D(height=block[0], width=block[1], depth=block[3]) for block in blocks]
				1253
				1254
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	1255	def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1256	"""
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	1257	Internal implementation of the public facing API for generating an Ethos-U register command stream.
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1258	Calculates dependencies between commands and inserts wait operations if needed.
				1259
				1260	:param npu_op_list: List[NpuOperation] list of high level NPU operations
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	1261	:param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator
				1262	:return Ethos-U instructions, as a list of 32-bit integers
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1263	"""
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	1264	accelerator = Accelerator.from_npu_accelerator(npu_accelerator)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1265	emit = CommandStreamEmitter()
Louis Verhaard	5207830	2020-11-18 13:35:06 +0100	[diff] [blame]	1266	arch = create_default_arch(accelerator)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1267	generate_command_stream(emit, npu_op_list, arch)
				1268	return emit.to_list()