Blame - ethosu/vela/register_command_stream_generator.py - ml/ethos-u/ethos-u-vela

blob: 7de3d9ac6be47c824be4f21228a4705110a0ff82 [file] [log] [blame]

Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1	# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
				2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	16	# Description:
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	17	# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	18	# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	19	# stream suitable for interpretation by the Ethos-U processor.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	20	from collections import defaultdict
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	21	from enum import Enum
				22	from enum import IntEnum
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	23	from typing import Dict
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	24	from typing import List
				25	from typing import Optional
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	26
				27	import numpy as np
				28
				29	from . import scaling
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	30	from .api import NpuAccelerator
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	31	from .api import NpuActivation
				32	from .api import NpuActivationOp
				33	from .api import NpuAddressRange
				34	from .api import NpuBlockOperation
				35	from .api import NpuBlockTraversal
				36	from .api import NpuConv2DOperation
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	37	from .api import NpuConvDepthWiseOperation
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	38	from .api import NpuDataType
				39	from .api import NpuDmaOperation
				40	from .api import NpuElementWiseOp
				41	from .api import NpuElementWiseOperation
				42	from .api import NpuFeatureMap
				43	from .api import NpuKernel
				44	from .api import NpuLayout
				45	from .api import NpuOperation
				46	from .api import NpuOperationType
				47	from .api import NpuPadding
				48	from .api import NpuPoolingOp
				49	from .api import NpuPoolingOperation
				50	from .api import NpuQuantization
				51	from .api import NpuResamplingMode
				52	from .api import NpuRoundingMode
				53	from .api import NpuShape3D
				54	from .api import NpuTileBox
				55	from .architecture_features import Accelerator
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	56	from .architecture_features import ArchitectureFeatures
				57	from .architecture_features import Block
Louis Verhaard	5207830	2020-11-18 13:35:06 +0100	[diff] [blame]	58	from .architecture_features import create_default_arch
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	59	from .architecture_features import SharedBufferArea
				60	from .architecture_features import SHRAMElements
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	61	from .ethos_u55_regs.ethos_u55_regs import acc_format
				62	from .ethos_u55_regs.ethos_u55_regs import activation
				63	from .ethos_u55_regs.ethos_u55_regs import cmd0
				64	from .ethos_u55_regs.ethos_u55_regs import cmd1
				65	from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	66	from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlin	cf7da10	2020-05-20 09:03:40 +0200	[diff] [blame]	67	from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	68	from .ethos_u55_regs.ethos_u55_regs import rounding
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	69	from .numeric_util import quantise_float32
				70	from .numeric_util import round_away_zero
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	71	from .numeric_util import round_up_to_int
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	72	from .operation import NpuBlockType
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	73	from .range_set import MemoryAccessSet
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	74	from .register_command_stream_util import calc_blockdep
				75	from .register_command_stream_util import get_dma_memory_accesses
				76	from .register_command_stream_util import get_op_memory_accesses
				77	from .register_command_stream_util import get_strides
				78	from .register_command_stream_util import get_wait_dependency
				79	from .register_command_stream_util import has_ifm2
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	80	from .register_command_stream_util import to_kernel
				81	from .register_command_stream_util import UNARY_ELEMWISE_OPS
				82	from .register_command_stream_util import Watermark
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	83	from .shared_buffer_allocation import find_suitable_block_configs
				84	from .shared_buffer_allocation import shared_buffer_allocation_for_npu_op
				85	from .shared_buffer_allocation import SharedBufferAllocation
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	86
				87
				88	class RegisterMachine:
				89	def __init__(self):
				90	self.n_banks = 1
				91	self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
				92	self.bank_idx = 0
				93
				94	def set_register(self, reg, value):
				95	is_changed = self.registers[self.bank_idx][reg] != value
				96	self.registers[self.bank_idx][reg] = value
				97	# is_changed = True # force command
				98	return is_changed
				99
				100	def switch_bank(self):
				101	self.bank_idx = (self.bank_idx + 1) % self.n_banks
				102
				103
				104	class CmdMode(IntEnum):
				105	NoPayload = 0x0000
				106	Payload32 = 0x4000
				107	Mask = 0xC000
				108	CmdOpMask = 0x03FF
				109
				110
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	111	class CommandStreamEmitter:
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	112	WORD_SIZE = 4
				113
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	114	def __init__(self):
				115	self.cmd_stream = []
				116	self.reg_machine = [RegisterMachine(), RegisterMachine()]
				117	self.last_absolute_wait = defaultdict(int)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	118	self.offset = 0
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	119
				120	def get_reg_machine(self, cmd):
				121	if "DMA" in cmd.name:
				122	return self.reg_machine[1]
				123	else:
				124	return self.reg_machine[0]
				125
				126	def size_in_bytes(self):
				127	sz = 0
				128	for cmd in self.cmd_stream:
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	129	sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	130	return sz
				131
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	132	def to_list(self) -> List[int]:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	133	return [elem for cmd in self.cmd_stream for elem in cmd]
				134
				135	def print_cmds(self):
				136	print("Code: Command: Param: Payload:")
				137	for words_for_one_command in self.cmd_stream:
				138	code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
				139	param = words_for_one_command[0] >> 16 # higher 16 bits
				140
				141	payload_mode = CmdMode(code & CmdMode.Mask)
				142
				143	# code and command
				144	s = " 0x%04x " % code
				145	if payload_mode == CmdMode.NoPayload:
				146	s += str(cmd0(code & CmdMode.CmdOpMask))
				147	else:
				148	s += str(cmd1(code & CmdMode.CmdOpMask))
				149
				150	s = s.ljust(40)
				151	s += "%5d" % param
				152
				153	# payload
				154	if payload_mode == CmdMode.Payload32:
				155	s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])
				156	else:
				157	s += " -"
				158
				159	print(s)
				160
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	161	def cmd0_with_param(self, cmd: cmd0, param):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	162	if isinstance(param, Enum):
				163	param = int(param.value)
				164	else:
				165	param = int(param)
				166	param = param & 0xFFFF
				167	command = cmd.value \| (param << 16)
				168	if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
				169	return
				170
				171	# This is not a redundant command, actually write it
				172	self.cmd_stream.append((command,))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	173	self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	174
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	175	def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	176	offset = int(offset) & 0xFFFFFFFFF
				177	command = cmd.value \| CmdMode.Payload32.value \| (param << 16)
				178
				179	if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
				180	return
				181
				182	# This is not a redundant command, actually write it
				183	self.cmd_stream.append((command, offset))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	184	self.offset += CommandStreamEmitter.WORD_SIZE * 2
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	185
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	186	def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	187	param = (16 * channel) + outstanding_count
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	188	command = ((param & 0xFFFF) << 16) \| cmd.value
				189	self.cmd_stream.append((command,))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	190	self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	191
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	192	def cmd_do_operation(self, cmd: cmd0, param=0):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	193	param = int(param)
				194	command = ((param & 0xFFFF) << 16) \| cmd.value
				195
				196	self.cmd_stream.append((command,))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	197	self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	198	self.get_reg_machine(cmd).switch_bank()
				199
				200
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	201	# -------------------------------------------------------------------
				202	# REGISTER GENERATION
				203	# -------------------------------------------------------------------
				204
				205
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	206	# TODO: Replace with definitions from ethos_u55_regs
				207	class IFM2Broadcast(IntEnum):
				208	BroadcastHdim = 1 << 0
				209	BroadcastWdim = 1 << 1
				210	BroadcastCdim = 1 << 2
				211	ReverseOperandOrder = 1 << 6
				212	UseIFM2Scalar = 1 << 7
				213
				214
				215	pooling_op_map = {
				216	NpuPoolingOp.MAX: pooling_mode.MAX.value,
				217	NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,
				218	NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,
				219	}
				220
				221	elementwise_op_map = {
				222	NpuElementWiseOp.MUL: elementwise_mode.MUL.value,
				223	NpuElementWiseOp.ADD: elementwise_mode.ADD.value,
				224	NpuElementWiseOp.SUB: elementwise_mode.SUB.value,
				225	NpuElementWiseOp.MIN: elementwise_mode.MIN.value,
				226	NpuElementWiseOp.MAX: elementwise_mode.MAX.value,
				227	NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,
				228	NpuElementWiseOp.ABS: elementwise_mode.ABS.value,
				229	NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,
				230	NpuElementWiseOp.SHR: elementwise_mode.SHR.value,
				231	NpuElementWiseOp.SHL: elementwise_mode.SHL.value,
				232	}
				233
				234	activation_op_map = {
				235	NpuActivationOp.NONE_OR_RELU: activation.NONE,
				236	NpuActivationOp.TANH: activation.TANH,
				237	NpuActivationOp.SIGMOID: activation.SIGMOID,
				238	}
				239
				240	# Maps an AccumulatorType enum to the corresponding acc_format value
				241	acc_format_map = {
				242	SHRAMElements.Acc16: acc_format.FP_S5_10.value,
				243	SHRAMElements.Acc32: acc_format.INT_32BIT.value,
				244	SHRAMElements.Acc40: acc_format.INT_40BIT.value,
				245	}
				246
				247	resampling_mode_map = {
				248	NpuResamplingMode.NONE: resampling_mode.NONE,
				249	NpuResamplingMode.NEAREST: resampling_mode.NEAREST,
				250	NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,
				251	}
				252
				253	# Maps data type size in bits to activation precision
				254	precision_map = {8: 0, 16: 1, 32: 2}
				255
				256	# Maps rounding mode to the corresponding value
				257	rounding_mode_map = {
				258	NpuRoundingMode.TFL: rounding.TFL.value,
				259	NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,
				260	NpuRoundingMode.NATURAL: rounding.NATURAL.value,
				261	}
				262
				263
				264	def quantise(value: float, quant: Optional[NpuQuantization]) -> int:
				265	"""Quantizes the given value"""
				266	scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32
				267	zp = 0 if quant is None else quant.zero_point
				268	return quantise_float32(value, scale, zp)
				269
				270
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	271	def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):
				272	"""Generates IFM_PAD registers"""
				273	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)
				274	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)
				275	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)
				276	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)
				277
				278
				279	def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):
				280	"""Generates ACTIVATION registers"""
				281	act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)
				282
				283	if act.min is None:
				284	quantized_min = ofm.data_type.min_value()
				285	else:
				286	quantized_min = quantise(act.min, ofm.quantization)
				287	if act.max is None:
				288	quantized_max = ofm.data_type.max_value()
				289	else:
				290	quantized_max = quantise(act.max, ofm.quantization)
				291	quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())
				292	quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())
				293	if act.op_type == NpuActivationOp.TABLE_LOOKUP:
				294	assert 0 <= act.lookup_table_index < 8
				295	activation_value = 16 + act.lookup_table_index
				296	if ofm.data_type == NpuDataType.INT32:
				297	activation_value \|= 3 << 12 # Force I8 range
				298	quantized_min = max(-128, quantized_min)
				299	quantized_max = min(127, quantized_max)
				300	else:
				301	activation_value = activation_op_map[act.op_type]
				302	emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)
				303	emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)
				304	emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)
				305
				306
				307	def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):
				308	"""Generates xFM_BASE registers"""
				309	if layout == NpuLayout.NHCWB16:
				310	# Check that all BasePointer addresses are aligned to 16 bytes
				311	assert all((int(addr) % 16) == 0 for addr in addresses)
				312	emit.cmd1_with_offset(ptr_cmds[0], addresses[0])
				313	emit.cmd1_with_offset(ptr_cmds[1], addresses[1])
				314	emit.cmd1_with_offset(ptr_cmds[2], addresses[2])
				315	emit.cmd1_with_offset(ptr_cmds[3], addresses[3])
				316
				317
				318	def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):
				319	"""Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""
				320	emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)
				321	emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)
				322	emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)
				323
				324
				325	def generate_strides(
				326	emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1
				327	):
				328	"""Generates STRIDE_C/Y/X registers"""
				329	strides = get_strides(fm)
				330	emit.cmd1_with_offset(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)
				331	emit.cmd1_with_offset(stride_y_cmd, strides.height) # stride between vertical values (H)
				332	emit.cmd1_with_offset(stride_x_cmd, strides.width) # stride between horisontal values (W)
				333
				334
				335	def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):
				336	"""Generates IFM/IFM2_PRECISION register"""
				337	dtype = fm.data_type
				338	prec = 1 if dtype.is_signed() else 0
				339	activation_precision = precision_map[dtype.size_in_bits()]
				340	prec += activation_precision << 2
				341
				342	if fm.layout == NpuLayout.NHCWB16:
				343	prec \|= 1 << 6
				344
				345	prec \|= op_to_scale << 8
				346	emit.cmd0_with_param(precision_cmd, prec)
				347
				348
				349	def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):
				350	"""Generates OFM_PRECISION register"""
				351	dtype = npu_op.ofm.data_type
				352	prec = 1 if dtype.is_signed() else 0
				353	activation_precision = precision_map[dtype.size_in_bits()]
				354	prec += activation_precision << 1
				355
				356	if use_global_scale:
				357	# Set global scale bit, as opposed to using per channel scale
				358	prec \|= 1 << 8
				359	if npu_op.ofm.layout == NpuLayout.NHCWB16:
				360	prec \|= 1 << 6
				361	prec \|= rounding_mode_map[npu_op.rounding_mode] << 14
				362	emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
				363
				364
				365	def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):
				366	"""Generates IFM2_BROADCAST register for binary elementwise operations"""
				367	ifm2_broadcast = 0
				368	ifm = npu_op.ifm
				369	ifm2 = npu_op.ifm2
				370	if npu_op.reversed_operands:
				371	ifm2_broadcast \|= IFM2Broadcast.ReverseOperandOrder
				372	if npu_op.ifm2_scalar is not None:
				373	# IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
				374	ifm2_broadcast \|= IFM2Broadcast.UseIFM2Scalar
				375	else:
				376	if ifm.shape.height != ifm2.shape.height:
				377	# Broadcast in 'H' dimension
				378	assert ifm2.shape.height == 1
				379	ifm2_broadcast \|= IFM2Broadcast.BroadcastHdim
				380
				381	if ifm.shape.width != ifm2.shape.width:
				382	# Broadcast in 'W' dimension
				383	assert ifm2.shape.width == 1
				384	ifm2_broadcast \|= IFM2Broadcast.BroadcastWdim
				385
				386	if ifm.shape.depth != ifm2.shape.depth:
				387	# Broadcast in 'C' dimension
				388	assert ifm2.shape.depth == 1
				389	ifm2_broadcast \|= IFM2Broadcast.BroadcastCdim
				390
				391	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
				392
				393
				394	def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):
				395	"""Generates general IFM registers"""
				396	emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)
				397	generate_addresses(
				398	emit,
				399	[cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],
				400	ifm.tiles.addresses,
				401	ifm.layout,
				402	)
				403	generate_tiles(
				404	emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles
				405	)
				406	emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)
				407	generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)
				408	emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))
				409
				410
				411	def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):
				412	"""Generates general IFM2 registers"""
				413	if not has_scalar:
				414	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)
				415	generate_addresses(
				416	emit,
				417	[cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],
				418	ifm2.tiles.addresses,
				419	ifm2.layout,
				420	)
				421	generate_tiles(
				422	emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles
				423	)
				424	generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)
				425	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))
				426
				427
				428	def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):
				429	"""Generates general OFM registers"""
				430	emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)
				431	generate_addresses(
				432	emit,
				433	[cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],
				434	ofm.tiles.addresses,
				435	ofm.layout,
				436	)
				437	generate_tiles(
				438	emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles
				439	)
				440	emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)
				441	emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)
				442	emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)
				443	generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)
				444	emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))
				445
				446
				447	def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):
				448	"""Generates KERNEL related registers"""
				449	emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))
				450	emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))
				451	# set kernel x stride low bit
				452	stride = (kernel.stride_x - 1) & 1
				453	# set kernel y stride low bit
				454	stride \|= (kernel.stride_y - 1 & 1) << 1
				455	# set kernel x stride extension bits
				456	stride \|= (kernel.stride_x - 1 >> 1) << 6
				457	# set kernel y stride extension bits
				458	stride \|= (kernel.stride_y - 1 >> 1) << 9
				459	stride \|= (kernel.dilation_x - 1) << 3
				460	stride \|= (kernel.dilation_y - 1) << 4
				461	if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:
				462	stride \|= 1 << 2
				463	emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
				464
				465
				466	def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):
				467	"""Generates WEIGHT registers"""
				468	if len(weights) == 0:
				469	return
				470	emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)
				471	# Set weights sources for active and present cores
				472	for core, (addr, length) in enumerate(
				473	[
				474	(cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
				475	(cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
				476	]
				477	):
				478	if core < len(weights):
				479	emit.cmd1_with_offset(addr, weights[core].address)
				480	emit.cmd1_with_offset(length, weights[core].length)
				481	elif core < arch.ncores:
				482	emit.cmd1_with_offset(addr, weights[0].address)
				483	emit.cmd1_with_offset(length, 0)
				484
				485
				486	def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):
				487	"""Generates SCALE registers"""
				488	if len(biases) == 0:
				489	return
				490	emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)
				491	# Set weights sources for active and present cores
				492	for core, (addr, length) in enumerate(
				493	[(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]
				494	):
				495	if core < len(biases):
				496	emit.cmd1_with_offset(addr, biases[core].address)
				497	emit.cmd1_with_offset(length, biases[core].length)
				498	elif core < arch.ncores:
				499	emit.cmd1_with_offset(addr, biases[0].address)
				500	emit.cmd1_with_offset(length, 0)
				501
				502
				503	def generate_block_config(
				504	emit: CommandStreamEmitter,
				505	npu_op: NpuBlockOperation,
				506	arch: ArchitectureFeatures,
				507	shared_buffer: SharedBufferAllocation,
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	508	):
				509	"""Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	510	block_config = npu_op.block_config
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	511	assert block_config is not None, "block_config has not been set"
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	512	alloc = shared_buffer.try_block(Block(block_config.width, block_config.height, block_config.depth))
				513	assert alloc is not None, f"Block config {block_config} does not fit, op: {npu_op.op_type}"
				514	emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
				515	emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)
				516	emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	517
				518
				519	def generate_shram_registers_elementwise(
				520	emit: CommandStreamEmitter,
				521	npu_op: NpuElementWiseOperation,
				522	arch: ArchitectureFeatures,
				523	shared_buffer: SharedBufferAllocation,
				524	):
				525	"""Generates IB_END/IB_START/AB_START registers for elementwise operations"""
				526	# For elementwise set the required SHRAM to be equal to the total size of available SHRAM
				527	uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
				528	shram_required = arch.available_shram_banks(uses_lut)
				529
				530	# Acc buffers not needed so set AB_START to size of SHRAM
				531	emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)
				532	emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shram_required)
				533	if has_ifm2(npu_op):
				534	# Set IFM2_IB_START to the latter half of the IB space
				535	ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]
				536	emit.cmd0_with_param(
				537	cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) // shared_buffer.ifm_count + ifm_ib_start,
				538	)
				539	emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
				540
				541
				542	def generate_shram_registers_non_elementwise(emit: CommandStreamEmitter, shared_buffer: SharedBufferAllocation):
				543	"""Generates IB_END/IB_START/AB_START registers for non-elementwise operations"""
				544	emit.cmd0_with_param(
				545	cmd0.NPU_SET_IFM_IB_END,
				546	shared_buffer.bank_locations[SharedBufferArea.IFM] + shared_buffer.banks_required[SharedBufferArea.IFM],
				547	)
				548	emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])
				549	emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
				550
				551
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	552	def create_shared_buffer(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> SharedBufferAllocation:
				553	"""Creates shared buffer allocation for the given operation"""
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	554	if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	555	block_type = NpuBlockType.ConvolutionMxN
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	556	elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	557	block_type = NpuBlockType.ConvolutionDepthWise
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	558	elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	559	block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	560	elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	561	block_type = NpuBlockType.ElementWise
				562	else:
				563	assert 0, "Unsupported operation"
				564	ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
				565	return shared_buffer_allocation_for_npu_op(arch, npu_op, block_type, ifm_resampling_mode)
				566
				567
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	568	def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):
				569	"""Generates KERNEL_WAIT/DMA_WAIT"""
				570	if cmd_waits.npu >= 0:
				571	emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
				572
				573	if cmd_waits.dma >= 0:
				574	emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
				575
				576
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	577	def generate_common(
				578	emit: CommandStreamEmitter,
				579	npu_op: NpuBlockOperation,
				580	block_traversal: NpuBlockTraversal,
				581	arch: ArchitectureFeatures,
				582	use_global_scale: bool = False,
				583	op_to_scale: int = 0,
				584	):
				585	"""Generate registers that are common to most operations"""
				586	assert npu_op.ifm is not None and npu_op.ofm is not None
				587	generate_ifm(emit, npu_op.ifm)
				588	generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)
				589	emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])
				590	if npu_op.padding is not None:
				591	generate_padding(emit, npu_op.padding)
				592	generate_ofm(emit, npu_op.ofm)
				593	generate_ofm_precision(emit, npu_op, use_global_scale)
				594	if npu_op.op_type != NpuOperationType.ElementWise:
				595	assert npu_op.kernel is not None
				596	generate_kernel(emit, npu_op.kernel, block_traversal)
				597	generate_weights(emit, npu_op.weights, arch)
				598	generate_biases(emit, npu_op.biases, arch)
				599	generate_activation(emit, npu_op.activation, npu_op.ofm)
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	600	shared_buffer = create_shared_buffer(npu_op, arch)
				601	generate_block_config(emit, npu_op, arch, shared_buffer)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	602	if isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	603	generate_shram_registers_elementwise(emit, npu_op, arch, shared_buffer)
				604	else:
				605	generate_shram_registers_non_elementwise(emit, shared_buffer)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	606
				607
				608	# -------------------------------------------------------------------
				609	# SCALING
				610	# -------------------------------------------------------------------
				611
				612
				613	def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):
				614	"""Generates OFM_SCALE register for pooling operations"""
				615	# For valid padding vela has to output scaling values
				616	kernel = pool_op.kernel
				617	ifm_quant = pool_op.ifm.quantization
				618	ofm_quant = pool_op.ofm.quantization
				619	if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):
				620	assert ifm_quant.scale_f32 is not None
				621	rescale = 0x3000 * ifm_quant.scale_f32
				622	if pool_op.ifm.data_type == NpuDataType.INT16:
				623	# Calculate scale and shift for the output scale of 1/(3*4096)
				624	shift = 0
				625	max_rescale = np.iinfo(np.int16).max / 2
				626	while rescale <= max_rescale and shift <= 30:
				627	shift += 1
				628	rescale *= 2
				629	scale = int(rescale)
				630	else:
				631	rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
				632	scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
				633	scale = int(round_away_zero(scale * rescale))
				634	elif pool_op.fused_quantize:
				635	# Quantize op requires different scaling
				636	ifm_scale_f64 = np.double(ifm_quant.scale_f32)
				637	ofm_scale_f64 = np.double(ofm_quant.scale_f32)
				638	scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
				639	elif pool_op.rescale is not None:
Fredrik Svedberg	e82be7c	2021-01-18 15:21:03 +0100	[diff] [blame^]	640	# for ResizeBilinear operations with rescale
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	641	rescale = pool_op.rescale
				642	rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
				643	scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
				644	scale = int(round_away_zero(scale * rescale))
				645	else:
				646	# In case avg pool fused with concat or other memory operation, rescaling might be needed.
				647	# kernel height == kernel width == 1 is always true in this case
				648	# Normally the scale is maximised, to get maximum precision, which means that
				649	# if rescale != 1, scale need to consider the number of bits needed for rescaling
				650	if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:
				651	rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32
				652	rescale_bits = 0
				653	if kernel.height == kernel.width == 1:
				654	if rescale > 1:
				655	rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
				656	elif rescale < 1:
				657	rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
				658	scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
				659	scale = int(round_away_zero(scale * rescale))
				660	else:
				661	scale = 1
				662	shift = 0
				663
				664	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
				665
				666
				667	def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
				668	"""
				669	Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
				670	Returns the operator to scale
				671	"""
				672	op_to_scale = 0
				673	if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):
				674	input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
				675	input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
				676	output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
				677
				678	if npu_op.activation is not None and npu_op.activation.op_type in (
				679	NpuActivationOp.SIGMOID,
				680	NpuActivationOp.TANH,
				681	):
				682	output_scale = 1 / 0x3000
				683
				684	if npu_op.sub_op_type == NpuElementWiseOp.MUL:
				685	if None in (input_scale, input2_scale, output_scale):
				686	ofm_scale = 1
				687	shift = 0
				688	else:
				689	ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
				690	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
				691	else: # Add/Sub
				692	if None in (input_scale, input2_scale, output_scale):
				693	opa_scale = opb_scale = ofm_scale = 1
				694	opa_shift = shift = 0
				695	if npu_op.rescale is not None:
				696	ofm_scale, shift = npu_op.rescale
				697	elif input_scale == input2_scale:
				698	opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
				699	input_scale, input2_scale, output_scale
				700	)
				701	opa_shift = 0 # Unused for this case
				702	else:
				703	# Use advanced implementation only when input scales differ
				704	bitdepth = npu_op.ifm.data_type.size_in_bits()
				705	(opa_scale, opa_shift, ofm_scale, shift, op_to_scale,) = scaling.advanced_elementwise_add_sub_scale(
				706	input_scale, input2_scale, output_scale, bitdepth
				707	)
				708	opb_scale = 0 # Unused for this case
				709	if npu_op.reversed_operands:
				710	# If the operand order is reversed we also have to swap which operand is scaled
				711	if op_to_scale == scaling.OperandToScale.OPa:
				712	op_to_scale = scaling.OperandToScale.OPb
				713	else:
				714	op_to_scale = scaling.OperandToScale.OPa
				715	emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
				716	emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
				717	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
				718	elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):
				719	output_scale = npu_op.ofm.quantization.scale_f32
				720	ofm_scale, shift = scaling.quantise_scale(output_scale)
				721	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
				722	else:
				723	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)
				724	return op_to_scale
				725
				726
				727	# -------------------------------------------------------------------
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	728	# PRINT
				729	# -------------------------------------------------------------------
Jacob Bohlin	e99b893	2020-07-13 16:01:51 +0200	[diff] [blame]	730
				731
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	732	def print_feature_map(fm: NpuFeatureMap, name: str):
				733	if fm is not None:
				734	q = (
				735	"no quantization"
				736	if fm.quantization is None
				737	else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"
				738	)
				739	h, w, c = fm.shape
				740	sz = h * w * c * fm.data_type.size_in_bytes()
				741	print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")
				742	strides = get_strides(fm)
				743	stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"
				744	t = fm.tiles
				745	addresses = [hex(addr) for addr in t.addresses]
				746	print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	747
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	748
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	749	def print_operation(npu_op: NpuOperation, index: int = 0, cmd=None):
				750	pass_info = f", {cmd}" if cmd else ""
				751	if isinstance(npu_op, NpuOperation) and not isinstance(npu_op, (NpuDmaOperation, NpuBlockOperation)):
				752	print(f"{index} {npu_op.op_type.name}{pass_info}")
				753	return
				754	if isinstance(npu_op, NpuDmaOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	755	print(f"{index} DMA_START src={npu_op.src}, dest={npu_op.dest}{pass_info}")
				756	return
				757	k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	758	if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	759	print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name}:{pass_info}")
Patrik Gustavsson	eca2e95	2020-05-27 09:15:11 +0200	[diff] [blame]	760	else:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	761	if (
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	762	isinstance(npu_op, NpuConv2DOperation)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	763	and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1
				764	):
				765	fc = "FullyConnected "
				766	else:
				767	fc = ""
				768	print(f"{index} {fc}{npu_op.op_type.name}{pass_info}")
				769	print_feature_map(npu_op.ifm, "IFM")
				770	if npu_op.ifm2_scalar is not None:
				771	quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
				772	print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")
				773	else:
				774	print_feature_map(npu_op.ifm2, "IFM2")
				775	print_feature_map(npu_op.ofm, "OFM")
				776	if k is not None and npu_op.op_type != NpuOperationType.ElementWise:
				777	print(f" Kernel: {k}")
				778	if npu_op.padding is not None:
				779	print(f" {npu_op.padding}")
				780	for weights in npu_op.weights:
				781	print(f" Weights: {weights}")
				782	for bias in npu_op.biases:
				783	print(f" Scales: {bias}")
				784	if npu_op.activation is not None:
				785	act = npu_op.activation
				786	if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:
				787	lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""
				788	print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	789	if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	790	print(f" {npu_op.block_traversal}")
				791	bh, bw, bc = npu_op.block_config
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	792	rescale = (
				793	f", rescale={npu_op.rescale}" if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)) else ""
				794	)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	795	print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	796
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	797
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	798	def print_operations(npu_op_list: List[NpuOperation], npu_op_to_cmd=None):
				799	npu_op_to_cmd = dict() if npu_op_to_cmd is None else npu_op_to_cmd
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	800	for index, npu_op in enumerate(npu_op_list):
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	801	print_operation(npu_op, index, npu_op_to_cmd.get(npu_op))
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	802
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	803
				804	# -------------------------------------------------------------------
				805	# OPERATIONS
				806	# -------------------------------------------------------------------
				807
				808
				809	def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):
				810	"""Generates NPU_OP_* command"""
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	811	if isinstance(npu_op, NpuDmaOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	812	emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	813	elif isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	814	emit.cmd_do_operation(cmd0.NPU_OP_CONV)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	815	elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	816	emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	817	elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	818	emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	819	elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	820	emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])
				821	else:
				822	assert 0, "Unsupported operation"
				823
				824
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	825	def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	826	"""Generates register commands for Conv2D operations"""
				827	generate_common(emit, npu_op, npu_op.block_traversal, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	828
				829
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	830	def generate_conv_depthwise_op(
				831	emit: CommandStreamEmitter, npu_op: NpuConvDepthWiseOperation, arch: ArchitectureFeatures
				832	):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	833	"""Generates register commands for depthwise convolution operations"""
				834	generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	835
				836
				837	def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
				838	"""Generates register commands for pooling operations"""
				839	use_global_scale = (
				840	npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0
				841	)
				842	generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)
				843	# Pooling op specific
				844	if use_global_scale:
				845	generate_ofm_scaling_for_pooling(emit, npu_op)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	846
				847
				848	def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):
				849	"""Generates register commands for elementwise operations"""
				850	use_global_scale = npu_op.sub_op_type in (
				851	NpuElementWiseOp.ADD,
				852	NpuElementWiseOp.SUB,
				853	NpuElementWiseOp.MUL,
				854	NpuElementWiseOp.LRELU,
				855	NpuElementWiseOp.ABS,
				856	)
				857	op_to_scale = generate_scaling_for_elementwise(emit, npu_op)
				858	generate_common(
				859	emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale
				860	)
				861	# Elementwise op specific
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	862	if npu_op.sub_op_type not in UNARY_ELEMWISE_OPS:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	863	# Binary operation; generate IFM2 registers
				864	assert npu_op.ifm2 is not None
				865	has_scalar = npu_op.ifm2_scalar is not None
				866	generate_ifm2(emit, npu_op.ifm2, has_scalar)
				867	generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)
				868	generate_ifm2_broadcast(emit, npu_op)
				869	if has_scalar:
				870	quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
				871	assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()
				872	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	873
				874
				875	def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):
				876	"""Generates register commands for DMA operations"""
				877	emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)
				878	emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
				879	emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)
				880
				881	emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
				882	emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
				883
				884
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	885	def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	886	"""
				887	Generates register commands for the given operation, but not the final NPU_OP_... command.
				888	Returns the selected block config
				889	"""
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	890	if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	891	generate_conv2d_op(emit, npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	892	elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	893	generate_conv_depthwise_op(emit, npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	894	elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	895	generate_pooling_op(emit, npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	896	elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	897	generate_elementwise_op(emit, npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	898	elif isinstance(npu_op, NpuDmaOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	899	generate_dma_op(emit, npu_op)
				900	else:
				901	assert 0, "Unsupported operation"
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	902
				903
				904	def generate_command_stream(
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	905	npu_op_list: List[NpuOperation], arch: ArchitectureFeatures, verbose: bool, add_to_debug_db=None, npu_op_to_cmd=None
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	906	) -> List[int]:
				907	"""
				908	Generates register commands for the given list of NPU operations.
				909	Returns Ethos-U instructions, as a list of 32-bit integers.
				910	"""
				911	emit = CommandStreamEmitter()
				912	if verbose:
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	913	print_operations(npu_op_list, npu_op_to_cmd)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	914	# Calculate memory accesses for every operation
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	915	memory_accesses: Dict[NpuOperation, MemoryAccessSet] = {}
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	916	for npu_op in npu_op_list:
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	917	if isinstance(npu_op, NpuDmaOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	918	memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	919	elif isinstance(npu_op, NpuBlockOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	920	memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	921	else:
				922	assert 0, "Invalid operation type"
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	923	if arch.is_ethos_u65_system:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	924	emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
				925	dep_watermark = Watermark(0, 0)
				926	prev_op = None
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	927	# Generate register commands for all operations
				928	for op_index, npu_op in enumerate(npu_op_list):
				929	dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	930	generate_registers_for_op(emit, npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	931	if not isinstance(npu_op, NpuDmaOperation) and isinstance(npu_op, NpuBlockOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	932	# Generate BLOCKDEP
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	933	blockdep = calc_blockdep(arch, prev_op, npu_op)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	934	blockdep = min(blockdep, arch.max_blockdep)
				935	emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
				936	prev_op = npu_op
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	937
				938	generate_cmd_waits(emit, cmd_waits)
				939	# Generate the actual NPU_OP command
				940	generate_operation_code(emit, npu_op)
				941	if add_to_debug_db is not None:
				942	add_to_debug_db(npu_op, emit.offset)
				943	# Fill in final part of command stream:
				944	emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	945	res = emit.to_list()
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	946	if verbose:
				947	emit.print_cmds()
				948	print("number of commands", len(emit.cmd_stream))
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	949	print("command stream length in words", len(res))
				950	return res
				951
				952
				953	# -------------------------------------------------------------------
				954	# EXTERNAL API
				955	# -------------------------------------------------------------------
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	956
				957
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	958	def find_block_configs(npu_op: NpuOperation, npu_accelerator: NpuAccelerator) -> List[NpuShape3D]:
				959	"""
				960	Internal implementation of the public facing API for finding block configs.
				961	"""
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	962	if isinstance(npu_op, NpuBlockOperation):
				963	arch = create_default_arch(Accelerator.from_npu_accelerator(npu_accelerator))
				964	shared_buffer = create_shared_buffer(npu_op, arch)
				965	blocks = find_suitable_block_configs(arch, shared_buffer)
				966	return [NpuShape3D(height=block[0], width=block[1], depth=block[3]) for block in blocks]
				967	return []
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	968
				969
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	970	def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	971	"""
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	972	Internal implementation of the public facing API for generating an Ethos-U register command stream.
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	973	Calculates dependencies between commands and inserts wait operations if needed.
				974
				975	:param npu_op_list: List[NpuOperation] list of high level NPU operations
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	976	:param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator
				977	:return Ethos-U instructions, as a list of 32-bit integers
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	978	"""
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	979	accelerator = Accelerator.from_npu_accelerator(npu_accelerator)
Louis Verhaard	5207830	2020-11-18 13:35:06 +0100	[diff] [blame]	980	arch = create_default_arch(accelerator)
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	981	return generate_command_stream(npu_op_list, arch, verbose=False)