Blame - ethosu/vela/register_command_stream_generator.py - ml/ethos-u/ethos-u-vela

blob: a4466c921771d09665f32a449e644164b5d41c14 [file] [log] [blame]

erik.andersson@arm.com	460c689	2021-02-24 14:38:09 +0100	[diff] [blame]	1	# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	16	# Description:
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	17	# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	18	# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	19	# stream suitable for interpretation by the Ethos-U processor.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	20	from collections import defaultdict
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	21	from enum import Enum
				22	from enum import IntEnum
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	23	from typing import Dict
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	24	from typing import List
				25	from typing import Optional
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	26
				27	import numpy as np
				28
				29	from . import scaling
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	30	from .api import NpuAccelerator
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	31	from .api import NpuActivation
				32	from .api import NpuActivationOp
				33	from .api import NpuAddressRange
				34	from .api import NpuBlockOperation
				35	from .api import NpuBlockTraversal
				36	from .api import NpuConv2DOperation
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	37	from .api import NpuConvDepthWiseOperation
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	38	from .api import NpuDataType
				39	from .api import NpuDmaOperation
				40	from .api import NpuElementWiseOp
				41	from .api import NpuElementWiseOperation
				42	from .api import NpuFeatureMap
				43	from .api import NpuKernel
				44	from .api import NpuLayout
				45	from .api import NpuOperation
				46	from .api import NpuOperationType
				47	from .api import NpuPadding
				48	from .api import NpuPoolingOp
				49	from .api import NpuPoolingOperation
				50	from .api import NpuQuantization
				51	from .api import NpuResamplingMode
				52	from .api import NpuRoundingMode
				53	from .api import NpuShape3D
				54	from .api import NpuTileBox
				55	from .architecture_features import Accelerator
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	56	from .architecture_features import ArchitectureFeatures
				57	from .architecture_features import Block
Louis Verhaard	5207830	2020-11-18 13:35:06 +0100	[diff] [blame]	58	from .architecture_features import create_default_arch
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	59	from .architecture_features import SharedBufferArea
				60	from .architecture_features import SHRAMElements
erik.andersson@arm.com	1878dab	2021-03-16 09:40:24 +0100	[diff] [blame]	61	from .errors import VelaError
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	62	from .ethos_u55_regs.ethos_u55_regs import acc_format
				63	from .ethos_u55_regs.ethos_u55_regs import activation
				64	from .ethos_u55_regs.ethos_u55_regs import cmd0
				65	from .ethos_u55_regs.ethos_u55_regs import cmd1
				66	from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	67	from .ethos_u55_regs.ethos_u55_regs import pooling_mode
Jacob Bohlin	cf7da10	2020-05-20 09:03:40 +0200	[diff] [blame]	68	from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	69	from .ethos_u55_regs.ethos_u55_regs import rounding
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	70	from .numeric_util import quantise_float32
				71	from .numeric_util import round_away_zero
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	72	from .numeric_util import round_up_to_int
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	73	from .operation import NpuBlockType
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	74	from .range_set import MemoryAccessSet
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame^]	75	from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	76	from .register_command_stream_util import calc_blockdep
				77	from .register_command_stream_util import get_dma_memory_accesses
				78	from .register_command_stream_util import get_op_memory_accesses
				79	from .register_command_stream_util import get_strides
				80	from .register_command_stream_util import get_wait_dependency
				81	from .register_command_stream_util import has_ifm2
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	82	from .register_command_stream_util import to_kernel
				83	from .register_command_stream_util import UNARY_ELEMWISE_OPS
				84	from .register_command_stream_util import Watermark
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	85	from .shared_buffer_allocation import find_suitable_block_configs
				86	from .shared_buffer_allocation import shared_buffer_allocation_for_npu_op
				87	from .shared_buffer_allocation import SharedBufferAllocation
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame^]	88	from ethosu.vela.errors import VelaError
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	89
				90
				91	class RegisterMachine:
				92	def __init__(self):
				93	self.n_banks = 1
				94	self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]
				95	self.bank_idx = 0
				96
				97	def set_register(self, reg, value):
				98	is_changed = self.registers[self.bank_idx][reg] != value
				99	self.registers[self.bank_idx][reg] = value
				100	# is_changed = True # force command
				101	return is_changed
				102
				103	def switch_bank(self):
				104	self.bank_idx = (self.bank_idx + 1) % self.n_banks
				105
				106
				107	class CmdMode(IntEnum):
				108	NoPayload = 0x0000
				109	Payload32 = 0x4000
				110	Mask = 0xC000
				111	CmdOpMask = 0x03FF
				112
				113
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	114	class CommandStreamEmitter:
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	115	WORD_SIZE = 4
				116
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	117	def __init__(self):
				118	self.cmd_stream = []
				119	self.reg_machine = [RegisterMachine(), RegisterMachine()]
				120	self.last_absolute_wait = defaultdict(int)
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	121	self.offset = 0
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	122
				123	def get_reg_machine(self, cmd):
				124	if "DMA" in cmd.name:
				125	return self.reg_machine[1]
				126	else:
				127	return self.reg_machine[0]
				128
				129	def size_in_bytes(self):
				130	sz = 0
				131	for cmd in self.cmd_stream:
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	132	sz += len(cmd) * CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	133	return sz
				134
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	135	def to_list(self) -> List[int]:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	136	return [elem for cmd in self.cmd_stream for elem in cmd]
				137
				138	def print_cmds(self):
				139	print("Code: Command: Param: Payload:")
				140	for words_for_one_command in self.cmd_stream:
				141	code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits
				142	param = words_for_one_command[0] >> 16 # higher 16 bits
				143
				144	payload_mode = CmdMode(code & CmdMode.Mask)
				145
				146	# code and command
				147	s = " 0x%04x " % code
				148	if payload_mode == CmdMode.NoPayload:
				149	s += str(cmd0(code & CmdMode.CmdOpMask))
				150	else:
				151	s += str(cmd1(code & CmdMode.CmdOpMask))
				152
				153	s = s.ljust(40)
				154	s += "%5d" % param
				155
				156	# payload
				157	if payload_mode == CmdMode.Payload32:
				158	s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])
				159	else:
				160	s += " -"
				161
				162	print(s)
				163
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	164	def cmd0_with_param(self, cmd: cmd0, param):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	165	if isinstance(param, Enum):
				166	param = int(param.value)
				167	else:
				168	param = int(param)
				169	param = param & 0xFFFF
				170	command = cmd.value \| (param << 16)
				171	if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):
				172	return
				173
				174	# This is not a redundant command, actually write it
				175	self.cmd_stream.append((command,))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	176	self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	177
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	178	def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	179	offset = int(offset) & 0xFFFFFFFFF
				180	command = cmd.value \| CmdMode.Payload32.value \| (param << 16)
				181
				182	if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):
				183	return
				184
				185	# This is not a redundant command, actually write it
				186	self.cmd_stream.append((command, offset))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	187	self.offset += CommandStreamEmitter.WORD_SIZE * 2
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	188
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	189	def cmd1_with_address(self, cmd: cmd1, offset):
				190	self.cmd1_with_offset(cmd, offset, offset >> 32)
				191
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	192	def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):
Tim Hall	289a41d	2020-08-04 21:40:14 +0100	[diff] [blame]	193	param = (16 * channel) + outstanding_count
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	194	command = ((param & 0xFFFF) << 16) \| cmd.value
				195	self.cmd_stream.append((command,))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	196	self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	197
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	198	def cmd_do_operation(self, cmd: cmd0, param=0):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	199	param = int(param)
				200	command = ((param & 0xFFFF) << 16) \| cmd.value
				201
				202	self.cmd_stream.append((command,))
Tim Hall	e6ccd87	2020-11-09 16:46:37 +0000	[diff] [blame]	203	self.offset += CommandStreamEmitter.WORD_SIZE
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	204	self.get_reg_machine(cmd).switch_bank()
				205
				206
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	207	# -------------------------------------------------------------------
				208	# REGISTER GENERATION
				209	# -------------------------------------------------------------------
				210
				211
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	212	# TODO: Replace with definitions from ethos_u55_regs
				213	class IFM2Broadcast(IntEnum):
				214	BroadcastHdim = 1 << 0
				215	BroadcastWdim = 1 << 1
				216	BroadcastCdim = 1 << 2
				217	ReverseOperandOrder = 1 << 6
				218	UseIFM2Scalar = 1 << 7
				219
				220
				221	pooling_op_map = {
				222	NpuPoolingOp.MAX: pooling_mode.MAX.value,
				223	NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,
				224	NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,
				225	}
				226
				227	elementwise_op_map = {
				228	NpuElementWiseOp.MUL: elementwise_mode.MUL.value,
				229	NpuElementWiseOp.ADD: elementwise_mode.ADD.value,
				230	NpuElementWiseOp.SUB: elementwise_mode.SUB.value,
				231	NpuElementWiseOp.MIN: elementwise_mode.MIN.value,
				232	NpuElementWiseOp.MAX: elementwise_mode.MAX.value,
				233	NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,
				234	NpuElementWiseOp.ABS: elementwise_mode.ABS.value,
				235	NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,
				236	NpuElementWiseOp.SHR: elementwise_mode.SHR.value,
				237	NpuElementWiseOp.SHL: elementwise_mode.SHL.value,
				238	}
				239
				240	activation_op_map = {
				241	NpuActivationOp.NONE_OR_RELU: activation.NONE,
				242	NpuActivationOp.TANH: activation.TANH,
				243	NpuActivationOp.SIGMOID: activation.SIGMOID,
				244	}
				245
				246	# Maps an AccumulatorType enum to the corresponding acc_format value
				247	acc_format_map = {
				248	SHRAMElements.Acc16: acc_format.FP_S5_10.value,
				249	SHRAMElements.Acc32: acc_format.INT_32BIT.value,
				250	SHRAMElements.Acc40: acc_format.INT_40BIT.value,
				251	}
				252
				253	resampling_mode_map = {
				254	NpuResamplingMode.NONE: resampling_mode.NONE,
				255	NpuResamplingMode.NEAREST: resampling_mode.NEAREST,
				256	NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,
				257	}
				258
				259	# Maps data type size in bits to activation precision
				260	precision_map = {8: 0, 16: 1, 32: 2}
				261
				262	# Maps rounding mode to the corresponding value
				263	rounding_mode_map = {
				264	NpuRoundingMode.TFL: rounding.TFL.value,
				265	NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,
				266	NpuRoundingMode.NATURAL: rounding.NATURAL.value,
				267	}
				268
				269
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame^]	270	def check_mem_limits(memory_accesses: MemoryAccessSet, mem_limits: Dict[int, int]):
				271	"""Checks that an operation's memory accesses respect the boundaries imposed by mem_limits"""
				272	for mem_access in memory_accesses.accesses:
				273	for region, range_set in mem_access.regions.items():
				274	if region not in mem_limits:
				275	raise VelaError(f"Invalid region: {region}")
				276	max = mem_limits[region]
				277	for start, end in range_set.ranges:
				278	for offset in (start, end):
				279	if offset < 0:
				280	raise VelaError(f"Negative address offset: {offset}, region: {region}")
				281	if offset > max:
				282	raise VelaError(f"Address offset out of range: {offset}, region: {region}, max: {max}")
				283
				284
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	285	def quantise(value: float, quant: Optional[NpuQuantization]) -> int:
				286	"""Quantizes the given value"""
				287	scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32
				288	zp = 0 if quant is None else quant.zero_point
				289	return quantise_float32(value, scale, zp)
				290
				291
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	292	def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):
				293	"""Generates IFM_PAD registers"""
				294	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)
				295	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)
				296	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)
				297	emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)
				298
				299
				300	def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):
				301	"""Generates ACTIVATION registers"""
				302	act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)
				303
				304	if act.min is None:
				305	quantized_min = ofm.data_type.min_value()
				306	else:
				307	quantized_min = quantise(act.min, ofm.quantization)
				308	if act.max is None:
				309	quantized_max = ofm.data_type.max_value()
				310	else:
				311	quantized_max = quantise(act.max, ofm.quantization)
				312	quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())
				313	quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())
				314	if act.op_type == NpuActivationOp.TABLE_LOOKUP:
				315	assert 0 <= act.lookup_table_index < 8
				316	activation_value = 16 + act.lookup_table_index
				317	if ofm.data_type == NpuDataType.INT32:
				318	activation_value \|= 3 << 12 # Force I8 range
				319	quantized_min = max(-128, quantized_min)
				320	quantized_max = min(127, quantized_max)
				321	else:
				322	activation_value = activation_op_map[act.op_type]
				323	emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)
				324	emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)
				325	emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)
				326
				327
				328	def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):
				329	"""Generates xFM_BASE registers"""
				330	if layout == NpuLayout.NHCWB16:
				331	# Check that all BasePointer addresses are aligned to 16 bytes
				332	assert all((int(addr) % 16) == 0 for addr in addresses)
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	333	for i in range(4):
				334	emit.cmd1_with_address(ptr_cmds[i], addresses[i])
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	335
				336
				337	def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):
				338	"""Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""
				339	emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)
				340	emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)
				341	emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)
				342
				343
				344	def generate_strides(
				345	emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1
				346	):
				347	"""Generates STRIDE_C/Y/X registers"""
				348	strides = get_strides(fm)
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	349	emit.cmd1_with_address(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)
				350	emit.cmd1_with_address(stride_y_cmd, strides.height) # stride between vertical values (H)
				351	emit.cmd1_with_address(stride_x_cmd, strides.width) # stride between horisontal values (W)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	352
				353
				354	def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):
				355	"""Generates IFM/IFM2_PRECISION register"""
				356	dtype = fm.data_type
				357	prec = 1 if dtype.is_signed() else 0
				358	activation_precision = precision_map[dtype.size_in_bits()]
				359	prec += activation_precision << 2
				360
				361	if fm.layout == NpuLayout.NHCWB16:
				362	prec \|= 1 << 6
				363
				364	prec \|= op_to_scale << 8
				365	emit.cmd0_with_param(precision_cmd, prec)
				366
				367
				368	def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):
				369	"""Generates OFM_PRECISION register"""
				370	dtype = npu_op.ofm.data_type
				371	prec = 1 if dtype.is_signed() else 0
				372	activation_precision = precision_map[dtype.size_in_bits()]
				373	prec += activation_precision << 1
				374
				375	if use_global_scale:
				376	# Set global scale bit, as opposed to using per channel scale
				377	prec \|= 1 << 8
				378	if npu_op.ofm.layout == NpuLayout.NHCWB16:
				379	prec \|= 1 << 6
				380	prec \|= rounding_mode_map[npu_op.rounding_mode] << 14
				381	emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)
				382
				383
				384	def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):
				385	"""Generates IFM2_BROADCAST register for binary elementwise operations"""
				386	ifm2_broadcast = 0
				387	ifm = npu_op.ifm
				388	ifm2 = npu_op.ifm2
				389	if npu_op.reversed_operands:
				390	ifm2_broadcast \|= IFM2Broadcast.ReverseOperandOrder
				391	if npu_op.ifm2_scalar is not None:
				392	# IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST
				393	ifm2_broadcast \|= IFM2Broadcast.UseIFM2Scalar
				394	else:
				395	if ifm.shape.height != ifm2.shape.height:
				396	# Broadcast in 'H' dimension
				397	assert ifm2.shape.height == 1
				398	ifm2_broadcast \|= IFM2Broadcast.BroadcastHdim
				399
				400	if ifm.shape.width != ifm2.shape.width:
				401	# Broadcast in 'W' dimension
				402	assert ifm2.shape.width == 1
				403	ifm2_broadcast \|= IFM2Broadcast.BroadcastWdim
				404
				405	if ifm.shape.depth != ifm2.shape.depth:
				406	# Broadcast in 'C' dimension
				407	assert ifm2.shape.depth == 1
				408	ifm2_broadcast \|= IFM2Broadcast.BroadcastCdim
				409
				410	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)
				411
				412
				413	def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):
				414	"""Generates general IFM registers"""
				415	emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)
				416	generate_addresses(
				417	emit,
				418	[cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],
				419	ifm.tiles.addresses,
				420	ifm.layout,
				421	)
				422	generate_tiles(
				423	emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles
				424	)
				425	emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)
				426	generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)
				427	emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))
				428
				429
				430	def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):
				431	"""Generates general IFM2 registers"""
				432	if not has_scalar:
				433	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)
				434	generate_addresses(
				435	emit,
				436	[cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],
				437	ifm2.tiles.addresses,
				438	ifm2.layout,
				439	)
				440	generate_tiles(
				441	emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles
				442	)
				443	generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)
				444	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))
				445
				446
				447	def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):
				448	"""Generates general OFM registers"""
				449	emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)
				450	generate_addresses(
				451	emit,
				452	[cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],
				453	ofm.tiles.addresses,
				454	ofm.layout,
				455	)
				456	generate_tiles(
				457	emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles
				458	)
				459	emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)
				460	emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)
				461	emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)
				462	generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)
				463	emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))
				464
				465
				466	def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):
				467	"""Generates KERNEL related registers"""
				468	emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))
				469	emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))
				470	# set kernel x stride low bit
				471	stride = (kernel.stride_x - 1) & 1
				472	# set kernel y stride low bit
				473	stride \|= (kernel.stride_y - 1 & 1) << 1
				474	# set kernel x stride extension bits
				475	stride \|= (kernel.stride_x - 1 >> 1) << 6
				476	# set kernel y stride extension bits
				477	stride \|= (kernel.stride_y - 1 >> 1) << 9
				478	stride \|= (kernel.dilation_x - 1) << 3
				479	stride \|= (kernel.dilation_y - 1) << 4
				480	if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:
				481	stride \|= 1 << 2
				482	emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)
				483
				484
				485	def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):
				486	"""Generates WEIGHT registers"""
				487	if len(weights) == 0:
				488	return
				489	emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)
				490	# Set weights sources for active and present cores
				491	for core, (addr, length) in enumerate(
				492	[
				493	(cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),
				494	(cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),
				495	]
				496	):
				497	if core < len(weights):
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	498	emit.cmd1_with_address(addr, weights[core].address)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	499	emit.cmd1_with_offset(length, weights[core].length)
				500	elif core < arch.ncores:
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	501	emit.cmd1_with_address(addr, weights[0].address)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	502	emit.cmd1_with_offset(length, 0)
				503
				504
				505	def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):
				506	"""Generates SCALE registers"""
				507	if len(biases) == 0:
				508	return
				509	emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)
				510	# Set weights sources for active and present cores
				511	for core, (addr, length) in enumerate(
				512	[(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]
				513	):
				514	if core < len(biases):
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	515	emit.cmd1_with_address(addr, biases[core].address)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	516	emit.cmd1_with_offset(length, biases[core].length)
				517	elif core < arch.ncores:
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	518	emit.cmd1_with_address(addr, biases[0].address)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	519	emit.cmd1_with_offset(length, 0)
				520
				521
				522	def generate_block_config(
				523	emit: CommandStreamEmitter,
				524	npu_op: NpuBlockOperation,
				525	arch: ArchitectureFeatures,
				526	shared_buffer: SharedBufferAllocation,
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	527	):
				528	"""Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	529	block_config = npu_op.block_config
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	530	assert block_config is not None, "block_config has not been set"
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	531	alloc = shared_buffer.try_block(Block(block_config.width, block_config.height, block_config.depth))
				532	assert alloc is not None, f"Block config {block_config} does not fit, op: {npu_op.op_type}"
				533	emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)
				534	emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)
				535	emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	536
				537
				538	def generate_shram_registers_elementwise(
				539	emit: CommandStreamEmitter,
				540	npu_op: NpuElementWiseOperation,
				541	arch: ArchitectureFeatures,
				542	shared_buffer: SharedBufferAllocation,
				543	):
				544	"""Generates IB_END/IB_START/AB_START registers for elementwise operations"""
				545	# For elementwise set the required SHRAM to be equal to the total size of available SHRAM
				546	uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
				547	shram_required = arch.available_shram_banks(uses_lut)
				548
				549	# Acc buffers not needed so set AB_START to size of SHRAM
				550	emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)
				551	emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shram_required)
				552	if has_ifm2(npu_op):
				553	# Set IFM2_IB_START to the latter half of the IB space
				554	ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]
				555	emit.cmd0_with_param(
				556	cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) // shared_buffer.ifm_count + ifm_ib_start,
				557	)
				558	emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
				559
				560
				561	def generate_shram_registers_non_elementwise(emit: CommandStreamEmitter, shared_buffer: SharedBufferAllocation):
				562	"""Generates IB_END/IB_START/AB_START registers for non-elementwise operations"""
				563	emit.cmd0_with_param(
				564	cmd0.NPU_SET_IFM_IB_END,
				565	shared_buffer.bank_locations[SharedBufferArea.IFM] + shared_buffer.banks_required[SharedBufferArea.IFM],
				566	)
				567	emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])
				568	emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])
				569
				570
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	571	def create_shared_buffer(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> SharedBufferAllocation:
				572	"""Creates shared buffer allocation for the given operation"""
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	573	if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	574	block_type = NpuBlockType.ConvolutionMxN
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	575	elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	576	block_type = NpuBlockType.ConvolutionDepthWise
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	577	elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	578	block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	579	elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	580	block_type = NpuBlockType.ElementWise
				581	else:
				582	assert 0, "Unsupported operation"
				583	ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]
				584	return shared_buffer_allocation_for_npu_op(arch, npu_op, block_type, ifm_resampling_mode)
				585
				586
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	587	def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):
				588	"""Generates KERNEL_WAIT/DMA_WAIT"""
				589	if cmd_waits.npu >= 0:
				590	emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)
				591
				592	if cmd_waits.dma >= 0:
				593	emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)
				594
				595
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	596	def generate_common(
				597	emit: CommandStreamEmitter,
				598	npu_op: NpuBlockOperation,
				599	block_traversal: NpuBlockTraversal,
				600	arch: ArchitectureFeatures,
				601	use_global_scale: bool = False,
				602	op_to_scale: int = 0,
				603	):
				604	"""Generate registers that are common to most operations"""
				605	assert npu_op.ifm is not None and npu_op.ofm is not None
				606	generate_ifm(emit, npu_op.ifm)
				607	generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)
				608	emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])
				609	if npu_op.padding is not None:
				610	generate_padding(emit, npu_op.padding)
				611	generate_ofm(emit, npu_op.ofm)
				612	generate_ofm_precision(emit, npu_op, use_global_scale)
				613	if npu_op.op_type != NpuOperationType.ElementWise:
				614	assert npu_op.kernel is not None
				615	generate_kernel(emit, npu_op.kernel, block_traversal)
				616	generate_weights(emit, npu_op.weights, arch)
				617	generate_biases(emit, npu_op.biases, arch)
				618	generate_activation(emit, npu_op.activation, npu_op.ofm)
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	619	shared_buffer = create_shared_buffer(npu_op, arch)
				620	generate_block_config(emit, npu_op, arch, shared_buffer)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	621	if isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	622	generate_shram_registers_elementwise(emit, npu_op, arch, shared_buffer)
				623	else:
				624	generate_shram_registers_non_elementwise(emit, shared_buffer)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	625
				626
				627	# -------------------------------------------------------------------
				628	# SCALING
				629	# -------------------------------------------------------------------
				630
				631
				632	def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):
				633	"""Generates OFM_SCALE register for pooling operations"""
				634	# For valid padding vela has to output scaling values
				635	kernel = pool_op.kernel
				636	ifm_quant = pool_op.ifm.quantization
				637	ofm_quant = pool_op.ofm.quantization
				638	if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):
				639	assert ifm_quant.scale_f32 is not None
				640	rescale = 0x3000 * ifm_quant.scale_f32
				641	if pool_op.ifm.data_type == NpuDataType.INT16:
				642	# Calculate scale and shift for the output scale of 1/(3*4096)
				643	shift = 0
				644	max_rescale = np.iinfo(np.int16).max / 2
				645	while rescale <= max_rescale and shift <= 30:
				646	shift += 1
				647	rescale *= 2
				648	scale = int(rescale)
				649	else:
				650	rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
				651	scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
				652	scale = int(round_away_zero(scale * rescale))
				653	elif pool_op.fused_quantize:
				654	# Quantize op requires different scaling
				655	ifm_scale_f64 = np.double(ifm_quant.scale_f32)
				656	ofm_scale_f64 = np.double(ofm_quant.scale_f32)
				657	scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)
				658	elif pool_op.rescale is not None:
Fredrik Svedberg	e82be7c	2021-01-18 15:21:03 +0100	[diff] [blame]	659	# for ResizeBilinear operations with rescale
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	660	rescale = pool_op.rescale
				661	rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
				662	scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
				663	scale = int(round_away_zero(scale * rescale))
				664	else:
				665	# In case avg pool fused with concat or other memory operation, rescaling might be needed.
				666	# kernel height == kernel width == 1 is always true in this case
				667	# Normally the scale is maximised, to get maximum precision, which means that
				668	# if rescale != 1, scale need to consider the number of bits needed for rescaling
				669	if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:
				670	rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32
				671	rescale_bits = 0
				672	if kernel.height == kernel.width == 1:
				673	if rescale > 1:
				674	rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1
				675	elif rescale < 1:
				676	rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)
				677	scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)
				678	scale = int(round_away_zero(scale * rescale))
				679	else:
				680	scale = 1
				681	shift = 0
				682
				683	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)
				684
				685
				686	def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:
				687	"""
				688	Generates OFM/OPA/OPB_SCALE registers for elementwise operators.
				689	Returns the operator to scale
				690	"""
				691	op_to_scale = 0
				692	if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):
				693	input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None
				694	input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None
				695	output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None
				696
				697	if npu_op.activation is not None and npu_op.activation.op_type in (
				698	NpuActivationOp.SIGMOID,
				699	NpuActivationOp.TANH,
				700	):
				701	output_scale = 1 / 0x3000
				702
				703	if npu_op.sub_op_type == NpuElementWiseOp.MUL:
				704	if None in (input_scale, input2_scale, output_scale):
				705	ofm_scale = 1
				706	shift = 0
				707	else:
				708	ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)
				709	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
				710	else: # Add/Sub
				711	if None in (input_scale, input2_scale, output_scale):
				712	opa_scale = opb_scale = ofm_scale = 1
				713	opa_shift = shift = 0
				714	if npu_op.rescale is not None:
				715	ofm_scale, shift = npu_op.rescale
				716	elif input_scale == input2_scale:
				717	opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(
				718	input_scale, input2_scale, output_scale
				719	)
				720	opa_shift = 0 # Unused for this case
				721	else:
				722	# Use advanced implementation only when input scales differ
				723	bitdepth = npu_op.ifm.data_type.size_in_bits()
				724	(opa_scale, opa_shift, ofm_scale, shift, op_to_scale,) = scaling.advanced_elementwise_add_sub_scale(
				725	input_scale, input2_scale, output_scale, bitdepth
				726	)
				727	opb_scale = 0 # Unused for this case
				728	if npu_op.reversed_operands:
				729	# If the operand order is reversed we also have to swap which operand is scaled
				730	if op_to_scale == scaling.OperandToScale.OPa:
				731	op_to_scale = scaling.OperandToScale.OPb
				732	else:
				733	op_to_scale = scaling.OperandToScale.OPa
				734	emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)
				735	emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)
				736	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
				737	elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):
				738	output_scale = npu_op.ofm.quantization.scale_f32
				739	ofm_scale, shift = scaling.quantise_scale(output_scale)
				740	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)
				741	else:
				742	emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)
				743	return op_to_scale
				744
				745
				746	# -------------------------------------------------------------------
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	747	# PRINT
				748	# -------------------------------------------------------------------
Jacob Bohlin	e99b893	2020-07-13 16:01:51 +0200	[diff] [blame]	749
				750
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	751	def print_feature_map(fm: NpuFeatureMap, name: str):
				752	if fm is not None:
				753	q = (
				754	"no quantization"
				755	if fm.quantization is None
				756	else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"
				757	)
				758	h, w, c = fm.shape
				759	sz = h * w * c * fm.data_type.size_in_bytes()
				760	print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")
				761	strides = get_strides(fm)
				762	stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"
				763	t = fm.tiles
				764	addresses = [hex(addr) for addr in t.addresses]
				765	print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	766
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	767
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	768	def print_operation(npu_op: NpuOperation, index: int = 0, cmd=None):
				769	pass_info = f", {cmd}" if cmd else ""
				770	if isinstance(npu_op, NpuOperation) and not isinstance(npu_op, (NpuDmaOperation, NpuBlockOperation)):
				771	print(f"{index} {npu_op.op_type.name}{pass_info}")
				772	return
				773	if isinstance(npu_op, NpuDmaOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	774	print(f"{index} DMA_START src={npu_op.src}, dest={npu_op.dest}{pass_info}")
				775	return
				776	k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	777	if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	778	print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name}:{pass_info}")
Patrik Gustavsson	eca2e95	2020-05-27 09:15:11 +0200	[diff] [blame]	779	else:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	780	if (
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	781	isinstance(npu_op, NpuConv2DOperation)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	782	and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1
				783	):
				784	fc = "FullyConnected "
				785	else:
				786	fc = ""
				787	print(f"{index} {fc}{npu_op.op_type.name}{pass_info}")
				788	print_feature_map(npu_op.ifm, "IFM")
				789	if npu_op.ifm2_scalar is not None:
				790	quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
				791	print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")
				792	else:
				793	print_feature_map(npu_op.ifm2, "IFM2")
				794	print_feature_map(npu_op.ofm, "OFM")
				795	if k is not None and npu_op.op_type != NpuOperationType.ElementWise:
				796	print(f" Kernel: {k}")
				797	if npu_op.padding is not None:
				798	print(f" {npu_op.padding}")
				799	for weights in npu_op.weights:
				800	print(f" Weights: {weights}")
				801	for bias in npu_op.biases:
				802	print(f" Scales: {bias}")
				803	if npu_op.activation is not None:
				804	act = npu_op.activation
				805	if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:
				806	lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""
				807	print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	808	if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	809	print(f" {npu_op.block_traversal}")
				810	bh, bw, bc = npu_op.block_config
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	811	rescale = (
				812	f", rescale={npu_op.rescale}" if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)) else ""
				813	)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	814	print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	815
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	816
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	817	def print_operations(npu_op_list: List[NpuOperation], npu_op_to_cmd=None):
				818	npu_op_to_cmd = dict() if npu_op_to_cmd is None else npu_op_to_cmd
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	819	for index, npu_op in enumerate(npu_op_list):
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	820	print_operation(npu_op, index, npu_op_to_cmd.get(npu_op))
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	821
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	822
				823	# -------------------------------------------------------------------
				824	# OPERATIONS
				825	# -------------------------------------------------------------------
				826
				827
				828	def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):
				829	"""Generates NPU_OP_* command"""
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	830	if isinstance(npu_op, NpuDmaOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	831	emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	832	elif isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	833	emit.cmd_do_operation(cmd0.NPU_OP_CONV)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	834	elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	835	emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	836	elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	837	emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	838	elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	839	emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])
				840	else:
				841	assert 0, "Unsupported operation"
				842
				843
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	844	def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	845	"""Generates register commands for Conv2D operations"""
				846	generate_common(emit, npu_op, npu_op.block_traversal, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	847
				848
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	849	def generate_conv_depthwise_op(
				850	emit: CommandStreamEmitter, npu_op: NpuConvDepthWiseOperation, arch: ArchitectureFeatures
				851	):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	852	"""Generates register commands for depthwise convolution operations"""
				853	generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	854
				855
				856	def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):
				857	"""Generates register commands for pooling operations"""
				858	use_global_scale = (
				859	npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0
				860	)
				861	generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)
				862	# Pooling op specific
				863	if use_global_scale:
				864	generate_ofm_scaling_for_pooling(emit, npu_op)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	865
				866
				867	def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):
				868	"""Generates register commands for elementwise operations"""
				869	use_global_scale = npu_op.sub_op_type in (
				870	NpuElementWiseOp.ADD,
				871	NpuElementWiseOp.SUB,
				872	NpuElementWiseOp.MUL,
				873	NpuElementWiseOp.LRELU,
				874	NpuElementWiseOp.ABS,
				875	)
				876	op_to_scale = generate_scaling_for_elementwise(emit, npu_op)
				877	generate_common(
				878	emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale
				879	)
				880	# Elementwise op specific
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	881	if npu_op.sub_op_type not in UNARY_ELEMWISE_OPS:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	882	# Binary operation; generate IFM2 registers
				883	assert npu_op.ifm2 is not None
				884	has_scalar = npu_op.ifm2_scalar is not None
				885	generate_ifm2(emit, npu_op.ifm2, has_scalar)
				886	generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)
				887	generate_ifm2_broadcast(emit, npu_op)
				888	if has_scalar:
				889	quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)
				890	assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()
				891	emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	892
				893
				894	def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):
				895	"""Generates register commands for DMA operations"""
				896	emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	897	emit.cmd1_with_address(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	898	emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)
				899
Mauricio Briceno	a8e48e6	2021-03-19 09:13:50 +0100	[diff] [blame]	900	emit.cmd1_with_address(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)
				901	emit.cmd1_with_address(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	902
				903
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	904	def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	905	"""
				906	Generates register commands for the given operation, but not the final NPU_OP_... command.
				907	Returns the selected block config
				908	"""
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	909	if isinstance(npu_op, NpuConv2DOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	910	generate_conv2d_op(emit, npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	911	elif isinstance(npu_op, NpuConvDepthWiseOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	912	generate_conv_depthwise_op(emit, npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	913	elif isinstance(npu_op, NpuPoolingOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	914	generate_pooling_op(emit, npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	915	elif isinstance(npu_op, NpuElementWiseOperation):
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	916	generate_elementwise_op(emit, npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	917	elif isinstance(npu_op, NpuDmaOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	918	generate_dma_op(emit, npu_op)
				919	else:
				920	assert 0, "Unsupported operation"
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	921
				922
				923	def generate_command_stream(
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame^]	924	npu_op_list: List[NpuOperation],
				925	arch: ArchitectureFeatures,
				926	verbose: bool,
				927	mem_limits: Dict[int, int],
				928	add_to_debug_db=None,
				929	npu_op_to_cmd=None,
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	930	) -> List[int]:
				931	"""
				932	Generates register commands for the given list of NPU operations.
				933	Returns Ethos-U instructions, as a list of 32-bit integers.
				934	"""
				935	emit = CommandStreamEmitter()
				936	if verbose:
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	937	print_operations(npu_op_list, npu_op_to_cmd)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	938	# Calculate memory accesses for every operation
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	939	memory_accesses: Dict[NpuOperation, MemoryAccessSet] = {}
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	940	for npu_op in npu_op_list:
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	941	if isinstance(npu_op, NpuDmaOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	942	memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	943	elif isinstance(npu_op, NpuBlockOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	944	memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	945	else:
				946	assert 0, "Invalid operation type"
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame^]	947
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	948	if arch.is_ethos_u65_system:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	949	emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)
				950	dep_watermark = Watermark(0, 0)
				951	prev_op = None
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	952	# Generate register commands for all operations
				953	for op_index, npu_op in enumerate(npu_op_list):
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame^]	954	try:
				955	check_mem_limits(memory_accesses[npu_op], mem_limits)
				956	dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)
				957	generate_registers_for_op(emit, npu_op, arch)
				958	except VelaError as e:
				959	# Add operation info and rethrow
				960	raise VelaError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	961	if not isinstance(npu_op, NpuDmaOperation) and isinstance(npu_op, NpuBlockOperation):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	962	# Generate BLOCKDEP
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	963	blockdep = calc_blockdep(arch, prev_op, npu_op)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	964	blockdep = min(blockdep, arch.max_blockdep)
				965	emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)
				966	prev_op = npu_op
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	967
				968	generate_cmd_waits(emit, cmd_waits)
				969	# Generate the actual NPU_OP command
				970	generate_operation_code(emit, npu_op)
				971	if add_to_debug_db is not None:
				972	add_to_debug_db(npu_op, emit.offset)
				973	# Fill in final part of command stream:
				974	emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	975	res = emit.to_list()
erik.andersson@arm.com	1878dab	2021-03-16 09:40:24 +0100	[diff] [blame]	976
				977	if emit.size_in_bytes() >= 1 << 24:
				978	raise VelaError(
				979	f"The command stream size exceeds the hardware limit of 16 MiB. "
				980	f"The current stream size is {emit.size_in_bytes()/2**20:.2F} MiB."
				981	)
				982
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	983	if verbose:
				984	emit.print_cmds()
				985	print("number of commands", len(emit.cmd_stream))
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	986	print("command stream length in words", len(res))
				987	return res
				988
				989
				990	# -------------------------------------------------------------------
				991	# EXTERNAL API
				992	# -------------------------------------------------------------------
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	993
				994
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	995	def find_block_configs(npu_op: NpuOperation, npu_accelerator: NpuAccelerator) -> List[NpuShape3D]:
				996	"""
				997	Internal implementation of the public facing API for finding block configs.
				998	"""
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	999	if isinstance(npu_op, NpuBlockOperation):
				1000	arch = create_default_arch(Accelerator.from_npu_accelerator(npu_accelerator))
				1001	shared_buffer = create_shared_buffer(npu_op, arch)
				1002	blocks = find_suitable_block_configs(arch, shared_buffer)
				1003	return [NpuShape3D(height=block[0], width=block[1], depth=block[3]) for block in blocks]
				1004	return []
Louis Verhaard	933f55e	2020-11-25 14:10:30 +0100	[diff] [blame]	1005
				1006
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	1007	def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1008	"""
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	1009	Internal implementation of the public facing API for generating an Ethos-U register command stream.
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1010	Calculates dependencies between commands and inserts wait operations if needed.
				1011
				1012	:param npu_op_list: List[NpuOperation] list of high level NPU operations
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	1013	:param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator
				1014	:return Ethos-U instructions, as a list of 32-bit integers
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1015	"""
Louis Verhaard	aeae567	2020-11-02 18:04:27 +0100	[diff] [blame]	1016	accelerator = Accelerator.from_npu_accelerator(npu_accelerator)
Louis Verhaard	5207830	2020-11-18 13:35:06 +0100	[diff] [blame]	1017	arch = create_default_arch(accelerator)
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame^]	1018	mem_limits = dict()
				1019	for region in range(0, 8):
				1020	mem_limits[region] = arch.max_address_offset
				1021	mem_limits[BASE_PTR_INDEX_MEM2MEM] = arch.shram_size_bytes
				1022	return generate_command_stream(npu_op_list, arch, verbose=False, mem_limits=mem_limits)