Blame - ethosu/vela/high_level_command_to_npu_op.py - ml/ethos-u/ethos-u-vela

blob: c822132031c82342368888a3a6e425fa22d3a20a [file] [log] [blame]

erik.andersson@arm.com	ad45f79	2021-02-03 10:20:16 +0100	[diff] [blame]	1	# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
				16	#
				17	# Description:
				18	# Conversion from high level command to NpuOperation
				19	from enum import IntEnum
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	20	from typing import cast
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	21	from typing import Dict
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	22	from typing import List
				23	from typing import Optional
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	24	from typing import Tuple
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	25
				26	from .api import NpuActivation
				27	from .api import NpuActivationOp
				28	from .api import NpuAddressRange
				29	from .api import NpuBlockOperation
				30	from .api import NpuBlockTraversal
				31	from .api import NpuConv2DOperation
				32	from .api import NpuConvDepthWiseOperation
				33	from .api import NpuDataType
				34	from .api import NpuDmaOperation
				35	from .api import NpuElementWiseOp
				36	from .api import NpuElementWiseOperation
				37	from .api import NpuFeatureMap
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	38	from .api import NpuLayout
				39	from .api import NpuOperation
				40	from .api import NpuPadding
				41	from .api import NpuPoolingOp
				42	from .api import NpuPoolingOperation
				43	from .api import NpuQuantization
				44	from .api import NpuResamplingMode
				45	from .api import NpuRoundingMode
				46	from .api import NpuShape3D
				47	from .api import NpuTileBox
				48	from .architecture_features import ArchitectureFeatures
				49	from .data_type import DataType
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	50	from .debug_database import DebugDatabase
Michael McGeagh	7a6f843	2020-12-02 15:29:22 +0000	[diff] [blame]	51	from .errors import UnsupportedFeatureError
Tim Hall	3c5cfe9	2022-03-16 16:31:57 +0000	[diff] [blame^]	52	from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	53	from .high_level_command_stream import Box
				54	from .high_level_command_stream import Command
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	55	from .high_level_command_stream import DMA
				56	from .high_level_command_stream import NpuStripe
Fredrik Svedberg	838df0a	2021-09-17 16:29:22 +0200	[diff] [blame]	57	from .numeric_util import quantise_float32
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	58	from .numeric_util import round_up
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	59	from .operation import NpuBlockType
				60	from .operation import Op
				61	from .operation import Operation
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	62	from .register_command_stream_generator import generate_command_stream
				63	from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	64	from .register_command_stream_util import to_npu_kernel
				65	from .register_command_stream_util import UNARY_ELEMWISE_OPS
patrik.gustavsson	eeb8515	2020-12-21 17:10:40 +0000	[diff] [blame]	66	from .shape4d import Shape4D
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	67	from .tensor import MemType
				68	from .tensor import Tensor
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	69	from .tensor import TensorFormat
				70	from .tensor import TensorPurpose
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	71	from .tensor import TensorSubPurpose
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	72	from .weight_compressor import NpuWeightTensor
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	73	from .weight_compressor import WeightKey
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	74
				75
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	76	class BasePointerIndex(IntEnum):
				77	WeightTensor = 0 # base address index for the Weight tensor
				78	ScratchTensor = 1 # base address index for the Scratch_tensor in the TensorArena
				79	ScratchFastTensor = 2 # base address for the Scratch_fast_tensor
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	80
				81
				82	dtype_map = {
				83	DataType.uint8: NpuDataType.UINT8,
				84	DataType.int8: NpuDataType.INT8,
				85	DataType.uint16: NpuDataType.UINT16,
				86	DataType.int16: NpuDataType.INT16,
				87	DataType.int32: NpuDataType.INT32,
				88	}
				89
				90
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	91	# Maps an elementwise op type to an elementwise_mode enum value used by NPU_OP_ELEMENTWISE
				92	elementwise_op_map = {
				93	Op.Mul: NpuElementWiseOp.MUL,
Patrik Gustavsson	b081d67	2021-08-25 13:49:25 +0200	[diff] [blame]	94	Op.RescaleMul: NpuElementWiseOp.MUL,
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	95	Op.Add: NpuElementWiseOp.ADD,
Fredrik Svedberg	e82be7c	2021-01-18 15:21:03 +0100	[diff] [blame]	96	Op.RescaleAdd: NpuElementWiseOp.ADD,
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	97	Op.Sub: NpuElementWiseOp.SUB,
				98	Op.Minimum: NpuElementWiseOp.MIN,
				99	Op.Maximum: NpuElementWiseOp.MAX,
				100	Op.LeakyRelu: NpuElementWiseOp.LRELU,
				101	Op.Abs: NpuElementWiseOp.ABS,
				102	Op.CLZ: NpuElementWiseOp.CLZ,
				103	Op.SHR: NpuElementWiseOp.SHR,
				104	Op.SHL: NpuElementWiseOp.SHL,
				105	}
				106
				107
Tim Hall	3c5cfe9	2022-03-16 16:31:57 +0000	[diff] [blame^]	108	# inverse of the resampling_mode_map in the register command stream generator
				109	resampling_mode_inv_map = {
				110	resampling_mode.NONE: NpuResamplingMode.NONE,
				111	resampling_mode.NEAREST: NpuResamplingMode.NEAREST,
				112	resampling_mode.TRANSPOSE: NpuResamplingMode.TRANSPOSE,
				113	}
				114
				115
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	116	def ifm_ifm2_correct_order(ifm_shape: List[int], ifm2_shape: List[int]) -> bool:
				117	if ifm_shape == []:
				118	# Scalar needs to be in IFM2
				119	return False
				120	if ifm2_shape == []:
				121	return True
				122
				123	for ifm, ifm2 in zip(ifm_shape, ifm2_shape):
				124	if ifm != ifm2 and ifm == 1:
				125	# Broadcasted FM needs to be in IFM2
				126	return False
				127	return True
				128
				129
Patrik Gustavsson	b0ca274	2020-11-18 07:59:09 +0100	[diff] [blame]	130	def get_rounding_mode(op: Operation, fused_quantize: bool) -> NpuRoundingMode:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	131	"""Specifies type of rounding to be used"""
				132	rounding_mode = NpuRoundingMode.TFL
				133	if op.type == Op.ResizeBilinear:
Dwight Lidman	9d24393	2021-08-10 12:53:12 +0200	[diff] [blame]	134	rounding_mode = NpuRoundingMode.NATURAL
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	135	elif (
				136	op.type.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise)
				137	and op.ifm.dtype == DataType.int16
				138	):
				139	rounding_mode = NpuRoundingMode.NATURAL
Patrik Gustavsson	b0ca274	2020-11-18 07:59:09 +0100	[diff] [blame]	140	elif (
				141	not fused_quantize
				142	and op.type.is_avgpool_op()
				143	and op.memory_function == Op.ConcatSliceWrite
				144	and op.kernel.elements_wh() == 1
				145	):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	146	rounding_mode = NpuRoundingMode.NATURAL
Louis Verhaard	1a92f78	2021-02-09 16:08:26 +0100	[diff] [blame]	147	if op.rounding_mode is not None:
				148	rounding_mode = op.rounding_mode
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	149	return rounding_mode
				150
				151
				152	def create_padding(cmd: NpuStripe, primary_op: Operation) -> NpuPadding:
				153	if primary_op.type.npu_block_type == NpuBlockType.VectorProduct:
				154	return NpuPadding(top=0, left=0, bottom=0, right=0)
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	155	top, left, bottom, right = primary_op.attrs["explicit_padding"]
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	156
				157	# Check if this is for horizontal ifm streaming
				158	if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	159	top = cmd.pad_top
				160	bottom = cmd.pad_bottom
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	161
Tim Hall	3751aa4	2021-12-16 13:17:29 +0000	[diff] [blame]	162	# the ifm box coordinate range depends upon whether the primary op was combined with a split slice read
				163	ifm_read_offset = primary_op.read_offsets[0]
				164	ifm_read_shape = primary_op.read_shapes[0]
				165	if ifm_read_offset is None or len(ifm_read_offset) < 2:
				166	box_start_coord_min = 0
				167	box_end_coord_max = cmd.ps.ifm_shapes[0].width
				168	else:
				169	box_start_coord_min = ifm_read_offset[-2]
				170	box_end_coord_max = ifm_read_shape[-2]
				171
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	172	# Indexing from end since a 1x1 Avgpool might have been added with non 4-dimensional input/output,
				173	# because of activation function needed to be fused.
Tim Hall	3751aa4	2021-12-16 13:17:29 +0000	[diff] [blame]	174	if len(cmd.ifm_box.start_coord) >= 2 and cmd.ifm_box.start_coord[-2] > box_start_coord_min:
				175	left = 0
				176	if len(cmd.ifm_box.end_coord) >= 2 and cmd.ifm_box.end_coord[-2] < box_end_coord_max:
				177	right = 0
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	178	return NpuPadding(top=top, left=left, bottom=bottom, right=right)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	179
				180
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	181	def get_region(mem_type: MemType, arch: ArchitectureFeatures) -> int:
Tim Hall	1bd531d	2020-11-01 20:59:36 +0000	[diff] [blame]	182	base_ptr_idx_map = {
				183	MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
				184	MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
				185	MemType.Scratch: BasePointerIndex.ScratchTensor,
				186	}
				187
				188	if arch.is_spilling_enabled():
				189	base_ptr_idx_map[MemType.Scratch_fast] = BasePointerIndex.ScratchFastTensor
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	190	else:
Tim Hall	1bd531d	2020-11-01 20:59:36 +0000	[diff] [blame]	191	base_ptr_idx_map[MemType.Scratch_fast] = BasePointerIndex.ScratchTensor
				192
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	193	return base_ptr_idx_map[mem_type].value
				194
				195
				196	def get_mem_limits_for_regions(arch: ArchitectureFeatures) -> Dict[int, int]:
				197	"""Returns map region -> max size of the region in bytes"""
				198	mem_limits = dict()
				199	for mem_type in MemType.all():
				200	mem_limits[get_region(mem_type, arch)] = arch.mem_type_size(mem_type)
				201	mem_limits[BASE_PTR_INDEX_MEM2MEM] = arch.shram_size_bytes
				202	return mem_limits
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	203
				204
Louis Verhaard	e91b531	2022-01-21 13:38:50 +0100	[diff] [blame]	205	def get_double_buffer_offset(arch: ArchitectureFeatures, range_index: int, core: int) -> int:
				206	"""Returns 0 if the first half of a double buffer should be used, 1 if the second half should be used"""
				207	return ((range_index - core) // arch.ncores) % 2
				208
				209
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	210	def get_ifm_depth(npu_block_type: NpuBlockType, ifm_box: Box, ofm_box: Box) -> int:
				211	if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	212	block = ifm_box.get_block()
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	213	else:
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	214	block = ofm_box.get_block()
				215	return block.depth
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	216
				217
				218	def use_zero_point_0(ps, tens: Tensor, is_ifm_tensor: bool) -> bool:
				219	"""Checks if quantization should use 0 as zero point"""
				220	if tens.dtype == DataType.int32 and is_ifm_tensor:
				221	return True
				222	if ps.primary_op.type not in (Op.AvgPool, Op.ResizeBilinear, Op.CLZ, Op.SHL):
				223	return False
Patrik Gustavsson	c74682c	2021-08-17 14:26:38 +0200	[diff] [blame]	224	if ps.primary_op.type == Op.AvgPool and ps.primary_op.explicit_scaling:
				225	return False
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	226	fused_quantize = any(op.type == Op.Quantize for op in ps.ops)
				227	forced_ofm_quantization = ps.primary_op.forced_output_quantization
				228	use_0 = (
Fredrik Svedberg	838df0a	2021-09-17 16:29:22 +0200	[diff] [blame]	229	(
				230	ps.primary_op.activation is None
				231	or forced_ofm_quantization is not None
Fredrik Svedberg	6f87be4	2021-10-07 10:54:20 +0200	[diff] [blame]	232	or (
				233	ps.primary_op.type.is_avgpool_op()
				234	and ps.primary_op.activation.op_type.is_relu_op()
				235	and not ps.primary_op.rescale
				236	)
Fredrik Svedberg	838df0a	2021-09-17 16:29:22 +0200	[diff] [blame]	237	)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	238	and (ps.primary_op.memory_function != Op.ConcatSliceWrite)
				239	and not fused_quantize
				240	)
				241	return use_0
				242
				243
				244	def get_ifm_or_ifm2_quantization(ps, tens: Tensor) -> Optional[NpuQuantization]:
				245	"""Gets quantization for IFM/IFM2"""
Dwight Lidman	4f728c0	2020-12-17 15:14:45 +0100	[diff] [blame]	246	op = ps.primary_op
				247	ifm_quant = op.forced_input_quantization if op.forced_input_quantization is not None else tens.quantization
				248	if ifm_quant is None:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	249	return None
				250	if use_zero_point_0(ps, tens, True):
				251	zero_point = 0
				252	else:
Dwight Lidman	4f728c0	2020-12-17 15:14:45 +0100	[diff] [blame]	253	zero_point = int(ifm_quant.zero_point)
				254	return NpuQuantization(scale_f32=ifm_quant.scale_f32, zero_point=zero_point)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	255
				256
				257	def get_ofm_quantization(ps, tens: Tensor) -> Optional[NpuQuantization]:
				258	"""Gets quantization for OFM"""
				259	op = ps.primary_op
				260	# Check if operation's output quantization is should be used instead of the output tensor's quantization
				261	# (used in LUTs)
				262	ofm_quant = op.forced_output_quantization if op.forced_output_quantization is not None else tens.quantization
				263	if ofm_quant is None:
				264	return None
				265	if use_zero_point_0(ps, tens, False):
				266	zero_point = 0
				267	else:
				268	zero_point = int(ofm_quant.zero_point)
				269	return NpuQuantization(scale_f32=ofm_quant.scale_f32, zero_point=zero_point)
				270
				271
Patrik Gustavsson	3a26920	2021-01-21 08:28:55 +0100	[diff] [blame]	272	def create_feature_map(tens: Tensor, box: Box, arch: ArchitectureFeatures, op_shape4D: Shape4D) -> NpuFeatureMap:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	273	"""Creates feature map with common fields populated"""
				274	fm = NpuFeatureMap()
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	275	fm.region = get_region(tens.mem_type, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	276	fm.data_type = dtype_map[tens.dtype]
				277	if tens.format == TensorFormat.NHWC:
				278	fm.layout = NpuLayout.NHWC
				279	elif tens.format == TensorFormat.NHCWB16:
				280	fm.layout = NpuLayout.NHCWB16
				281	else:
				282	assert 0, "Incorrect tensor format"
Patrik Gustavsson	3a26920	2021-01-21 08:28:55 +0100	[diff] [blame]	283	height_0, height_1, width_0, addresses = tens.addresses_for_rolling_buffer(
				284	box.start_coord, box.end_coord, op_shape4D
				285	)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	286	for idx, addr in enumerate(addresses):
				287	if addr is None:
				288	addresses[idx] = 0
				289	fm.tiles = NpuTileBox(
				290	height_0=height_0, height_1=height_1, width_0=width_0, addresses=[int(addr) for addr in addresses]
				291	)
Patrik Gustavsson	3a26920	2021-01-21 08:28:55 +0100	[diff] [blame]	292	strides = tens.get_strides(shape4D=op_shape4D)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	293	fm.strides = NpuShape3D(height=int(strides[2]), width=int(strides[3]), depth=int(strides[1]))
				294	return fm
				295
				296
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	297	def create_weights(
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	298	weight_tensor: NpuWeightTensor, weight_box: Box, scale_tensor: NpuWeightTensor, arch: ArchitectureFeatures
				299	) -> Tuple[List[NpuAddressRange], List[NpuAddressRange]]:
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	300	"""Returns address ranges for weights and scales"""
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	301	weights = []
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	302	biases = []
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	303	shared_region = get_region(weight_tensor.mem_type, arch)
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	304	scale_region = get_region(scale_tensor.mem_type, arch) if scale_tensor else 0
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	305
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	306	w_tensor_src = weight_tensor
				307	if weight_tensor.src_tensor:
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	308	w_tensor_src = cast(NpuWeightTensor, weight_tensor.src_tensor)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	309
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	310	core_offset = 0
				311	for core in range(0, arch.ncores):
				312	# Get weight range per core
				313	key = WeightKey(core, weight_box.start_coord[-1])
				314	if key in w_tensor_src.encoded_ranges:
				315	weight_range = w_tensor_src.encoded_ranges[key]
				316	if weight_tensor.sub_purpose == TensorSubPurpose.DoubleBuffer:
				317	assert weight_tensor != w_tensor_src
				318	# Double buffered inside weight_tensor
Louis Verhaard	e91b531	2022-01-21 13:38:50 +0100	[diff] [blame]	319	address = weight_tensor.address + core_offset
				320	address += get_double_buffer_offset(arch, weight_range.index, core) * w_tensor_src.max_range_bytes
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	321	core_offset += round_up(weight_range.total_bytes, 16)
				322	else:
				323	if weight_tensor == w_tensor_src:
				324	# Straight from source tensor
				325	address = weight_tensor.address + weight_range.offset
				326	else:
				327	# Single buffered inside weight tensor
				328	address = weight_tensor.address + core_offset
				329	core_offset += round_up(weight_range.total_bytes, 16)
				330
				331	# Location of weights in tensor
				332	addr_range = NpuAddressRange(
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	333	shared_region, int(address + weight_range.weight_offset), round_up(int(weight_range.weight_bytes), 16)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	334	)
				335	weights.append(addr_range)
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	336
				337	# Location of standalone scales or combined weights tensor scales
				338	if scale_tensor:
				339	assert scale_tensor.src_tensor is None # Must be standalone
				340	scale_range = scale_tensor.encoded_ranges[key]
				341	address = scale_tensor.address + scale_range.offset
				342	addr_range = NpuAddressRange(scale_region, int(address), round_up(int(scale_range.scale_bytes), 16))
				343	else:
				344	addr_range = NpuAddressRange(shared_region, int(address), round_up(int(weight_range.scale_bytes), 16))
				345
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	346	biases.append(addr_range)
				347
				348	return weights, biases
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	349
				350
				351	def create_npu_activation(op: Operation) -> NpuActivation:
				352	"""Creates fused activation function"""
				353	if op.activation is None:
				354	return NpuActivation(NpuActivationOp.NONE_OR_RELU)
				355	faf = op.activation.op_type
				356	act_op = NpuActivationOp.NONE_OR_RELU
				357	if faf == Op.Tanh:
				358	act_op = NpuActivationOp.TANH
				359	elif faf == Op.Sigmoid:
				360	act_op = NpuActivationOp.SIGMOID
				361	elif faf == Op.LUT:
				362	act_op = NpuActivationOp.TABLE_LOOKUP
				363	elif not faf.is_relu_op():
Michael McGeagh	7a6f843	2020-12-02 15:29:22 +0000	[diff] [blame]	364	raise UnsupportedFeatureError(f"Unsupported fused_activation_function: {faf.name}")
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	365
				366	act = NpuActivation(act_op)
				367	act.min = op.activation.min
				368	act.max = op.activation.max
Fredrik Svedberg	6f87be4	2021-10-07 10:54:20 +0200	[diff] [blame]	369	if act_op is NpuActivationOp.NONE_OR_RELU and op.type.is_avgpool_op() and not op.rescale:
Fredrik Svedberg	838df0a	2021-09-17 16:29:22 +0200	[diff] [blame]	370	quant = op.ofm.quantization
				371	if quant and quant.zero_point: # Zero point is not 0
				372	scale_f32 = 1 if quant.scale_f32 is None else quant.scale_f32
				373	zero_point = quant.zero_point
				374	if act.min is not None:
				375	act.min = scale_f32 * quantise_float32(act.min, scale_f32, zero_point)
				376	if act.max is not None:
				377	act.max = scale_f32 * quantise_float32(act.max, scale_f32, zero_point)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	378	act.lookup_table_index = op.activation.lut_index
				379	return act
				380
				381
				382	def set_common_op_fields(npu_op: NpuBlockOperation, cmd: NpuStripe, arch: ArchitectureFeatures):
				383	"""Sets common fields of the given operation"""
				384	ps = cmd.ps
				385	op = ps.primary_op
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	386
				387	ifm_height = cmd.ifm_box.get_block().height
Patrik Gustavsson	3a26920	2021-01-21 08:28:55 +0100	[diff] [blame]	388	ifm_width = cmd.ps.ifm_shapes[0].width
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	389	ifm_depth = get_ifm_depth(op.type.npu_block_type, cmd.ifm_box, cmd.ofm_box)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	390
Patrik Gustavsson	2349d42	2020-12-01 16:02:29 +0100	[diff] [blame]	391	npu_op.ifm = create_feature_map(cmd.ifm_tensor, cmd.ifm_box, arch, ps.ifm_shapes[0])
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	392	npu_op.ifm.shape = NpuShape3D(height=ifm_height, width=ifm_width, depth=ifm_depth)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	393	npu_op.ifm.quantization = get_ifm_or_ifm2_quantization(ps, cmd.ifm_tensor)
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	394
				395	out_block = cmd.ofm_box.get_block()
Patrik Gustavsson	2349d42	2020-12-01 16:02:29 +0100	[diff] [blame]	396	npu_op.ofm = create_feature_map(cmd.ofm_tensor, cmd.ofm_box, arch, ps.ofm_shapes[0])
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	397	npu_op.ofm.shape = NpuShape3D(height=out_block.height, width=out_block.width, depth=out_block.depth)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	398	npu_op.ofm.quantization = get_ofm_quantization(ps, cmd.ofm_tensor)
				399
				400	if cmd.weight_tensor is not None:
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	401	npu_op.weights, npu_op.biases = create_weights(cmd.weight_tensor, cmd.weight_box, cmd.scale_tensor, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	402	npu_op.activation = create_npu_activation(op)
Patrik Gustavsson	b0ca274	2020-11-18 07:59:09 +0100	[diff] [blame]	403	npu_op.fused_quantize = any(op.type == Op.Quantize for op in ps.ops)
				404	npu_op.rounding_mode = get_rounding_mode(op, npu_op.fused_quantize)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	405	npu_op.block_config = NpuShape3D(height=ps.block_config[0], width=ps.block_config[1], depth=ps.block_config[3])
				406
				407	if not op.type.is_elementwise_op():
				408	npu_op.padding = create_padding(cmd, op)
				409	npu_op.kernel = to_npu_kernel(op.kernel)
Tim Hall	3c5cfe9	2022-03-16 16:31:57 +0000	[diff] [blame^]	410	npu_op.ifm_upscale = resampling_mode_inv_map[op.ifm_resampling_mode]
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	411	return npu_op
				412
				413
				414	def create_npu_conv2d_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuConv2DOperation:
				415	"""Converts the command to NpuConv2DOperation"""
				416	npu_op = NpuConv2DOperation()
				417	set_common_op_fields(npu_op, cmd, arch)
				418	if cmd.ps.primary_op.type.npu_block_type == NpuBlockType.VectorProduct:
				419	npu_op.block_traversal = NpuBlockTraversal.DEPTH_FIRST
				420	else:
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	421	if cmd.weight_tensor.src_tensor:
				422	npu_op.block_traversal = cmd.weight_tensor.src_tensor.hw_traversal
				423	else:
				424	npu_op.block_traversal = cmd.weight_tensor.hw_traversal
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	425	return npu_op
				426
				427
				428	def create_npu_conv_depthwise_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuConvDepthWiseOperation:
				429	"""Converts the command to NpuConvDepthWiseOperation"""
				430	npu_op = NpuConvDepthWiseOperation()
				431	set_common_op_fields(npu_op, cmd, arch)
				432	return npu_op
				433
				434
				435	def create_npu_pool_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuPoolingOperation:
				436	"""Converts the command to NpuPoolingOperation"""
				437	ps = cmd.ps
				438	op = ps.primary_op
				439	pool_op = NpuPoolingOp.AVERAGE
				440	if op.type.is_maxpool_op():
				441	pool_op = NpuPoolingOp.MAX
				442	elif op.type.is_avgpool_op() or op.type == Op.ResizeBilinear:
				443	pool_op = NpuPoolingOp.AVERAGE
				444	elif op.type == Op.ReduceSum:
				445	pool_op = NpuPoolingOp.REDUCE_SUM
				446	else:
				447	assert 0, f"Unknown pool type {op.type}"
				448	npu_op = NpuPoolingOperation(pool_op)
				449	set_common_op_fields(npu_op, cmd, arch)
				450	# Pooling specific info
Dwight Lidman	4f728c0	2020-12-17 15:14:45 +0100	[diff] [blame]	451	npu_op.rescale = op.rescale
Patrik Gustavsson	c74682c	2021-08-17 14:26:38 +0200	[diff] [blame]	452	if op.explicit_scaling:
				453	# Note: reuse of rescale for explicit scaling to not expose this in the external API
				454	assert npu_op.rescale is None
				455	npu_op.rescale = op.explicit_scaling
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	456	return npu_op
				457
				458
				459	def create_npu_elementwise_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuElementWiseOperation:
				460	"""Converts the command to NpuElementWiseOperation"""
				461	ps = cmd.ps
				462	op = ps.primary_op
				463	assert op.type in elementwise_op_map, f"Unknown elementwise type {op.type}"
				464	elemwise_op = elementwise_op_map[op.type]
				465	npu_op = NpuElementWiseOperation(elemwise_op)
Patrik Gustavsson	2349d42	2020-12-01 16:02:29 +0100	[diff] [blame]	466
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	467	if elemwise_op not in UNARY_ELEMWISE_OPS:
Patrik Gustavsson	3a26920	2021-01-21 08:28:55 +0100	[diff] [blame]	468	ifm_shape = [] if cmd.ifm_tensor.shape == [] else ps.ifm_shapes[0].as_list()
				469	ifm2_shape = [] if cmd.ifm2_tensor.shape == [] else ps.ifm_shapes[1].as_list()
				470	if not ifm_ifm2_correct_order(ifm_shape, ifm2_shape):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	471	# The scalar/broadcasted feature map has to be the ifm2 tensor so switch the ifms
				472	cmd.ifm_tensor, cmd.ifm2_tensor = cmd.ifm2_tensor, cmd.ifm_tensor
				473	cmd.ifm_box, cmd.ifm2_box = cmd.ifm2_box, cmd.ifm_box
Patrik Gustavsson	2349d42	2020-12-01 16:02:29 +0100	[diff] [blame]	474	ps.ifm_shapes[0], ps.ifm_shapes[1] = ps.ifm_shapes[1], ps.ifm_shapes[0]
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	475	npu_op.reversed_operands = True
Patrik Gustavsson	2349d42	2020-12-01 16:02:29 +0100	[diff] [blame]	476	npu_op.ifm2 = create_feature_map(cmd.ifm2_tensor, cmd.ifm2_box, arch, ps.ifm_shapes[1])
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	477	npu_op.ifm2.quantization = get_ifm_or_ifm2_quantization(ps, cmd.ifm2_tensor)
				478	if cmd.ifm2_tensor.shape == []:
				479	# scalar
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	480	npu_op.ifm2_scalar = cmd.ifm2_tensor.get_scalar()
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	481	npu_op.ifm2.shape = NpuShape3D(height=0, width=0, depth=0)
				482	else:
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	483	ifm2_blk = cmd.ifm2_box.get_block()
Patrik Gustavsson	3a26920	2021-01-21 08:28:55 +0100	[diff] [blame]	484	ifm2_width = ps.ifm_shapes[1].width
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	485	npu_op.ifm2.shape = NpuShape3D(height=ifm2_blk.height, width=ifm2_width, depth=ifm2_blk.depth)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	486	set_common_op_fields(npu_op, cmd, arch)
				487	# Check if output scale needs to be overridden
				488	output_scale = None
				489	if op.type == Op.Add and "resizebilinear" in op.attrs:
				490	# Force output scale same as the input scale for
				491	# resizebilinear 1x1 that is converted to add
				492	output_scale = npu_op.ifm2.quantization.scale_f32
Fredrik Svedberg	f2afd7f	2021-02-01 21:42:12 +0100	[diff] [blame]	493	if op.type == Op.Abs:
				494	output_scale = npu_op.ifm.quantization.scale_f32 / npu_op.ofm.quantization.scale_f32
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	495	if op.type == Op.LeakyRelu:
				496	output_scale = op.attrs["alpha"]
Patrik Gustavsson	b081d67	2021-08-25 13:49:25 +0200	[diff] [blame]	497	if op.type in (Op.RescaleAdd, Op.RescaleMul):
Fredrik Svedberg	e82be7c	2021-01-18 15:21:03 +0100	[diff] [blame]	498	assert op.rescale is not None, f"{op.type} must have rescale"
				499	npu_op.rescale = op.rescale
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	500	if op.type in (Op.Add, Op.Mul, Op.Sub):
				501	if op.activation is not None and op.activation.op_type in (Op.Sigmoid, Op.Tanh):
				502	output_scale = 1 / 0x3000
				503	if output_scale is not None:
				504	npu_op.ofm.quantization = NpuQuantization(scale_f32=output_scale, zero_point=npu_op.ofm.quantization.zero_point)
				505	return npu_op
				506
				507
				508	def create_dma_op(cmd: DMA, arch: ArchitectureFeatures) -> NpuDmaOperation:
				509	"""Converts the command to NpuDmaOperation"""
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	510	src_region = get_region(cmd.in_tensor.mem_type, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	511	if cmd.out_tensor.purpose == TensorPurpose.LUT:
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	512	dest_region = BASE_PTR_INDEX_MEM2MEM
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	513	else:
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	514	dest_region = get_region(cmd.out_tensor.mem_type, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	515
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	516	if cmd.in_tensor.purpose == TensorPurpose.Weights:
				517	# Get weight range per core
				518	sz = 0
				519	for core in range(0, arch.ncores):
				520	key = WeightKey(core, cmd.box.start_coord[-1])
				521	if key in cmd.in_tensor.encoded_ranges:
				522	weight_range = cmd.in_tensor.encoded_ranges[key]
				523	sz += round_up(weight_range.total_bytes, 16)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	524
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	525	if core == 0:
				526	weight_range = cmd.in_tensor.encoded_ranges[key]
				527	src_addr = cmd.in_tensor.address + weight_range.offset
				528
				529	if cmd.out_tensor.sub_purpose == TensorSubPurpose.DoubleBuffer:
				530	dest_addr = cmd.out_tensor.address + cmd.in_tensor.max_range_bytes * (
Louis Verhaard	e91b531	2022-01-21 13:38:50 +0100	[diff] [blame]	531	get_double_buffer_offset(arch, weight_range.index, core)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	532	)
				533	else:
				534	dest_addr = cmd.out_tensor.address
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	535	else:
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	536	start_coord = cmd.box.start_coord
				537	src_addr = cmd.in_tensor.address_for_coordinate(start_coord)
				538	dest_addr = cmd.out_tensor.address_for_coordinate(start_coord)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	539	sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr
				540	src = NpuAddressRange(src_region, int(src_addr), int(sz))
				541	dest = NpuAddressRange(dest_region, int(dest_addr), int(sz))
				542	return NpuDmaOperation(src, dest)
				543
				544
				545	def convert_command_to_npu_op(cmd: Command, arch: ArchitectureFeatures) -> NpuOperation:
				546	"""Converts the high level command to NpuOperation"""
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	547	npu_op: NpuOperation
				548	if isinstance(cmd, DMA):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	549	npu_op = create_dma_op(cmd, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	550	elif isinstance(cmd, NpuStripe):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	551	npu_block_type = cmd.ps.primary_op.type.npu_block_type
				552	if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct):
				553	npu_op = create_npu_conv2d_op(cmd, arch)
				554	elif npu_block_type == NpuBlockType.ConvolutionDepthWise:
				555	npu_op = create_npu_conv_depthwise_op(cmd, arch)
				556	elif npu_block_type in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):
				557	npu_op = create_npu_pool_op(cmd, arch)
				558	elif npu_block_type == NpuBlockType.ElementWise:
				559	npu_op = create_npu_elementwise_op(cmd, arch)
				560	else:
				561	assert 0, f"Unknown command type {npu_block_type}"
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	562	return npu_op
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	563
				564
				565	def generate_register_command_stream_for_sg(nng, sg, arch, verbose=False):
				566	"""Generates command stream for the subgraph, adds it to sg.register_command_stream"""
				567	# Convert high level command stream to list of NpuOperation
				568	npu_op_list = []
				569	npu_op_to_cmd = dict() # map from npu op to high level command
				570	for cmd in sg.high_level_command_stream:
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	571	if isinstance(cmd, NpuStripe) and cmd.ps.npu_block_type == NpuBlockType.Default:
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	572	print("Warning: Skipping register command stream generation for", cmd.ps)
				573	else:
				574	npu_op = convert_command_to_npu_op(cmd, arch)
				575	npu_op_list.append(npu_op)
				576	npu_op_to_cmd[npu_op] = cmd
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	577	mem_limits = get_mem_limits_for_regions(arch)
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	578	# Generate register commands
erik.andersson@arm.com	ad45f79	2021-02-03 10:20:16 +0100	[diff] [blame]	579	if len(sg.high_level_command_stream) > 0:
				580	stream_id = DebugDatabase.add_stream(sg)
				581	sg.generated_stream_id = stream_id
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	582
erik.andersson@arm.com	ad45f79	2021-02-03 10:20:16 +0100	[diff] [blame]	583	def add_to_debug_db(npu_op: NpuOperation, offset: int):
				584	"""Adds info to the debug database"""
				585	if not isinstance(npu_op, NpuDmaOperation):
				586	cmd = npu_op_to_cmd[npu_op]
				587	DebugDatabase.add_command(stream_id, offset, cmd.ps.primary_op)
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	588
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	589	sg.register_command_stream = generate_command_stream(
				590	npu_op_list, arch, verbose, mem_limits, add_to_debug_db, npu_op_to_cmd
				591	)