Blame - ethosu/vela/high_level_command_to_npu_op.py - ml/ethos-u/ethos-u-vela

blob: 3a78d6fb0c66f2a2a2a3d30cd2d871dcdc9b92f6 [file] [log] [blame]

erik.andersson@arm.com	ad45f79	2021-02-03 10:20:16 +0100	[diff] [blame]	1	# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
				16	#
				17	# Description:
				18	# Conversion from high level command to NpuOperation
				19	from enum import IntEnum
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	20	from typing import cast
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	21	from typing import Dict
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	22	from typing import List
				23	from typing import Optional
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	24	from typing import Tuple
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	25
				26	from .api import NpuActivation
				27	from .api import NpuActivationOp
				28	from .api import NpuAddressRange
				29	from .api import NpuBlockOperation
				30	from .api import NpuBlockTraversal
				31	from .api import NpuConv2DOperation
				32	from .api import NpuConvDepthWiseOperation
				33	from .api import NpuDataType
				34	from .api import NpuDmaOperation
				35	from .api import NpuElementWiseOp
				36	from .api import NpuElementWiseOperation
				37	from .api import NpuFeatureMap
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	38	from .api import NpuLayout
				39	from .api import NpuOperation
				40	from .api import NpuPadding
				41	from .api import NpuPoolingOp
				42	from .api import NpuPoolingOperation
				43	from .api import NpuQuantization
				44	from .api import NpuResamplingMode
				45	from .api import NpuRoundingMode
				46	from .api import NpuShape3D
				47	from .api import NpuTileBox
				48	from .architecture_features import ArchitectureFeatures
				49	from .data_type import DataType
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	50	from .debug_database import DebugDatabase
Michael McGeagh	7a6f843	2020-12-02 15:29:22 +0000	[diff] [blame]	51	from .errors import UnsupportedFeatureError
Tim Hall	3c5cfe9	2022-03-16 16:31:57 +0000	[diff] [blame]	52	from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	53	from .high_level_command_stream import Box
				54	from .high_level_command_stream import Command
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	55	from .high_level_command_stream import DMA
				56	from .high_level_command_stream import NpuStripe
Fredrik Svedberg	838df0a	2021-09-17 16:29:22 +0200	[diff] [blame]	57	from .numeric_util import quantise_float32
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	58	from .numeric_util import round_up
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	59	from .operation import NpuBlockType
				60	from .operation import Op
				61	from .operation import Operation
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	62	from .register_command_stream_generator import generate_command_stream
				63	from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	64	from .register_command_stream_util import to_npu_kernel
				65	from .register_command_stream_util import UNARY_ELEMWISE_OPS
patrik.gustavsson	eeb8515	2020-12-21 17:10:40 +0000	[diff] [blame]	66	from .shape4d import Shape4D
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	67	from .tensor import MemType
				68	from .tensor import Tensor
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	69	from .tensor import TensorFormat
				70	from .tensor import TensorPurpose
Tim Hall	b5df773	2022-05-04 16:20:43 +0100	[diff] [blame]	71	from .tensor import TensorSubPurpose
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	72	from .weight_compressor import NpuWeightTensor
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	73	from .weight_compressor import WeightKey
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	74
				75
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	76	class BasePointerIndex(IntEnum):
				77	WeightTensor = 0 # base address index for the Weight tensor
				78	ScratchTensor = 1 # base address index for the Scratch_tensor in the TensorArena
				79	ScratchFastTensor = 2 # base address for the Scratch_fast_tensor
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	80
				81
				82	dtype_map = {
				83	DataType.uint8: NpuDataType.UINT8,
				84	DataType.int8: NpuDataType.INT8,
				85	DataType.uint16: NpuDataType.UINT16,
				86	DataType.int16: NpuDataType.INT16,
				87	DataType.int32: NpuDataType.INT32,
				88	}
				89
				90
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	91	# Maps an elementwise op type to an elementwise_mode enum value used by NPU_OP_ELEMENTWISE
				92	elementwise_op_map = {
				93	Op.Mul: NpuElementWiseOp.MUL,
Patrik Gustavsson	b081d67	2021-08-25 13:49:25 +0200	[diff] [blame]	94	Op.RescaleMul: NpuElementWiseOp.MUL,
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	95	Op.Add: NpuElementWiseOp.ADD,
Fredrik Svedberg	e82be7c	2021-01-18 15:21:03 +0100	[diff] [blame]	96	Op.RescaleAdd: NpuElementWiseOp.ADD,
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	97	Op.Sub: NpuElementWiseOp.SUB,
				98	Op.Minimum: NpuElementWiseOp.MIN,
				99	Op.Maximum: NpuElementWiseOp.MAX,
				100	Op.LeakyRelu: NpuElementWiseOp.LRELU,
				101	Op.Abs: NpuElementWiseOp.ABS,
				102	Op.CLZ: NpuElementWiseOp.CLZ,
				103	Op.SHR: NpuElementWiseOp.SHR,
				104	Op.SHL: NpuElementWiseOp.SHL,
				105	}
				106
				107
Tim Hall	3c5cfe9	2022-03-16 16:31:57 +0000	[diff] [blame]	108	# inverse of the resampling_mode_map in the register command stream generator
				109	resampling_mode_inv_map = {
				110	resampling_mode.NONE: NpuResamplingMode.NONE,
				111	resampling_mode.NEAREST: NpuResamplingMode.NEAREST,
				112	resampling_mode.TRANSPOSE: NpuResamplingMode.TRANSPOSE,
				113	}
				114
				115
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	116	def ifm_ifm2_correct_order(ifm_shape: List[int], ifm2_shape: List[int]) -> bool:
				117	if ifm_shape == []:
				118	# Scalar needs to be in IFM2
				119	return False
				120	if ifm2_shape == []:
				121	return True
				122
				123	for ifm, ifm2 in zip(ifm_shape, ifm2_shape):
				124	if ifm != ifm2 and ifm == 1:
				125	# Broadcasted FM needs to be in IFM2
				126	return False
				127	return True
				128
				129
Patrik Gustavsson	b0ca274	2020-11-18 07:59:09 +0100	[diff] [blame]	130	def get_rounding_mode(op: Operation, fused_quantize: bool) -> NpuRoundingMode:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	131	"""Specifies type of rounding to be used"""
				132	rounding_mode = NpuRoundingMode.TFL
				133	if op.type == Op.ResizeBilinear:
Dwight Lidman	9d24393	2021-08-10 12:53:12 +0200	[diff] [blame]	134	rounding_mode = NpuRoundingMode.NATURAL
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	135	elif (
				136	op.type.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise)
				137	and op.ifm.dtype == DataType.int16
				138	):
				139	rounding_mode = NpuRoundingMode.NATURAL
Patrik Gustavsson	b0ca274	2020-11-18 07:59:09 +0100	[diff] [blame]	140	elif (
				141	not fused_quantize
				142	and op.type.is_avgpool_op()
				143	and op.memory_function == Op.ConcatSliceWrite
				144	and op.kernel.elements_wh() == 1
				145	):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	146	rounding_mode = NpuRoundingMode.NATURAL
Louis Verhaard	1a92f78	2021-02-09 16:08:26 +0100	[diff] [blame]	147	if op.rounding_mode is not None:
				148	rounding_mode = op.rounding_mode
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	149	return rounding_mode
				150
				151
				152	def create_padding(cmd: NpuStripe, primary_op: Operation) -> NpuPadding:
				153	if primary_op.type.npu_block_type == NpuBlockType.VectorProduct:
				154	return NpuPadding(top=0, left=0, bottom=0, right=0)
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	155	top, left, bottom, right = primary_op.attrs["explicit_padding"]
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	156
				157	# Check if this is for horizontal ifm streaming
				158	if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	159	top = cmd.pad_top
				160	bottom = cmd.pad_bottom
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	161
Tim Hall	3751aa4	2021-12-16 13:17:29 +0000	[diff] [blame]	162	# the ifm box coordinate range depends upon whether the primary op was combined with a split slice read
				163	ifm_read_offset = primary_op.read_offsets[0]
				164	ifm_read_shape = primary_op.read_shapes[0]
				165	if ifm_read_offset is None or len(ifm_read_offset) < 2:
				166	box_start_coord_min = 0
				167	box_end_coord_max = cmd.ps.ifm_shapes[0].width
				168	else:
				169	box_start_coord_min = ifm_read_offset[-2]
				170	box_end_coord_max = ifm_read_shape[-2]
				171
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	172	# Indexing from end since a 1x1 Avgpool might have been added with non 4-dimensional input/output,
				173	# because of activation function needed to be fused.
Tim Hall	3751aa4	2021-12-16 13:17:29 +0000	[diff] [blame]	174	if len(cmd.ifm_box.start_coord) >= 2 and cmd.ifm_box.start_coord[-2] > box_start_coord_min:
				175	left = 0
				176	if len(cmd.ifm_box.end_coord) >= 2 and cmd.ifm_box.end_coord[-2] < box_end_coord_max:
				177	right = 0
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	178	return NpuPadding(top=top, left=left, bottom=bottom, right=right)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	179
				180
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	181	def get_region(mem_type: MemType, arch: ArchitectureFeatures) -> int:
Tim Hall	1bd531d	2020-11-01 20:59:36 +0000	[diff] [blame]	182	base_ptr_idx_map = {
				183	MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
				184	MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
				185	MemType.Scratch: BasePointerIndex.ScratchTensor,
				186	}
				187
				188	if arch.is_spilling_enabled():
				189	base_ptr_idx_map[MemType.Scratch_fast] = BasePointerIndex.ScratchFastTensor
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	190	else:
Tim Hall	1bd531d	2020-11-01 20:59:36 +0000	[diff] [blame]	191	base_ptr_idx_map[MemType.Scratch_fast] = BasePointerIndex.ScratchTensor
				192
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	193	return base_ptr_idx_map[mem_type].value
				194
				195
				196	def get_mem_limits_for_regions(arch: ArchitectureFeatures) -> Dict[int, int]:
				197	"""Returns map region -> max size of the region in bytes"""
				198	mem_limits = dict()
				199	for mem_type in MemType.all():
				200	mem_limits[get_region(mem_type, arch)] = arch.mem_type_size(mem_type)
				201	mem_limits[BASE_PTR_INDEX_MEM2MEM] = arch.shram_size_bytes
				202	return mem_limits
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	203
				204
Tim Hall	b5df773	2022-05-04 16:20:43 +0100	[diff] [blame]	205	def get_double_buffer_offset(arch: ArchitectureFeatures, range_index: int, core: int) -> int:
				206	"""Returns 0 if the first half of a double buffer should be used, 1 if the second half should be used"""
				207	return ((range_index - core) // arch.ncores) % 2
Louis Verhaard	e91b531	2022-01-21 13:38:50 +0100	[diff] [blame]	208
				209
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	210	def get_ifm_depth(npu_block_type: NpuBlockType, ifm_box: Box, ofm_box: Box) -> int:
				211	if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	212	block = ifm_box.get_block()
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	213	else:
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	214	block = ofm_box.get_block()
				215	return block.depth
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	216
				217
				218	def use_zero_point_0(ps, tens: Tensor, is_ifm_tensor: bool) -> bool:
				219	"""Checks if quantization should use 0 as zero point"""
				220	if tens.dtype == DataType.int32 and is_ifm_tensor:
				221	return True
				222	if ps.primary_op.type not in (Op.AvgPool, Op.ResizeBilinear, Op.CLZ, Op.SHL):
				223	return False
Patrik Gustavsson	c74682c	2021-08-17 14:26:38 +0200	[diff] [blame]	224	if ps.primary_op.type == Op.AvgPool and ps.primary_op.explicit_scaling:
				225	return False
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	226	fused_quantize = any(op.type == Op.Quantize for op in ps.ops)
				227	forced_ofm_quantization = ps.primary_op.forced_output_quantization
				228	use_0 = (
Fredrik Svedberg	838df0a	2021-09-17 16:29:22 +0200	[diff] [blame]	229	(
				230	ps.primary_op.activation is None
				231	or forced_ofm_quantization is not None
Fredrik Svedberg	6f87be4	2021-10-07 10:54:20 +0200	[diff] [blame]	232	or (
				233	ps.primary_op.type.is_avgpool_op()
				234	and ps.primary_op.activation.op_type.is_relu_op()
				235	and not ps.primary_op.rescale
				236	)
Fredrik Svedberg	838df0a	2021-09-17 16:29:22 +0200	[diff] [blame]	237	)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	238	and (ps.primary_op.memory_function != Op.ConcatSliceWrite)
				239	and not fused_quantize
				240	)
				241	return use_0
				242
				243
				244	def get_ifm_or_ifm2_quantization(ps, tens: Tensor) -> Optional[NpuQuantization]:
				245	"""Gets quantization for IFM/IFM2"""
Dwight Lidman	4f728c0	2020-12-17 15:14:45 +0100	[diff] [blame]	246	op = ps.primary_op
				247	ifm_quant = op.forced_input_quantization if op.forced_input_quantization is not None else tens.quantization
				248	if ifm_quant is None:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	249	return None
				250	if use_zero_point_0(ps, tens, True):
				251	zero_point = 0
				252	else:
Dwight Lidman	4f728c0	2020-12-17 15:14:45 +0100	[diff] [blame]	253	zero_point = int(ifm_quant.zero_point)
				254	return NpuQuantization(scale_f32=ifm_quant.scale_f32, zero_point=zero_point)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	255
				256
				257	def get_ofm_quantization(ps, tens: Tensor) -> Optional[NpuQuantization]:
				258	"""Gets quantization for OFM"""
				259	op = ps.primary_op
				260	# Check if operation's output quantization is should be used instead of the output tensor's quantization
				261	# (used in LUTs)
				262	ofm_quant = op.forced_output_quantization if op.forced_output_quantization is not None else tens.quantization
				263	if ofm_quant is None:
				264	return None
				265	if use_zero_point_0(ps, tens, False):
				266	zero_point = 0
				267	else:
				268	zero_point = int(ofm_quant.zero_point)
				269	return NpuQuantization(scale_f32=ofm_quant.scale_f32, zero_point=zero_point)
				270
				271
Patrik Gustavsson	3a26920	2021-01-21 08:28:55 +0100	[diff] [blame]	272	def create_feature_map(tens: Tensor, box: Box, arch: ArchitectureFeatures, op_shape4D: Shape4D) -> NpuFeatureMap:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	273	"""Creates feature map with common fields populated"""
				274	fm = NpuFeatureMap()
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	275	fm.region = get_region(tens.mem_type, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	276	fm.data_type = dtype_map[tens.dtype]
				277	if tens.format == TensorFormat.NHWC:
				278	fm.layout = NpuLayout.NHWC
				279	elif tens.format == TensorFormat.NHCWB16:
				280	fm.layout = NpuLayout.NHCWB16
				281	else:
				282	assert 0, "Incorrect tensor format"
Patrik Gustavsson	3a26920	2021-01-21 08:28:55 +0100	[diff] [blame]	283	height_0, height_1, width_0, addresses = tens.addresses_for_rolling_buffer(
				284	box.start_coord, box.end_coord, op_shape4D
				285	)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	286	for idx, addr in enumerate(addresses):
				287	if addr is None:
				288	addresses[idx] = 0
				289	fm.tiles = NpuTileBox(
				290	height_0=height_0, height_1=height_1, width_0=width_0, addresses=[int(addr) for addr in addresses]
				291	)
Patrik Gustavsson	3a26920	2021-01-21 08:28:55 +0100	[diff] [blame]	292	strides = tens.get_strides(shape4D=op_shape4D)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	293	fm.strides = NpuShape3D(height=int(strides[2]), width=int(strides[3]), depth=int(strides[1]))
Tim Hall	68df8a1	2022-03-16 16:51:16 +0000	[diff] [blame]	294	fm.name = tens.name
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	295	return fm
				296
				297
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	298	def create_weights(
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	299	weight_tensor: NpuWeightTensor, weight_box: Box, scale_tensor: NpuWeightTensor, arch: ArchitectureFeatures
				300	) -> Tuple[List[NpuAddressRange], List[NpuAddressRange]]:
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	301	"""Returns address ranges for weights and scales"""
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	302	weights = []
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	303	biases = []
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	304	shared_region = get_region(weight_tensor.mem_type, arch)
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	305	scale_region = get_region(scale_tensor.mem_type, arch) if scale_tensor else 0
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	306
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	307	w_tensor_src = weight_tensor
				308	if weight_tensor.src_tensor:
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	309	w_tensor_src = cast(NpuWeightTensor, weight_tensor.src_tensor)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	310
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	311	core_offset = 0
				312	for core in range(0, arch.ncores):
				313	# Get weight range per core
				314	key = WeightKey(core, weight_box.start_coord[-1])
				315	if key in w_tensor_src.encoded_ranges:
				316	weight_range = w_tensor_src.encoded_ranges[key]
Tim Hall	b5df773	2022-05-04 16:20:43 +0100	[diff] [blame]	317	if weight_tensor.sub_purpose == TensorSubPurpose.DoubleBuffer:
				318	assert weight_tensor != w_tensor_src
				319	# Double buffered inside weight_tensor
Louis Verhaard	cc5f4de	2022-03-01 11:26:58 +0100	[diff] [blame]	320	address = weight_tensor.address + core_offset
Tim Hall	b5df773	2022-05-04 16:20:43 +0100	[diff] [blame]	321	address += get_double_buffer_offset(arch, weight_range.index, core) * w_tensor_src.max_range_bytes
Louis Verhaard	cc5f4de	2022-03-01 11:26:58 +0100	[diff] [blame]	322	core_offset += round_up(weight_range.total_bytes, 16)
Tim Hall	b5df773	2022-05-04 16:20:43 +0100	[diff] [blame]	323	else:
				324	if weight_tensor == w_tensor_src:
				325	# Straight from source tensor
				326	address = weight_tensor.address + weight_range.offset
				327	else:
				328	# Single buffered inside weight tensor
				329	address = weight_tensor.address + core_offset
				330	core_offset += round_up(weight_range.total_bytes, 16)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	331
				332	# Location of weights in tensor
				333	addr_range = NpuAddressRange(
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	334	shared_region, int(address + weight_range.weight_offset), round_up(int(weight_range.weight_bytes), 16)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	335	)
				336	weights.append(addr_range)
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	337
				338	# Location of standalone scales or combined weights tensor scales
				339	if scale_tensor:
				340	assert scale_tensor.src_tensor is None # Must be standalone
				341	scale_range = scale_tensor.encoded_ranges[key]
				342	address = scale_tensor.address + scale_range.offset
				343	addr_range = NpuAddressRange(scale_region, int(address), round_up(int(scale_range.scale_bytes), 16))
				344	else:
				345	addr_range = NpuAddressRange(shared_region, int(address), round_up(int(weight_range.scale_bytes), 16))
				346
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	347	biases.append(addr_range)
				348
				349	return weights, biases
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	350
				351
				352	def create_npu_activation(op: Operation) -> NpuActivation:
				353	"""Creates fused activation function"""
				354	if op.activation is None:
				355	return NpuActivation(NpuActivationOp.NONE_OR_RELU)
				356	faf = op.activation.op_type
				357	act_op = NpuActivationOp.NONE_OR_RELU
				358	if faf == Op.Tanh:
				359	act_op = NpuActivationOp.TANH
				360	elif faf == Op.Sigmoid:
				361	act_op = NpuActivationOp.SIGMOID
				362	elif faf == Op.LUT:
				363	act_op = NpuActivationOp.TABLE_LOOKUP
				364	elif not faf.is_relu_op():
Michael McGeagh	7a6f843	2020-12-02 15:29:22 +0000	[diff] [blame]	365	raise UnsupportedFeatureError(f"Unsupported fused_activation_function: {faf.name}")
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	366
				367	act = NpuActivation(act_op)
				368	act.min = op.activation.min
				369	act.max = op.activation.max
Fredrik Svedberg	6f87be4	2021-10-07 10:54:20 +0200	[diff] [blame]	370	if act_op is NpuActivationOp.NONE_OR_RELU and op.type.is_avgpool_op() and not op.rescale:
Fredrik Svedberg	838df0a	2021-09-17 16:29:22 +0200	[diff] [blame]	371	quant = op.ofm.quantization
				372	if quant and quant.zero_point: # Zero point is not 0
				373	scale_f32 = 1 if quant.scale_f32 is None else quant.scale_f32
				374	zero_point = quant.zero_point
				375	if act.min is not None:
				376	act.min = scale_f32 * quantise_float32(act.min, scale_f32, zero_point)
				377	if act.max is not None:
				378	act.max = scale_f32 * quantise_float32(act.max, scale_f32, zero_point)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	379	act.lookup_table_index = op.activation.lut_index
				380	return act
				381
				382
				383	def set_common_op_fields(npu_op: NpuBlockOperation, cmd: NpuStripe, arch: ArchitectureFeatures):
				384	"""Sets common fields of the given operation"""
				385	ps = cmd.ps
				386	op = ps.primary_op
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	387
				388	ifm_height = cmd.ifm_box.get_block().height
Patrik Gustavsson	3a26920	2021-01-21 08:28:55 +0100	[diff] [blame]	389	ifm_width = cmd.ps.ifm_shapes[0].width
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	390	ifm_depth = get_ifm_depth(op.type.npu_block_type, cmd.ifm_box, cmd.ofm_box)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	391
Patrik Gustavsson	2349d42	2020-12-01 16:02:29 +0100	[diff] [blame]	392	npu_op.ifm = create_feature_map(cmd.ifm_tensor, cmd.ifm_box, arch, ps.ifm_shapes[0])
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	393	npu_op.ifm.shape = NpuShape3D(height=ifm_height, width=ifm_width, depth=ifm_depth)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	394	npu_op.ifm.quantization = get_ifm_or_ifm2_quantization(ps, cmd.ifm_tensor)
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	395
				396	out_block = cmd.ofm_box.get_block()
Patrik Gustavsson	2349d42	2020-12-01 16:02:29 +0100	[diff] [blame]	397	npu_op.ofm = create_feature_map(cmd.ofm_tensor, cmd.ofm_box, arch, ps.ofm_shapes[0])
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	398	npu_op.ofm.shape = NpuShape3D(height=out_block.height, width=out_block.width, depth=out_block.depth)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	399	npu_op.ofm.quantization = get_ofm_quantization(ps, cmd.ofm_tensor)
				400
				401	if cmd.weight_tensor is not None:
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	402	npu_op.weights, npu_op.biases = create_weights(cmd.weight_tensor, cmd.weight_box, cmd.scale_tensor, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	403	npu_op.activation = create_npu_activation(op)
Patrik Gustavsson	b0ca274	2020-11-18 07:59:09 +0100	[diff] [blame]	404	npu_op.fused_quantize = any(op.type == Op.Quantize for op in ps.ops)
				405	npu_op.rounding_mode = get_rounding_mode(op, npu_op.fused_quantize)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	406	npu_op.block_config = NpuShape3D(height=ps.block_config[0], width=ps.block_config[1], depth=ps.block_config[3])
				407
				408	if not op.type.is_elementwise_op():
				409	npu_op.padding = create_padding(cmd, op)
				410	npu_op.kernel = to_npu_kernel(op.kernel)
Tim Hall	3c5cfe9	2022-03-16 16:31:57 +0000	[diff] [blame]	411	npu_op.ifm_upscale = resampling_mode_inv_map[op.ifm_resampling_mode]
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	412	return npu_op
				413
				414
				415	def create_npu_conv2d_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuConv2DOperation:
				416	"""Converts the command to NpuConv2DOperation"""
				417	npu_op = NpuConv2DOperation()
				418	set_common_op_fields(npu_op, cmd, arch)
				419	if cmd.ps.primary_op.type.npu_block_type == NpuBlockType.VectorProduct:
				420	npu_op.block_traversal = NpuBlockTraversal.DEPTH_FIRST
				421	else:
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	422	if cmd.weight_tensor.src_tensor:
				423	npu_op.block_traversal = cmd.weight_tensor.src_tensor.hw_traversal
				424	else:
				425	npu_op.block_traversal = cmd.weight_tensor.hw_traversal
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	426	return npu_op
				427
				428
				429	def create_npu_conv_depthwise_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuConvDepthWiseOperation:
				430	"""Converts the command to NpuConvDepthWiseOperation"""
				431	npu_op = NpuConvDepthWiseOperation()
				432	set_common_op_fields(npu_op, cmd, arch)
				433	return npu_op
				434
				435
				436	def create_npu_pool_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuPoolingOperation:
				437	"""Converts the command to NpuPoolingOperation"""
				438	ps = cmd.ps
				439	op = ps.primary_op
				440	pool_op = NpuPoolingOp.AVERAGE
				441	if op.type.is_maxpool_op():
				442	pool_op = NpuPoolingOp.MAX
				443	elif op.type.is_avgpool_op() or op.type == Op.ResizeBilinear:
				444	pool_op = NpuPoolingOp.AVERAGE
				445	elif op.type == Op.ReduceSum:
				446	pool_op = NpuPoolingOp.REDUCE_SUM
				447	else:
				448	assert 0, f"Unknown pool type {op.type}"
				449	npu_op = NpuPoolingOperation(pool_op)
				450	set_common_op_fields(npu_op, cmd, arch)
				451	# Pooling specific info
Dwight Lidman	4f728c0	2020-12-17 15:14:45 +0100	[diff] [blame]	452	npu_op.rescale = op.rescale
Patrik Gustavsson	c74682c	2021-08-17 14:26:38 +0200	[diff] [blame]	453	if op.explicit_scaling:
				454	# Note: reuse of rescale for explicit scaling to not expose this in the external API
				455	assert npu_op.rescale is None
				456	npu_op.rescale = op.explicit_scaling
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	457	return npu_op
				458
				459
				460	def create_npu_elementwise_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuElementWiseOperation:
				461	"""Converts the command to NpuElementWiseOperation"""
				462	ps = cmd.ps
				463	op = ps.primary_op
				464	assert op.type in elementwise_op_map, f"Unknown elementwise type {op.type}"
				465	elemwise_op = elementwise_op_map[op.type]
				466	npu_op = NpuElementWiseOperation(elemwise_op)
Patrik Gustavsson	2349d42	2020-12-01 16:02:29 +0100	[diff] [blame]	467
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	468	if elemwise_op not in UNARY_ELEMWISE_OPS:
Patrik Gustavsson	3a26920	2021-01-21 08:28:55 +0100	[diff] [blame]	469	ifm_shape = [] if cmd.ifm_tensor.shape == [] else ps.ifm_shapes[0].as_list()
				470	ifm2_shape = [] if cmd.ifm2_tensor.shape == [] else ps.ifm_shapes[1].as_list()
				471	if not ifm_ifm2_correct_order(ifm_shape, ifm2_shape):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	472	# The scalar/broadcasted feature map has to be the ifm2 tensor so switch the ifms
				473	cmd.ifm_tensor, cmd.ifm2_tensor = cmd.ifm2_tensor, cmd.ifm_tensor
				474	cmd.ifm_box, cmd.ifm2_box = cmd.ifm2_box, cmd.ifm_box
Patrik Gustavsson	2349d42	2020-12-01 16:02:29 +0100	[diff] [blame]	475	ps.ifm_shapes[0], ps.ifm_shapes[1] = ps.ifm_shapes[1], ps.ifm_shapes[0]
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	476	npu_op.reversed_operands = True
Patrik Gustavsson	2349d42	2020-12-01 16:02:29 +0100	[diff] [blame]	477	npu_op.ifm2 = create_feature_map(cmd.ifm2_tensor, cmd.ifm2_box, arch, ps.ifm_shapes[1])
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	478	npu_op.ifm2.quantization = get_ifm_or_ifm2_quantization(ps, cmd.ifm2_tensor)
				479	if cmd.ifm2_tensor.shape == []:
				480	# scalar
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	481	npu_op.ifm2_scalar = cmd.ifm2_tensor.get_scalar()
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	482	npu_op.ifm2.shape = NpuShape3D(height=0, width=0, depth=0)
				483	else:
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	484	ifm2_blk = cmd.ifm2_box.get_block()
Patrik Gustavsson	3a26920	2021-01-21 08:28:55 +0100	[diff] [blame]	485	ifm2_width = ps.ifm_shapes[1].width
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	486	npu_op.ifm2.shape = NpuShape3D(height=ifm2_blk.height, width=ifm2_width, depth=ifm2_blk.depth)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	487	set_common_op_fields(npu_op, cmd, arch)
				488	# Check if output scale needs to be overridden
				489	output_scale = None
				490	if op.type == Op.Add and "resizebilinear" in op.attrs:
				491	# Force output scale same as the input scale for
				492	# resizebilinear 1x1 that is converted to add
				493	output_scale = npu_op.ifm2.quantization.scale_f32
Fredrik Svedberg	f2afd7f	2021-02-01 21:42:12 +0100	[diff] [blame]	494	if op.type == Op.Abs:
				495	output_scale = npu_op.ifm.quantization.scale_f32 / npu_op.ofm.quantization.scale_f32
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	496	if op.type == Op.LeakyRelu:
				497	output_scale = op.attrs["alpha"]
Patrik Gustavsson	b081d67	2021-08-25 13:49:25 +0200	[diff] [blame]	498	if op.type in (Op.RescaleAdd, Op.RescaleMul):
Fredrik Svedberg	e82be7c	2021-01-18 15:21:03 +0100	[diff] [blame]	499	assert op.rescale is not None, f"{op.type} must have rescale"
				500	npu_op.rescale = op.rescale
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	501	if op.type in (Op.Add, Op.Mul, Op.Sub):
				502	if op.activation is not None and op.activation.op_type in (Op.Sigmoid, Op.Tanh):
				503	output_scale = 1 / 0x3000
				504	if output_scale is not None:
				505	npu_op.ofm.quantization = NpuQuantization(scale_f32=output_scale, zero_point=npu_op.ofm.quantization.zero_point)
				506	return npu_op
				507
				508
				509	def create_dma_op(cmd: DMA, arch: ArchitectureFeatures) -> NpuDmaOperation:
				510	"""Converts the command to NpuDmaOperation"""
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	511	src_region = get_region(cmd.in_tensor.mem_type, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	512	if cmd.out_tensor.purpose == TensorPurpose.LUT:
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	513	dest_region = BASE_PTR_INDEX_MEM2MEM
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	514	else:
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	515	dest_region = get_region(cmd.out_tensor.mem_type, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	516
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	517	if cmd.in_tensor.purpose == TensorPurpose.Weights:
				518	# Get weight range per core
				519	sz = 0
				520	for core in range(0, arch.ncores):
				521	key = WeightKey(core, cmd.box.start_coord[-1])
				522	if key in cmd.in_tensor.encoded_ranges:
				523	weight_range = cmd.in_tensor.encoded_ranges[key]
				524	sz += round_up(weight_range.total_bytes, 16)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	525
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	526	if core == 0:
				527	weight_range = cmd.in_tensor.encoded_ranges[key]
				528	src_addr = cmd.in_tensor.address + weight_range.offset
Tim Hall	b5df773	2022-05-04 16:20:43 +0100	[diff] [blame]	529
				530	if cmd.out_tensor.sub_purpose == TensorSubPurpose.DoubleBuffer:
				531	dest_addr = cmd.out_tensor.address + cmd.in_tensor.max_range_bytes * (
				532	get_double_buffer_offset(arch, weight_range.index, core)
				533	)
				534	else:
				535	dest_addr = cmd.out_tensor.address
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	536	else:
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	537	start_coord = cmd.box.start_coord
				538	src_addr = cmd.in_tensor.address_for_coordinate(start_coord)
				539	dest_addr = cmd.out_tensor.address_for_coordinate(start_coord)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	540	sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr
				541	src = NpuAddressRange(src_region, int(src_addr), int(sz))
				542	dest = NpuAddressRange(dest_region, int(dest_addr), int(sz))
				543	return NpuDmaOperation(src, dest)
				544
				545
				546	def convert_command_to_npu_op(cmd: Command, arch: ArchitectureFeatures) -> NpuOperation:
				547	"""Converts the high level command to NpuOperation"""
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	548	npu_op: NpuOperation
				549	if isinstance(cmd, DMA):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	550	npu_op = create_dma_op(cmd, arch)
Tim Hall	68df8a1	2022-03-16 16:51:16 +0000	[diff] [blame]	551	npu_op.name = cmd.out_tensor.name
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	552	elif isinstance(cmd, NpuStripe):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	553	npu_block_type = cmd.ps.primary_op.type.npu_block_type
				554	if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct):
				555	npu_op = create_npu_conv2d_op(cmd, arch)
				556	elif npu_block_type == NpuBlockType.ConvolutionDepthWise:
				557	npu_op = create_npu_conv_depthwise_op(cmd, arch)
				558	elif npu_block_type in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):
				559	npu_op = create_npu_pool_op(cmd, arch)
				560	elif npu_block_type == NpuBlockType.ElementWise:
				561	npu_op = create_npu_elementwise_op(cmd, arch)
				562	else:
				563	assert 0, f"Unknown command type {npu_block_type}"
Tim Hall	68df8a1	2022-03-16 16:51:16 +0000	[diff] [blame]	564	npu_op.name = cmd.ps.primary_op.name
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	565	return npu_op
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	566
				567
				568	def generate_register_command_stream_for_sg(nng, sg, arch, verbose=False):
				569	"""Generates command stream for the subgraph, adds it to sg.register_command_stream"""
				570	# Convert high level command stream to list of NpuOperation
				571	npu_op_list = []
				572	npu_op_to_cmd = dict() # map from npu op to high level command
				573	for cmd in sg.high_level_command_stream:
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	574	if isinstance(cmd, NpuStripe) and cmd.ps.npu_block_type == NpuBlockType.Default:
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	575	print("Warning: Skipping register command stream generation for", cmd.ps)
				576	else:
				577	npu_op = convert_command_to_npu_op(cmd, arch)
				578	npu_op_list.append(npu_op)
				579	npu_op_to_cmd[npu_op] = cmd
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	580	mem_limits = get_mem_limits_for_regions(arch)
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	581	# Generate register commands
erik.andersson@arm.com	ad45f79	2021-02-03 10:20:16 +0100	[diff] [blame]	582	if len(sg.high_level_command_stream) > 0:
				583	stream_id = DebugDatabase.add_stream(sg)
				584	sg.generated_stream_id = stream_id
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	585
erik.andersson@arm.com	ad45f79	2021-02-03 10:20:16 +0100	[diff] [blame]	586	def add_to_debug_db(npu_op: NpuOperation, offset: int):
				587	"""Adds info to the debug database"""
				588	if not isinstance(npu_op, NpuDmaOperation):
				589	cmd = npu_op_to_cmd[npu_op]
				590	DebugDatabase.add_command(stream_id, offset, cmd.ps.primary_op)
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	591
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	592	sg.register_command_stream = generate_command_stream(
				593	npu_op_list, arch, verbose, mem_limits, add_to_debug_db, npu_op_to_cmd
				594	)