Blame - ethosu/vela/high_level_command_to_npu_op.py - ml/ethos-u/ethos-u-vela

blob: 5e9dffac869e60c172b0f49c585d04fd25af4095 [file] [log] [blame]

Johan Alfven	bfe6fe3	2023-02-14 15:20:03 +0100	[diff] [blame]	1	# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
				16	#
				17	# Description:
				18	# Conversion from high level command to NpuOperation
				19	from enum import IntEnum
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	20	from typing import cast
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	21	from typing import Dict
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	22	from typing import List
				23	from typing import Optional
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	24	from typing import Tuple
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	25
				26	from .api import NpuActivation
				27	from .api import NpuActivationOp
				28	from .api import NpuAddressRange
				29	from .api import NpuBlockOperation
				30	from .api import NpuBlockTraversal
				31	from .api import NpuConv2DOperation
				32	from .api import NpuConvDepthWiseOperation
				33	from .api import NpuDataType
				34	from .api import NpuDmaOperation
				35	from .api import NpuElementWiseOp
				36	from .api import NpuElementWiseOperation
				37	from .api import NpuFeatureMap
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	38	from .api import NpuLayout
				39	from .api import NpuOperation
Rickard Bolin	9ae3455	2022-06-09 13:07:17 +0000	[diff] [blame]	40	from .api import NpuOperationType
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	41	from .api import NpuPadding
				42	from .api import NpuPoolingOp
				43	from .api import NpuPoolingOperation
				44	from .api import NpuQuantization
				45	from .api import NpuResamplingMode
				46	from .api import NpuRoundingMode
				47	from .api import NpuShape3D
				48	from .api import NpuTileBox
				49	from .architecture_features import ArchitectureFeatures
				50	from .data_type import DataType
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	51	from .debug_database import DebugDatabase
Michael McGeagh	7a6f843	2020-12-02 15:29:22 +0000	[diff] [blame]	52	from .errors import UnsupportedFeatureError
Tim Hall	3c5cfe9	2022-03-16 16:31:57 +0000	[diff] [blame]	53	from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	54	from .high_level_command_stream import Box
				55	from .high_level_command_stream import Command
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	56	from .high_level_command_stream import DMA
Johan Alfven	9072496	2023-02-02 09:07:48 +0100	[diff] [blame]	57	from .high_level_command_stream import NOP
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	58	from .high_level_command_stream import NpuStripe
Fredrik Svedberg	838df0a	2021-09-17 16:29:22 +0200	[diff] [blame]	59	from .numeric_util import quantise_float32
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	60	from .numeric_util import round_up
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	61	from .operation import NpuBlockType
				62	from .operation import Op
				63	from .operation import Operation
Rickard Bolin	9ae3455	2022-06-09 13:07:17 +0000	[diff] [blame]	64	from .operation import Padding
Tim Hall	5ff4cd1	2023-05-16 22:39:14 +0100	[diff] [blame]	65	from .operation import RoundingMode
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	66	from .register_command_stream_generator import generate_command_stream
				67	from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	68	from .register_command_stream_util import to_npu_kernel
				69	from .register_command_stream_util import UNARY_ELEMWISE_OPS
patrik.gustavsson	eeb8515	2020-12-21 17:10:40 +0000	[diff] [blame]	70	from .shape4d import Shape4D
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	71	from .tensor import MemType
				72	from .tensor import Tensor
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	73	from .tensor import TensorFormat
				74	from .tensor import TensorPurpose
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	75	from .weight_compressor import NpuWeightTensor
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	76	from .weight_compressor import WeightKey
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	77
				78
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	79	class BasePointerIndex(IntEnum):
				80	WeightTensor = 0 # base address index for the Weight tensor
				81	ScratchTensor = 1 # base address index for the Scratch_tensor in the TensorArena
				82	ScratchFastTensor = 2 # base address for the Scratch_fast_tensor
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	83
				84
				85	dtype_map = {
				86	DataType.uint8: NpuDataType.UINT8,
				87	DataType.int8: NpuDataType.INT8,
				88	DataType.uint16: NpuDataType.UINT16,
				89	DataType.int16: NpuDataType.INT16,
				90	DataType.int32: NpuDataType.INT32,
				91	}
				92
				93
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	94	# Maps an elementwise op type to an elementwise_mode enum value used by NPU_OP_ELEMENTWISE
				95	elementwise_op_map = {
				96	Op.Mul: NpuElementWiseOp.MUL,
				97	Op.Add: NpuElementWiseOp.ADD,
				98	Op.Sub: NpuElementWiseOp.SUB,
				99	Op.Minimum: NpuElementWiseOp.MIN,
				100	Op.Maximum: NpuElementWiseOp.MAX,
				101	Op.LeakyRelu: NpuElementWiseOp.LRELU,
				102	Op.Abs: NpuElementWiseOp.ABS,
				103	Op.CLZ: NpuElementWiseOp.CLZ,
				104	Op.SHR: NpuElementWiseOp.SHR,
				105	Op.SHL: NpuElementWiseOp.SHL,
				106	}
				107
				108
Tim Hall	3c5cfe9	2022-03-16 16:31:57 +0000	[diff] [blame]	109	# inverse of the resampling_mode_map in the register command stream generator
				110	resampling_mode_inv_map = {
				111	resampling_mode.NONE: NpuResamplingMode.NONE,
				112	resampling_mode.NEAREST: NpuResamplingMode.NEAREST,
				113	resampling_mode.TRANSPOSE: NpuResamplingMode.TRANSPOSE,
				114	}
				115
				116
Tim Hall	5ff4cd1	2023-05-16 22:39:14 +0100	[diff] [blame]	117	rounding_mode_map = {
				118	RoundingMode.TFLite: NpuRoundingMode.TFL,
				119	RoundingMode.ToZero: NpuRoundingMode.TRUNCATE,
				120	RoundingMode.HalfUp: NpuRoundingMode.NATURAL,
				121	RoundingMode.AwayZero: NpuRoundingMode.NATURAL,
				122	}
				123
				124
Johan Alfvén	56a71b0	2022-10-19 11:20:12 +0200	[diff] [blame]	125	def ifm_ifm2_correct_order(ifm_shape: Shape4D, ifm2_shape: Shape4D) -> bool:
				126
				127	if ifm_shape is None:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	128	# Scalar needs to be in IFM2
				129	return False
Johan Alfvén	56a71b0	2022-10-19 11:20:12 +0200	[diff] [blame]	130	if ifm2_shape is None:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	131	return True
				132
Johan Alfvén	56a71b0	2022-10-19 11:20:12 +0200	[diff] [blame]	133	for ifm, ifm2 in zip(ifm_shape.as_list(), ifm2_shape.as_list()):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	134	if ifm != ifm2 and ifm == 1:
				135	# Broadcasted FM needs to be in IFM2
				136	return False
				137	return True
				138
				139
Patrik Gustavsson	b0ca274	2020-11-18 07:59:09 +0100	[diff] [blame]	140	def get_rounding_mode(op: Operation, fused_quantize: bool) -> NpuRoundingMode:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	141	"""Specifies type of rounding to be used"""
				142	rounding_mode = NpuRoundingMode.TFL
Tim Hall	885033b	2022-07-21 11:46:03 +0100	[diff] [blame]	143	if op.type.is_resize_op():
Dwight Lidman	9d24393	2021-08-10 12:53:12 +0200	[diff] [blame]	144	rounding_mode = NpuRoundingMode.NATURAL
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	145	elif (
Johan Gunnarsson	9855637	2023-08-10 13:10:44 +0200	[diff] [blame]	146	op.original_type.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	147	and op.ifm.dtype == DataType.int16
				148	):
				149	rounding_mode = NpuRoundingMode.NATURAL
Patrik Gustavsson	b0ca274	2020-11-18 07:59:09 +0100	[diff] [blame]	150	elif (
				151	not fused_quantize
				152	and op.type.is_avgpool_op()
				153	and op.memory_function == Op.ConcatSliceWrite
				154	and op.kernel.elements_wh() == 1
				155	):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	156	rounding_mode = NpuRoundingMode.NATURAL
Louis Verhaard	1a92f78	2021-02-09 16:08:26 +0100	[diff] [blame]	157	if op.rounding_mode is not None:
Tim Hall	5ff4cd1	2023-05-16 22:39:14 +0100	[diff] [blame]	158	rounding_mode = rounding_mode_map[op.rounding_mode]
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	159	return rounding_mode
				160
				161
Rickard Bolin	9ae3455	2022-06-09 13:07:17 +0000	[diff] [blame]	162	def create_padding(cmd: NpuStripe, primary_op: Operation, npu_op: NpuBlockOperation) -> NpuPadding:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	163	if primary_op.type.npu_block_type == NpuBlockType.VectorProduct:
				164	return NpuPadding(top=0, left=0, bottom=0, right=0)
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	165	top, left, bottom, right = primary_op.attrs["explicit_padding"]
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	166
				167	# Check if this is for horizontal ifm streaming
				168	if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	169	top = cmd.pad_top
				170	bottom = cmd.pad_bottom
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	171
Tim Hall	3751aa4	2021-12-16 13:17:29 +0000	[diff] [blame]	172	# the ifm box coordinate range depends upon whether the primary op was combined with a split slice read
				173	ifm_read_offset = primary_op.read_offsets[0]
				174	ifm_read_shape = primary_op.read_shapes[0]
				175	if ifm_read_offset is None or len(ifm_read_offset) < 2:
				176	box_start_coord_min = 0
				177	box_end_coord_max = cmd.ps.ifm_shapes[0].width
				178	else:
				179	box_start_coord_min = ifm_read_offset[-2]
				180	box_end_coord_max = ifm_read_shape[-2]
				181
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	182	# Indexing from end since a 1x1 Avgpool might have been added with non 4-dimensional input/output,
				183	# because of activation function needed to be fused.
Tim Hall	3751aa4	2021-12-16 13:17:29 +0000	[diff] [blame]	184	if len(cmd.ifm_box.start_coord) >= 2 and cmd.ifm_box.start_coord[-2] > box_start_coord_min:
				185	left = 0
				186	if len(cmd.ifm_box.end_coord) >= 2 and cmd.ifm_box.end_coord[-2] < box_end_coord_max:
				187	right = 0
Rickard Bolin	9ae3455	2022-06-09 13:07:17 +0000	[diff] [blame]	188
				189	# If tile padding is selected, modify the tile base addresses and set NpuPadding to zero.
				190	if primary_op.attrs.get("padding", None) == Padding.TILE:
				191	assert cmd.ifm_tensor.format == TensorFormat.NHCWB16, "Tensor format NHCWB16 required to perform tile padding"
				192	assert npu_op.op_type == NpuOperationType.ConvDepthWise, "Tile padding only supported for depthwise convolution"
				193	assert npu_op.ifm is not None, "Feature map must be initialized to modify the tile addresses"
				194	npu_op.ifm.tiles = modify_tile_addresses_for_padding(
				195	npu_op.ifm.tiles,
				196	primary_op.attrs.get("explicit_padding", None),
				197	channels=cmd.ps.ifm_shapes[0].depth,
				198	dtype=cmd.ifm_tensor.dtype,
				199	)
				200	top, left, bottom, right = 0, 0, 0, 0
Rickard Bolin	fea1516	2022-07-04 16:19:16 +0000	[diff] [blame]	201
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	202	return NpuPadding(top=top, left=left, bottom=bottom, right=right)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	203
				204
Rickard Bolin	9ae3455	2022-06-09 13:07:17 +0000	[diff] [blame]	205	def modify_tile_addresses_for_padding(
				206	tile_box: NpuTileBox, padding_direction: List[int], channels: int, dtype: DataType
				207	) -> NpuTileBox:
				208	# Addresses are 16-bytes aligned when using the NHCWB16 format, which is required to utilize tiling
				209	# Calculate the offset to top right, bottom left and bottom right element in the IFM (top left offset is 0)
				210	"""
				211	Example: 4x4x1 IFM
				212	\| a b c d \| <-- Offset to TR ('d') is (w0-1) = 3
				213	\| e f g h \|
				214	\| i j k l \|
				215	\| m n o p \| <-- Offset to BL ('m') is (w0(h0-1)) = 12 and to BR ('p') ((w0h0)-1) = 15
				216	"""
				217	h0, h1, w0, addresses = tile_box
				218	elem_size = 2 if dtype == DataType.int16 else 1
				219	tr_offset = (w0 - 1) * 16 * elem_size
				220	bl_offset = w0 * (h0 - 1) * 16 * (round_up(channels, 16) // 16) * elem_size
				221	br_offset = tr_offset + bl_offset
				222
				223	# Explicit padding order: (Top, Left, Bottom, Right)
				224	if padding_direction == (1, 1, 0, 0):
				225	# Pad top left corner
				226	"""
				227	\| a a b \|
				228	\| a b \| -> \| a a b \|
				229	\| c d \| \| c c d \|
				230	"""
				231	addresses = [addresses[0]] * 4
				232	h0, h1, w0 = 1, 1, 1
				233
				234	elif padding_direction == (1, 0, 0, 1):
				235	# Pad top right corner
				236	"""
				237	\| a b b \|
				238	\| a b \| -> \| a b b \|
				239	\| c d \| \| c d d \|
				240	"""
				241	addresses = [addresses[0], addresses[0] + tr_offset, addresses[0], addresses[0] + tr_offset]
				242	h0, h1, w0 = 1, 1, w0
				243
				244	elif padding_direction == (0, 1, 1, 0):
				245	# Pad bottom left corner
				246	"""
				247	\| a b \| \| a a b \|
				248	\| c d \| -> \| c c d \|
				249	\| c c d \|
				250	"""
				251	addresses = [addresses[0], addresses[0], addresses[0] + bl_offset, addresses[0] + bl_offset]
				252	h0, h1, w0 = h0, h1, 1
				253
				254	elif padding_direction == (0, 0, 1, 1):
				255	# Pad bottom right corner
				256	"""
				257	\| a b \| \| a b b \|
				258	\| c d \| -> \| c d d \|
				259	\| c d d \|
				260	"""
				261	addresses = [
				262	addresses[0],
				263	addresses[0] + tr_offset,
				264	addresses[0] + bl_offset,
				265	addresses[0] + br_offset,
				266	]
				267	# h0, h1, w0 = h0, h1, w0
				268	else:
				269	assert 0, "Invalid padding direction for tile padding"
				270
				271	return NpuTileBox(height_0=h0, height_1=h1, width_0=w0, addresses=[int(addr) for addr in addresses])
				272
				273
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	274	def get_region(mem_type: MemType, arch: ArchitectureFeatures) -> int:
Tim Hall	1bd531d	2020-11-01 20:59:36 +0000	[diff] [blame]	275	base_ptr_idx_map = {
				276	MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
				277	MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
				278	MemType.Scratch: BasePointerIndex.ScratchTensor,
				279	}
				280
				281	if arch.is_spilling_enabled():
				282	base_ptr_idx_map[MemType.Scratch_fast] = BasePointerIndex.ScratchFastTensor
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	283	else:
Tim Hall	1bd531d	2020-11-01 20:59:36 +0000	[diff] [blame]	284	base_ptr_idx_map[MemType.Scratch_fast] = BasePointerIndex.ScratchTensor
				285
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	286	return base_ptr_idx_map[mem_type].value
				287
				288
				289	def get_mem_limits_for_regions(arch: ArchitectureFeatures) -> Dict[int, int]:
				290	"""Returns map region -> max size of the region in bytes"""
				291	mem_limits = dict()
				292	for mem_type in MemType.all():
				293	mem_limits[get_region(mem_type, arch)] = arch.mem_type_size(mem_type)
				294	mem_limits[BASE_PTR_INDEX_MEM2MEM] = arch.shram_size_bytes
				295	return mem_limits
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	296
				297
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	298	def get_ifm_depth(npu_block_type: NpuBlockType, ifm_box: Box, ofm_box: Box) -> int:
				299	if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	300	block = ifm_box.get_block()
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	301	else:
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	302	block = ofm_box.get_block()
				303	return block.depth
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	304
				305
				306	def use_zero_point_0(ps, tens: Tensor, is_ifm_tensor: bool) -> bool:
				307	"""Checks if quantization should use 0 as zero point"""
				308	if tens.dtype == DataType.int32 and is_ifm_tensor:
				309	return True
Tim Hall	5ff4cd1	2023-05-16 22:39:14 +0100	[diff] [blame]	310	if ps.primary_op.rounding_mode == RoundingMode.AwayZero:
Raul Farkas	3e7157b	2023-05-09 09:09:17 +0100	[diff] [blame]	311	if (
				312	ps.primary_op.original_type == Op.AvgPool
				313	and ps.primary_op.type == Op.Conv2DBias
Johan Alfven	4bf0cdf	2023-11-06 11:52:56 +0100	[diff] [blame^]	314	and ps.primary_op.attrs.get("padding", None) in (Padding.EXPLICIT, Padding.VALID)
Raul Farkas	3e7157b	2023-05-09 09:09:17 +0100	[diff] [blame]	315	):
				316	# Force zero point to 0 for AveragePool operators converted to a Conv2DBias with rounding away from
				317	# zero.
				318	return True
Tim Hall	5ff4cd1	2023-05-16 22:39:14 +0100	[diff] [blame]	319	if ps.primary_op.original_type == Op.ResizeBilinear and ps.primary_op.type == Op.DepthwiseConv2DBias:
				320	# Force zero point to 0 for ResizeBilinear operators converted to a DepthwiseConv with rounding away from
				321	# zero. This is because the reference kernel ignores the zero points.
				322	return True
				323	if (
				324	not is_ifm_tensor
				325	and ps.primary_op.original_type == Op.AvgPool
				326	and ps.primary_op.attrs.get("padding", None) == Padding.EXPLICIT
				327	and ps.primary_op.type == Op.DepthwiseConv2DBias
				328	):
				329	# Force zero point to 0 for the OFM of AvgPool operators that have been combined with a previous PAD
				330	# operator and converted to a DepthwiseConv with rounding away from zero. This is because the zero point
				331	# will already have been applied in the Bias.
				332	return True
Tim Hall	885033b	2022-07-21 11:46:03 +0100	[diff] [blame]	333	if ps.primary_op.type not in (Op.AvgPool, Op.CLZ, Op.SHL) and not ps.primary_op.type.is_resize_op():
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	334	return False
Patrik Gustavsson	c74682c	2021-08-17 14:26:38 +0200	[diff] [blame]	335	if ps.primary_op.type == Op.AvgPool and ps.primary_op.explicit_scaling:
				336	return False
Johan Gunnarsson	9855637	2023-08-10 13:10:44 +0200	[diff] [blame]	337	fused_quantize = any(op.type == Op.Quantize or op.original_type == Op.Quantize for op in ps.ops)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	338	forced_ofm_quantization = ps.primary_op.forced_output_quantization
				339	use_0 = (
Fredrik Svedberg	838df0a	2021-09-17 16:29:22 +0200	[diff] [blame]	340	(
				341	ps.primary_op.activation is None
				342	or forced_ofm_quantization is not None
Fredrik Svedberg	4a434cb	2022-09-27 14:13:01 +0200	[diff] [blame]	343	or (ps.primary_op.type.is_avgpool_op() and ps.primary_op.activation.op_type.is_relu_op())
Fredrik Svedberg	838df0a	2021-09-17 16:29:22 +0200	[diff] [blame]	344	)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	345	and (ps.primary_op.memory_function != Op.ConcatSliceWrite)
				346	and not fused_quantize
				347	)
				348	return use_0
				349
				350
				351	def get_ifm_or_ifm2_quantization(ps, tens: Tensor) -> Optional[NpuQuantization]:
				352	"""Gets quantization for IFM/IFM2"""
Dwight Lidman	4f728c0	2020-12-17 15:14:45 +0100	[diff] [blame]	353	op = ps.primary_op
				354	ifm_quant = op.forced_input_quantization if op.forced_input_quantization is not None else tens.quantization
				355	if ifm_quant is None:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	356	return None
				357	if use_zero_point_0(ps, tens, True):
				358	zero_point = 0
				359	else:
Dwight Lidman	4f728c0	2020-12-17 15:14:45 +0100	[diff] [blame]	360	zero_point = int(ifm_quant.zero_point)
				361	return NpuQuantization(scale_f32=ifm_quant.scale_f32, zero_point=zero_point)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	362
				363
				364	def get_ofm_quantization(ps, tens: Tensor) -> Optional[NpuQuantization]:
				365	"""Gets quantization for OFM"""
				366	op = ps.primary_op
				367	# Check if operation's output quantization is should be used instead of the output tensor's quantization
				368	# (used in LUTs)
				369	ofm_quant = op.forced_output_quantization if op.forced_output_quantization is not None else tens.quantization
				370	if ofm_quant is None:
				371	return None
				372	if use_zero_point_0(ps, tens, False):
				373	zero_point = 0
				374	else:
				375	zero_point = int(ofm_quant.zero_point)
				376	return NpuQuantization(scale_f32=ofm_quant.scale_f32, zero_point=zero_point)
				377
				378
Rickard Bolin	17e53b5	2022-09-06 16:09:01 +0000	[diff] [blame]	379	def create_feature_map(
				380	tens: Tensor,
				381	box: Box,
				382	arch: ArchitectureFeatures,
				383	op_shape4D: Shape4D,
Rickard Bolin	fea1516	2022-07-04 16:19:16 +0000	[diff] [blame]	384	tile_base_offsets: List[int],
Rickard Bolin	17e53b5	2022-09-06 16:09:01 +0000	[diff] [blame]	385	stride_multiplier: Optional[List[int]] = None,
				386	) -> NpuFeatureMap:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	387	"""Creates feature map with common fields populated"""
				388	fm = NpuFeatureMap()
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	389	fm.region = get_region(tens.mem_type, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	390	fm.data_type = dtype_map[tens.dtype]
				391	if tens.format == TensorFormat.NHWC:
				392	fm.layout = NpuLayout.NHWC
				393	elif tens.format == TensorFormat.NHCWB16:
				394	fm.layout = NpuLayout.NHCWB16
				395	else:
				396	assert 0, "Incorrect tensor format"
Rickard Bolin	17e53b5	2022-09-06 16:09:01 +0000	[diff] [blame]	397
				398	strides = tens.get_strides(op_shape4D)
				399	assert strides is not None
				400
				401	if stride_multiplier and stride_multiplier != [1, 1, 1]:
				402	assert (
				403	tens.format == TensorFormat.NHWC
				404	), "Only default stride multiplier ([1, 1, 1]) supported for NHCWB16 format"
				405	# Multiply strides for C/H/W (in that order) with corresponding stride factor
				406	for i, stride_factor in enumerate(stride_multiplier, start=1):
				407	strides[i] *= stride_factor
				408
Patrik Gustavsson	3a26920	2021-01-21 08:28:55 +0100	[diff] [blame]	409	height_0, height_1, width_0, addresses = tens.addresses_for_rolling_buffer(
Rickard Bolin	17e53b5	2022-09-06 16:09:01 +0000	[diff] [blame]	410	box.start_coord, box.end_coord, strides, op_shape4D
Patrik Gustavsson	3a26920	2021-01-21 08:28:55 +0100	[diff] [blame]	411	)
Rickard Bolin	17e53b5	2022-09-06 16:09:01 +0000	[diff] [blame]	412
Rickard Bolin	fea1516	2022-07-04 16:19:16 +0000	[diff] [blame]	413	for idx, offset in enumerate(tile_base_offsets):
				414	addresses[idx] += offset
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	415	fm.tiles = NpuTileBox(
				416	height_0=height_0, height_1=height_1, width_0=width_0, addresses=[int(addr) for addr in addresses]
				417	)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	418	fm.strides = NpuShape3D(height=int(strides[2]), width=int(strides[3]), depth=int(strides[1]))
Tim Hall	68df8a1	2022-03-16 16:51:16 +0000	[diff] [blame]	419	fm.name = tens.name
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	420	return fm
				421
				422
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	423	def create_weights(
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	424	weight_tensor: NpuWeightTensor, weight_box: Box, scale_tensor: NpuWeightTensor, arch: ArchitectureFeatures
				425	) -> Tuple[List[NpuAddressRange], List[NpuAddressRange]]:
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	426	"""Returns address ranges for weights and scales"""
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	427	weights = []
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	428	biases = []
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	429	shared_region = get_region(weight_tensor.mem_type, arch)
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	430	scale_region = get_region(scale_tensor.mem_type, arch) if scale_tensor else 0
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	431
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	432	w_tensor_src = weight_tensor
				433	if weight_tensor.src_tensor:
Jonas Ohlsson	845e232	2022-03-01 12:39:55 +0100	[diff] [blame]	434	w_tensor_src = cast(NpuWeightTensor, weight_tensor.src_tensor)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	435
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	436	core_offset = 0
				437	for core in range(0, arch.ncores):
				438	# Get weight range per core
				439	key = WeightKey(core, weight_box.start_coord[-1])
				440	if key in w_tensor_src.encoded_ranges:
				441	weight_range = w_tensor_src.encoded_ranges[key]
Rickard Bolin	fd8b500	2022-05-16 09:11:06 +0000	[diff] [blame]	442	if weight_tensor == w_tensor_src:
				443	# Straight from source tensor
				444	address = weight_tensor.address + weight_range.offset
Tim Hall	b5df773	2022-05-04 16:20:43 +0100	[diff] [blame]	445	else:
Rickard Bolin	fd8b500	2022-05-16 09:11:06 +0000	[diff] [blame]	446	# Weight buffered tensor
				447	address = weight_tensor.address + core_offset
				448	core_offset += round_up(weight_range.total_bytes, 16)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	449
				450	# Location of weights in tensor
				451	addr_range = NpuAddressRange(
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	452	shared_region, int(address + weight_range.weight_offset), round_up(int(weight_range.weight_bytes), 16)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	453	)
				454	weights.append(addr_range)
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	455
				456	# Location of standalone scales or combined weights tensor scales
				457	if scale_tensor:
				458	assert scale_tensor.src_tensor is None # Must be standalone
				459	scale_range = scale_tensor.encoded_ranges[key]
				460	address = scale_tensor.address + scale_range.offset
				461	addr_range = NpuAddressRange(scale_region, int(address), round_up(int(scale_range.scale_bytes), 16))
				462	else:
				463	addr_range = NpuAddressRange(shared_region, int(address), round_up(int(weight_range.scale_bytes), 16))
				464
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	465	biases.append(addr_range)
				466
				467	return weights, biases
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	468
				469
				470	def create_npu_activation(op: Operation) -> NpuActivation:
				471	"""Creates fused activation function"""
				472	if op.activation is None:
				473	return NpuActivation(NpuActivationOp.NONE_OR_RELU)
				474	faf = op.activation.op_type
				475	act_op = NpuActivationOp.NONE_OR_RELU
				476	if faf == Op.Tanh:
				477	act_op = NpuActivationOp.TANH
				478	elif faf == Op.Sigmoid:
				479	act_op = NpuActivationOp.SIGMOID
				480	elif faf == Op.LUT:
				481	act_op = NpuActivationOp.TABLE_LOOKUP
				482	elif not faf.is_relu_op():
Michael McGeagh	7a6f843	2020-12-02 15:29:22 +0000	[diff] [blame]	483	raise UnsupportedFeatureError(f"Unsupported fused_activation_function: {faf.name}")
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	484
				485	act = NpuActivation(act_op)
				486	act.min = op.activation.min
				487	act.max = op.activation.max
Fredrik Svedberg	4a434cb	2022-09-27 14:13:01 +0200	[diff] [blame]	488	if act_op is NpuActivationOp.NONE_OR_RELU and op.type.is_avgpool_op() and not op.explicit_scaling:
Fredrik Svedberg	838df0a	2021-09-17 16:29:22 +0200	[diff] [blame]	489	quant = op.ofm.quantization
				490	if quant and quant.zero_point: # Zero point is not 0
				491	scale_f32 = 1 if quant.scale_f32 is None else quant.scale_f32
				492	zero_point = quant.zero_point
				493	if act.min is not None:
				494	act.min = scale_f32 * quantise_float32(act.min, scale_f32, zero_point)
				495	if act.max is not None:
				496	act.max = scale_f32 * quantise_float32(act.max, scale_f32, zero_point)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	497	act.lookup_table_index = op.activation.lut_index
				498	return act
				499
				500
				501	def set_common_op_fields(npu_op: NpuBlockOperation, cmd: NpuStripe, arch: ArchitectureFeatures):
				502	"""Sets common fields of the given operation"""
				503	ps = cmd.ps
				504	op = ps.primary_op
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	505
				506	ifm_height = cmd.ifm_box.get_block().height
Johan Alfven	bfe6fe3	2023-02-14 15:20:03 +0100	[diff] [blame]	507	ifm_width = cmd.ifm_box.get_block().width
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	508	ifm_depth = get_ifm_depth(op.type.npu_block_type, cmd.ifm_box, cmd.ofm_box)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	509
Rickard Bolin	fea1516	2022-07-04 16:19:16 +0000	[diff] [blame]	510	npu_op.ifm = create_feature_map(cmd.ifm_tensor, cmd.ifm_box, arch, ps.ifm_shapes[0], op.tile_base_offsets_ifm[0])
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	511	npu_op.ifm.shape = NpuShape3D(height=ifm_height, width=ifm_width, depth=ifm_depth)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	512	npu_op.ifm.quantization = get_ifm_or_ifm2_quantization(ps, cmd.ifm_tensor)
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	513
				514	out_block = cmd.ofm_box.get_block()
Rickard Bolin	fea1516	2022-07-04 16:19:16 +0000	[diff] [blame]	515	npu_op.ofm = create_feature_map(
				516	cmd.ofm_tensor, cmd.ofm_box, arch, ps.ofm_shapes[0], op.tile_base_offsets_ofm, op.ofm_stride_multiplier
				517	)
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	518	npu_op.ofm.shape = NpuShape3D(height=out_block.height, width=out_block.width, depth=out_block.depth)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	519	npu_op.ofm.quantization = get_ofm_quantization(ps, cmd.ofm_tensor)
				520
				521	if cmd.weight_tensor is not None:
Tim Hall	d784af7	2021-06-08 21:25:57 +0100	[diff] [blame]	522	npu_op.weights, npu_op.biases = create_weights(cmd.weight_tensor, cmd.weight_box, cmd.scale_tensor, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	523	npu_op.activation = create_npu_activation(op)
Johan Gunnarsson	9855637	2023-08-10 13:10:44 +0200	[diff] [blame]	524	npu_op.fused_quantize = any(op.type == Op.Quantize or op.original_type == Op.Quantize for op in ps.ops)
Patrik Gustavsson	b0ca274	2020-11-18 07:59:09 +0100	[diff] [blame]	525	npu_op.rounding_mode = get_rounding_mode(op, npu_op.fused_quantize)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	526	npu_op.block_config = NpuShape3D(height=ps.block_config[0], width=ps.block_config[1], depth=ps.block_config[3])
				527
				528	if not op.type.is_elementwise_op():
Rickard Bolin	9ae3455	2022-06-09 13:07:17 +0000	[diff] [blame]	529	npu_op.padding = create_padding(cmd, op, npu_op)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	530	npu_op.kernel = to_npu_kernel(op.kernel)
Tim Hall	3c5cfe9	2022-03-16 16:31:57 +0000	[diff] [blame]	531	npu_op.ifm_upscale = resampling_mode_inv_map[op.ifm_resampling_mode]
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	532	return npu_op
				533
				534
				535	def create_npu_conv2d_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuConv2DOperation:
				536	"""Converts the command to NpuConv2DOperation"""
				537	npu_op = NpuConv2DOperation()
				538	set_common_op_fields(npu_op, cmd, arch)
				539	if cmd.ps.primary_op.type.npu_block_type == NpuBlockType.VectorProduct:
				540	npu_op.block_traversal = NpuBlockTraversal.DEPTH_FIRST
				541	else:
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	542	if cmd.weight_tensor.src_tensor:
				543	npu_op.block_traversal = cmd.weight_tensor.src_tensor.hw_traversal
				544	else:
				545	npu_op.block_traversal = cmd.weight_tensor.hw_traversal
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	546	return npu_op
				547
				548
				549	def create_npu_conv_depthwise_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuConvDepthWiseOperation:
				550	"""Converts the command to NpuConvDepthWiseOperation"""
				551	npu_op = NpuConvDepthWiseOperation()
				552	set_common_op_fields(npu_op, cmd, arch)
				553	return npu_op
				554
				555
				556	def create_npu_pool_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuPoolingOperation:
				557	"""Converts the command to NpuPoolingOperation"""
				558	ps = cmd.ps
				559	op = ps.primary_op
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	560	if op.type.is_maxpool_op():
				561	pool_op = NpuPoolingOp.MAX
Tim Hall	885033b	2022-07-21 11:46:03 +0100	[diff] [blame]	562	elif op.type.is_avgpool_op() or op.type.is_resize_op():
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	563	pool_op = NpuPoolingOp.AVERAGE
				564	elif op.type == Op.ReduceSum:
				565	pool_op = NpuPoolingOp.REDUCE_SUM
				566	else:
				567	assert 0, f"Unknown pool type {op.type}"
				568	npu_op = NpuPoolingOperation(pool_op)
				569	set_common_op_fields(npu_op, cmd, arch)
				570	# Pooling specific info
Patrik Gustavsson	c74682c	2021-08-17 14:26:38 +0200	[diff] [blame]	571	if op.explicit_scaling:
				572	# Note: reuse of rescale for explicit scaling to not expose this in the external API
Patrik Gustavsson	c74682c	2021-08-17 14:26:38 +0200	[diff] [blame]	573	npu_op.rescale = op.explicit_scaling
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	574	return npu_op
				575
				576
				577	def create_npu_elementwise_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuElementWiseOperation:
				578	"""Converts the command to NpuElementWiseOperation"""
				579	ps = cmd.ps
				580	op = ps.primary_op
				581	assert op.type in elementwise_op_map, f"Unknown elementwise type {op.type}"
				582	elemwise_op = elementwise_op_map[op.type]
				583	npu_op = NpuElementWiseOperation(elemwise_op)
Patrik Gustavsson	2349d42	2020-12-01 16:02:29 +0100	[diff] [blame]	584
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	585	if elemwise_op not in UNARY_ELEMWISE_OPS:
Johan Alfvén	56a71b0	2022-10-19 11:20:12 +0200	[diff] [blame]	586	ifm_shape = None if cmd.ifm_tensor.shape == [] else ps.ifm_shapes[0]
				587	ifm2_shape = None if cmd.ifm2_tensor.shape == [] else ps.ifm_shapes[1]
Fredrik Svedberg	b81e1bb	2022-10-11 21:50:51 +0200	[diff] [blame]	588	if cmd.reversed_operands:
				589	assert ifm_ifm2_correct_order(ifm_shape, ifm2_shape)
				590	npu_op.reversed_operands = True
				591	elif not ifm_ifm2_correct_order(ifm_shape, ifm2_shape):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	592	# The scalar/broadcasted feature map has to be the ifm2 tensor so switch the ifms
				593	cmd.ifm_tensor, cmd.ifm2_tensor = cmd.ifm2_tensor, cmd.ifm_tensor
				594	cmd.ifm_box, cmd.ifm2_box = cmd.ifm2_box, cmd.ifm_box
Patrik Gustavsson	2349d42	2020-12-01 16:02:29 +0100	[diff] [blame]	595	ps.ifm_shapes[0], ps.ifm_shapes[1] = ps.ifm_shapes[1], ps.ifm_shapes[0]
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	596	npu_op.reversed_operands = True
Rickard Bolin	fea1516	2022-07-04 16:19:16 +0000	[diff] [blame]	597	npu_op.ifm2 = create_feature_map(
				598	cmd.ifm2_tensor,
				599	cmd.ifm2_box,
				600	arch,
				601	ps.ifm_shapes[1],
				602	op.tile_base_offsets_ifm[1],
				603	)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	604	npu_op.ifm2.quantization = get_ifm_or_ifm2_quantization(ps, cmd.ifm2_tensor)
				605	if cmd.ifm2_tensor.shape == []:
				606	# scalar
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	607	npu_op.ifm2_scalar = cmd.ifm2_tensor.get_scalar()
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	608	npu_op.ifm2.shape = NpuShape3D(height=0, width=0, depth=0)
				609	else:
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	610	ifm2_blk = cmd.ifm2_box.get_block()
Johan Alfven	bfe6fe3	2023-02-14 15:20:03 +0100	[diff] [blame]	611	npu_op.ifm2.shape = NpuShape3D(height=ifm2_blk.height, width=ifm2_blk.width, depth=ifm2_blk.depth)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	612	set_common_op_fields(npu_op, cmd, arch)
				613	# Check if output scale needs to be overridden
				614	output_scale = None
Fredrik Svedberg	4a434cb	2022-09-27 14:13:01 +0200	[diff] [blame]	615	if op.explicit_scaling is not None:
				616	assert not op.explicit_scaling.per_channel
				617	assert op.type in (Op.Add, Op.Mul, Op.Sub)
				618	npu_op.rescale = (op.explicit_scaling.multiplier[0], op.explicit_scaling.shift[0])
				619	elif op.type == Op.Add and op.original_type.is_resize_op():
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	620	# Force output scale same as the input scale for
Tim Hall	885033b	2022-07-21 11:46:03 +0100	[diff] [blame]	621	# resizebilinear/nearestneighbor 1x1 that is converted to add
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	622	output_scale = npu_op.ifm2.quantization.scale_f32
Tim Hall	885033b	2022-07-21 11:46:03 +0100	[diff] [blame]	623	elif op.type == Op.Abs:
Fredrik Svedberg	f2afd7f	2021-02-01 21:42:12 +0100	[diff] [blame]	624	output_scale = npu_op.ifm.quantization.scale_f32 / npu_op.ofm.quantization.scale_f32
Tim Hall	885033b	2022-07-21 11:46:03 +0100	[diff] [blame]	625	elif op.type == Op.LeakyRelu:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	626	output_scale = op.attrs["alpha"]
Tim Hall	885033b	2022-07-21 11:46:03 +0100	[diff] [blame]	627	elif op.type in (Op.Add, Op.Mul, Op.Sub):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	628	if op.activation is not None and op.activation.op_type in (Op.Sigmoid, Op.Tanh):
				629	output_scale = 1 / 0x3000
				630	if output_scale is not None:
				631	npu_op.ofm.quantization = NpuQuantization(scale_f32=output_scale, zero_point=npu_op.ofm.quantization.zero_point)
				632	return npu_op
				633
				634
				635	def create_dma_op(cmd: DMA, arch: ArchitectureFeatures) -> NpuDmaOperation:
				636	"""Converts the command to NpuDmaOperation"""
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	637	src_region = get_region(cmd.in_tensor.mem_type, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	638	if cmd.out_tensor.purpose == TensorPurpose.LUT:
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	639	dest_region = BASE_PTR_INDEX_MEM2MEM
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	640	else:
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	641	dest_region = get_region(cmd.out_tensor.mem_type, arch)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	642
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	643	if cmd.in_tensor.purpose == TensorPurpose.Weights:
				644	# Get weight range per core
				645	sz = 0
				646	for core in range(0, arch.ncores):
				647	key = WeightKey(core, cmd.box.start_coord[-1])
				648	if key in cmd.in_tensor.encoded_ranges:
				649	weight_range = cmd.in_tensor.encoded_ranges[key]
				650	sz += round_up(weight_range.total_bytes, 16)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	651
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	652	if core == 0:
				653	weight_range = cmd.in_tensor.encoded_ranges[key]
				654	src_addr = cmd.in_tensor.address + weight_range.offset
Rickard Bolin	fd8b500	2022-05-16 09:11:06 +0000	[diff] [blame]	655	dest_addr = cmd.out_tensor.address
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	656	else:
Rickard Bolin	17e53b5	2022-09-06 16:09:01 +0000	[diff] [blame]	657	src_addr = cmd.in_tensor.address_for_coordinate(cmd.box.start_coord)
				658	dest_addr = cmd.out_tensor.address_for_coordinate(cmd.box.start_coord)
Johan Alfven	9072496	2023-02-02 09:07:48 +0100	[diff] [blame]	659	# DMA must use 16 bytes alignment (tensors are always aligned but the sz calculation uses actual size)
				660	sz = round_up(cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr, 16)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	661	src = NpuAddressRange(src_region, int(src_addr), int(sz))
				662	dest = NpuAddressRange(dest_region, int(dest_addr), int(sz))
				663	return NpuDmaOperation(src, dest)
				664
				665
				666	def convert_command_to_npu_op(cmd: Command, arch: ArchitectureFeatures) -> NpuOperation:
				667	"""Converts the high level command to NpuOperation"""
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	668	npu_op: NpuOperation
				669	if isinstance(cmd, DMA):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	670	npu_op = create_dma_op(cmd, arch)
Tim Hall	68df8a1	2022-03-16 16:51:16 +0000	[diff] [blame]	671	npu_op.name = cmd.out_tensor.name
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	672	elif isinstance(cmd, NpuStripe):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	673	npu_block_type = cmd.ps.primary_op.type.npu_block_type
				674	if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct):
				675	npu_op = create_npu_conv2d_op(cmd, arch)
				676	elif npu_block_type == NpuBlockType.ConvolutionDepthWise:
				677	npu_op = create_npu_conv_depthwise_op(cmd, arch)
				678	elif npu_block_type in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):
				679	npu_op = create_npu_pool_op(cmd, arch)
				680	elif npu_block_type == NpuBlockType.ElementWise:
				681	npu_op = create_npu_elementwise_op(cmd, arch)
				682	else:
				683	assert 0, f"Unknown command type {npu_block_type}"
Tim Hall	68df8a1	2022-03-16 16:51:16 +0000	[diff] [blame]	684	npu_op.name = cmd.ps.primary_op.name
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	685	return npu_op
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	686
				687
				688	def generate_register_command_stream_for_sg(nng, sg, arch, verbose=False):
				689	"""Generates command stream for the subgraph, adds it to sg.register_command_stream"""
				690	# Convert high level command stream to list of NpuOperation
				691	npu_op_list = []
				692	npu_op_to_cmd = dict() # map from npu op to high level command
				693	for cmd in sg.high_level_command_stream:
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	694	if isinstance(cmd, NpuStripe) and cmd.ps.npu_block_type == NpuBlockType.Default:
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	695	print("Warning: Skipping register command stream generation for", cmd.ps)
Johan Alfven	9072496	2023-02-02 09:07:48 +0100	[diff] [blame]	696	elif isinstance(cmd, NOP):
				697	# NOP should not generate anything
				698	continue
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	699	else:
				700	npu_op = convert_command_to_npu_op(cmd, arch)
				701	npu_op_list.append(npu_op)
				702	npu_op_to_cmd[npu_op] = cmd
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	703	mem_limits = get_mem_limits_for_regions(arch)
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	704	# Generate register commands
erik.andersson@arm.com	ad45f79	2021-02-03 10:20:16 +0100	[diff] [blame]	705	if len(sg.high_level_command_stream) > 0:
				706	stream_id = DebugDatabase.add_stream(sg)
				707	sg.generated_stream_id = stream_id
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	708
erik.andersson@arm.com	ad45f79	2021-02-03 10:20:16 +0100	[diff] [blame]	709	def add_to_debug_db(npu_op: NpuOperation, offset: int):
				710	"""Adds info to the debug database"""
				711	if not isinstance(npu_op, NpuDmaOperation):
				712	cmd = npu_op_to_cmd[npu_op]
				713	DebugDatabase.add_command(stream_id, offset, cmd.ps.primary_op)
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	714
Louis Verhaard	024c355	2021-03-17 14:26:34 +0100	[diff] [blame]	715	sg.register_command_stream = generate_command_stream(
				716	npu_op_list, arch, verbose, mem_limits, add_to_debug_db, npu_op_to_cmd
				717	)