Blame - ethosu/vela/high_level_command_to_npu_op.py - ml/ethos-u/ethos-u-vela

blob: 56c5e74741bcd810cf14e224e1446e81b87f6f95 [file] [log] [blame]

erik.andersson@arm.com	ad45f79	2021-02-03 10:20:16 +0100	[diff] [blame]	1	# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
				16	#
				17	# Description:
				18	# Conversion from high level command to NpuOperation
				19	from enum import IntEnum
				20	from typing import List
				21	from typing import Optional
				22
				23	from .api import NpuActivation
				24	from .api import NpuActivationOp
				25	from .api import NpuAddressRange
				26	from .api import NpuBlockOperation
				27	from .api import NpuBlockTraversal
				28	from .api import NpuConv2DOperation
				29	from .api import NpuConvDepthWiseOperation
				30	from .api import NpuDataType
				31	from .api import NpuDmaOperation
				32	from .api import NpuElementWiseOp
				33	from .api import NpuElementWiseOperation
				34	from .api import NpuFeatureMap
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	35	from .api import NpuLayout
				36	from .api import NpuOperation
				37	from .api import NpuPadding
				38	from .api import NpuPoolingOp
				39	from .api import NpuPoolingOperation
				40	from .api import NpuQuantization
				41	from .api import NpuResamplingMode
				42	from .api import NpuRoundingMode
				43	from .api import NpuShape3D
				44	from .api import NpuTileBox
				45	from .architecture_features import ArchitectureFeatures
				46	from .data_type import DataType
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	47	from .debug_database import DebugDatabase
Michael McGeagh	7a6f843	2020-12-02 15:29:22 +0000	[diff] [blame]	48	from .errors import UnsupportedFeatureError
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	49	from .high_level_command_stream import Box
				50	from .high_level_command_stream import Command
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	51	from .high_level_command_stream import DMA
				52	from .high_level_command_stream import NpuStripe
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	53	from .operation import NpuBlockType
				54	from .operation import Op
				55	from .operation import Operation
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	56	from .register_command_stream_generator import generate_command_stream
				57	from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	58	from .register_command_stream_util import to_npu_kernel
				59	from .register_command_stream_util import UNARY_ELEMWISE_OPS
patrik.gustavsson	eeb8515	2020-12-21 17:10:40 +0000	[diff] [blame]	60	from .shape4d import Shape4D
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	61	from .tensor import MemType
				62	from .tensor import Tensor
				63	from .tensor import TensorBlockTraversal
				64	from .tensor import TensorFormat
				65	from .tensor import TensorPurpose
				66
				67
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	68	class BasePointerIndex(IntEnum):
				69	WeightTensor = 0 # base address index for the Weight tensor
				70	ScratchTensor = 1 # base address index for the Scratch_tensor in the TensorArena
				71	ScratchFastTensor = 2 # base address for the Scratch_fast_tensor
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	72
				73
				74	dtype_map = {
				75	DataType.uint8: NpuDataType.UINT8,
				76	DataType.int8: NpuDataType.INT8,
				77	DataType.uint16: NpuDataType.UINT16,
				78	DataType.int16: NpuDataType.INT16,
				79	DataType.int32: NpuDataType.INT32,
				80	}
				81
				82
				83	block_traversal_map = {
				84	TensorBlockTraversal.DepthFirst: NpuBlockTraversal.DEPTH_FIRST,
				85	TensorBlockTraversal.PartKernelFirst: NpuBlockTraversal.PART_KERNEL_FIRST,
				86	}
				87
				88
				89	# Maps an elementwise op type to an elementwise_mode enum value used by NPU_OP_ELEMENTWISE
				90	elementwise_op_map = {
				91	Op.Mul: NpuElementWiseOp.MUL,
				92	Op.Add: NpuElementWiseOp.ADD,
Fredrik Svedberg	e82be7c	2021-01-18 15:21:03 +0100	[diff] [blame]	93	Op.RescaleAdd: NpuElementWiseOp.ADD,
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	94	Op.Sub: NpuElementWiseOp.SUB,
				95	Op.Minimum: NpuElementWiseOp.MIN,
				96	Op.Maximum: NpuElementWiseOp.MAX,
				97	Op.LeakyRelu: NpuElementWiseOp.LRELU,
				98	Op.Abs: NpuElementWiseOp.ABS,
				99	Op.CLZ: NpuElementWiseOp.CLZ,
				100	Op.SHR: NpuElementWiseOp.SHR,
				101	Op.SHL: NpuElementWiseOp.SHL,
				102	}
				103
				104
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	105	def ifm_ifm2_correct_order(ifm_shape: List[int], ifm2_shape: List[int]) -> bool:
				106	if ifm_shape == []:
				107	# Scalar needs to be in IFM2
				108	return False
				109	if ifm2_shape == []:
				110	return True
				111
				112	for ifm, ifm2 in zip(ifm_shape, ifm2_shape):
				113	if ifm != ifm2 and ifm == 1:
				114	# Broadcasted FM needs to be in IFM2
				115	return False
				116	return True
				117
				118
Patrik Gustavsson	b0ca274	2020-11-18 07:59:09 +0100	[diff] [blame]	119	def get_rounding_mode(op: Operation, fused_quantize: bool) -> NpuRoundingMode:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	120	"""Specifies type of rounding to be used"""
				121	rounding_mode = NpuRoundingMode.TFL
				122	if op.type == Op.ResizeBilinear:
				123	rounding_mode = NpuRoundingMode.TRUNCATE
				124	elif (
				125	op.type.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise)
				126	and op.ifm.dtype == DataType.int16
				127	):
				128	rounding_mode = NpuRoundingMode.NATURAL
Patrik Gustavsson	b0ca274	2020-11-18 07:59:09 +0100	[diff] [blame]	129	elif (
				130	not fused_quantize
				131	and op.type.is_avgpool_op()
				132	and op.memory_function == Op.ConcatSliceWrite
				133	and op.kernel.elements_wh() == 1
				134	):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	135	rounding_mode = NpuRoundingMode.NATURAL
Louis Verhaard	1a92f78	2021-02-09 16:08:26 +0100	[diff] [blame]	136	if op.rounding_mode is not None:
				137	rounding_mode = op.rounding_mode
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	138	return rounding_mode
				139
				140
				141	def create_padding(cmd: NpuStripe, primary_op: Operation) -> NpuPadding:
				142	if primary_op.type.npu_block_type == NpuBlockType.VectorProduct:
				143	return NpuPadding(top=0, left=0, bottom=0, right=0)
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	144	top, left, bottom, right = primary_op.attrs["explicit_padding"]
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	145
				146	# Check if this is for horizontal ifm streaming
				147	if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	148	top = cmd.pad_top
				149	bottom = cmd.pad_bottom
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	150
				151	# Indexing from end since a 1x1 Avgpool might have been added with non 4-dimensional input/output,
				152	# because of activation function needed to be fused.
Andreas Nevalainen	083f103	2020-11-18 10:45:50 +0100	[diff] [blame]	153	if len(cmd.ifm_box.start_coord) >= 2 and cmd.ifm_box.start_coord[-2] > 0:
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	154	left = 0
Patrik Gustavsson	3a26920	2021-01-21 08:28:55 +0100	[diff] [blame]	155	if len(cmd.ifm_box.end_coord) >= 2 and cmd.ifm_box.end_coord[-2] < cmd.ps.ifm_shapes[0].width:
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	156	right = 0
				157	return NpuPadding(top=top, left=left, bottom=bottom, right=right)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	158
				159
				160	def get_region(tens: Tensor, arch: ArchitectureFeatures) -> int:
Tim Hall	1bd531d	2020-11-01 20:59:36 +0000	[diff] [blame]	161	base_ptr_idx_map = {
				162	MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
				163	MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
				164	MemType.Scratch: BasePointerIndex.ScratchTensor,
				165	}
				166
				167	if arch.is_spilling_enabled():
				168	base_ptr_idx_map[MemType.Scratch_fast] = BasePointerIndex.ScratchFastTensor
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	169	else:
Tim Hall	1bd531d	2020-11-01 20:59:36 +0000	[diff] [blame]	170	base_ptr_idx_map[MemType.Scratch_fast] = BasePointerIndex.ScratchTensor
				171
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	172	return base_ptr_idx_map[tens.mem_type].value
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	173
				174
				175	def get_upscale(op: Operation) -> NpuResamplingMode:
				176	upscale = NpuResamplingMode.NONE
				177	if op.type == Op.ResizeBilinear:
				178	# perform nearest neighbor upscale
				179	upscale = NpuResamplingMode.NEAREST
				180	elif op.type == Op.Conv2DBackpropInputSwitchedBias:
				181	# perform insert zero upscale
				182	upscale = NpuResamplingMode.TRANSPOSE
				183	return upscale
				184
				185
				186	def get_ifm_depth(npu_block_type: NpuBlockType, ifm_box: Box, ofm_box: Box) -> int:
				187	if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	188	block = ifm_box.get_block()
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	189	else:
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	190	block = ofm_box.get_block()
				191	return block.depth
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	192
				193
				194	def use_zero_point_0(ps, tens: Tensor, is_ifm_tensor: bool) -> bool:
				195	"""Checks if quantization should use 0 as zero point"""
				196	if tens.dtype == DataType.int32 and is_ifm_tensor:
				197	return True
				198	if ps.primary_op.type not in (Op.AvgPool, Op.ResizeBilinear, Op.CLZ, Op.SHL):
				199	return False
				200	fused_quantize = any(op.type == Op.Quantize for op in ps.ops)
				201	forced_ofm_quantization = ps.primary_op.forced_output_quantization
				202	use_0 = (
				203	(ps.primary_op.activation is None or forced_ofm_quantization is not None)
				204	and (ps.primary_op.memory_function != Op.ConcatSliceWrite)
				205	and not fused_quantize
				206	)
				207	return use_0
				208
				209
				210	def get_ifm_or_ifm2_quantization(ps, tens: Tensor) -> Optional[NpuQuantization]:
				211	"""Gets quantization for IFM/IFM2"""
Dwight Lidman	4f728c0	2020-12-17 15:14:45 +0100	[diff] [blame^]	212	op = ps.primary_op
				213	ifm_quant = op.forced_input_quantization if op.forced_input_quantization is not None else tens.quantization
				214	if ifm_quant is None:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	215	return None
				216	if use_zero_point_0(ps, tens, True):
				217	zero_point = 0
				218	else:
Dwight Lidman	4f728c0	2020-12-17 15:14:45 +0100	[diff] [blame^]	219	zero_point = int(ifm_quant.zero_point)
				220	return NpuQuantization(scale_f32=ifm_quant.scale_f32, zero_point=zero_point)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	221
				222
				223	def get_ofm_quantization(ps, tens: Tensor) -> Optional[NpuQuantization]:
				224	"""Gets quantization for OFM"""
				225	op = ps.primary_op
				226	# Check if operation's output quantization is should be used instead of the output tensor's quantization
				227	# (used in LUTs)
				228	ofm_quant = op.forced_output_quantization if op.forced_output_quantization is not None else tens.quantization
				229	if ofm_quant is None:
				230	return None
				231	if use_zero_point_0(ps, tens, False):
				232	zero_point = 0
				233	else:
				234	zero_point = int(ofm_quant.zero_point)
				235	return NpuQuantization(scale_f32=ofm_quant.scale_f32, zero_point=zero_point)
				236
				237
Patrik Gustavsson	3a26920	2021-01-21 08:28:55 +0100	[diff] [blame]	238	def create_feature_map(tens: Tensor, box: Box, arch: ArchitectureFeatures, op_shape4D: Shape4D) -> NpuFeatureMap:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	239	"""Creates feature map with common fields populated"""
				240	fm = NpuFeatureMap()
				241	fm.region = get_region(tens, arch)
				242	fm.data_type = dtype_map[tens.dtype]
				243	if tens.format == TensorFormat.NHWC:
				244	fm.layout = NpuLayout.NHWC
				245	elif tens.format == TensorFormat.NHCWB16:
				246	fm.layout = NpuLayout.NHCWB16
				247	else:
				248	assert 0, "Incorrect tensor format"
Patrik Gustavsson	3a26920	2021-01-21 08:28:55 +0100	[diff] [blame]	249	height_0, height_1, width_0, addresses = tens.addresses_for_rolling_buffer(
				250	box.start_coord, box.end_coord, op_shape4D
				251	)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	252	for idx, addr in enumerate(addresses):
				253	if addr is None:
				254	addresses[idx] = 0
				255	fm.tiles = NpuTileBox(
				256	height_0=height_0, height_1=height_1, width_0=width_0, addresses=[int(addr) for addr in addresses]
				257	)
Patrik Gustavsson	3a26920	2021-01-21 08:28:55 +0100	[diff] [blame]	258	strides = tens.get_strides(shape4D=op_shape4D)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	259	fm.strides = NpuShape3D(height=int(strides[2]), width=int(strides[3]), depth=int(strides[1]))
				260	return fm
				261
				262
				263	def create_weights(weight_tensor: Tensor, weight_box: Box, arch: ArchitectureFeatures) -> List[NpuAddressRange]:
				264	"""Returns address ranges for weights"""
				265	weights = []
				266	stream_index = weight_tensor.compressed_stream_index_from_coord(weight_box.start_coord)
				267	weight_substream_offsets = weight_tensor.compressed_values_substream_offsets[stream_index]
				268	substreams = len(weight_substream_offsets) - 1 # Offset list must terminate with full stream length
				269
				270	# Extract weight substream offsets and calculate their lengths
				271	assert len(weight_substream_offsets) > 1 and (weight_substream_offsets[0] == 0)
				272	weight_addr = weight_tensor.address_for_coordinate(weight_box.start_coord)
				273	region = get_region(weight_tensor, arch)
				274	for core in range(substreams):
				275	address = weight_addr + weight_substream_offsets[core]
				276	length = weight_substream_offsets[core + 1] - weight_substream_offsets[core]
				277	addr_range = NpuAddressRange(region, int(address), int(length))
				278	weights.append(addr_range)
				279	return weights
				280
				281
				282	def create_biases(
				283	weight_tensor: Tensor, scale_tensor: Tensor, weight_box: Box, arch: ArchitectureFeatures
				284	) -> List[NpuAddressRange]:
				285	"""Returns address ranges for biases"""
				286	biases = []
				287	stream_index = weight_tensor.compressed_stream_index_from_coord(weight_box.start_coord)
				288	scale_substream_offsets = scale_tensor.compressed_values_substream_offsets[stream_index]
				289	substreams = len(scale_substream_offsets) - 1 # Offset list must terminate with full stream length
				290
				291	# Extract scale substream offsets and calculate their lengths
				292	assert len(scale_substream_offsets) > 1 and (scale_substream_offsets[0] == 0)
				293	scale_addr = scale_tensor.address_for_coordinate(weight_box.start_coord[-1:])
				294
				295	region = get_region(scale_tensor, arch)
				296	for core in range(substreams):
				297	address = scale_addr + scale_substream_offsets[core]
				298	length = scale_substream_offsets[core + 1] - scale_substream_offsets[core]
				299	addr_range = NpuAddressRange(region, int(address), int(length))
				300	biases.append(addr_range)
				301	return biases
				302
				303
				304	def create_npu_activation(op: Operation) -> NpuActivation:
				305	"""Creates fused activation function"""
				306	if op.activation is None:
				307	return NpuActivation(NpuActivationOp.NONE_OR_RELU)
				308	faf = op.activation.op_type
				309	act_op = NpuActivationOp.NONE_OR_RELU
				310	if faf == Op.Tanh:
				311	act_op = NpuActivationOp.TANH
				312	elif faf == Op.Sigmoid:
				313	act_op = NpuActivationOp.SIGMOID
				314	elif faf == Op.LUT:
				315	act_op = NpuActivationOp.TABLE_LOOKUP
				316	elif not faf.is_relu_op():
Michael McGeagh	7a6f843	2020-12-02 15:29:22 +0000	[diff] [blame]	317	raise UnsupportedFeatureError(f"Unsupported fused_activation_function: {faf.name}")
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	318
				319	act = NpuActivation(act_op)
				320	act.min = op.activation.min
				321	act.max = op.activation.max
				322	act.lookup_table_index = op.activation.lut_index
				323	return act
				324
				325
				326	def set_common_op_fields(npu_op: NpuBlockOperation, cmd: NpuStripe, arch: ArchitectureFeatures):
				327	"""Sets common fields of the given operation"""
				328	ps = cmd.ps
				329	op = ps.primary_op
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	330
				331	ifm_height = cmd.ifm_box.get_block().height
Patrik Gustavsson	3a26920	2021-01-21 08:28:55 +0100	[diff] [blame]	332	ifm_width = cmd.ps.ifm_shapes[0].width
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	333	ifm_depth = get_ifm_depth(op.type.npu_block_type, cmd.ifm_box, cmd.ofm_box)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	334
Patrik Gustavsson	2349d42	2020-12-01 16:02:29 +0100	[diff] [blame]	335	npu_op.ifm = create_feature_map(cmd.ifm_tensor, cmd.ifm_box, arch, ps.ifm_shapes[0])
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	336	npu_op.ifm.shape = NpuShape3D(height=ifm_height, width=ifm_width, depth=ifm_depth)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	337	npu_op.ifm.quantization = get_ifm_or_ifm2_quantization(ps, cmd.ifm_tensor)
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	338
				339	out_block = cmd.ofm_box.get_block()
Patrik Gustavsson	2349d42	2020-12-01 16:02:29 +0100	[diff] [blame]	340	npu_op.ofm = create_feature_map(cmd.ofm_tensor, cmd.ofm_box, arch, ps.ofm_shapes[0])
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	341	npu_op.ofm.shape = NpuShape3D(height=out_block.height, width=out_block.width, depth=out_block.depth)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	342	npu_op.ofm.quantization = get_ofm_quantization(ps, cmd.ofm_tensor)
				343
				344	if cmd.weight_tensor is not None:
				345	npu_op.weights = create_weights(cmd.weight_tensor, cmd.weight_box, arch)
				346	if cmd.scale_tensor is not None:
				347	npu_op.biases = create_biases(cmd.weight_tensor, cmd.scale_tensor, cmd.weight_box, arch)
				348	npu_op.activation = create_npu_activation(op)
Patrik Gustavsson	b0ca274	2020-11-18 07:59:09 +0100	[diff] [blame]	349	npu_op.fused_quantize = any(op.type == Op.Quantize for op in ps.ops)
				350	npu_op.rounding_mode = get_rounding_mode(op, npu_op.fused_quantize)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	351	npu_op.block_config = NpuShape3D(height=ps.block_config[0], width=ps.block_config[1], depth=ps.block_config[3])
				352
				353	if not op.type.is_elementwise_op():
				354	npu_op.padding = create_padding(cmd, op)
				355	npu_op.kernel = to_npu_kernel(op.kernel)
				356	npu_op.ifm_upscale = get_upscale(op)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	357	return npu_op
				358
				359
				360	def create_npu_conv2d_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuConv2DOperation:
				361	"""Converts the command to NpuConv2DOperation"""
				362	npu_op = NpuConv2DOperation()
				363	set_common_op_fields(npu_op, cmd, arch)
				364	if cmd.ps.primary_op.type.npu_block_type == NpuBlockType.VectorProduct:
				365	npu_op.block_traversal = NpuBlockTraversal.DEPTH_FIRST
				366	else:
				367	npu_op.block_traversal = block_traversal_map[cmd.weight_tensor.block_traversal]
				368	return npu_op
				369
				370
				371	def create_npu_conv_depthwise_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuConvDepthWiseOperation:
				372	"""Converts the command to NpuConvDepthWiseOperation"""
				373	npu_op = NpuConvDepthWiseOperation()
				374	set_common_op_fields(npu_op, cmd, arch)
				375	return npu_op
				376
				377
				378	def create_npu_pool_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuPoolingOperation:
				379	"""Converts the command to NpuPoolingOperation"""
				380	ps = cmd.ps
				381	op = ps.primary_op
				382	pool_op = NpuPoolingOp.AVERAGE
				383	if op.type.is_maxpool_op():
				384	pool_op = NpuPoolingOp.MAX
				385	elif op.type.is_avgpool_op() or op.type == Op.ResizeBilinear:
				386	pool_op = NpuPoolingOp.AVERAGE
				387	elif op.type == Op.ReduceSum:
				388	pool_op = NpuPoolingOp.REDUCE_SUM
				389	else:
				390	assert 0, f"Unknown pool type {op.type}"
				391	npu_op = NpuPoolingOperation(pool_op)
				392	set_common_op_fields(npu_op, cmd, arch)
				393	# Pooling specific info
Dwight Lidman	4f728c0	2020-12-17 15:14:45 +0100	[diff] [blame^]	394	npu_op.rescale = op.rescale
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	395	return npu_op
				396
				397
				398	def create_npu_elementwise_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuElementWiseOperation:
				399	"""Converts the command to NpuElementWiseOperation"""
				400	ps = cmd.ps
				401	op = ps.primary_op
				402	assert op.type in elementwise_op_map, f"Unknown elementwise type {op.type}"
				403	elemwise_op = elementwise_op_map[op.type]
				404	npu_op = NpuElementWiseOperation(elemwise_op)
Patrik Gustavsson	2349d42	2020-12-01 16:02:29 +0100	[diff] [blame]	405
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	406	if elemwise_op not in UNARY_ELEMWISE_OPS:
Patrik Gustavsson	3a26920	2021-01-21 08:28:55 +0100	[diff] [blame]	407	ifm_shape = [] if cmd.ifm_tensor.shape == [] else ps.ifm_shapes[0].as_list()
				408	ifm2_shape = [] if cmd.ifm2_tensor.shape == [] else ps.ifm_shapes[1].as_list()
				409	if not ifm_ifm2_correct_order(ifm_shape, ifm2_shape):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	410	# The scalar/broadcasted feature map has to be the ifm2 tensor so switch the ifms
				411	cmd.ifm_tensor, cmd.ifm2_tensor = cmd.ifm2_tensor, cmd.ifm_tensor
				412	cmd.ifm_box, cmd.ifm2_box = cmd.ifm2_box, cmd.ifm_box
Patrik Gustavsson	2349d42	2020-12-01 16:02:29 +0100	[diff] [blame]	413	ps.ifm_shapes[0], ps.ifm_shapes[1] = ps.ifm_shapes[1], ps.ifm_shapes[0]
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	414	npu_op.reversed_operands = True
Patrik Gustavsson	2349d42	2020-12-01 16:02:29 +0100	[diff] [blame]	415	npu_op.ifm2 = create_feature_map(cmd.ifm2_tensor, cmd.ifm2_box, arch, ps.ifm_shapes[1])
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	416	npu_op.ifm2.quantization = get_ifm_or_ifm2_quantization(ps, cmd.ifm2_tensor)
				417	if cmd.ifm2_tensor.shape == []:
				418	# scalar
				419	assert cmd.ifm2_tensor.quant_values.size == 1
				420	npu_op.ifm2_scalar = cmd.ifm2_tensor.values.item(0)
				421	npu_op.ifm2.shape = NpuShape3D(height=0, width=0, depth=0)
				422	else:
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	423	ifm2_blk = cmd.ifm2_box.get_block()
Patrik Gustavsson	3a26920	2021-01-21 08:28:55 +0100	[diff] [blame]	424	ifm2_width = ps.ifm_shapes[1].width
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	425	npu_op.ifm2.shape = NpuShape3D(height=ifm2_blk.height, width=ifm2_width, depth=ifm2_blk.depth)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	426	set_common_op_fields(npu_op, cmd, arch)
				427	# Check if output scale needs to be overridden
				428	output_scale = None
				429	if op.type == Op.Add and "resizebilinear" in op.attrs:
				430	# Force output scale same as the input scale for
				431	# resizebilinear 1x1 that is converted to add
				432	output_scale = npu_op.ifm2.quantization.scale_f32
Fredrik Svedberg	f2afd7f	2021-02-01 21:42:12 +0100	[diff] [blame]	433	if op.type == Op.Abs:
				434	output_scale = npu_op.ifm.quantization.scale_f32 / npu_op.ofm.quantization.scale_f32
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	435	if op.type == Op.LeakyRelu:
				436	output_scale = op.attrs["alpha"]
Fredrik Svedberg	e82be7c	2021-01-18 15:21:03 +0100	[diff] [blame]	437	if op.type == Op.RescaleAdd:
				438	assert op.rescale is not None, f"{op.type} must have rescale"
				439	npu_op.rescale = op.rescale
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	440	if op.type in (Op.Add, Op.Mul, Op.Sub):
				441	if op.activation is not None and op.activation.op_type in (Op.Sigmoid, Op.Tanh):
				442	output_scale = 1 / 0x3000
				443	if output_scale is not None:
				444	npu_op.ofm.quantization = NpuQuantization(scale_f32=output_scale, zero_point=npu_op.ofm.quantization.zero_point)
				445	return npu_op
				446
				447
				448	def create_dma_op(cmd: DMA, arch: ArchitectureFeatures) -> NpuDmaOperation:
				449	"""Converts the command to NpuDmaOperation"""
				450	src_region = get_region(cmd.in_tensor, arch)
				451	if cmd.out_tensor.purpose == TensorPurpose.LUT:
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	452	dest_region = BASE_PTR_INDEX_MEM2MEM
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	453	else:
				454	dest_region = get_region(cmd.out_tensor, arch)
				455
				456	start_coord = cmd.box.start_coord
				457	src_addr = cmd.in_tensor.address_for_coordinate(start_coord)
				458	dest_addr = cmd.out_tensor.address_for_coordinate(start_coord)
				459
				460	if cmd.in_tensor.compressed_values is not None:
				461	if cmd.out_tensor.purpose == TensorPurpose.FSBias:
				462	sz = cmd.in_tensor.storage_size()
				463	else:
				464	stream_index = cmd.in_tensor.compressed_stream_index_from_coord(start_coord)
				465	sz = cmd.in_tensor.size_of_compressed_stream(stream_index)
				466	else:
				467	sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr
				468	src = NpuAddressRange(src_region, int(src_addr), int(sz))
				469	dest = NpuAddressRange(dest_region, int(dest_addr), int(sz))
				470	return NpuDmaOperation(src, dest)
				471
				472
				473	def convert_command_to_npu_op(cmd: Command, arch: ArchitectureFeatures) -> NpuOperation:
				474	"""Converts the high level command to NpuOperation"""
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	475	npu_op: NpuOperation
				476	if isinstance(cmd, DMA):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	477	npu_op = create_dma_op(cmd, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	478	elif isinstance(cmd, NpuStripe):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	479	npu_block_type = cmd.ps.primary_op.type.npu_block_type
				480	if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct):
				481	npu_op = create_npu_conv2d_op(cmd, arch)
				482	elif npu_block_type == NpuBlockType.ConvolutionDepthWise:
				483	npu_op = create_npu_conv_depthwise_op(cmd, arch)
				484	elif npu_block_type in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):
				485	npu_op = create_npu_pool_op(cmd, arch)
				486	elif npu_block_type == NpuBlockType.ElementWise:
				487	npu_op = create_npu_elementwise_op(cmd, arch)
				488	else:
				489	assert 0, f"Unknown command type {npu_block_type}"
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	490	return npu_op
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	491
				492
				493	def generate_register_command_stream_for_sg(nng, sg, arch, verbose=False):
				494	"""Generates command stream for the subgraph, adds it to sg.register_command_stream"""
				495	# Convert high level command stream to list of NpuOperation
				496	npu_op_list = []
				497	npu_op_to_cmd = dict() # map from npu op to high level command
				498	for cmd in sg.high_level_command_stream:
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	499	if isinstance(cmd, NpuStripe) and cmd.ps.npu_block_type == NpuBlockType.Default:
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	500	print("Warning: Skipping register command stream generation for", cmd.ps)
				501	else:
				502	npu_op = convert_command_to_npu_op(cmd, arch)
				503	npu_op_list.append(npu_op)
				504	npu_op_to_cmd[npu_op] = cmd
				505	# Generate register commands
erik.andersson@arm.com	ad45f79	2021-02-03 10:20:16 +0100	[diff] [blame]	506	if len(sg.high_level_command_stream) > 0:
				507	stream_id = DebugDatabase.add_stream(sg)
				508	sg.generated_stream_id = stream_id
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	509
erik.andersson@arm.com	ad45f79	2021-02-03 10:20:16 +0100	[diff] [blame]	510	def add_to_debug_db(npu_op: NpuOperation, offset: int):
				511	"""Adds info to the debug database"""
				512	if not isinstance(npu_op, NpuDmaOperation):
				513	cmd = npu_op_to_cmd[npu_op]
				514	DebugDatabase.add_command(stream_id, offset, cmd.ps.primary_op)
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	515
erik.andersson@arm.com	ad45f79	2021-02-03 10:20:16 +0100	[diff] [blame]	516	sg.register_command_stream = generate_command_stream(npu_op_list, arch, verbose, add_to_debug_db, npu_op_to_cmd)