Blame - ethosu/vela/high_level_command_to_npu_op.py - ml/ethos-u/ethos-u-vela

blob: 071170258a9c78e42d8d9ecb1645ca5a2d39a820 [file] [log] [blame]

Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1	# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
				2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
				16	#
				17	# Description:
				18	# Conversion from high level command to NpuOperation
				19	from enum import IntEnum
				20	from typing import List
				21	from typing import Optional
				22
				23	from .api import NpuActivation
				24	from .api import NpuActivationOp
				25	from .api import NpuAddressRange
				26	from .api import NpuBlockOperation
				27	from .api import NpuBlockTraversal
				28	from .api import NpuConv2DOperation
				29	from .api import NpuConvDepthWiseOperation
				30	from .api import NpuDataType
				31	from .api import NpuDmaOperation
				32	from .api import NpuElementWiseOp
				33	from .api import NpuElementWiseOperation
				34	from .api import NpuFeatureMap
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	35	from .api import NpuLayout
				36	from .api import NpuOperation
				37	from .api import NpuPadding
				38	from .api import NpuPoolingOp
				39	from .api import NpuPoolingOperation
				40	from .api import NpuQuantization
				41	from .api import NpuResamplingMode
				42	from .api import NpuRoundingMode
				43	from .api import NpuShape3D
				44	from .api import NpuTileBox
				45	from .architecture_features import ArchitectureFeatures
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	46	from .architecture_features import Block
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	47	from .data_type import DataType
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	48	from .debug_database import DebugDatabase
Michael McGeagh	7a6f843	2020-12-02 15:29:22 +0000	[diff] [blame]	49	from .errors import UnsupportedFeatureError
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	50	from .high_level_command_stream import Box
				51	from .high_level_command_stream import Command
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	52	from .high_level_command_stream import DMA
				53	from .high_level_command_stream import NpuStripe
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	54	from .operation import NpuBlockType
				55	from .operation import Op
				56	from .operation import Operation
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	57	from .register_command_stream_generator import generate_command_stream
				58	from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	59	from .register_command_stream_util import to_npu_kernel
				60	from .register_command_stream_util import UNARY_ELEMWISE_OPS
Patrik Gustavsson	bf31d64	2020-12-16 13:08:06 +0100	[diff] [blame^]	61	from .shape4d import Shape4D
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	62	from .tensor import MemType
				63	from .tensor import Tensor
				64	from .tensor import TensorBlockTraversal
				65	from .tensor import TensorFormat
				66	from .tensor import TensorPurpose
				67
				68
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	69	class BasePointerIndex(IntEnum):
				70	WeightTensor = 0 # base address index for the Weight tensor
				71	ScratchTensor = 1 # base address index for the Scratch_tensor in the TensorArena
				72	ScratchFastTensor = 2 # base address for the Scratch_fast_tensor
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	73
				74
				75	dtype_map = {
				76	DataType.uint8: NpuDataType.UINT8,
				77	DataType.int8: NpuDataType.INT8,
				78	DataType.uint16: NpuDataType.UINT16,
				79	DataType.int16: NpuDataType.INT16,
				80	DataType.int32: NpuDataType.INT32,
				81	}
				82
				83
				84	block_traversal_map = {
				85	TensorBlockTraversal.DepthFirst: NpuBlockTraversal.DEPTH_FIRST,
				86	TensorBlockTraversal.PartKernelFirst: NpuBlockTraversal.PART_KERNEL_FIRST,
				87	}
				88
				89
				90	# Maps an elementwise op type to an elementwise_mode enum value used by NPU_OP_ELEMENTWISE
				91	elementwise_op_map = {
				92	Op.Mul: NpuElementWiseOp.MUL,
				93	Op.Add: NpuElementWiseOp.ADD,
				94	Op.Sub: NpuElementWiseOp.SUB,
				95	Op.Minimum: NpuElementWiseOp.MIN,
				96	Op.Maximum: NpuElementWiseOp.MAX,
				97	Op.LeakyRelu: NpuElementWiseOp.LRELU,
				98	Op.Abs: NpuElementWiseOp.ABS,
				99	Op.CLZ: NpuElementWiseOp.CLZ,
				100	Op.SHR: NpuElementWiseOp.SHR,
				101	Op.SHL: NpuElementWiseOp.SHL,
				102	}
				103
				104
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	105	def ifm_ifm2_correct_order(ifm_shape: List[int], ifm2_shape: List[int]) -> bool:
				106	if ifm_shape == []:
				107	# Scalar needs to be in IFM2
				108	return False
				109	if ifm2_shape == []:
				110	return True
				111
				112	for ifm, ifm2 in zip(ifm_shape, ifm2_shape):
				113	if ifm != ifm2 and ifm == 1:
				114	# Broadcasted FM needs to be in IFM2
				115	return False
				116	return True
				117
				118
Patrik Gustavsson	b0ca274	2020-11-18 07:59:09 +0100	[diff] [blame]	119	def get_rounding_mode(op: Operation, fused_quantize: bool) -> NpuRoundingMode:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	120	"""Specifies type of rounding to be used"""
				121	rounding_mode = NpuRoundingMode.TFL
				122	if op.type == Op.ResizeBilinear:
				123	rounding_mode = NpuRoundingMode.TRUNCATE
				124	elif (
				125	op.type.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise)
				126	and op.ifm.dtype == DataType.int16
				127	):
				128	rounding_mode = NpuRoundingMode.NATURAL
Patrik Gustavsson	b0ca274	2020-11-18 07:59:09 +0100	[diff] [blame]	129	elif (
				130	not fused_quantize
				131	and op.type.is_avgpool_op()
				132	and op.memory_function == Op.ConcatSliceWrite
				133	and op.kernel.elements_wh() == 1
				134	):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	135	rounding_mode = NpuRoundingMode.NATURAL
				136	rounding_mode = op.attrs.get("rounding_mode", rounding_mode)
				137	return rounding_mode
				138
				139
				140	def create_padding(cmd: NpuStripe, primary_op: Operation) -> NpuPadding:
				141	if primary_op.type.npu_block_type == NpuBlockType.VectorProduct:
				142	return NpuPadding(top=0, left=0, bottom=0, right=0)
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	143	top, left, bottom, right = primary_op.attrs["explicit_padding"]
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	144
				145	# Check if this is for horizontal ifm streaming
				146	if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	147	top = cmd.pad_top
				148	bottom = cmd.pad_bottom
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	149
				150	# Indexing from end since a 1x1 Avgpool might have been added with non 4-dimensional input/output,
				151	# because of activation function needed to be fused.
Andreas Nevalainen	083f103	2020-11-18 10:45:50 +0100	[diff] [blame]	152	if len(cmd.ifm_box.start_coord) >= 2 and cmd.ifm_box.start_coord[-2] > 0:
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	153	left = 0
Andreas Nevalainen	083f103	2020-11-18 10:45:50 +0100	[diff] [blame]	154	if len(cmd.ifm_box.end_coord) >= 2 and cmd.ifm_box.end_coord[-2] < Block.from_shape(cmd.ifm_tensor.shape).width:
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	155	right = 0
				156	return NpuPadding(top=top, left=left, bottom=bottom, right=right)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	157
				158
				159	def get_region(tens: Tensor, arch: ArchitectureFeatures) -> int:
Tim Hall	1bd531d	2020-11-01 20:59:36 +0000	[diff] [blame]	160	base_ptr_idx_map = {
				161	MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
				162	MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
				163	MemType.Scratch: BasePointerIndex.ScratchTensor,
				164	}
				165
				166	if arch.is_spilling_enabled():
				167	base_ptr_idx_map[MemType.Scratch_fast] = BasePointerIndex.ScratchFastTensor
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	168	else:
Tim Hall	1bd531d	2020-11-01 20:59:36 +0000	[diff] [blame]	169	base_ptr_idx_map[MemType.Scratch_fast] = BasePointerIndex.ScratchTensor
				170
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	171	return base_ptr_idx_map[tens.mem_type].value
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	172
				173
				174	def get_upscale(op: Operation) -> NpuResamplingMode:
				175	upscale = NpuResamplingMode.NONE
				176	if op.type == Op.ResizeBilinear:
				177	# perform nearest neighbor upscale
				178	upscale = NpuResamplingMode.NEAREST
				179	elif op.type == Op.Conv2DBackpropInputSwitchedBias:
				180	# perform insert zero upscale
				181	upscale = NpuResamplingMode.TRANSPOSE
				182	return upscale
				183
				184
				185	def get_ifm_depth(npu_block_type: NpuBlockType, ifm_box: Box, ofm_box: Box) -> int:
				186	if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	187	block = ifm_box.get_block()
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	188	else:
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	189	block = ofm_box.get_block()
				190	return block.depth
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	191
				192
				193	def use_zero_point_0(ps, tens: Tensor, is_ifm_tensor: bool) -> bool:
				194	"""Checks if quantization should use 0 as zero point"""
				195	if tens.dtype == DataType.int32 and is_ifm_tensor:
				196	return True
				197	if ps.primary_op.type not in (Op.AvgPool, Op.ResizeBilinear, Op.CLZ, Op.SHL):
				198	return False
				199	fused_quantize = any(op.type == Op.Quantize for op in ps.ops)
				200	forced_ofm_quantization = ps.primary_op.forced_output_quantization
				201	use_0 = (
				202	(ps.primary_op.activation is None or forced_ofm_quantization is not None)
				203	and (ps.primary_op.memory_function != Op.ConcatSliceWrite)
				204	and not fused_quantize
				205	)
				206	return use_0
				207
				208
				209	def get_ifm_or_ifm2_quantization(ps, tens: Tensor) -> Optional[NpuQuantization]:
				210	"""Gets quantization for IFM/IFM2"""
				211	if tens.quantization is None:
				212	return None
				213	if use_zero_point_0(ps, tens, True):
				214	zero_point = 0
				215	else:
				216	zero_point = int(tens.quantization.zero_point)
				217	return NpuQuantization(scale_f32=tens.quantization.scale_f32, zero_point=zero_point)
				218
				219
				220	def get_ofm_quantization(ps, tens: Tensor) -> Optional[NpuQuantization]:
				221	"""Gets quantization for OFM"""
				222	op = ps.primary_op
				223	# Check if operation's output quantization is should be used instead of the output tensor's quantization
				224	# (used in LUTs)
				225	ofm_quant = op.forced_output_quantization if op.forced_output_quantization is not None else tens.quantization
				226	if ofm_quant is None:
				227	return None
				228	if use_zero_point_0(ps, tens, False):
				229	zero_point = 0
				230	else:
				231	zero_point = int(ofm_quant.zero_point)
				232	return NpuQuantization(scale_f32=ofm_quant.scale_f32, zero_point=zero_point)
				233
				234
Patrik Gustavsson	bf31d64	2020-12-16 13:08:06 +0100	[diff] [blame^]	235	def create_feature_map(tens: Tensor, box: Box, arch: ArchitectureFeatures, fm_shape: Shape4D) -> NpuFeatureMap:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	236	"""Creates feature map with common fields populated"""
				237	fm = NpuFeatureMap()
				238	fm.region = get_region(tens, arch)
				239	fm.data_type = dtype_map[tens.dtype]
				240	if tens.format == TensorFormat.NHWC:
				241	fm.layout = NpuLayout.NHWC
				242	elif tens.format == TensorFormat.NHCWB16:
				243	fm.layout = NpuLayout.NHCWB16
				244	else:
				245	assert 0, "Incorrect tensor format"
Patrik Gustavsson	2349d42	2020-12-01 16:02:29 +0100	[diff] [blame]	246	height_0, height_1, width_0, addresses = tens.addresses_for_rolling_buffer(box.start_coord, box.end_coord, fm_shape)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	247	for idx, addr in enumerate(addresses):
				248	if addr is None:
				249	addresses[idx] = 0
				250	fm.tiles = NpuTileBox(
				251	height_0=height_0, height_1=height_1, width_0=width_0, addresses=[int(addr) for addr in addresses]
				252	)
				253	strides = tens.get_strides()
				254	fm.strides = NpuShape3D(height=int(strides[2]), width=int(strides[3]), depth=int(strides[1]))
				255	return fm
				256
				257
				258	def create_weights(weight_tensor: Tensor, weight_box: Box, arch: ArchitectureFeatures) -> List[NpuAddressRange]:
				259	"""Returns address ranges for weights"""
				260	weights = []
				261	stream_index = weight_tensor.compressed_stream_index_from_coord(weight_box.start_coord)
				262	weight_substream_offsets = weight_tensor.compressed_values_substream_offsets[stream_index]
				263	substreams = len(weight_substream_offsets) - 1 # Offset list must terminate with full stream length
				264
				265	# Extract weight substream offsets and calculate their lengths
				266	assert len(weight_substream_offsets) > 1 and (weight_substream_offsets[0] == 0)
				267	weight_addr = weight_tensor.address_for_coordinate(weight_box.start_coord)
				268	region = get_region(weight_tensor, arch)
				269	for core in range(substreams):
				270	address = weight_addr + weight_substream_offsets[core]
				271	length = weight_substream_offsets[core + 1] - weight_substream_offsets[core]
				272	addr_range = NpuAddressRange(region, int(address), int(length))
				273	weights.append(addr_range)
				274	return weights
				275
				276
				277	def create_biases(
				278	weight_tensor: Tensor, scale_tensor: Tensor, weight_box: Box, arch: ArchitectureFeatures
				279	) -> List[NpuAddressRange]:
				280	"""Returns address ranges for biases"""
				281	biases = []
				282	stream_index = weight_tensor.compressed_stream_index_from_coord(weight_box.start_coord)
				283	scale_substream_offsets = scale_tensor.compressed_values_substream_offsets[stream_index]
				284	substreams = len(scale_substream_offsets) - 1 # Offset list must terminate with full stream length
				285
				286	# Extract scale substream offsets and calculate their lengths
				287	assert len(scale_substream_offsets) > 1 and (scale_substream_offsets[0] == 0)
				288	scale_addr = scale_tensor.address_for_coordinate(weight_box.start_coord[-1:])
				289
				290	region = get_region(scale_tensor, arch)
				291	for core in range(substreams):
				292	address = scale_addr + scale_substream_offsets[core]
				293	length = scale_substream_offsets[core + 1] - scale_substream_offsets[core]
				294	addr_range = NpuAddressRange(region, int(address), int(length))
				295	biases.append(addr_range)
				296	return biases
				297
				298
				299	def create_npu_activation(op: Operation) -> NpuActivation:
				300	"""Creates fused activation function"""
				301	if op.activation is None:
				302	return NpuActivation(NpuActivationOp.NONE_OR_RELU)
				303	faf = op.activation.op_type
				304	act_op = NpuActivationOp.NONE_OR_RELU
				305	if faf == Op.Tanh:
				306	act_op = NpuActivationOp.TANH
				307	elif faf == Op.Sigmoid:
				308	act_op = NpuActivationOp.SIGMOID
				309	elif faf == Op.LUT:
				310	act_op = NpuActivationOp.TABLE_LOOKUP
				311	elif not faf.is_relu_op():
Michael McGeagh	7a6f843	2020-12-02 15:29:22 +0000	[diff] [blame]	312	raise UnsupportedFeatureError(f"Unsupported fused_activation_function: {faf.name}")
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	313
				314	act = NpuActivation(act_op)
				315	act.min = op.activation.min
				316	act.max = op.activation.max
				317	act.lookup_table_index = op.activation.lut_index
				318	return act
				319
				320
				321	def set_common_op_fields(npu_op: NpuBlockOperation, cmd: NpuStripe, arch: ArchitectureFeatures):
				322	"""Sets common fields of the given operation"""
				323	ps = cmd.ps
				324	op = ps.primary_op
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	325
				326	ifm_height = cmd.ifm_box.get_block().height
				327	ifm_width = Block.from_shape(cmd.ifm_tensor.shape).width
				328	ifm_depth = get_ifm_depth(op.type.npu_block_type, cmd.ifm_box, cmd.ofm_box)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	329
Patrik Gustavsson	2349d42	2020-12-01 16:02:29 +0100	[diff] [blame]	330	npu_op.ifm = create_feature_map(cmd.ifm_tensor, cmd.ifm_box, arch, ps.ifm_shapes[0])
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	331	npu_op.ifm.shape = NpuShape3D(height=ifm_height, width=ifm_width, depth=ifm_depth)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	332	npu_op.ifm.quantization = get_ifm_or_ifm2_quantization(ps, cmd.ifm_tensor)
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	333
				334	out_block = cmd.ofm_box.get_block()
Patrik Gustavsson	2349d42	2020-12-01 16:02:29 +0100	[diff] [blame]	335	npu_op.ofm = create_feature_map(cmd.ofm_tensor, cmd.ofm_box, arch, ps.ofm_shapes[0])
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	336	npu_op.ofm.shape = NpuShape3D(height=out_block.height, width=out_block.width, depth=out_block.depth)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	337	npu_op.ofm.quantization = get_ofm_quantization(ps, cmd.ofm_tensor)
				338
				339	if cmd.weight_tensor is not None:
				340	npu_op.weights = create_weights(cmd.weight_tensor, cmd.weight_box, arch)
				341	if cmd.scale_tensor is not None:
				342	npu_op.biases = create_biases(cmd.weight_tensor, cmd.scale_tensor, cmd.weight_box, arch)
				343	npu_op.activation = create_npu_activation(op)
Patrik Gustavsson	b0ca274	2020-11-18 07:59:09 +0100	[diff] [blame]	344	npu_op.fused_quantize = any(op.type == Op.Quantize for op in ps.ops)
				345	npu_op.rounding_mode = get_rounding_mode(op, npu_op.fused_quantize)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	346	npu_op.block_config = NpuShape3D(height=ps.block_config[0], width=ps.block_config[1], depth=ps.block_config[3])
				347
				348	if not op.type.is_elementwise_op():
				349	npu_op.padding = create_padding(cmd, op)
				350	npu_op.kernel = to_npu_kernel(op.kernel)
				351	npu_op.ifm_upscale = get_upscale(op)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	352	return npu_op
				353
				354
				355	def create_npu_conv2d_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuConv2DOperation:
				356	"""Converts the command to NpuConv2DOperation"""
				357	npu_op = NpuConv2DOperation()
				358	set_common_op_fields(npu_op, cmd, arch)
				359	if cmd.ps.primary_op.type.npu_block_type == NpuBlockType.VectorProduct:
				360	npu_op.block_traversal = NpuBlockTraversal.DEPTH_FIRST
				361	else:
				362	npu_op.block_traversal = block_traversal_map[cmd.weight_tensor.block_traversal]
				363	return npu_op
				364
				365
				366	def create_npu_conv_depthwise_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuConvDepthWiseOperation:
				367	"""Converts the command to NpuConvDepthWiseOperation"""
				368	npu_op = NpuConvDepthWiseOperation()
				369	set_common_op_fields(npu_op, cmd, arch)
				370	return npu_op
				371
				372
				373	def create_npu_pool_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuPoolingOperation:
				374	"""Converts the command to NpuPoolingOperation"""
				375	ps = cmd.ps
				376	op = ps.primary_op
				377	pool_op = NpuPoolingOp.AVERAGE
				378	if op.type.is_maxpool_op():
				379	pool_op = NpuPoolingOp.MAX
				380	elif op.type.is_avgpool_op() or op.type == Op.ResizeBilinear:
				381	pool_op = NpuPoolingOp.AVERAGE
				382	elif op.type == Op.ReduceSum:
				383	pool_op = NpuPoolingOp.REDUCE_SUM
				384	else:
				385	assert 0, f"Unknown pool type {op.type}"
				386	npu_op = NpuPoolingOperation(pool_op)
				387	set_common_op_fields(npu_op, cmd, arch)
				388	# Pooling specific info
				389	if op.type == Op.ResizeBilinear and "rescale" in op.attrs:
				390	npu_op.rescale = op.attrs["rescale"]
				391	return npu_op
				392
				393
				394	def create_npu_elementwise_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuElementWiseOperation:
				395	"""Converts the command to NpuElementWiseOperation"""
				396	ps = cmd.ps
				397	op = ps.primary_op
				398	assert op.type in elementwise_op_map, f"Unknown elementwise type {op.type}"
				399	elemwise_op = elementwise_op_map[op.type]
				400	npu_op = NpuElementWiseOperation(elemwise_op)
Patrik Gustavsson	2349d42	2020-12-01 16:02:29 +0100	[diff] [blame]	401
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	402	if elemwise_op not in UNARY_ELEMWISE_OPS:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	403	if not ifm_ifm2_correct_order(cmd.ifm_tensor.shape, cmd.ifm2_tensor.shape):
				404	# The scalar/broadcasted feature map has to be the ifm2 tensor so switch the ifms
				405	cmd.ifm_tensor, cmd.ifm2_tensor = cmd.ifm2_tensor, cmd.ifm_tensor
				406	cmd.ifm_box, cmd.ifm2_box = cmd.ifm2_box, cmd.ifm_box
Patrik Gustavsson	2349d42	2020-12-01 16:02:29 +0100	[diff] [blame]	407	ps.ifm_shapes[0], ps.ifm_shapes[1] = ps.ifm_shapes[1], ps.ifm_shapes[0]
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	408	npu_op.reversed_operands = True
Patrik Gustavsson	2349d42	2020-12-01 16:02:29 +0100	[diff] [blame]	409	npu_op.ifm2 = create_feature_map(cmd.ifm2_tensor, cmd.ifm2_box, arch, ps.ifm_shapes[1])
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	410	npu_op.ifm2.quantization = get_ifm_or_ifm2_quantization(ps, cmd.ifm2_tensor)
				411	if cmd.ifm2_tensor.shape == []:
				412	# scalar
				413	assert cmd.ifm2_tensor.quant_values.size == 1
				414	npu_op.ifm2_scalar = cmd.ifm2_tensor.values.item(0)
				415	npu_op.ifm2.shape = NpuShape3D(height=0, width=0, depth=0)
				416	else:
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	417	ifm2_blk = cmd.ifm2_box.get_block()
				418	ifm2_width = Block.from_shape(cmd.ifm2_tensor.shape).width
				419	npu_op.ifm2.shape = NpuShape3D(height=ifm2_blk.height, width=ifm2_width, depth=ifm2_blk.depth)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	420	set_common_op_fields(npu_op, cmd, arch)
				421	# Check if output scale needs to be overridden
				422	output_scale = None
				423	if op.type == Op.Add and "resizebilinear" in op.attrs:
				424	# Force output scale same as the input scale for
				425	# resizebilinear 1x1 that is converted to add
				426	output_scale = npu_op.ifm2.quantization.scale_f32
				427	if op.type == Op.LeakyRelu:
				428	output_scale = op.attrs["alpha"]
				429	if op.type in (Op.Add, Op.Sub) and "rescale" in op.attrs:
				430	npu_op.rescale = op.attrs.get("rescale")
				431	if op.type in (Op.Add, Op.Mul, Op.Sub):
				432	if op.activation is not None and op.activation.op_type in (Op.Sigmoid, Op.Tanh):
				433	output_scale = 1 / 0x3000
				434	if output_scale is not None:
				435	npu_op.ofm.quantization = NpuQuantization(scale_f32=output_scale, zero_point=npu_op.ofm.quantization.zero_point)
				436	return npu_op
				437
				438
				439	def create_dma_op(cmd: DMA, arch: ArchitectureFeatures) -> NpuDmaOperation:
				440	"""Converts the command to NpuDmaOperation"""
				441	src_region = get_region(cmd.in_tensor, arch)
				442	if cmd.out_tensor.purpose == TensorPurpose.LUT:
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	443	dest_region = BASE_PTR_INDEX_MEM2MEM
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	444	else:
				445	dest_region = get_region(cmd.out_tensor, arch)
				446
				447	start_coord = cmd.box.start_coord
				448	src_addr = cmd.in_tensor.address_for_coordinate(start_coord)
				449	dest_addr = cmd.out_tensor.address_for_coordinate(start_coord)
				450
				451	if cmd.in_tensor.compressed_values is not None:
				452	if cmd.out_tensor.purpose == TensorPurpose.FSBias:
				453	sz = cmd.in_tensor.storage_size()
				454	else:
				455	stream_index = cmd.in_tensor.compressed_stream_index_from_coord(start_coord)
				456	sz = cmd.in_tensor.size_of_compressed_stream(stream_index)
				457	else:
				458	sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr
				459	src = NpuAddressRange(src_region, int(src_addr), int(sz))
				460	dest = NpuAddressRange(dest_region, int(dest_addr), int(sz))
				461	return NpuDmaOperation(src, dest)
				462
				463
				464	def convert_command_to_npu_op(cmd: Command, arch: ArchitectureFeatures) -> NpuOperation:
				465	"""Converts the high level command to NpuOperation"""
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	466	npu_op: NpuOperation
				467	if isinstance(cmd, DMA):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	468	npu_op = create_dma_op(cmd, arch)
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	469	elif isinstance(cmd, NpuStripe):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	470	npu_block_type = cmd.ps.primary_op.type.npu_block_type
				471	if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct):
				472	npu_op = create_npu_conv2d_op(cmd, arch)
				473	elif npu_block_type == NpuBlockType.ConvolutionDepthWise:
				474	npu_op = create_npu_conv_depthwise_op(cmd, arch)
				475	elif npu_block_type in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):
				476	npu_op = create_npu_pool_op(cmd, arch)
				477	elif npu_block_type == NpuBlockType.ElementWise:
				478	npu_op = create_npu_elementwise_op(cmd, arch)
				479	else:
				480	assert 0, f"Unknown command type {npu_block_type}"
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	481	return npu_op
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	482
				483
				484	def generate_register_command_stream_for_sg(nng, sg, arch, verbose=False):
				485	"""Generates command stream for the subgraph, adds it to sg.register_command_stream"""
				486	# Convert high level command stream to list of NpuOperation
				487	npu_op_list = []
				488	npu_op_to_cmd = dict() # map from npu op to high level command
				489	for cmd in sg.high_level_command_stream:
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	490	if isinstance(cmd, NpuStripe) and cmd.ps.npu_block_type == NpuBlockType.Default:
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	491	print("Warning: Skipping register command stream generation for", cmd.ps)
				492	else:
				493	npu_op = convert_command_to_npu_op(cmd, arch)
				494	npu_op_list.append(npu_op)
				495	npu_op_to_cmd[npu_op] = cmd
				496	# Generate register commands
				497	stream_id = DebugDatabase.add_stream(sg)
				498	DebugDatabase.set_stream_offset(sg, 0) # Default to zero, can only set during file writing
				499
				500	def add_to_debug_db(npu_op: NpuOperation, offset: int):
				501	"""Adds info to the debug database"""
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	502	if not isinstance(npu_op, NpuDmaOperation):
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	503	cmd = npu_op_to_cmd[npu_op]
				504	DebugDatabase.add_command(stream_id, offset, cmd.ps.primary_op)
				505
Dwight Lidman	9b43f84	2020-12-08 17:56:44 +0100	[diff] [blame]	506	sg.register_command_stream = generate_command_stream(npu_op_list, arch, verbose, add_to_debug_db, npu_op_to_cmd)