Blame - ethosu/vela/high_level_command_to_npu_op.py - ml/ethos-u/ethos-u-vela

blob: f78644421f18361fa5140c04ff56981e26b15799 [file] [log] [blame]

Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	1	# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
				2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
				16	#
				17	# Description:
				18	# Conversion from high level command to NpuOperation
				19	from enum import IntEnum
				20	from typing import List
				21	from typing import Optional
				22
				23	from .api import NpuActivation
				24	from .api import NpuActivationOp
				25	from .api import NpuAddressRange
				26	from .api import NpuBlockOperation
				27	from .api import NpuBlockTraversal
				28	from .api import NpuConv2DOperation
				29	from .api import NpuConvDepthWiseOperation
				30	from .api import NpuDataType
				31	from .api import NpuDmaOperation
				32	from .api import NpuElementWiseOp
				33	from .api import NpuElementWiseOperation
				34	from .api import NpuFeatureMap
				35	from .api import NpuKernel
				36	from .api import NpuLayout
				37	from .api import NpuOperation
				38	from .api import NpuPadding
				39	from .api import NpuPoolingOp
				40	from .api import NpuPoolingOperation
				41	from .api import NpuQuantization
				42	from .api import NpuResamplingMode
				43	from .api import NpuRoundingMode
				44	from .api import NpuShape3D
				45	from .api import NpuTileBox
				46	from .architecture_features import ArchitectureFeatures
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	47	from .architecture_features import Block
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	48	from .data_type import DataType
				49	from .high_level_command_stream import Box
				50	from .high_level_command_stream import Command
				51	from .high_level_command_stream import CommandType
				52	from .high_level_command_stream import DMA
				53	from .high_level_command_stream import NpuStripe
				54	from .operation import Kernel
				55	from .operation import NpuBlockType
				56	from .operation import Op
				57	from .operation import Operation
				58	from .tensor import MemType
				59	from .tensor import Tensor
				60	from .tensor import TensorBlockTraversal
				61	from .tensor import TensorFormat
				62	from .tensor import TensorPurpose
				63
				64
				65	unary_elementwise_ops = set((NpuElementWiseOp.ABS, NpuElementWiseOp.LRELU, NpuElementWiseOp.CLZ,))
				66
				67
				68	class BasePointerIndex(IntEnum):
				69	WeightTensor = 0 # base address index for the Weight tensor
				70	ScratchTensor = 1 # base address index for the Scratch_tensor in the TensorArena
				71	ScratchFastTensor = 2 # base address for the Scratch_fast_tensor
				72	Mem2Mem = (1 << 8) \| (3 << 0) # base address slot for memory 2 memory transfer
				73
				74
				75	dtype_map = {
				76	DataType.uint8: NpuDataType.UINT8,
				77	DataType.int8: NpuDataType.INT8,
				78	DataType.uint16: NpuDataType.UINT16,
				79	DataType.int16: NpuDataType.INT16,
				80	DataType.int32: NpuDataType.INT32,
				81	}
				82
				83
				84	block_traversal_map = {
				85	TensorBlockTraversal.DepthFirst: NpuBlockTraversal.DEPTH_FIRST,
				86	TensorBlockTraversal.PartKernelFirst: NpuBlockTraversal.PART_KERNEL_FIRST,
				87	}
				88
				89
				90	# Maps an elementwise op type to an elementwise_mode enum value used by NPU_OP_ELEMENTWISE
				91	elementwise_op_map = {
				92	Op.Mul: NpuElementWiseOp.MUL,
				93	Op.Add: NpuElementWiseOp.ADD,
				94	Op.Sub: NpuElementWiseOp.SUB,
				95	Op.Minimum: NpuElementWiseOp.MIN,
				96	Op.Maximum: NpuElementWiseOp.MAX,
				97	Op.LeakyRelu: NpuElementWiseOp.LRELU,
				98	Op.Abs: NpuElementWiseOp.ABS,
				99	Op.CLZ: NpuElementWiseOp.CLZ,
				100	Op.SHR: NpuElementWiseOp.SHR,
				101	Op.SHL: NpuElementWiseOp.SHL,
				102	}
				103
				104
				105	def to_npu_kernel(kernel: Kernel) -> NpuKernel:
				106	"""Converts the given internally used kernel object to NpuKernel (of public API)"""
				107	return NpuKernel(
				108	kernel.width, kernel.height, kernel.stride.x, kernel.stride.y, kernel.dilation.x, kernel.dilation.y
				109	)
				110
				111
				112	def to_kernel(kernel: Optional[NpuKernel]) -> Kernel:
				113	"""Converts the given public API object to Kernel (used internally)"""
				114	if kernel is None:
				115	return Kernel(1, 1)
				116	return Kernel(kernel.width, kernel.height, kernel.stride_x, kernel.stride_y, kernel.dilation_x, kernel.dilation_y)
				117
				118
				119	def ifm_ifm2_correct_order(ifm_shape: List[int], ifm2_shape: List[int]) -> bool:
				120	if ifm_shape == []:
				121	# Scalar needs to be in IFM2
				122	return False
				123	if ifm2_shape == []:
				124	return True
				125
				126	for ifm, ifm2 in zip(ifm_shape, ifm2_shape):
				127	if ifm != ifm2 and ifm == 1:
				128	# Broadcasted FM needs to be in IFM2
				129	return False
				130	return True
				131
				132
Patrik Gustavsson	b0ca274	2020-11-18 07:59:09 +0100	[diff] [blame]	133	def get_rounding_mode(op: Operation, fused_quantize: bool) -> NpuRoundingMode:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	134	"""Specifies type of rounding to be used"""
				135	rounding_mode = NpuRoundingMode.TFL
				136	if op.type == Op.ResizeBilinear:
				137	rounding_mode = NpuRoundingMode.TRUNCATE
				138	elif (
				139	op.type.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise)
				140	and op.ifm.dtype == DataType.int16
				141	):
				142	rounding_mode = NpuRoundingMode.NATURAL
Patrik Gustavsson	b0ca274	2020-11-18 07:59:09 +0100	[diff] [blame]	143	elif (
				144	not fused_quantize
				145	and op.type.is_avgpool_op()
				146	and op.memory_function == Op.ConcatSliceWrite
				147	and op.kernel.elements_wh() == 1
				148	):
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	149	rounding_mode = NpuRoundingMode.NATURAL
				150	rounding_mode = op.attrs.get("rounding_mode", rounding_mode)
				151	return rounding_mode
				152
				153
				154	def create_padding(cmd: NpuStripe, primary_op: Operation) -> NpuPadding:
				155	if primary_op.type.npu_block_type == NpuBlockType.VectorProduct:
				156	return NpuPadding(top=0, left=0, bottom=0, right=0)
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	157	top, left, bottom, right = primary_op.attrs["explicit_padding"]
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	158
				159	# Check if this is for horizontal ifm streaming
				160	if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	161	top = cmd.pad_top
				162	bottom = cmd.pad_bottom
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	163
				164	# Indexing from end since a 1x1 Avgpool might have been added with non 4-dimensional input/output,
				165	# because of activation function needed to be fused.
Andreas Nevalainen	083f103	2020-11-18 10:45:50 +0100	[diff] [blame]	166	if len(cmd.ifm_box.start_coord) >= 2 and cmd.ifm_box.start_coord[-2] > 0:
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	167	left = 0
Andreas Nevalainen	083f103	2020-11-18 10:45:50 +0100	[diff] [blame]	168	if len(cmd.ifm_box.end_coord) >= 2 and cmd.ifm_box.end_coord[-2] < Block.from_shape(cmd.ifm_tensor.shape).width:
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	169	right = 0
				170	return NpuPadding(top=top, left=left, bottom=bottom, right=right)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	171
				172
				173	def get_region(tens: Tensor, arch: ArchitectureFeatures) -> int:
				174	if arch.feature_map_storage_mem_area == arch.fast_storage_mem_area:
				175	base_ptr_idx_map = {
				176	MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
				177	MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
				178	MemType.Scratch: BasePointerIndex.ScratchTensor,
				179	MemType.Scratch_fast: BasePointerIndex.ScratchTensor,
				180	}
				181	else:
				182	base_ptr_idx_map = {
				183	MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
				184	MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
				185	MemType.Scratch: BasePointerIndex.ScratchTensor,
				186	MemType.Scratch_fast: BasePointerIndex.ScratchFastTensor,
				187	}
				188	return int(base_ptr_idx_map[tens.mem_type])
				189
				190
				191	def get_upscale(op: Operation) -> NpuResamplingMode:
				192	upscale = NpuResamplingMode.NONE
				193	if op.type == Op.ResizeBilinear:
				194	# perform nearest neighbor upscale
				195	upscale = NpuResamplingMode.NEAREST
				196	elif op.type == Op.Conv2DBackpropInputSwitchedBias:
				197	# perform insert zero upscale
				198	upscale = NpuResamplingMode.TRANSPOSE
				199	return upscale
				200
				201
				202	def get_ifm_depth(npu_block_type: NpuBlockType, ifm_box: Box, ofm_box: Box) -> int:
				203	if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	204	block = ifm_box.get_block()
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	205	else:
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	206	block = ofm_box.get_block()
				207	return block.depth
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	208
				209
				210	def use_zero_point_0(ps, tens: Tensor, is_ifm_tensor: bool) -> bool:
				211	"""Checks if quantization should use 0 as zero point"""
				212	if tens.dtype == DataType.int32 and is_ifm_tensor:
				213	return True
				214	if ps.primary_op.type not in (Op.AvgPool, Op.ResizeBilinear, Op.CLZ, Op.SHL):
				215	return False
				216	fused_quantize = any(op.type == Op.Quantize for op in ps.ops)
				217	forced_ofm_quantization = ps.primary_op.forced_output_quantization
				218	use_0 = (
				219	(ps.primary_op.activation is None or forced_ofm_quantization is not None)
				220	and (ps.primary_op.memory_function != Op.ConcatSliceWrite)
				221	and not fused_quantize
				222	)
				223	return use_0
				224
				225
				226	def get_ifm_or_ifm2_quantization(ps, tens: Tensor) -> Optional[NpuQuantization]:
				227	"""Gets quantization for IFM/IFM2"""
				228	if tens.quantization is None:
				229	return None
				230	if use_zero_point_0(ps, tens, True):
				231	zero_point = 0
				232	else:
				233	zero_point = int(tens.quantization.zero_point)
				234	return NpuQuantization(scale_f32=tens.quantization.scale_f32, zero_point=zero_point)
				235
				236
				237	def get_ofm_quantization(ps, tens: Tensor) -> Optional[NpuQuantization]:
				238	"""Gets quantization for OFM"""
				239	op = ps.primary_op
				240	# Check if operation's output quantization is should be used instead of the output tensor's quantization
				241	# (used in LUTs)
				242	ofm_quant = op.forced_output_quantization if op.forced_output_quantization is not None else tens.quantization
				243	if ofm_quant is None:
				244	return None
				245	if use_zero_point_0(ps, tens, False):
				246	zero_point = 0
				247	else:
				248	zero_point = int(ofm_quant.zero_point)
				249	return NpuQuantization(scale_f32=ofm_quant.scale_f32, zero_point=zero_point)
				250
				251
				252	def create_feature_map(tens: Tensor, box: Box, arch: ArchitectureFeatures) -> NpuFeatureMap:
				253	"""Creates feature map with common fields populated"""
				254	fm = NpuFeatureMap()
				255	fm.region = get_region(tens, arch)
				256	fm.data_type = dtype_map[tens.dtype]
				257	if tens.format == TensorFormat.NHWC:
				258	fm.layout = NpuLayout.NHWC
				259	elif tens.format == TensorFormat.NHCWB16:
				260	fm.layout = NpuLayout.NHCWB16
				261	else:
				262	assert 0, "Incorrect tensor format"
				263	height_0, height_1, width_0, addresses = tens.addresses_for_rolling_buffer(box.start_coord, box.end_coord)
				264	for idx, addr in enumerate(addresses):
				265	if addr is None:
				266	addresses[idx] = 0
				267	fm.tiles = NpuTileBox(
				268	height_0=height_0, height_1=height_1, width_0=width_0, addresses=[int(addr) for addr in addresses]
				269	)
				270	strides = tens.get_strides()
				271	fm.strides = NpuShape3D(height=int(strides[2]), width=int(strides[3]), depth=int(strides[1]))
				272	return fm
				273
				274
				275	def create_weights(weight_tensor: Tensor, weight_box: Box, arch: ArchitectureFeatures) -> List[NpuAddressRange]:
				276	"""Returns address ranges for weights"""
				277	weights = []
				278	stream_index = weight_tensor.compressed_stream_index_from_coord(weight_box.start_coord)
				279	weight_substream_offsets = weight_tensor.compressed_values_substream_offsets[stream_index]
				280	substreams = len(weight_substream_offsets) - 1 # Offset list must terminate with full stream length
				281
				282	# Extract weight substream offsets and calculate their lengths
				283	assert len(weight_substream_offsets) > 1 and (weight_substream_offsets[0] == 0)
				284	weight_addr = weight_tensor.address_for_coordinate(weight_box.start_coord)
				285	region = get_region(weight_tensor, arch)
				286	for core in range(substreams):
				287	address = weight_addr + weight_substream_offsets[core]
				288	length = weight_substream_offsets[core + 1] - weight_substream_offsets[core]
				289	addr_range = NpuAddressRange(region, int(address), int(length))
				290	weights.append(addr_range)
				291	return weights
				292
				293
				294	def create_biases(
				295	weight_tensor: Tensor, scale_tensor: Tensor, weight_box: Box, arch: ArchitectureFeatures
				296	) -> List[NpuAddressRange]:
				297	"""Returns address ranges for biases"""
				298	biases = []
				299	stream_index = weight_tensor.compressed_stream_index_from_coord(weight_box.start_coord)
				300	scale_substream_offsets = scale_tensor.compressed_values_substream_offsets[stream_index]
				301	substreams = len(scale_substream_offsets) - 1 # Offset list must terminate with full stream length
				302
				303	# Extract scale substream offsets and calculate their lengths
				304	assert len(scale_substream_offsets) > 1 and (scale_substream_offsets[0] == 0)
				305	scale_addr = scale_tensor.address_for_coordinate(weight_box.start_coord[-1:])
				306
				307	region = get_region(scale_tensor, arch)
				308	for core in range(substreams):
				309	address = scale_addr + scale_substream_offsets[core]
				310	length = scale_substream_offsets[core + 1] - scale_substream_offsets[core]
				311	addr_range = NpuAddressRange(region, int(address), int(length))
				312	biases.append(addr_range)
				313	return biases
				314
				315
				316	def create_npu_activation(op: Operation) -> NpuActivation:
				317	"""Creates fused activation function"""
				318	if op.activation is None:
				319	return NpuActivation(NpuActivationOp.NONE_OR_RELU)
				320	faf = op.activation.op_type
				321	act_op = NpuActivationOp.NONE_OR_RELU
				322	if faf == Op.Tanh:
				323	act_op = NpuActivationOp.TANH
				324	elif faf == Op.Sigmoid:
				325	act_op = NpuActivationOp.SIGMOID
				326	elif faf == Op.LUT:
				327	act_op = NpuActivationOp.TABLE_LOOKUP
				328	elif not faf.is_relu_op():
				329	raise Exception("Unsupported fused_activation_function = " + faf.name)
				330
				331	act = NpuActivation(act_op)
				332	act.min = op.activation.min
				333	act.max = op.activation.max
				334	act.lookup_table_index = op.activation.lut_index
				335	return act
				336
				337
				338	def set_common_op_fields(npu_op: NpuBlockOperation, cmd: NpuStripe, arch: ArchitectureFeatures):
				339	"""Sets common fields of the given operation"""
				340	ps = cmd.ps
				341	op = ps.primary_op
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	342
				343	ifm_height = cmd.ifm_box.get_block().height
				344	ifm_width = Block.from_shape(cmd.ifm_tensor.shape).width
				345	ifm_depth = get_ifm_depth(op.type.npu_block_type, cmd.ifm_box, cmd.ofm_box)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	346
				347	npu_op.ifm = create_feature_map(cmd.ifm_tensor, cmd.ifm_box, arch)
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	348	npu_op.ifm.shape = NpuShape3D(height=ifm_height, width=ifm_width, depth=ifm_depth)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	349	npu_op.ifm.quantization = get_ifm_or_ifm2_quantization(ps, cmd.ifm_tensor)
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	350
				351	out_block = cmd.ofm_box.get_block()
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	352	npu_op.ofm = create_feature_map(cmd.ofm_tensor, cmd.ofm_box, arch)
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	353	npu_op.ofm.shape = NpuShape3D(height=out_block.height, width=out_block.width, depth=out_block.depth)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	354	npu_op.ofm.quantization = get_ofm_quantization(ps, cmd.ofm_tensor)
				355
				356	if cmd.weight_tensor is not None:
				357	npu_op.weights = create_weights(cmd.weight_tensor, cmd.weight_box, arch)
				358	if cmd.scale_tensor is not None:
				359	npu_op.biases = create_biases(cmd.weight_tensor, cmd.scale_tensor, cmd.weight_box, arch)
				360	npu_op.activation = create_npu_activation(op)
Patrik Gustavsson	b0ca274	2020-11-18 07:59:09 +0100	[diff] [blame]	361	npu_op.fused_quantize = any(op.type == Op.Quantize for op in ps.ops)
				362	npu_op.rounding_mode = get_rounding_mode(op, npu_op.fused_quantize)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	363	npu_op.block_config = NpuShape3D(height=ps.block_config[0], width=ps.block_config[1], depth=ps.block_config[3])
				364
				365	if not op.type.is_elementwise_op():
				366	npu_op.padding = create_padding(cmd, op)
				367	npu_op.kernel = to_npu_kernel(op.kernel)
				368	npu_op.ifm_upscale = get_upscale(op)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	369	return npu_op
				370
				371
				372	def create_npu_conv2d_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuConv2DOperation:
				373	"""Converts the command to NpuConv2DOperation"""
				374	npu_op = NpuConv2DOperation()
				375	set_common_op_fields(npu_op, cmd, arch)
				376	if cmd.ps.primary_op.type.npu_block_type == NpuBlockType.VectorProduct:
				377	npu_op.block_traversal = NpuBlockTraversal.DEPTH_FIRST
				378	else:
				379	npu_op.block_traversal = block_traversal_map[cmd.weight_tensor.block_traversal]
				380	return npu_op
				381
				382
				383	def create_npu_conv_depthwise_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuConvDepthWiseOperation:
				384	"""Converts the command to NpuConvDepthWiseOperation"""
				385	npu_op = NpuConvDepthWiseOperation()
				386	set_common_op_fields(npu_op, cmd, arch)
				387	return npu_op
				388
				389
				390	def create_npu_pool_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuPoolingOperation:
				391	"""Converts the command to NpuPoolingOperation"""
				392	ps = cmd.ps
				393	op = ps.primary_op
				394	pool_op = NpuPoolingOp.AVERAGE
				395	if op.type.is_maxpool_op():
				396	pool_op = NpuPoolingOp.MAX
				397	elif op.type.is_avgpool_op() or op.type == Op.ResizeBilinear:
				398	pool_op = NpuPoolingOp.AVERAGE
				399	elif op.type == Op.ReduceSum:
				400	pool_op = NpuPoolingOp.REDUCE_SUM
				401	else:
				402	assert 0, f"Unknown pool type {op.type}"
				403	npu_op = NpuPoolingOperation(pool_op)
				404	set_common_op_fields(npu_op, cmd, arch)
				405	# Pooling specific info
				406	if op.type == Op.ResizeBilinear and "rescale" in op.attrs:
				407	npu_op.rescale = op.attrs["rescale"]
				408	return npu_op
				409
				410
				411	def create_npu_elementwise_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuElementWiseOperation:
				412	"""Converts the command to NpuElementWiseOperation"""
				413	ps = cmd.ps
				414	op = ps.primary_op
				415	assert op.type in elementwise_op_map, f"Unknown elementwise type {op.type}"
				416	elemwise_op = elementwise_op_map[op.type]
				417	npu_op = NpuElementWiseOperation(elemwise_op)
				418	if elemwise_op not in unary_elementwise_ops:
				419	if not ifm_ifm2_correct_order(cmd.ifm_tensor.shape, cmd.ifm2_tensor.shape):
				420	# The scalar/broadcasted feature map has to be the ifm2 tensor so switch the ifms
				421	cmd.ifm_tensor, cmd.ifm2_tensor = cmd.ifm2_tensor, cmd.ifm_tensor
				422	cmd.ifm_box, cmd.ifm2_box = cmd.ifm2_box, cmd.ifm_box
				423	npu_op.reversed_operands = True
				424	npu_op.ifm2 = create_feature_map(cmd.ifm2_tensor, cmd.ifm2_box, arch)
				425	npu_op.ifm2.quantization = get_ifm_or_ifm2_quantization(ps, cmd.ifm2_tensor)
				426	if cmd.ifm2_tensor.shape == []:
				427	# scalar
				428	assert cmd.ifm2_tensor.quant_values.size == 1
				429	npu_op.ifm2_scalar = cmd.ifm2_tensor.values.item(0)
				430	npu_op.ifm2.shape = NpuShape3D(height=0, width=0, depth=0)
				431	else:
Louis Verhaard	69b3176	2020-11-17 09:45:20 +0100	[diff] [blame]	432	ifm2_blk = cmd.ifm2_box.get_block()
				433	ifm2_width = Block.from_shape(cmd.ifm2_tensor.shape).width
				434	npu_op.ifm2.shape = NpuShape3D(height=ifm2_blk.height, width=ifm2_width, depth=ifm2_blk.depth)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	435	set_common_op_fields(npu_op, cmd, arch)
				436	# Check if output scale needs to be overridden
				437	output_scale = None
				438	if op.type == Op.Add and "resizebilinear" in op.attrs:
				439	# Force output scale same as the input scale for
				440	# resizebilinear 1x1 that is converted to add
				441	output_scale = npu_op.ifm2.quantization.scale_f32
				442	if op.type == Op.LeakyRelu:
				443	output_scale = op.attrs["alpha"]
				444	if op.type in (Op.Add, Op.Sub) and "rescale" in op.attrs:
				445	npu_op.rescale = op.attrs.get("rescale")
				446	if op.type in (Op.Add, Op.Mul, Op.Sub):
				447	if op.activation is not None and op.activation.op_type in (Op.Sigmoid, Op.Tanh):
				448	output_scale = 1 / 0x3000
				449	if output_scale is not None:
				450	npu_op.ofm.quantization = NpuQuantization(scale_f32=output_scale, zero_point=npu_op.ofm.quantization.zero_point)
				451	return npu_op
				452
				453
				454	def create_dma_op(cmd: DMA, arch: ArchitectureFeatures) -> NpuDmaOperation:
				455	"""Converts the command to NpuDmaOperation"""
				456	src_region = get_region(cmd.in_tensor, arch)
				457	if cmd.out_tensor.purpose == TensorPurpose.LUT:
				458	dest_region = BasePointerIndex.Mem2Mem
				459	else:
				460	dest_region = get_region(cmd.out_tensor, arch)
				461
				462	start_coord = cmd.box.start_coord
				463	src_addr = cmd.in_tensor.address_for_coordinate(start_coord)
				464	dest_addr = cmd.out_tensor.address_for_coordinate(start_coord)
				465
				466	if cmd.in_tensor.compressed_values is not None:
				467	if cmd.out_tensor.purpose == TensorPurpose.FSBias:
				468	sz = cmd.in_tensor.storage_size()
				469	else:
				470	stream_index = cmd.in_tensor.compressed_stream_index_from_coord(start_coord)
				471	sz = cmd.in_tensor.size_of_compressed_stream(stream_index)
				472	else:
				473	sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr
				474	src = NpuAddressRange(src_region, int(src_addr), int(sz))
				475	dest = NpuAddressRange(dest_region, int(dest_addr), int(sz))
				476	return NpuDmaOperation(src, dest)
				477
				478
				479	def convert_command_to_npu_op(cmd: Command, arch: ArchitectureFeatures) -> NpuOperation:
				480	"""Converts the high level command to NpuOperation"""
				481	if cmd.cmdtype == CommandType.DMA:
				482	npu_op = create_dma_op(cmd, arch)
				483	elif cmd.cmdtype == CommandType.NpuStripe:
				484	npu_block_type = cmd.ps.primary_op.type.npu_block_type
				485	if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct):
				486	npu_op = create_npu_conv2d_op(cmd, arch)
				487	elif npu_block_type == NpuBlockType.ConvolutionDepthWise:
				488	npu_op = create_npu_conv_depthwise_op(cmd, arch)
				489	elif npu_block_type in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):
				490	npu_op = create_npu_pool_op(cmd, arch)
				491	elif npu_block_type == NpuBlockType.ElementWise:
				492	npu_op = create_npu_elementwise_op(cmd, arch)
				493	else:
				494	assert 0, f"Unknown command type {npu_block_type}"
				495	# add a link to the high level command for debugging purposes
				496	npu_op.cmd = cmd
				497	return npu_op