Blame - ethosu/vela/tflite_graph_optimiser.py - ml/ethos-u/ethos-u-vela

blob: 97e30ad697a1e882f31a6c4bf86eb2d4380317ab [file] [log] [blame]

Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	1	# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
				2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
				16	# Description:
				17	# Early optimisation of a TensorFlow Lite based network graph, using the rewrite_graph module
				18	# to do the traversal of the graph.
				19	import math
				20	import uuid
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	21
				22	import numpy as np
				23
				24	from . import fp_math
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	25	from . import rewrite_graph
				26	from . import scaling
				27	from .api import NpuRoundingMode
				28	from .data_type import DataType
				29	from .debug_database import DebugDatabase
				30	from .errors import UnsupportedFeatureError
				31	from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Jonas Ohlsson	0957e3e	2021-09-01 15:57:21 +0200	[diff] [blame]	32	from .graph_optimiser_util import bypass_memory_only_ops
Patrik Gustavsson	c74682c	2021-08-17 14:26:38 +0200	[diff] [blame]	33	from .graph_optimiser_util import calc_explicit_padding
Patrik Gustavsson	df99510	2021-08-23 15:33:59 +0200	[diff] [blame]	34	from .graph_optimiser_util import convert_depthwise_to_conv
Patrik Gustavsson	f436ada	2021-09-14 14:56:48 +0200	[diff] [blame]	35	from .graph_optimiser_util import convert_to_lut
Patrik Gustavsson	df99510	2021-08-23 15:33:59 +0200	[diff] [blame]	36	from .graph_optimiser_util import fix_sg_input_output
Jonas Ohlsson	0957e3e	2021-09-01 15:57:21 +0200	[diff] [blame]	37	from .graph_optimiser_util import memory_only_ops
Patrik Gustavsson	f1580f0	2021-09-01 12:43:02 +0200	[diff] [blame]	38	from .graph_optimiser_util import move_splitsliceread_to_consumer
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	39	from .graph_optimiser_util import needed_total_padding
				40	from .graph_optimiser_util import set_ifm_ofm_op_shapes
				41	from .graph_optimiser_util import set_tensor_equivalence
				42	from .numeric_util import clamp_sigmoid
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	43	from .numeric_util import round_away_zero
				44	from .operation import create_activation_function
Fredrik Svedberg	1a7527c	2021-09-13 15:52:16 +0200	[diff] [blame]	45	from .operation import ExplicitScaling
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	46	from .operation import NpuBlockType
				47	from .operation import Op
				48	from .operation import Operation
				49	from .operation import Padding
				50	from .operation_util import create_avgpool_nop
				51	from .operation_util import get_pad_values_from_input
				52	from .shape4d import Shape4D
				53	from .softmax import SoftMax
				54	from .tensor import check_quantized_tens_scaling_equal
				55	from .tensor import create_const_tensor
				56	from .tensor import create_equivalence_id
				57	from .tensor import QuantizationParameters
				58	from .tensor import Tensor
				59	from .tensor import TensorPurpose
				60	from .tflite_mapping import optype_to_builtintype
				61
				62	passthrough_nodes = (Op.Identity,)
				63
				64
				65	def create_avg_pool_for_concat(concat_op, name, ifm, ifm_shape: Shape4D, write_offset: Shape4D):
				66	"""Creates an average pool for the given concat op/input feature map"""
				67	ofm = concat_op.ofm
				68	avgpool_op = create_avgpool_nop(name)
				69	avgpool_op.inputs = [ifm]
				70	avgpool_op.outputs = [ofm]
				71
				72	avgpool_op.write_offset = write_offset
				73	avgpool_op.write_shape = ifm_shape
				74	ofm.ops.append(avgpool_op)
				75	DebugDatabase.add_optimised(concat_op, avgpool_op)
				76	avgpool_op.ifm_shapes.append(ifm_shape)
				77	avgpool_op.ofm_shapes.append(concat_op.ofm_shapes[0])
				78	avgpool_op.memory_function = Op.ConcatSliceWrite
				79	return avgpool_op
				80
				81
				82	def remove_passthrough_tensor(tens, arch, nng):
				83	if len(tens.ops) == 1 and tens.ops[0].type in passthrough_nodes:
				84	assert len(tens.ops[0].inputs) == 1
				85	tens = tens.ops[0].inputs[0]
				86	return tens
				87
				88
				89	def rewrite_concat_ops(op, arch):
				90	if not op.run_on_npu or not op.type.is_concat_op():
				91	return
				92
				93	axis_4D = 0
				94	ofm = op.ofm
				95	ofm.ops = []
				96	offset = 0
				97
				98	unfuse_activation_function(op)
				99
				100	if op.type == Op.Pack:
				101	# Pack is also referred to as Stack
				102	axis = int(op.attrs["axis"])
				103	if axis < 0: # Convert to positive axis
				104	axis = len(op.inputs[0].shape) + 1 + axis
				105
				106	desired_shape = op.inputs[0].shape[:axis] + [1] + op.inputs[0].shape[axis:]
				107
				108	axis_4D = axis + (4 - len(desired_shape))
				109
				110	for idx, inp in enumerate(op.inputs):
				111	op.ifm_shapes[idx] = Shape4D(desired_shape)
				112	op.type = Op.PackReshaped
				113
				114	inputs, axis = op.get_concat_inputs_axis()
				115	for idx, inp in enumerate(inputs):
				116	if op.type != Op.PackReshaped:
				117	op.ifm_shapes[idx] = Shape4D(inp.shape)
				118	if axis >= 0:
				119	axis_4D = axis + (4 - len(inp.shape))
				120	else:
				121	axis_4D = axis
				122	write_offset = [0, 0, 0, 0]
				123	write_offset[axis_4D] = offset
				124	concat_end = offset + op.ifm_shapes[idx][axis_4D]
				125	create_avg_pool_for_concat(
				126	op, op.name + str(idx) + "_avgpool", inp, op.ifm_shapes[idx], Shape4D.from_list(write_offset)
				127	)
				128	offset = concat_end
				129	assert ofm.shape[axis] == offset
				130
				131	return op
				132
				133
				134	def rewrite_split_ops(tens, arch, nng):
				135
				136	if len(tens.ops) == 1 and tens.ops[0].type.is_split_op() and tens.ops[0].type != Op.Unpack:
				137	split_op = tens.ops[0]
				138
				139	# Not supported so leave it and run on CPU
				140	if not split_op.run_on_npu:
				141	return tens
				142
				143	inp, outputs, axis, offset_start, offset_end = split_op.get_split_inputs_axis()
				144
				145	tens.ops = []
				146	new_op = Operation(Op.SplitSliceRead, split_op.name)
				147	new_op.inputs = [inp]
				148	ofm_shape_idx = 0
Tim Hall	51a8dce	2021-12-20 16:49:27 +0000	[diff] [blame]	149	if None in (offset_end, offset_start):
				150	read_shape = None
				151	else:
				152	# the read shape is relative to each start offset
				153	read_shape = [oe - os for oe, os in zip(offset_end, offset_start)]
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	154
				155	# For Split the offset cannot be extracted from the tensor so it has to
				156	# be calculated from the index of the output tensor
				157	if axis is not None:
				158	# Get the start and end of the split
				159	offset_start = [0] * 4
				160	axis_4D_list = split_op.attrs.get("split_axis_4D", None) # Present for UnpackReshaped and some StridedSlice
				161	for idx, out in enumerate(outputs):
				162	if axis_4D_list is not None:
				163	axis_4D = axis_4D_list[idx]
				164	else:
				165	split_op.ofm_shapes[idx] = Shape4D(out.shape)
				166	if axis >= 0:
				167	axis_4D = axis + (4 - len(out.shape))
				168	else:
				169	axis_4D = axis
				170
				171	if out == tens:
				172	ofm_shape_idx = idx
				173	read_shape = split_op.ofm_shapes[idx]
				174	break
				175
				176	offset_start[axis_4D] += split_op.ofm_shapes[idx][axis_4D]
				177
				178	new_op.read_offsets[0] = Shape4D.from_list(offset_start, 0)
				179	new_op.read_shapes[0] = read_shape
				180	new_op.run_on_npu = True
				181	new_op.set_output_tensor(tens)
				182	new_op.ifm_shapes.append(Shape4D(inp.shape))
				183	new_op.ofm_shapes.append(split_op.ofm_shapes[ofm_shape_idx])
				184	DebugDatabase.add_optimised(split_op, new_op)
				185
				186	return tens
				187
				188
				189	def remove_SplitSliceRead(op, arch):
				190
				191	if op.type == Op.SplitSliceRead:
				192	# Check if it is possible to put the SplitSliceRead on the tensor consumer, or if an avgpool need to be inserted
				193	if (
				194	len(op.ofm.consumer_list) == 1
				195	and op.ofm.consumer_list[0] is not None
				196	and op.ofm.consumer_list[0].run_on_npu
Jonas Ohlsson	0957e3e	2021-09-01 15:57:21 +0200	[diff] [blame]	197	and op.ofm.consumer_list[0].type not in memory_only_ops
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	198	and op.ofm_shapes[0] == Shape4D.from_list(op.ofm.shape)
				199	):
				200	# SplitSliceRead can be performed by tensor consumer
				201	cons_op = op.ofm.consumer_list[0]
Patrik Gustavsson	f1580f0	2021-09-01 12:43:02 +0200	[diff] [blame]	202	move_splitsliceread_to_consumer(op, cons_op)
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	203	else:
				204	avgpool_op = create_avgpool_nop(op.name + "_avgpool")
				205	avgpool_op.add_input_tensor(op.ifm)
				206	avgpool_op.outputs = [op.ofm]
				207	op.ofm.ops.remove(op)
				208	op.ofm.ops.append(avgpool_op)
				209	avgpool_op.ifm_shapes.append(op.ifm_shapes[0])
				210	avgpool_op.ofm_shapes.append(op.ofm_shapes[0])
				211	avgpool_op.read_offsets[0] = op.read_offsets[0]
				212	avgpool_op.read_shapes[0] = op.read_shapes[0]
				213
				214	op.ifm.consumer_list.remove(op)
				215	DebugDatabase.add_optimised(op, avgpool_op)
				216
				217
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	218	def calc_padding_and_skirt(padding_type, kernel, input_shape, explicit_padding):
				219	k_w, k_h = kernel.dilated_wh()
				220	s_x, s_y = kernel.stride
				221	ypad = needed_total_padding(int(input_shape.height), int(s_y), int(k_h))
				222	xpad = needed_total_padding(int(input_shape.width), int(s_x), int(k_w))
				223	if padding_type == Padding.SAME:
				224	left_pad = (xpad + 0) // 2
				225	right_pad = (xpad + 1) // 2
				226	top_pad = (ypad + 0) // 2
				227	bottom_pad = (ypad + 1) // 2
				228	elif padding_type == Padding.VALID:
				229	left_pad = 0
				230	right_pad = 0
				231	top_pad = 0
				232	bottom_pad = 0
				233	elif padding_type == Padding.EXPLICIT:
				234	# Padding is specified in a PAD operator which has been bypassed.
				235	top, left, bottom, right = explicit_padding
				236	top_pad, bottom_pad = calc_explicit_padding(int(input_shape.height), int(s_y), int(k_h), int(top), int(bottom))
				237	left_pad, right_pad = calc_explicit_padding(int(input_shape.width), int(s_x), int(k_w), int(left), int(right))
				238	else:
Tim Hall	0ab2edc	2022-02-23 17:58:02 +0000	[diff] [blame^]	239	raise UnsupportedFeatureError(f"Unsupported padding = {padding_type} for padding calculation")
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	240	padding = (top_pad, left_pad, bottom_pad, right_pad)
				241	skirt = (top_pad, left_pad, ypad - top_pad, xpad - left_pad)
				242	return padding, skirt
				243
				244
				245	def calc_upscaled_padding_and_skirt(padding_type, kernel_size, stride, input_shape, upscaling_factor):
				246	kernel_height, kernel_width = kernel_size[0], kernel_size[1]
				247	if padding_type == Padding.SAME:
				248	ypad = needed_total_padding(int(input_shape.height) * upscaling_factor, int(stride[1]), int(kernel_height))
				249	xpad = needed_total_padding(int(input_shape.width) * upscaling_factor, int(stride[2]), int(kernel_width))
				250	right_pad = max(((xpad + 1) // upscaling_factor) - 1, 0)
				251	bottom_pad = max(((ypad + 1) // upscaling_factor) - 1, 0)
				252	left_pad = max(kernel_width - 1 - right_pad, 0)
				253	top_pad = max(kernel_height - 1 - bottom_pad, 0)
				254	elif padding_type == Padding.VALID:
				255	right_pad = max(kernel_width - 2, 0)
				256	bottom_pad = max(kernel_height - 2, 0)
				257	left_pad = kernel_width - 1
				258	top_pad = kernel_height - 1
				259	else:
Tim Hall	0ab2edc	2022-02-23 17:58:02 +0000	[diff] [blame^]	260	raise UnsupportedFeatureError(f"Unsupported padding = {padding_type} for up-scaled padding calculation")
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	261	padding = (top_pad, left_pad, bottom_pad, right_pad)
				262	skirt = padding
				263	return padding, skirt
				264
				265
				266	def fixup_conv2d_backprop(op, arch, nng):
				267	if op.type == Op.Conv2DBackpropInput:
				268	# flip the inputs
				269	op.inputs[0], op.inputs[2] = op.inputs[2], op.inputs[0]
				270	op.type = Op.Conv2DBackpropInputSwitchedBias
				271	op.ifm.resampling_mode = resampling_mode.TRANSPOSE
				272
				273	# Update strides
				274	op.attrs.update({"stride_w": 1, "stride_h": 1, "strides": (1, 1, 1, 1)})
				275
				276	return op
				277
				278
				279	# Convert the op to an elementwise add
				280	def convert_resizebilinear_1x1_to_add(op):
				281	op.type = Op.Add
				282	op.name = op.name + "_add"
				283	op.attrs["resizebilinear"] = True
				284	# Create an input tensor filled with zeros
				285	shape = op.ofm_shapes[0].as_list()
				286	tens = Tensor(shape, op.inputs[0].dtype, op.inputs[1].name + "_add")
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	287	tens.values = np.zeros(shape, tens.dtype.as_numpy_type())
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	288	tens.quantization = QuantizationParameters(0.0, 255.0)
				289	tens.quantization.scale_f32 = 1.0
				290	tens.quantization.zero_point = 0
				291	tens.consumer_list = [op]
				292	tens_op = op.inputs[1].ops[0]
				293	tens_op.set_output_tensor(tens)
				294	# Set the add inputs
				295	op.inputs[1] = op.inputs[0]
				296	op.inputs[0] = tens
				297	op.set_ifm_ofm_shapes()
				298
				299	return op
				300
				301
Rickard Bolin	e546def	2022-01-25 15:45:00 +0000	[diff] [blame]	302	# Convert ResizeBilinear to a number of 2x2 nearest neighbor upscaling and one avgpool op with kernel size dependent
				303	# on the upscaling factor. Avgpool kernel limit of 8x8 when padding is applied limits upscaling to 8x8.
				304	def convert_resizebilinear_to_nearest_neighbor_upscaling_and_pool(op):
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	305	pre_op = op
				306	outputs = op.outputs
Rickard Bolin	e546def	2022-01-25 15:45:00 +0000	[diff] [blame]	307	dtype = op.ifm.dtype
				308	op.attrs.update({"strides": (1, 1, 1, 1), "ksize": (1, 1, 1, 1)})
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	309	if op.attrs["align_corners"]:
				310	shape_modifier = 1
				311	op.attrs["padding"] = Padding.VALID
				312	else:
				313	shape_modifier = 0
				314	op.attrs["padding"] = Padding.SAME
				315	op.inputs[0].resampling_mode = resampling_mode.NEAREST
				316
				317	upscaled_shape = np.array(op.ifm_shapes[0].get_hw_as_list())
				318	out_shape = np.array(op.ofm_shapes[0].get_hw_as_list())
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	319
Rickard Bolin	e546def	2022-01-25 15:45:00 +0000	[diff] [blame]	320	# Calculate how many times 2x2 upscaling needs to be performed
				321	upscale_factor = round(out_shape[1] / upscaled_shape[1])
				322	n = int(np.log2(upscale_factor))
				323
				324	# Perform 2x2 upscaling n-1 times
				325	scaled_op = pre_op
				326	for count in range(n - 1):
				327	if count > 0:
				328	scaled_op = op.clone(f"_{count}")
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	329	scaled_op.inputs[0] = pre_op.outputs[0]
				330
Rickard Bolin	e546def	2022-01-25 15:45:00 +0000	[diff] [blame]	331	# Nearest neighbor 2x2 upscaling
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	332	upscaled_shape = upscaled_shape * 2 - shape_modifier
Rickard Bolin	e546def	2022-01-25 15:45:00 +0000	[diff] [blame]	333	shape = op.ofm_shapes[0].as_list()
				334	shape[1:3] = upscaled_shape
				335	out_tens = Tensor(shape, dtype, f"{op.outputs[0].name}_{count}")
				336	out_tens.quantization = op.outputs[0].quantization.clone()
				337	scaled_op.set_output_tensor(out_tens)
				338	pre_op = scaled_op
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	339
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	340	scaled_op.set_ifm_ofm_shapes()
				341
Rickard Bolin	e546def	2022-01-25 15:45:00 +0000	[diff] [blame]	342	# Last 2x2 upscaling also applies avgpool with kernel size dependent on the upscaling factor and adds
				343	# padding to the right and bottom.
				344	if n > 1:
				345	scaled_op = op.clone(f"_{n-1}")
				346	scaled_op.inputs[0] = pre_op.outputs[0]
				347	scaled_op.attrs["padding"] = Padding.EXPLICIT
				348	scaled_op.attrs["explicit_padding"] = [0, 0, upscale_factor - 1, upscale_factor - 1]
				349	scaled_op.attrs.update({"ksize": (1, upscale_factor, upscale_factor, 1)})
				350	scaled_op.outputs = outputs
				351	scaled_op.outputs[0].ops = [scaled_op]
				352	scaled_op.set_ifm_ofm_shapes()
				353
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	354	return op
				355
				356
				357	def fixup_resizebilinear(op, arch, nng):
				358	if op.type == Op.ResizeBilinear and op.run_on_npu:
				359	if op.ifm_shapes[0] == op.ofm_shapes[0]:
				360	# Bypass nop resizebilinear
				361	op.inputs = op.inputs[:1]
				362	op.type = Op.Identity
				363	elif op.ifm_shapes[0].height == 1 and op.ifm_shapes[0].width == 1:
				364	convert_resizebilinear_1x1_to_add(op)
				365	else:
Rickard Bolin	e546def	2022-01-25 15:45:00 +0000	[diff] [blame]	366	convert_resizebilinear_to_nearest_neighbor_upscaling_and_pool(op)
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	367
				368	return op
				369
				370
				371	def convert_nop_split_to_identity(op, arch, nng):
				372	if op.type == Op.Split and op.attrs.get("num_splits") == 1:
				373	# the list comprehension should return a list with a single tensor
				374	# if it shouldn't, remove_passthrough_tensor will fail appropriately
				375	op.inputs = [i for i in op.inputs if i.shape == op.outputs[0].shape]
				376	op.type = Op.Identity
				377	return op
				378
				379
				380	def rewrite_fully_connected_input(op, arch, nng):
				381	if op.type == Op.FullyConnected:
				382	n_in_elems = op.weights.shape[-2]
				383	elms = op.ifm.elements()
				384	batch_size = elms // n_in_elems
				385	assert batch_size * n_in_elems == elms
				386
				387	op.ifm_shapes[0] = Shape4D([batch_size, 1, 1, n_in_elems])
				388	return op
				389
				390
				391	def convert_batched_fc_shape(op, arch, nng):
				392	if op.type == Op.FullyConnected:
				393	# Check if the first dimension indicates batching
				394	if op.ifm_shapes[0].batch > 1:
				395	batching_split = {4: (2, 2), 8: (2, 4), 16: (4, 4)}
				396	n = op.ifm_shapes[0].batch
				397	h, w = batching_split.get(n, (1, n))
				398	op.ifm_shapes[0] = Shape4D([1, h, w, op.ifm_shapes[0].depth])
				399
				400	# Reshape Weights to be 4D. IO becomes HWIO
				401	weight_tensor = op.inputs[1]
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	402	weight_tensor.values = np.expand_dims(np.expand_dims(weight_tensor.values, axis=0), axis=0)
				403	weight_tensor.set_all_shapes(list(weight_tensor.values.shape))
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	404
				405	n = op.ofm_shapes[0].batch
				406	h, w = batching_split.get(n, (1, n))
				407	op.ofm_shapes[0] = Shape4D([1, h, w, op.ofm_shapes[0].depth])
				408	return op
				409
				410
				411	def unfuse_activation_function(op):
				412	if op.type == Op.ConcatTFLite and op.run_on_npu and op.activation is not None:
				413	act_op = Operation(op.activation.op_type, op.name + op.activation.op_type.name)
				414	op.activation = None
				415	out_tens = op.outputs[0]
				416	intermediate_tens = out_tens.clone("_act_intermediate")
				417	act_op.set_output_tensor(out_tens)
				418	act_op.add_input_tensor(intermediate_tens)
				419	op.set_output_tensor(intermediate_tens)
				420	act_op.set_ifm_ofm_shapes()
				421
				422
				423	def rewrite_stridedslice_output(op, arch, nng):
				424	if not op.run_on_npu or op.type != Op.StridedSlice:
				425	return op
				426
				427	new_axis_mask = op.attrs["new_axis_mask"]
				428	shrink_axis_mask = op.attrs["shrink_axis_mask"]
				429
				430	if shrink_axis_mask == 0 and new_axis_mask == 0:
				431	return op
				432
				433	axis_4D = [0] * len(op.outputs)
				434	for idx, out_tens in enumerate(op.outputs):
				435	output_shape = list(out_tens.shape)
				436
				437	if shrink_axis_mask != 0:
				438	n = 0
				439	axis = 0
				440	while shrink_axis_mask:
				441	prev_mask = shrink_axis_mask
				442	n += 1
				443	shrink_axis_mask &= shrink_axis_mask - 1
				444	axis = int(math.log2(prev_mask - shrink_axis_mask))
				445	output_shape = output_shape[:axis] + [1] + output_shape[axis:]
				446
				447	assert len(out_tens.shape) == (len(op.inputs[0].shape) - n)
				448	op.attrs["shrink_axis_mask"] = 0
				449	if axis >= 0:
				450	axis_4D[idx] = axis + (4 - len(output_shape))
				451	else:
				452	axis_4D[idx] = axis
				453	op.ofm_shapes[idx] = Shape4D(output_shape)
				454
				455	elif new_axis_mask != 0:
				456	n = 0
				457	axis = 0
				458	while new_axis_mask:
				459	prev_mask = new_axis_mask
				460	n += 1
				461	new_axis_mask &= new_axis_mask - 1
				462	axis = int(math.log2(prev_mask - new_axis_mask))
				463	output_shape = output_shape[:axis] + output_shape[(axis + 1) :]
				464	new_axis_mask >>= 1
				465
				466	assert len(out_tens.shape) == (len(op.inputs[0].shape) + n)
				467	op.attrs["new_axis_mask"] = 0
				468	if axis >= 0:
				469	axis_4D[idx] = axis + (4 - len(output_shape))
				470	else:
				471	axis_4D[idx] = axis
				472	op.ofm_shapes[idx] = Shape4D(output_shape)
				473
				474	op.attrs["split_axis_4D"] = axis_4D
				475	return op
				476
				477
				478	def rewrite_unpack_output(op, arch, nng):
				479	tens = op.outputs[0]
				480	if op.run_on_npu and op.type == Op.Unpack:
				481	# Unpack is also referred to as Unstack
				482	axis = int(op.attrs["axis"])
				483	if axis < 0: # Convert to positive axis
				484	axis = len(op.inputs[0].shape) + 1 + axis
				485	op.type = Op.UnpackReshaped
				486	desired_output_shape = tens.shape[:axis] + [1] + tens.shape[axis:]
				487
				488	axis_4D = axis + (4 - len(desired_output_shape))
				489	op.attrs["split_axis_4D"] = [axis_4D] * len(op.outputs)
				490
				491	for idx, out_tens in enumerate(op.outputs):
				492	op.ofm_shapes[idx] = Shape4D(desired_output_shape)
				493	return op
				494
				495
				496	def add_padding_fields(op, arch, nng):
				497	if op.run_on_npu:
				498	if "padding" in op.attrs:
				499	input_shape = op.ifm_shapes[0]
				500	output_shape = op.ofm_shapes[0]
				501	if op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op():
				502	kernel_size = op.inputs[1].shape[:2]
				503	elif op.type.is_pool_op() or op.type.npu_block_type == NpuBlockType.ReduceSum:
				504	kernel_size = op.attrs["ksize"][1:3]
				505	else:
				506	raise UnsupportedFeatureError(f"Unknown operation that uses padding: {optype_to_builtintype(op.type)}")
				507
				508	if op.type == Op.Conv2DBackpropInputSwitchedBias:
				509	upscaling_factor = output_shape.height // input_shape.height
				510	padding, skirt = calc_upscaled_padding_and_skirt(
				511	op.attrs["padding"], kernel_size, op.attrs["strides"], input_shape, upscaling_factor
				512	)
				513	else:
				514	padding, skirt = calc_padding_and_skirt(
				515	op.attrs["padding"], op.kernel, input_shape, op.attrs.get("explicit_padding"),
				516	)
				517
				518	op.attrs["explicit_padding"] = padding
				519	op.attrs["skirt"] = skirt
				520
				521	return op
				522
				523
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	524	def reorder_depthwise_weights(op, arch, nng):
				525	if op.type.is_depthwise_conv2d_op():
				526	weight_tensor = op.inputs[1]
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	527	weight_tensor.values = np.transpose(weight_tensor.values, (0, 1, 3, 2))
				528	weight_tensor.set_all_shapes(list(weight_tensor.values.shape))
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	529	weight_tensor.weight_transpose_depthwise = True
				530
				531	return op
				532
				533
				534	def optimise_strided_conv(op, arch, nng):
				535	stride_x, stride_y = op.get_kernel_stride()
				536	ifm_tensor, _, weight_tensor, _ = op.get_ifm_ifm2_weights_ofm()
				537
				538	if (
				539	op.type == Op.Conv2DBias
				540	and op.op_index == 0
				541	and stride_x == 2
				542	and op.ifm_shapes[0].depth <= 4
				543	and op.ifm_shapes[0].width % 2 == 0
				544	and weight_tensor is not None
				545	and weight_tensor.shape[1] >= 2
				546	):
				547	ifm_shape = op.ifm_shapes[0]
				548	# IFM
				549	op.ifm_shapes[0] = Shape4D([ifm_shape.batch, ifm_shape.height, ifm_shape.width // 2, ifm_shape.depth * 2])
				550
				551	# Weights
				552	weight_shape = weight_tensor.shape
				553	if weight_shape[1] % 2 != 0:
				554	weight_shape[1] = weight_shape[1] + 1
				555	padded_array = np.zeros(weight_shape)
				556	for i in range(weight_shape[0]):
				557	padded_array[i] = np.vstack(
				558	[
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	559	weight_tensor.values[i],
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	560	np.full((1, weight_shape[2], weight_shape[3]), weight_tensor.quantization.zero_point),
				561	]
				562	)
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	563	weight_tensor.values = padded_array
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	564	weight_shape[1] //= 2
				565	weight_shape[2] *= 2
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	566	weight_tensor.values = np.reshape(weight_tensor.values, weight_shape)
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	567	weight_tensor.set_all_shapes(weight_shape)
				568	# If multiple copies of the weights are used, we could avoid
				569	# them having the same address by changing the value_id
				570	weight_tensor.value_id = uuid.uuid4()
				571
				572	# Strides
				573	stride_x = 1
				574	op.attrs.update({"stride_w": stride_x, "stride_h": stride_y, "strides": (1, stride_y, stride_x, 1)})
				575
				576	return op
				577
				578
				579	def convert_conv_to_fc(op, arch, nng):
				580	# Conv 1x1 can be equivalent to Fully Connected.
				581	# By representing certain convs as fully connected layers, Vela can better determine wether or not to use
				582	# caching/double buffering for the weights.
				583	# (Weights dont need to be reloaded for convs when IFM H and W are 1)
				584	if op.type == Op.Conv2DBias:
				585	h = op.ifm_shapes[0].height
				586	w = op.ifm_shapes[0].width
				587	kh, kw, _, _ = op.inputs[1].shape
				588	if h == 1 and w == 1 and kh == 1 and kw == 1:
				589	# Overwrite this op as a Fully Connected Op
				590	op.name += "_fc"
				591	op.type = Op.FullyConnected
				592	op.attrs = {
				593	"weights_format": 0,
				594	}
				595	# Reshape Weights to be 2D. HWIO becomes just IO (as H and W are 1, they can just be dropped)
				596	weight_tensor = op.inputs[1]
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	597	weight_tensor.values = weight_tensor.values.squeeze(axis=(0, 1))
				598	weight_tensor.set_all_shapes(list(weight_tensor.values.shape))
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	599
				600	DebugDatabase.add_optimised(op, op)
				601	return op
				602
				603
				604	def fixup_relus_with_differing_ifm_ofm_scaling(op, arch, nng):
				605	if op.run_on_npu and op.type.is_relu_op():
				606	ifm = op.inputs[0]
				607	ofm = op.outputs[0]
				608	# Relu with differing IFM and OFM scaling cannot be fused with another primary op
				609	# and requires its own to be inserted
				610	if not check_quantized_tens_scaling_equal(ifm, ofm):
				611	# Override this op with its own primary op (avgpool)
				612	relu_fused_op = create_avgpool_nop(op.name + "_avgpool")
				613	# And fuse the original activation function to it
				614	relu_fused_op.activation = create_activation_function(op.type)
Fredrik Svedberg	1a7527c	2021-09-13 15:52:16 +0200	[diff] [blame]	615	# Add explicit rescaling
				616	rescale = ifm.quantization.scale_f32 / ofm.quantization.scale_f32
				617	multiplier, shift = scaling.quantise_scale(rescale)
				618	relu_fused_op.rescale = ExplicitScaling(False, [shift], [multiplier])
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	619	# Tidy up and assign the ifm and ofm to the new op
				620	ifm.consumer_list.remove(op)
				621
				622	relu_fused_op.add_input_tensor(ifm)
				623	relu_fused_op.set_output_tensor(ofm)
				624	relu_fused_op.set_ifm_ofm_shapes()
				625	op = relu_fused_op
				626	return op
				627
				628
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	629	def convert_softmax(op, arch, nng):
				630	if op.type == Op.Softmax and op.run_on_npu:
				631	softmax = SoftMax(op)
				632	op = softmax.get_graph()
				633	return op
				634
				635
				636	def convert_mul_max_to_abs_or_lrelu(op, arch, nng):
				637	r"""Whenever there is a subgraph with this topology:
				638
				639	Input X For X = -1 or X > 0
				640	\| \ / This subgraph can be replaced with either
				641	\| Mul an Abs (if X = -1) or a LeakyReLU (if X > 0)
				642	\| /
				643	Max
				644	"""
				645
				646	if op.type == Op.Maximum:
				647	# finds the Mul input(s) to the Max
				648	muls = [i for i in op.inputs if i.ops[0].type == Op.Mul]
				649	if len(muls) == 1:
				650	mul = muls[0].ops[0]
				651	elif len(muls) == 2:
				652	# In the case both inputs are Muls, find the one with the same input as the Max
				653	mul = [m for m in muls if len(set(op.inputs + m.ops[0].inputs)) == 1][0].ops[0]
				654	else:
				655	# No Mul inputs
				656	return op
				657
				658	# make sure the Mul doesn't have any other consumers
				659	mul_ofm = mul.outputs[0]
				660	if len(mul_ofm.consumers()) != 1:
				661	return op
				662	# make sure the Mul doesn't have a fused activation function
				663	if mul.activation:
				664	return op
				665	ifm, ofm = op.get_ifm_ofm()
				666	if ifm is None or ofm is None:
				667	return op
				668
				669	if ifm.dtype not in (DataType.uint8, DataType.int8) or ifm.dtype != ofm.dtype:
				670	return op
				671	if not check_quantized_tens_scaling_equal(ifm, ofm) or not check_quantized_tens_scaling_equal(ifm, mul_ofm):
				672	# rewrite to LeakyRelu currently only makes sense if the quantization is identical
				673	return op
				674
				675	# finds the branched input that goes to both the Max and the Mul
				676	shared = set(op.inputs) & set(mul.inputs)
				677	if len(shared) == 1:
				678	shared_in = shared.pop()
				679	# find the constant scalar input to the Mul
				680	const_tens = (set(mul.inputs) - {shared_in}).pop()
				681	# check that it is a scalar
				682	if const_tens.shape != []:
				683	return op
				684	const = const_tens.ops[0]
				685	# check that it is a constant
				686	if const.type != Op.Const:
				687	return op
				688	# Remove the Mul from the shared input's consumers
				689	shared_in.consumer_list.remove(mul)
				690	else:
				691	return op
				692
				693	val = const.outputs[0].values
				694	if val >= 0:
				695	new_op = Op.LeakyRelu
				696	op.attrs["alpha"] = val
				697	# to produce bit exact results, the alpha is not enough;
				698	# save additional scaling info in attr "alpha_scale", to be used as input
				699	# to the LUT construction
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	700	alpha_scalar = const_tens.values - const_tens.quantization.zero_point
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	701	mul_ifm_scale = np.double(ifm.quantization.scale_f32)
				702	mul_ifm2_scale = np.double(const_tens.quantization.scale_f32)
				703	mul_ofm_scale = np.double(mul_ofm.quantization.scale_f32)
				704	alpha_scale, alpha_shift = scaling.elementwise_mul_scale(mul_ifm_scale, mul_ifm2_scale, mul_ofm_scale)
				705	op.attrs["alpha_scaling"] = (alpha_scalar, alpha_scale, alpha_shift)
				706	elif val == -1:
				707	new_op = Op.Abs
				708	else:
				709	return op
				710
				711	op.type = new_op
				712	op.name = op.name.replace("Maximum", new_op.name)
				713	op.outputs[0].name = op.outputs[0].name.replace("Maximum", new_op.name)
				714	op.inputs = [shared_in]
				715	op.set_ifm_ofm_shapes()
				716
				717	# Record optimisation in debug database
				718	DebugDatabase.add_optimised(op, op)
				719
				720	return op
				721
				722
				723	def convert_hardswish_to_lut(op, arch, nng):
				724	if op.type == Op.HardSwish:
				725	ifm, ofm = op.get_ifm_ofm()
				726	# Generate the LUT
				727	ifm_scale = np.double(ifm.quantization.scale_f32)
				728	ofm_scale = np.double(ofm.quantization.scale_f32)
				729	zp_in = ifm.quantization.zero_point
				730	zp_out = ofm.quantization.zero_point
				731	ifm_scale_hires = (1 / 128) * ifm_scale
				732	relu_multiplier = np.double(3 / 32768)
				733	out_scale, out_shift = scaling.quantise_scale(ifm_scale_hires / ofm_scale)
				734	relu_scale, relu_shift = scaling.quantise_scale(ifm_scale_hires / relu_multiplier)
				735	# Use 16bit scale
				736	out_scale_16 = fp_math.downscale_multiplier_int32_to_int16(out_scale)
				737	relu_scale_16 = fp_math.downscale_multiplier_int32_to_int16(relu_scale)
				738
				739	values = []
				740	ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)
				741	quantized_min = min(ix)
				742	quantized_max = max(ix)
				743	for x in ix:
				744	input_value = x - zp_in
				745	input_value_hires = input_value * 128
				746	# Compute the input value on essentially the output scale, not shifted yet
				747	input_value_preshift = fp_math.saturating_rounding_mul16(input_value_hires, out_scale_16)
				748	# Compute the "relu-ish multiplier". This matches the code in TensorFlow Lite Micro kernel
				749	relu_value = np.int16(input_value_hires)
				750	if relu_shift < 31:
				751	relu_value = fp_math.shift_left16(relu_value, 30 - relu_shift)
				752
				753	relu_value = fp_math.saturating_rounding_mul16(relu_value, relu_scale_16)
				754
				755	if relu_shift < 31:
				756	relu_value = fp_math.shift_left16(relu_value, 1)
				757
				758	if relu_shift > 31:
				759	relu_value = fp_math.rounding_divide_by_pot(relu_value, relu_shift - 31)
				760
				761	# Rescaled the value into a 16bit fixedpoint relu_value in [-1, 1]
				762	# Now convert that to a 16bit fixedpoint value in [0, 1]
				763	relu_value = (relu_value + (1 << 15)) >> 1
				764	lut_result = fp_math.saturating_mul16(relu_value, input_value_preshift)
				765	shift = 31 - out_shift
				766	shift = -shift if shift < 0 else 0
				767	# Finally apply the output shift
				768	lut_result = fp_math.rounding_divide_by_pot(lut_result, shift) + zp_out
				769	lut_result = min(quantized_max, max(quantized_min, lut_result))
				770	values.append(lut_result)
				771	return convert_to_lut(op, values, "hardswish")
				772	return op
				773
				774
				775	def convert_lrelu_to_mul_max(op, arch):
				776	# Converts LeakyRelu to Max(alpha * IFM, identity * IFM)
				777	# (the opposite of convert_mul_max_to_abs_or_lrelu)
				778	ifm, ofm = op.get_ifm_ofm()
				779	if ifm is None or ofm is None:
				780	return op
				781
				782	# Add multiplication with alpha
				783	mul_alpha = Operation(Op.Mul, op.name + "_mul_alpha")
				784	mul_alpha.add_input_tensor(ifm)
				785	# Create const tensor containing alpha as scalar
Fredrik Svedberg	cce872b	2021-09-02 15:20:52 +0200	[diff] [blame]	786	alpha = np.float32(op.attrs["alpha"])
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	787	quantization = ifm.quantization.clone()
				788	quantization.min = 0
				789	quantization.max = alpha * (quantization.quant_max - quantization.quant_min)
				790	quantization.zero_point = 0
Fredrik Svedberg	cce872b	2021-09-02 15:20:52 +0200	[diff] [blame]	791	if np.isinf(1 / alpha):
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	792	# Handling of alpha near zero
Fredrik Svedberg	cce872b	2021-09-02 15:20:52 +0200	[diff] [blame]	793	quantization.scale_f32 = np.float32(1)
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	794	scalar = 0
				795	else:
				796	quantization.scale_f32 = alpha
				797	scalar = alpha
				798	alpha_tens = create_const_tensor(
				799	op.name + "_alpha_scalar", [], ifm.dtype, [scalar], np.float32, quantization=quantization
				800	)
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	801	alpha_tens.values = np.array([1])
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	802	mul_alpha.add_input_tensor(alpha_tens)
				803	fm_alpha = ofm.clone(op.name + "_alpha", set_unique=True)
				804	mul_alpha.set_output_tensor(fm_alpha)
				805	mul_alpha.set_ifm_ofm_shapes()
				806	DebugDatabase.add_optimised(op, mul_alpha)
				807
				808	if check_quantized_tens_scaling_equal(ifm, ofm):
				809	# No identity multiplication is needed
				810	fm_id = ifm
				811	else:
				812	# Add multiplication with identity
				813	mul_identity = Operation(Op.Mul, op.name + "_mul_identity")
				814	mul_identity.add_input_tensor(ifm)
				815	# Create const tensor containing identity as scalar
				816	quantization = ifm.quantization.clone()
				817	quantization.min = 0
				818	quantization.max = quantization.quant_max - quantization.quant_min
Fredrik Svedberg	cce872b	2021-09-02 15:20:52 +0200	[diff] [blame]	819	quantization.scale_f32 = np.float32(1)
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	820	quantization.zero_point = 0
				821	identity_tens = create_const_tensor(
				822	op.name + "_id_scalar", [], ifm.dtype, [1], np.uint8, quantization=quantization
				823	)
				824	mul_identity.add_input_tensor(identity_tens)
				825	# Make sure that fm_id is allocated to a different address than fm_alpha
				826	fm_id = ofm.clone(op.name + "_id", set_unique=True)
				827	mul_identity.set_output_tensor(fm_id)
				828	mul_identity.set_ifm_ofm_shapes()
				829	DebugDatabase.add_optimised(op, mul_identity)
				830
				831	# Convert LeakyRelu to Max, add the results of the multiplication(s) as inputs
				832	op.type = Op.Maximum
				833	op.name = op.name.replace("LeakyRelu", "Maximum")
				834	op.inputs = []
				835	ifm.consumer_list.remove(op)
				836	op.add_input_tensor(fm_alpha)
				837	op.add_input_tensor(fm_id)
				838	op.set_ifm_ofm_shapes()
				839
				840	DebugDatabase.add_optimised(op, op)
				841	return op
				842
				843
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	844	def convert_to_lut8(op, fn, fn_name):
				845	# Converts op to a no-op + int8/uint8 LUT which is generated with the given function.
				846	# fn is a function(real) -> real
				847	ifm, ofm = op.get_ifm_ofm()
				848	if ifm.dtype not in (DataType.uint8, DataType.int8) or ifm.dtype != ofm.dtype:
				849	return op
				850	# Generate the LUT
				851	ifm_scale = np.double(ifm.quantization.scale_f32)
				852	ofm_scale = np.double(ofm.quantization.scale_f32)
				853	zp_in = ifm.quantization.zero_point
				854	zp_out = ofm.quantization.zero_point
				855	values = []
				856	ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)
				857	quantized_min = min(ix)
				858	quantized_max = max(ix)
				859	for x in ix:
				860	x_real = ifm_scale * (x - zp_in)
				861	y_real = fn(x_real)
				862	lut_result = round_away_zero(zp_out + y_real / ofm_scale)
				863	lut_result = min(quantized_max, max(quantized_min, lut_result))
				864	values.append(lut_result)
				865	return convert_to_lut(op, values, fn_name)
				866
				867
				868	def convert_lrelu_to_lut(op, arch):
				869	ifm, ofm = op.get_ifm_ofm()
				870	# Generate the LUT
				871	alpha = op.attrs["alpha"]
				872	ifm_scale = np.double(ifm.quantization.scale_f32)
				873	ofm_scale = np.double(ofm.quantization.scale_f32)
				874	zp_in = ifm.quantization.zero_point
				875	zp_out = ofm.quantization.zero_point
				876	identity_scale, identity_shift = scaling.elementwise_mul_scale(ifm_scale, 1, ofm_scale)
				877	alpha_scalar = 1
				878	alpha_scale, alpha_shift = scaling.elementwise_mul_scale(ifm_scale, alpha, ofm_scale)
				879	if "alpha_scaling" in op.attrs:
				880	# The LeakyRelu was the result from convert_mul_max_to_abs_or_lrelu
				881	alpha_scalar, alpha_scale, alpha_shift = op.attrs["alpha_scaling"]
				882	values = []
				883	ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)
				884	quantized_min = min(ix)
				885	quantized_max = max(ix)
				886	for x in ix:
				887	if x < zp_in:
				888	lut_result = zp_out + fp_math.multiply_by_quantized_multiplier(
				889	alpha_scalar * (x - zp_in), alpha_scale, alpha_shift
				890	)
				891	else:
				892	lut_result = zp_out + fp_math.multiply_by_quantized_multiplier(x - zp_in, identity_scale, identity_shift)
				893	lut_result = min(quantized_max, max(quantized_min, lut_result))
				894	values.append(lut_result)
				895	return convert_to_lut(op, values, "lrelu")
				896
				897
				898	def convert_lrelu(op, arch, nng):
				899	# Converts LeakyRelu to a LUT based solution if possible, otherwise a mul + max
				900	if op.type != Op.LeakyRelu:
				901	return op
				902	ifm, ofm = op.get_ifm_ofm()
				903	if ifm is None or ofm is None:
				904	return op
				905	if ifm.dtype in (DataType.uint8, DataType.int8) and ifm.dtype == ofm.dtype:
				906	# use LUT for int8/uint8
				907	return convert_lrelu_to_lut(op, arch)
				908	if check_quantized_tens_scaling_equal(ifm, ofm) and ifm.dtype == ofm.dtype == DataType.int16:
				909	# use LeakyRelu unmodified for int16 with equal input/output scaling
				910	return op
				911	return convert_lrelu_to_mul_max(op, arch)
				912
				913
				914	def convert_tanh_sigmoid_to_lut(op, arch, nng):
				915	# Converts int8/uint8 Sigmoid and Tanh to a LUT based solution
				916	if op.type == Op.Sigmoid:
				917	return convert_to_lut8(op, clamp_sigmoid, "sigmoid")
				918	elif op.type == Op.Tanh:
				919	return convert_to_lut8(op, math.tanh, "tanh")
				920	return op
				921
				922
Jonas Ohlsson	0957e3e	2021-09-01 15:57:21 +0200	[diff] [blame]	923	def remove_memory_only_ops(op, arch):
				924	if op.run_on_npu and op.type in memory_only_ops:
				925	bypass_memory_only_ops(op)
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	926
				927
				928	def fuse_activation_function_with_prev(op, arch, nng):
				929	# if op is a no-op: attempts to move the activation function to the preceding op
				930	if not op.attrs.get("is_nop", False) or op.activation is None:
				931	return op
				932	ifm, ofm = op.get_ifm_ofm()
				933	if ifm is None or ofm is None:
				934	return op
				935	# finds the input(s) to the operation
				936	prev_op = ifm.ops[0]
				937	# Note: the below checks on prev_op require that a first optimize pass on the full graph has been performed
				938	fuse = (
				939	prev_op.run_on_npu
				940	and prev_op.type.npu_block_type != NpuBlockType.Default
				941	and len(ifm.ops) == 1
				942	and len(prev_op.outputs[0].consumers()) == 1
				943	and prev_op.activation is None
				944	)
				945	if op.activation_lut is not None and arch.shram_reserved_unused_banks == 0:
				946	# TODO: if SHRAM LUT space is shared with SHRAM ACC (32, 64 MAC),
				947	# LUT currently only works correctly for elementwise ops
				948	fuse = False
				949	if not fuse:
				950	return op
				951	# Move the fused activation function + corresponding info to prev_op
				952	prev_op.activation = op.activation
				953	prev_op.forced_output_quantization = op.forced_output_quantization
				954	if op.activation_lut is not None:
				955	prev_op.set_activation_lut(op.activation_lut)
				956	# Bypass op
				957	prev_op.set_output_tensor(ofm)
				958	DebugDatabase.add_optimised(op, prev_op)
				959	return op
				960
				961
				962	def _leading_pad_ok(leading_pad, stride, kernel_size):
				963	# If kernel size // 2 > stride, then (left, top) padding must be a multiple of stride,
				964	# otherwise replacing PAD by hardware padding would iterate the wrong IFM rows/columns
				965	max_size = kernel_size // 2
				966	return leading_pad == max_size or max_size <= stride or leading_pad % stride == 0
				967
				968
				969	def replace_pad_by_hw_pad(op: Operation, arch, nng):
				970	"""
				971	Tries to completely remove a PAD operator by using hardware padding.
				972	E.g. a PAD operation that pads 1, followed by a CONV with VALID padding and kernel size 3
				973	is rewritten such that the PAD is removed, and the CONV uses SAME padding.
				974	Converts tens1 -> PAD -> tens2 -> CONV to tens1 -> CONV
				975	if both operations can be run on the NPU.
				976	This is the most efficient way to implement PAD, but cannot be done for all pad sizes.
				977	"""
				978	if (
				979	(op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op() or op.type.is_avgpool_op())
Tim Hall	0ab2edc	2022-02-23 17:58:02 +0000	[diff] [blame^]	980	and op.type not in (Op.Conv2DBackpropInput, Op.Conv2DBackpropInputSwitchedBias)
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	981	and op.run_on_npu
				982	and op.attrs["padding"] == Padding.VALID
				983	):
				984	pad_op = op.ifm.ops[0]
				985	if pad_op.type != Op.Pad or not pad_op.run_on_npu:
				986	return op
				987	if pad_op.ifm.dtype != pad_op.ofm.dtype or not check_quantized_tens_scaling_equal(pad_op.ofm, pad_op.ifm):
				988	return op
				989	top, left, bottom, right = get_pad_values_from_input(pad_op.inputs[1].values)
				990	k = op.kernel
				991	k_w, k_h = k.dilated_wh()
				992
				993	# Check if the PAD operator can be replaced by hardware padding
				994	if left > k_w // 2 or right > k_w // 2 or top > k_h // 2 or bottom > k_h // 2:
				995	# Too much padding, it would require hardware padding to actually insert zeros
				996	return op
				997	if not _leading_pad_ok(top, k.stride.y, k_h) or not _leading_pad_ok(left, k.stride.x, k_w):
				998	return op
				999
				1000	if op.type.is_avgpool_op():
				1001	# For average pool, hardware padding can only be used if padding is 0 or kernel size / 2
				1002	for pad, k_size in (
				1003	(left, k_w),
				1004	(right, k_w),
				1005	(top, k_h),
				1006	(bottom, k_h),
				1007	):
				1008	if pad not in (0, k_size // 2):
				1009	return op
				1010	# Average pool is converted to depthwise, because NPU average pool + same padding
				1011	# has a special implementation that is different from PAD followed by average pool with
				1012	# valid padding.
				1013	k_w, k_h = op.kernel.width, op.kernel.height
				1014	ifm = op.ifm
				1015	# Remember other inputs
				1016	other_inputs = op.inputs[1:]
				1017	# Create a weight tensor, all weights are set to 1/(kernel width * kernel height)
				1018	quantization = QuantizationParameters(0.0, 255.0)
				1019	quantization.scale_f32 = 1.0 / (k_w * k_h)
				1020	quantization.zero_point = 0
				1021	shape = [k_h, k_w, 1, op.ofm.shape[-1]]
				1022	weights = np.full(shape, 1)
				1023
				1024	weight_tens = create_const_tensor(
				1025	op.name + "_weights",
				1026	shape,
				1027	op.ifm.dtype,
				1028	weights,
				1029	np.uint8,
				1030	purpose=TensorPurpose.Weights,
				1031	quantization=quantization,
				1032	)
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	1033	weight_tens.values = weights
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	1034	op.type = Op.DepthwiseConv2DBias
				1035	op.inputs = []
				1036	op.add_input_tensor(ifm)
				1037	op.add_input_tensor(weight_tens)
				1038	# Add bias tensor, all biases set to 0
				1039	op.inputs.append(None)
				1040	fixup_bias_tensors(op, arch, nng)
				1041	# Add other inputs
				1042	op.inputs.extend(other_inputs)
				1043	op.rounding_mode = NpuRoundingMode.NATURAL
				1044
				1045	# Bypass the PAD operator
				1046	op.set_input_tensor(pad_op.ifm, 0)
				1047	# Adjust the padding attributes of the convolution operator
				1048	op.attrs["padding"] = Padding.EXPLICIT
				1049	op.attrs["explicit_padding"] = (top, left, bottom, right)
				1050	op.set_ifm_ofm_shapes()
				1051	return op
				1052
				1053
				1054	def convert_pad(op: Operation, arch, nng):
				1055	"""
				1056	Rewrites PAD operator to an average pool that copies the IFM to the OFM
				1057	+ up to 4 average pool operators that fill the OFM with zeros at the borders.
				1058	This is done as fall-back for the PAD operators that remain after replace_pad_by_hw_pad
				1059	"""
				1060	if op.type != Op.Pad or not op.run_on_npu:
				1061	return op
				1062	top, left, bottom, right = get_pad_values_from_input(op.inputs[1].values)
				1063
				1064	ifm = op.ifm
				1065	assert ifm is not None
James Ward	3e13434	2021-10-28 10:01:40 +0100	[diff] [blame]	1066	ifm_shape = op.ifm_shapes[0]
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	1067	ofm = op.ofm
				1068	assert ofm is not None
				1069	ofm.ops = []
				1070	ofm_shape = op.ofm_shapes[0]
				1071
				1072	# Average pool op that copies IFM to the right place inside the OFM
				1073	shp0 = Shape4D(0, 0, 0, 0)
				1074	shp_top = shp0.with_height(top)
				1075	avgpool_op = create_avg_pool_for_concat(op, op.name + "_main", ifm, ifm_shape, shp_top.with_width(left))
				1076	avgpool_op.activation = op.activation
				1077	quant = ofm.quantization
				1078	pad_value = quant.zero_point
				1079	# Add operations that fill the borders of the OFM
				1080	if top > 0:
				1081	shape = Shape4D(1, top, ofm_shape.width, ofm_shape.depth)
				1082	zero_tens = create_const_tensor(
				1083	op.name + "_top", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], np.uint8, quantization=quant
				1084	)
				1085	# If top/bottom or left/right are equal, the const tensors can be allocated to the same address
				1086	zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))
				1087	create_avg_pool_for_concat(op, op.name + "_top", zero_tens, shape, shp0)
				1088	if bottom > 0:
				1089	shape = Shape4D(1, bottom, ofm_shape.width, ofm_shape.depth)
				1090	zero_tens = create_const_tensor(
				1091	op.name + "_bottom",
				1092	shape.as_list(),
				1093	ofm.dtype,
				1094	shape.elements() * [pad_value],
				1095	np.uint8,
				1096	quantization=quant,
				1097	)
				1098	zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))
				1099	create_avg_pool_for_concat(
				1100	op, op.name + "_bottom", zero_tens, shape, shp0.with_height(ofm_shape.height - bottom)
				1101	)
				1102	if left > 0:
				1103	shape = Shape4D(1, ifm_shape.height, left, ofm_shape.depth)
				1104	zero_tens = create_const_tensor(
				1105	op.name + "_left", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], np.uint8, quantization=quant
				1106	)
				1107	zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))
				1108	create_avg_pool_for_concat(op, op.name + "_left", zero_tens, shape, shp_top)
				1109	if right > 0:
				1110	shape = Shape4D(1, ifm_shape.height, right, ofm_shape.depth)
				1111	zero_tens = create_const_tensor(
				1112	op.name + "_right", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], np.uint8, quantization=quant
				1113	)
				1114	zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))
				1115	create_avg_pool_for_concat(
				1116	op, op.name + "_right", zero_tens, shape, shp_top.with_width(ofm_shape.width - right)
				1117	)
				1118
				1119	op.type = Op.ConcatTFLite
				1120	return avgpool_op
				1121
				1122
				1123	def add_attrs_to_resizebilinear(op, arch, nng):
				1124	if op.type == Op.ResizeBilinear and op.run_on_npu:
				1125	input_tensor = op.inputs[0]
				1126	input_shape = op.ifm_shapes[0]
				1127	upscaled_height = input_shape.height * 2
				1128	upscaled_width = input_shape.width * 2
				1129	out_shape = op.ofm_shapes[0]
				1130	if not op.attrs["align_corners"] and out_shape.height == upscaled_height and out_shape.width == upscaled_width:
				1131	# this means the output is supposed to be a x2 upscale,
				1132	# so we need to do SAME padding
				1133	op.attrs["padding"] = Padding.SAME
				1134	elif (
				1135	op.attrs["align_corners"]
				1136	and out_shape.height == (upscaled_height - 1)
				1137	and out_shape.width == (upscaled_width - 1)
				1138	):
				1139	# here we can just run the avg pool without padding and
				1140	# produce a (M * 2 - 1, N * 2 - 1) sized output
				1141	op.attrs["padding"] = Padding.VALID
				1142	else:
				1143	return op
				1144	input_tensor.resampling_mode = resampling_mode.NEAREST
				1145	op.attrs.update({"strides": (1, 1, 1, 1), "ksize": (1, 2, 2, 1)})
				1146	return op
				1147
				1148
				1149	def fixup_bias_tensors(op, arch, nng):
				1150	if op.type.needs_bias() and op.bias is None:
				1151	# Op has no bias, add bias tensor filled with zeros
				1152	nr_biases = op.inputs[1].shape[-1]
				1153	bias_values = [0] * nr_biases
				1154	bias_tensor = create_const_tensor(op.name + "_bias", [nr_biases], DataType.int32, bias_values)
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	1155	op.set_input_tensor(bias_tensor, op.type.info.indices.biases[0])
				1156
				1157	return op
				1158
				1159
Fredrik Svedberg	cc8569f	2021-11-01 14:25:29 +0100	[diff] [blame]	1160	def fixup_asymmetric_weights(op, arch, nng):
				1161	if op.run_on_npu and (op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op()):
				1162	if op.ifm.dtype == DataType.int8:
				1163	if not np.all(op.weights.quantization.zero_point == 0):
				1164	print(f"Warning: {op.type} '{op.name}' has asymmetric weights, zero points have been adjusted.")
				1165	op.weights.quantization.zero_point *= 0
				1166
				1167	return op
				1168
				1169
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	1170	def convert_mean_to_depthwise_conv_or_avgpool(op, arch, nng):
				1171	if op.type == Op.Mean and op.run_on_npu:
				1172	keep_dims = op.attrs.get("keep_dims", False)
				1173	inp, axis = op.inputs
				1174	shape = inp.shape
				1175	dims = len(shape)
				1176
				1177	# Height and width axes have different index depending on dimensions
				1178	if axis.shape == [] or axis.shape[0] == 1: # single axis
				1179	axis = int(axis.values) if len(axis.shape) == 0 else int(axis.values[0])
				1180	if dims in (2, 3):
				1181	if axis == 0:
				1182	h, w = shape[axis], 1
				1183	else:
				1184	h, w = 1, shape[axis]
				1185	else:
				1186	if axis == 1:
				1187	h, w = shape[axis], 1
				1188	else:
				1189	h, w = 1, shape[axis]
				1190	else: # multiple axes
				1191	axis = sorted(axis.values)
				1192	h, w = [shape[i] for i in axis]
				1193
				1194	# Set necessary depthwise attributes
				1195	op.attrs.update(
				1196	{
				1197	"padding": Padding.VALID,
				1198	"stride_h": 1,
				1199	"stride_w": 1,
				1200	"strides": (1, 1, 1, 1),
				1201	"depth_multiplier": 1,
				1202	"channel_multiplier": 1,
				1203	"dilation_h_factor": 1,
				1204	"dilation_w_factor": 1,
				1205	"dilation": (1, 1, 1, 1),
				1206	}
				1207	)
				1208	# Change op type
				1209	op.type = Op.DepthwiseConv2DBias
				1210	# Set IFM/OFM shapes after changing op type
				1211	op.set_ifm_ofm_shapes()
				1212
				1213	weight_scale, bias = 1, None
				1214	ofmq, ifmq = op.ofm.quantization, inp.quantization
				1215	# Set rounding mode, scaling and zero point based on which reference implementation to match
				1216	if len(shape) == 4 and axis == [1, 2] and keep_dims:
				1217	if inp.dtype == DataType.uint8:
				1218	# This attribute means a different scaling calculation is used in order to match reference
				1219	op.low_precision_scaling = True
				1220	weight_scale = h * w
				1221	# Set zero points to 0 as they will be adjusted for with bias term
				1222	foq = ofmq.clone()
				1223	foq.zero_point = 0
				1224	fiq = ifmq.clone()
				1225	fiq.zero_point = 0
				1226	op.forced_input_quantization = fiq
				1227	bias_term = ofmq.zero_point - int(ifmq.zero_point * ifmq.scale_f32 / ofmq.scale_f32)
				1228	# If the bias term is outside uint8 range, we need an Add op to apply it.
				1229	if bias_term < 0 or bias_term > 255:
				1230	intermediate = op.ofm.clone(suffix="_intermediate", set_unique=True)
				1231	# Bias term has higher bitness (i32) than input/output (u8).
				1232	# 16 bits is enough since the bias is added/subtracted from a u8 value,
				1233	# the bias can only effectively assume values in the range [-255, 255].
				1234	intermediate.dtype = DataType.int16
				1235	intermediate.quantization.zero_point = 0
				1236	add_op = Operation(Op.Add, op.name + "_bias")
				1237	add_op.forced_output_quantization = foq
				1238	add_op.add_input_tensor(intermediate)
				1239	quant = QuantizationParameters()
				1240	quant.zero_point = 0
				1241	bias_term_tens = create_const_tensor(
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	1242	op.name + "_bias", [1, 1, 1, 1], DataType.int16, [bias_term], np.int16, quantization=quant,
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	1243	)
				1244	add_op.add_input_tensor(bias_term_tens)
				1245	add_op.set_output_tensor(op.ofm)
				1246	add_op.set_ifm_ofm_shapes()
				1247	add_op.activation = op.activation
				1248	op.activation = None
				1249	op.set_output_tensor(intermediate)
				1250	op.set_ifm_ofm_shapes()
				1251	# If not, we can just do it with the OFM zero point.
				1252	else:
				1253	foq.zero_point = bias_term
				1254	op.forced_output_quantization = foq
				1255	else:
				1256	assert inp.dtype == DataType.int8
				1257	# Use a depthwise to calculate the sum,
				1258	# followed by a multiplication with 1/N to get the MEAN
				1259	weight_scale = 1
				1260	intermediate = op.ofm.clone(suffix="_intermediate", set_unique=True)
				1261	intermediate.dtype = DataType.int16
				1262	mul_op = Operation(Op.Mul, op.name + "_mul")
				1263	mul_op.add_input_tensor(intermediate)
				1264	# Create scalar containing 1/N
				1265	quant = QuantizationParameters()
				1266	quant.zero_point = 0
				1267	# The reference rounds negative numbers downwards, e.g. -1.5 is rounded to -2,
				1268	# while rounding mode NATURAL would round this to -1.
				1269	# This can only occur if N is even, and can be emulated by
				1270	# multiplying with a number that is slightly smaller than 1/N.
				1271	# It must be so small that other roundings are not affected;
				1272	# the calculated value is based on worst case,
				1273	# which is sum 256 * N (the maximum sum that can occur with int8)
				1274	n = int(h * w)
				1275	eps = 1 / (256 * (n + 1)) if n % 2 == 0 else 0
				1276	quant.scale_f32 = 1 / (n - eps)
				1277	scalar = create_const_tensor(
				1278	op.name + "_scalar", [1, 1, 1, 1], DataType.uint8, [1], np.uint8, quantization=quant
				1279	)
				1280	mul_op.add_input_tensor(scalar)
				1281	mul_op.set_output_tensor(op.ofm)
				1282	mul_op.set_ifm_ofm_shapes()
				1283	mul_op.rounding_mode = NpuRoundingMode.NATURAL
				1284	mul_op.activation = op.activation
				1285	op.activation = None
				1286	op.set_output_tensor(intermediate)
				1287	op.set_ifm_ofm_shapes()
				1288	elif ifmq.zero_point == ofmq.zero_point and ifmq.scale_f32 == ofmq.scale_f32:
				1289	# Here we can just use a simple AvgPool with truncating rounding,
				1290	# as we're emulating simple integer division.
				1291	op.rounding_mode = NpuRoundingMode.TRUNCATE
				1292	op.type = Op.AvgPool
				1293	op.attrs.update({"ksize": (1, h, w, 1), "filter_height": h, "filter_width": w})
				1294	else:
				1295	op.rounding_mode = NpuRoundingMode.NATURAL
				1296	weight_scale = 1 / (h * w)
				1297	# Input zero point is adjusted after mean calculation, so we emulate that with a bias
				1298	bias = -ifmq.zero_point * h * w
				1299	fiq = ifmq.clone()
				1300	fiq.zero_point = 0
				1301	op.forced_input_quantization = fiq
				1302
				1303	# Change dimensions to 4
				1304	if dims < 4:
				1305	shape = [1] + shape
				1306	if dims == 2:
				1307	shape += [1]
				1308
Rickard Bolin	7d7cb67	2021-12-07 09:09:14 +0000	[diff] [blame]	1309	# If height is greater than max kernel height, reshape from HxW to 1x(HxW)
				1310	if (h > 64 and op.type == Op.DepthwiseConv2DBias) or (h > 256 and op.type == Op.AvgPool):
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	1311	shape = [shape[0], 1, h * w, shape[3]]
				1312	op.ifm_shapes[0] = Shape4D(shape)
				1313	if h > 256 and op.type == Op.AvgPool:
				1314	op.attrs.update({"ksize": (1, 1, h * w, 1), "filter_height": 1, "filter_width": h * w})
				1315
				1316	# If the AvgPool version is used, we don't need to do anything else
				1317	if op.type == Op.AvgPool:
				1318	return op
				1319
				1320	# Make unit weight tensor quantization
				1321	weight_quant = ifmq.clone()
				1322	weight_quant.min = 0
				1323	weight_quant.max = 255
				1324	weight_quant.scale_f32 = weight_scale
				1325	weight_quant.zero_point = 0
				1326
				1327	# Set weight shape to [H,W,C,B]
				1328	weight_shape = shape[1:4] + [shape[0]]
				1329	# Add unit weight tensor
				1330	op.set_input_tensor(
				1331	create_const_tensor(
				1332	"weights",
				1333	weight_shape,
				1334	inp.dtype,
				1335	np.ones(weight_shape),
				1336	value_dtype=np.uint8,
				1337	quantization=weight_quant,
				1338	),
				1339	1,
				1340	)
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	1341	op.weights.values = np.reshape(op.inputs[1].values, weight_shape)
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	1342
				1343	# Add None bias tensor
				1344	op.inputs.append(None)
				1345	# Add bias tensor
				1346	if bias:
				1347	bias_shape = [shape[-1]]
				1348	op.set_input_tensor(
				1349	create_const_tensor(
Tim Hall	8ae2929	2021-07-28 16:52:03 +0100	[diff] [blame]	1350	"bias", bias_shape, inp.dtype, np.ones(bias_shape) * bias, value_dtype=np.int32, quantization=None,
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	1351	),
				1352	2,
				1353	)
				1354
				1355	return op
				1356
				1357
				1358	def supported_operator_check(op, arch, nng):
Jonas Ohlsson	45e653d	2021-07-26 16:13:12 +0200	[diff] [blame]	1359	op.run_on_npu = arch.tflite_supported_operators.is_operator_supported(op)
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	1360	return op
				1361
				1362
				1363	def tflite_optimise_graph(nng, arch):
				1364	# Pre-processing step
				1365	pre_process_list = [
				1366	supported_operator_check,
				1367	set_ifm_ofm_op_shapes,
				1368	]
				1369
				1370	for idx, sg in enumerate(nng.subgraphs):
				1371	nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
				1372	nng, sg, arch, [], pre_process_list, rewrite_unsupported=False,
				1373	)
				1374
				1375	# Handle Concat Ops
				1376	for idx, sg in enumerate(nng.subgraphs):
				1377	rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [rewrite_concat_ops])
				1378	sg.refresh_after_modification()
				1379
				1380	# Handle Split Ops
				1381	for idx, sg in enumerate(nng.subgraphs):
				1382	nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
				1383	nng,
				1384	sg,
				1385	arch,
				1386	[],
				1387	[rewrite_unpack_output, rewrite_stridedslice_output, convert_nop_split_to_identity],
				1388	rewrite_unsupported=False,
				1389	)
				1390
				1391	for idx, sg in enumerate(nng.subgraphs):
				1392	nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
				1393	nng, sg, arch, [rewrite_split_ops], [], rewrite_unsupported=False,
				1394	)
				1395
				1396	# Handle sg input output
				1397	for idx, sg in enumerate(nng.subgraphs):
				1398	nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
				1399	nng, sg, arch, [], [fix_sg_input_output], rewrite_unsupported=False,
				1400	)
				1401
Jonas Ohlsson	0957e3e	2021-09-01 15:57:21 +0200	[diff] [blame]	1402	# Removal of memory only operators
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	1403	for sg in nng.subgraphs:
Jonas Ohlsson	0957e3e	2021-09-01 15:57:21 +0200	[diff] [blame]	1404	rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [remove_memory_only_ops])
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	1405	sg.refresh_after_modification()
				1406
				1407	# Rewrite of operators
				1408	op_rewrite_list = [
				1409	set_tensor_equivalence,
				1410	convert_mean_to_depthwise_conv_or_avgpool,
				1411	convert_depthwise_to_conv,
				1412	convert_conv_to_fc,
				1413	convert_softmax,
				1414	optimise_strided_conv,
				1415	convert_hardswish_to_lut,
				1416	rewrite_fully_connected_input,
				1417	convert_batched_fc_shape,
				1418	fixup_conv2d_backprop,
				1419	fixup_relus_with_differing_ifm_ofm_scaling,
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	1420	reorder_depthwise_weights,
				1421	fixup_resizebilinear,
				1422	fixup_bias_tensors,
Fredrik Svedberg	cc8569f	2021-11-01 14:25:29 +0100	[diff] [blame]	1423	fixup_asymmetric_weights,
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	1424	convert_mul_max_to_abs_or_lrelu,
				1425	convert_lrelu,
				1426	convert_tanh_sigmoid_to_lut,
				1427	replace_pad_by_hw_pad,
				1428	]
				1429
				1430	for idx, sg in enumerate(nng.subgraphs):
				1431	nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
				1432	nng, sg, arch, [], op_rewrite_list, rewrite_unsupported=False,
				1433	)
				1434
				1435	for idx, sg in enumerate(nng.subgraphs):
				1436	# remove passthrough tensors and attempt further optimizations
				1437	nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
				1438	nng,
				1439	sg,
				1440	arch,
				1441	[remove_passthrough_tensor],
				1442	[fuse_activation_function_with_prev, convert_pad, add_padding_fields],
				1443	)
				1444
				1445	# Removal of SplitSliceRead, need to be done after optimisation has been performed,
				1446	# since ifm/ofm_shapes are of importance to this function
				1447	for sg in nng.subgraphs:
				1448	rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [remove_SplitSliceRead])
				1449	sg.refresh_after_modification()
				1450
				1451	return nng