Blame - ethosu/vela/tflite_graph_optimiser.py - ml/ethos-u/ethos-u-vela

blob: ef39aea3a876d942f3a7045e092505e7931bcd99 [file] [log] [blame]

Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	1	# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
				2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
				16	# Description:
				17	# Early optimisation of a TensorFlow Lite based network graph, using the rewrite_graph module
				18	# to do the traversal of the graph.
				19	import math
				20	import uuid
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	21
				22	import numpy as np
				23
				24	from . import fp_math
				25	from . import lut
				26	from . import rewrite_graph
				27	from . import scaling
				28	from .api import NpuRoundingMode
				29	from .data_type import DataType
				30	from .debug_database import DebugDatabase
				31	from .errors import UnsupportedFeatureError
				32	from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Patrik Gustavsson	df99510	2021-08-23 15:33:59 +0200	[diff] [blame^]	33	from .graph_optimiser_util import bypass_reshape_and_squeeze_ops
Patrik Gustavsson	c74682c	2021-08-17 14:26:38 +0200	[diff] [blame]	34	from .graph_optimiser_util import calc_explicit_padding
Patrik Gustavsson	df99510	2021-08-23 15:33:59 +0200	[diff] [blame^]	35	from .graph_optimiser_util import convert_depthwise_to_conv
				36	from .graph_optimiser_util import fix_sg_input_output
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	37	from .graph_optimiser_util import needed_total_padding
				38	from .graph_optimiser_util import set_ifm_ofm_op_shapes
				39	from .graph_optimiser_util import set_tensor_equivalence
				40	from .numeric_util import clamp_sigmoid
				41	from .numeric_util import full_shape
				42	from .numeric_util import round_away_zero
				43	from .operation import create_activation_function
				44	from .operation import NpuBlockType
				45	from .operation import Op
				46	from .operation import Operation
				47	from .operation import Padding
				48	from .operation_util import create_avgpool_nop
				49	from .operation_util import get_pad_values_from_input
				50	from .shape4d import Shape4D
				51	from .softmax import SoftMax
				52	from .tensor import check_quantized_tens_scaling_equal
				53	from .tensor import create_const_tensor
				54	from .tensor import create_equivalence_id
				55	from .tensor import QuantizationParameters
				56	from .tensor import Tensor
				57	from .tensor import TensorPurpose
				58	from .tflite_mapping import optype_to_builtintype
				59
				60	passthrough_nodes = (Op.Identity,)
				61
				62
				63	def create_avg_pool_for_concat(concat_op, name, ifm, ifm_shape: Shape4D, write_offset: Shape4D):
				64	"""Creates an average pool for the given concat op/input feature map"""
				65	ofm = concat_op.ofm
				66	avgpool_op = create_avgpool_nop(name)
				67	avgpool_op.inputs = [ifm]
				68	avgpool_op.outputs = [ofm]
				69
				70	avgpool_op.write_offset = write_offset
				71	avgpool_op.write_shape = ifm_shape
				72	ofm.ops.append(avgpool_op)
				73	DebugDatabase.add_optimised(concat_op, avgpool_op)
				74	avgpool_op.ifm_shapes.append(ifm_shape)
				75	avgpool_op.ofm_shapes.append(concat_op.ofm_shapes[0])
				76	avgpool_op.memory_function = Op.ConcatSliceWrite
				77	return avgpool_op
				78
				79
				80	def remove_passthrough_tensor(tens, arch, nng):
				81	if len(tens.ops) == 1 and tens.ops[0].type in passthrough_nodes:
				82	assert len(tens.ops[0].inputs) == 1
				83	tens = tens.ops[0].inputs[0]
				84	return tens
				85
				86
				87	def rewrite_concat_ops(op, arch):
				88	if not op.run_on_npu or not op.type.is_concat_op():
				89	return
				90
				91	axis_4D = 0
				92	ofm = op.ofm
				93	ofm.ops = []
				94	offset = 0
				95
				96	unfuse_activation_function(op)
				97
				98	if op.type == Op.Pack:
				99	# Pack is also referred to as Stack
				100	axis = int(op.attrs["axis"])
				101	if axis < 0: # Convert to positive axis
				102	axis = len(op.inputs[0].shape) + 1 + axis
				103
				104	desired_shape = op.inputs[0].shape[:axis] + [1] + op.inputs[0].shape[axis:]
				105
				106	axis_4D = axis + (4 - len(desired_shape))
				107
				108	for idx, inp in enumerate(op.inputs):
				109	op.ifm_shapes[idx] = Shape4D(desired_shape)
				110	op.type = Op.PackReshaped
				111
				112	inputs, axis = op.get_concat_inputs_axis()
				113	for idx, inp in enumerate(inputs):
				114	if op.type != Op.PackReshaped:
				115	op.ifm_shapes[idx] = Shape4D(inp.shape)
				116	if axis >= 0:
				117	axis_4D = axis + (4 - len(inp.shape))
				118	else:
				119	axis_4D = axis
				120	write_offset = [0, 0, 0, 0]
				121	write_offset[axis_4D] = offset
				122	concat_end = offset + op.ifm_shapes[idx][axis_4D]
				123	create_avg_pool_for_concat(
				124	op, op.name + str(idx) + "_avgpool", inp, op.ifm_shapes[idx], Shape4D.from_list(write_offset)
				125	)
				126	offset = concat_end
				127	assert ofm.shape[axis] == offset
				128
				129	return op
				130
				131
				132	def rewrite_split_ops(tens, arch, nng):
				133
				134	if len(tens.ops) == 1 and tens.ops[0].type.is_split_op() and tens.ops[0].type != Op.Unpack:
				135	split_op = tens.ops[0]
				136
				137	# Not supported so leave it and run on CPU
				138	if not split_op.run_on_npu:
				139	return tens
				140
				141	inp, outputs, axis, offset_start, offset_end = split_op.get_split_inputs_axis()
				142
				143	tens.ops = []
				144	new_op = Operation(Op.SplitSliceRead, split_op.name)
				145	new_op.inputs = [inp]
				146	ofm_shape_idx = 0
				147	read_shape = offset_end
				148
				149	# For Split the offset cannot be extracted from the tensor so it has to
				150	# be calculated from the index of the output tensor
				151	if axis is not None:
				152	# Get the start and end of the split
				153	offset_start = [0] * 4
				154	axis_4D_list = split_op.attrs.get("split_axis_4D", None) # Present for UnpackReshaped and some StridedSlice
				155	for idx, out in enumerate(outputs):
				156	if axis_4D_list is not None:
				157	axis_4D = axis_4D_list[idx]
				158	else:
				159	split_op.ofm_shapes[idx] = Shape4D(out.shape)
				160	if axis >= 0:
				161	axis_4D = axis + (4 - len(out.shape))
				162	else:
				163	axis_4D = axis
				164
				165	if out == tens:
				166	ofm_shape_idx = idx
				167	read_shape = split_op.ofm_shapes[idx]
				168	break
				169
				170	offset_start[axis_4D] += split_op.ofm_shapes[idx][axis_4D]
				171
				172	new_op.read_offsets[0] = Shape4D.from_list(offset_start, 0)
				173	new_op.read_shapes[0] = read_shape
				174	new_op.run_on_npu = True
				175	new_op.set_output_tensor(tens)
				176	new_op.ifm_shapes.append(Shape4D(inp.shape))
				177	new_op.ofm_shapes.append(split_op.ofm_shapes[ofm_shape_idx])
				178	DebugDatabase.add_optimised(split_op, new_op)
				179
				180	return tens
				181
				182
				183	def remove_SplitSliceRead(op, arch):
				184
				185	if op.type == Op.SplitSliceRead:
				186	# Check if it is possible to put the SplitSliceRead on the tensor consumer, or if an avgpool need to be inserted
				187	if (
				188	len(op.ofm.consumer_list) == 1
				189	and op.ofm.consumer_list[0] is not None
				190	and op.ofm.consumer_list[0].run_on_npu
Jonas Ohlsson	fbfd96e	2021-08-25 11:38:03 +0200	[diff] [blame]	191	and op.ofm.consumer_list[0].type not in (Op.Reshape, Op.Squeeze)
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	192	and op.ofm_shapes[0] == Shape4D.from_list(op.ofm.shape)
				193	):
				194	# SplitSliceRead can be performed by tensor consumer
				195	cons_op = op.ofm.consumer_list[0]
				196	if cons_op.ifm == op.ofm:
				197	cons_op.read_offsets[0] = op.read_offsets[0]
				198	cons_op.read_shapes[0] = op.read_shapes[0]
				199	cons_op.set_input_tensor(op.ifm, cons_op.type.info.indices.ifms[0])
				200	cons_op.ifm_shapes[0] = op.ifm_shapes[0]
				201	elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == op.ofm:
				202	cons_op.read_offsets[1] = op.read_offsets[0]
				203	cons_op.read_shapes[1] = op.read_shapes[0]
				204	cons_op.set_input_tensor(op.ifm, cons_op.type.info.indices.ifms[1])
				205	cons_op.ifm_shapes[1] = op.ifm_shapes[0]
				206
				207	if "skirt" in cons_op.attrs:
				208	assert cons_op.attrs["explicit_padding"] == cons_op.attrs["skirt"]
				209	cons_op.attrs["skirt"] = None
				210	cons_op.attrs["force_padding"] = True
				211	op.ofm.consumer_list.remove(cons_op)
				212	op.ofm.ops = []
				213	op.ifm.consumer_list.remove(op)
				214	else:
				215	avgpool_op = create_avgpool_nop(op.name + "_avgpool")
				216	avgpool_op.add_input_tensor(op.ifm)
				217	avgpool_op.outputs = [op.ofm]
				218	op.ofm.ops.remove(op)
				219	op.ofm.ops.append(avgpool_op)
				220	avgpool_op.ifm_shapes.append(op.ifm_shapes[0])
				221	avgpool_op.ofm_shapes.append(op.ofm_shapes[0])
				222	avgpool_op.read_offsets[0] = op.read_offsets[0]
				223	avgpool_op.read_shapes[0] = op.read_shapes[0]
				224
				225	op.ifm.consumer_list.remove(op)
				226	DebugDatabase.add_optimised(op, avgpool_op)
				227
				228
				229	def insert_copy_op_after_tens(tens):
				230	tens_cons_list_copy = tens.consumer_list.copy()
				231
				232	# Create a avg_pool nop op with ifm as input
				233	copy_tens = tens.clone()
				234	copy_op = create_avgpool_nop(tens.name + "_avgpool")
				235	copy_op.add_input_tensor(tens)
				236	copy_op.set_output_tensor(copy_tens)
				237	copy_op.set_ifm_ofm_shapes()
				238	copy_op.run_on_npu = True
				239
				240	# Set copy_ifm consumers
				241	for tens_cons in tens_cons_list_copy:
				242	if tens_cons is not None:
				243	for ifm_idx, cons_inp in enumerate(tens_cons.inputs):
				244	if cons_inp == tens:
				245	tens_cons.set_input_tensor(copy_tens, ifm_idx)
				246
				247	DebugDatabase.add_optimised(tens.ops[0], copy_op)
				248
				249
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	250	def calc_padding_and_skirt(padding_type, kernel, input_shape, explicit_padding):
				251	k_w, k_h = kernel.dilated_wh()
				252	s_x, s_y = kernel.stride
				253	ypad = needed_total_padding(int(input_shape.height), int(s_y), int(k_h))
				254	xpad = needed_total_padding(int(input_shape.width), int(s_x), int(k_w))
				255	if padding_type == Padding.SAME:
				256	left_pad = (xpad + 0) // 2
				257	right_pad = (xpad + 1) // 2
				258	top_pad = (ypad + 0) // 2
				259	bottom_pad = (ypad + 1) // 2
				260	elif padding_type == Padding.VALID:
				261	left_pad = 0
				262	right_pad = 0
				263	top_pad = 0
				264	bottom_pad = 0
				265	elif padding_type == Padding.EXPLICIT:
				266	# Padding is specified in a PAD operator which has been bypassed.
				267	top, left, bottom, right = explicit_padding
				268	top_pad, bottom_pad = calc_explicit_padding(int(input_shape.height), int(s_y), int(k_h), int(top), int(bottom))
				269	left_pad, right_pad = calc_explicit_padding(int(input_shape.width), int(s_x), int(k_w), int(left), int(right))
				270	else:
				271	raise UnsupportedFeatureError(f"Unknown padding")
				272	padding = (top_pad, left_pad, bottom_pad, right_pad)
				273	skirt = (top_pad, left_pad, ypad - top_pad, xpad - left_pad)
				274	return padding, skirt
				275
				276
				277	def calc_upscaled_padding_and_skirt(padding_type, kernel_size, stride, input_shape, upscaling_factor):
				278	kernel_height, kernel_width = kernel_size[0], kernel_size[1]
				279	if padding_type == Padding.SAME:
				280	ypad = needed_total_padding(int(input_shape.height) * upscaling_factor, int(stride[1]), int(kernel_height))
				281	xpad = needed_total_padding(int(input_shape.width) * upscaling_factor, int(stride[2]), int(kernel_width))
				282	right_pad = max(((xpad + 1) // upscaling_factor) - 1, 0)
				283	bottom_pad = max(((ypad + 1) // upscaling_factor) - 1, 0)
				284	left_pad = max(kernel_width - 1 - right_pad, 0)
				285	top_pad = max(kernel_height - 1 - bottom_pad, 0)
				286	elif padding_type == Padding.VALID:
				287	right_pad = max(kernel_width - 2, 0)
				288	bottom_pad = max(kernel_height - 2, 0)
				289	left_pad = kernel_width - 1
				290	top_pad = kernel_height - 1
				291	else:
				292	raise UnsupportedFeatureError(f"Unknown padding")
				293	padding = (top_pad, left_pad, bottom_pad, right_pad)
				294	skirt = padding
				295	return padding, skirt
				296
				297
				298	def fixup_conv2d_backprop(op, arch, nng):
				299	if op.type == Op.Conv2DBackpropInput:
				300	# flip the inputs
				301	op.inputs[0], op.inputs[2] = op.inputs[2], op.inputs[0]
				302	op.type = Op.Conv2DBackpropInputSwitchedBias
				303	op.ifm.resampling_mode = resampling_mode.TRANSPOSE
				304
				305	# Update strides
				306	op.attrs.update({"stride_w": 1, "stride_h": 1, "strides": (1, 1, 1, 1)})
				307
				308	return op
				309
				310
				311	# Convert the op to an elementwise add
				312	def convert_resizebilinear_1x1_to_add(op):
				313	op.type = Op.Add
				314	op.name = op.name + "_add"
				315	op.attrs["resizebilinear"] = True
				316	# Create an input tensor filled with zeros
				317	shape = op.ofm_shapes[0].as_list()
				318	tens = Tensor(shape, op.inputs[0].dtype, op.inputs[1].name + "_add")
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	319	tens.values = np.zeros(shape, tens.dtype.as_numpy_type())
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	320	tens.quantization = QuantizationParameters(0.0, 255.0)
				321	tens.quantization.scale_f32 = 1.0
				322	tens.quantization.zero_point = 0
				323	tens.consumer_list = [op]
				324	tens_op = op.inputs[1].ops[0]
				325	tens_op.set_output_tensor(tens)
				326	# Set the add inputs
				327	op.inputs[1] = op.inputs[0]
				328	op.inputs[0] = tens
				329	op.set_ifm_ofm_shapes()
				330
				331	return op
				332
				333
				334	# Convert ResizeBilinear to a number of 2x2 pool ops
				335	def convert_resizebilinear_to_2x2_pool(op):
				336	count = 0
				337	pre_op = op
				338	outputs = op.outputs
				339
				340	op.attrs.update({"strides": (1, 1, 1, 1), "ksize": (1, 2, 2, 1)})
				341	if op.attrs["align_corners"]:
				342	shape_modifier = 1
				343	op.attrs["padding"] = Padding.VALID
				344	else:
				345	shape_modifier = 0
				346	op.attrs["padding"] = Padding.SAME
				347	op.inputs[0].resampling_mode = resampling_mode.NEAREST
				348
				349	upscaled_shape = np.array(op.ifm_shapes[0].get_hw_as_list())
				350	out_shape = np.array(op.ofm_shapes[0].get_hw_as_list())
				351	if (upscaled_shape == upscaled_shape * 2 - shape_modifier).all():
				352	return op
				353
				354	while (upscaled_shape < out_shape).all():
				355	if count == 0:
				356	scaled_op = pre_op
				357	else:
				358	scaled_op = op.clone("_{}".format(count))
				359	scaled_op.inputs[0] = pre_op.outputs[0]
				360
				361	upscaled_shape = upscaled_shape * 2 - shape_modifier
				362
				363	if (upscaled_shape == out_shape).all():
				364	scaled_op.outputs = outputs
				365	scaled_op.outputs[0].ops = [scaled_op]
				366	else:
				367	shape = op.ofm_shapes[0].as_list()
				368	shape[1:3] = upscaled_shape
				369	out_tens = Tensor(shape, DataType.int16, "{}_{}".format(op.outputs[0].name, count))
				370	out_tens.quantization = op.outputs[0].quantization.clone()
				371	out_tens.quantization.quant_min = np.iinfo(np.int16).min
				372	out_tens.quantization.quant_max = np.iinfo(np.int16).max
				373	scaled_op.set_output_tensor(out_tens)
				374	pre_op = scaled_op
				375	count += 1
				376
				377	# Setup the scale value
				378	if scaled_op.inputs[0].dtype.bits == 8 and scaled_op.outputs[0].dtype.bits == 16:
				379	scaled_op.rescale = 128
				380	elif scaled_op.inputs[0].dtype.bits == 16 and scaled_op.outputs[0].dtype.bits == 8:
				381	scaled_op.rescale = 1 / 128
				382	else:
				383	scaled_op.rescale = None
				384	scaled_op.set_ifm_ofm_shapes()
				385
				386	return op
				387
				388
				389	def fixup_resizebilinear(op, arch, nng):
				390	if op.type == Op.ResizeBilinear and op.run_on_npu:
				391	if op.ifm_shapes[0] == op.ofm_shapes[0]:
				392	# Bypass nop resizebilinear
				393	op.inputs = op.inputs[:1]
				394	op.type = Op.Identity
				395	elif op.ifm_shapes[0].height == 1 and op.ifm_shapes[0].width == 1:
				396	convert_resizebilinear_1x1_to_add(op)
				397	else:
				398	convert_resizebilinear_to_2x2_pool(op)
				399
				400	return op
				401
				402
				403	def convert_nop_split_to_identity(op, arch, nng):
				404	if op.type == Op.Split and op.attrs.get("num_splits") == 1:
				405	# the list comprehension should return a list with a single tensor
				406	# if it shouldn't, remove_passthrough_tensor will fail appropriately
				407	op.inputs = [i for i in op.inputs if i.shape == op.outputs[0].shape]
				408	op.type = Op.Identity
				409	return op
				410
				411
				412	def rewrite_fully_connected_input(op, arch, nng):
				413	if op.type == Op.FullyConnected:
				414	n_in_elems = op.weights.shape[-2]
				415	elms = op.ifm.elements()
				416	batch_size = elms // n_in_elems
				417	assert batch_size * n_in_elems == elms
				418
				419	op.ifm_shapes[0] = Shape4D([batch_size, 1, 1, n_in_elems])
				420	return op
				421
				422
				423	def convert_batched_fc_shape(op, arch, nng):
				424	if op.type == Op.FullyConnected:
				425	# Check if the first dimension indicates batching
				426	if op.ifm_shapes[0].batch > 1:
				427	batching_split = {4: (2, 2), 8: (2, 4), 16: (4, 4)}
				428	n = op.ifm_shapes[0].batch
				429	h, w = batching_split.get(n, (1, n))
				430	op.ifm_shapes[0] = Shape4D([1, h, w, op.ifm_shapes[0].depth])
				431
				432	# Reshape Weights to be 4D. IO becomes HWIO
				433	weight_tensor = op.inputs[1]
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	434	weight_tensor.values = np.expand_dims(np.expand_dims(weight_tensor.values, axis=0), axis=0)
				435	weight_tensor.set_all_shapes(list(weight_tensor.values.shape))
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	436
				437	n = op.ofm_shapes[0].batch
				438	h, w = batching_split.get(n, (1, n))
				439	op.ofm_shapes[0] = Shape4D([1, h, w, op.ofm_shapes[0].depth])
				440	return op
				441
				442
				443	def unfuse_activation_function(op):
				444	if op.type == Op.ConcatTFLite and op.run_on_npu and op.activation is not None:
				445	act_op = Operation(op.activation.op_type, op.name + op.activation.op_type.name)
				446	op.activation = None
				447	out_tens = op.outputs[0]
				448	intermediate_tens = out_tens.clone("_act_intermediate")
				449	act_op.set_output_tensor(out_tens)
				450	act_op.add_input_tensor(intermediate_tens)
				451	op.set_output_tensor(intermediate_tens)
				452	act_op.set_ifm_ofm_shapes()
				453
				454
				455	def rewrite_stridedslice_output(op, arch, nng):
				456	if not op.run_on_npu or op.type != Op.StridedSlice:
				457	return op
				458
				459	new_axis_mask = op.attrs["new_axis_mask"]
				460	shrink_axis_mask = op.attrs["shrink_axis_mask"]
				461
				462	if shrink_axis_mask == 0 and new_axis_mask == 0:
				463	return op
				464
				465	axis_4D = [0] * len(op.outputs)
				466	for idx, out_tens in enumerate(op.outputs):
				467	output_shape = list(out_tens.shape)
				468
				469	if shrink_axis_mask != 0:
				470	n = 0
				471	axis = 0
				472	while shrink_axis_mask:
				473	prev_mask = shrink_axis_mask
				474	n += 1
				475	shrink_axis_mask &= shrink_axis_mask - 1
				476	axis = int(math.log2(prev_mask - shrink_axis_mask))
				477	output_shape = output_shape[:axis] + [1] + output_shape[axis:]
				478
				479	assert len(out_tens.shape) == (len(op.inputs[0].shape) - n)
				480	op.attrs["shrink_axis_mask"] = 0
				481	if axis >= 0:
				482	axis_4D[idx] = axis + (4 - len(output_shape))
				483	else:
				484	axis_4D[idx] = axis
				485	op.ofm_shapes[idx] = Shape4D(output_shape)
				486
				487	elif new_axis_mask != 0:
				488	n = 0
				489	axis = 0
				490	while new_axis_mask:
				491	prev_mask = new_axis_mask
				492	n += 1
				493	new_axis_mask &= new_axis_mask - 1
				494	axis = int(math.log2(prev_mask - new_axis_mask))
				495	output_shape = output_shape[:axis] + output_shape[(axis + 1) :]
				496	new_axis_mask >>= 1
				497
				498	assert len(out_tens.shape) == (len(op.inputs[0].shape) + n)
				499	op.attrs["new_axis_mask"] = 0
				500	if axis >= 0:
				501	axis_4D[idx] = axis + (4 - len(output_shape))
				502	else:
				503	axis_4D[idx] = axis
				504	op.ofm_shapes[idx] = Shape4D(output_shape)
				505
				506	op.attrs["split_axis_4D"] = axis_4D
				507	return op
				508
				509
				510	def rewrite_unpack_output(op, arch, nng):
				511	tens = op.outputs[0]
				512	if op.run_on_npu and op.type == Op.Unpack:
				513	# Unpack is also referred to as Unstack
				514	axis = int(op.attrs["axis"])
				515	if axis < 0: # Convert to positive axis
				516	axis = len(op.inputs[0].shape) + 1 + axis
				517	op.type = Op.UnpackReshaped
				518	desired_output_shape = tens.shape[:axis] + [1] + tens.shape[axis:]
				519
				520	axis_4D = axis + (4 - len(desired_output_shape))
				521	op.attrs["split_axis_4D"] = [axis_4D] * len(op.outputs)
				522
				523	for idx, out_tens in enumerate(op.outputs):
				524	op.ofm_shapes[idx] = Shape4D(desired_output_shape)
				525	return op
				526
				527
				528	def add_padding_fields(op, arch, nng):
				529	if op.run_on_npu:
				530	if "padding" in op.attrs:
				531	input_shape = op.ifm_shapes[0]
				532	output_shape = op.ofm_shapes[0]
				533	if op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op():
				534	kernel_size = op.inputs[1].shape[:2]
				535	elif op.type.is_pool_op() or op.type.npu_block_type == NpuBlockType.ReduceSum:
				536	kernel_size = op.attrs["ksize"][1:3]
				537	else:
				538	raise UnsupportedFeatureError(f"Unknown operation that uses padding: {optype_to_builtintype(op.type)}")
				539
				540	if op.type == Op.Conv2DBackpropInputSwitchedBias:
				541	upscaling_factor = output_shape.height // input_shape.height
				542	padding, skirt = calc_upscaled_padding_and_skirt(
				543	op.attrs["padding"], kernel_size, op.attrs["strides"], input_shape, upscaling_factor
				544	)
				545	else:
				546	padding, skirt = calc_padding_and_skirt(
				547	op.attrs["padding"], op.kernel, input_shape, op.attrs.get("explicit_padding"),
				548	)
				549
				550	op.attrs["explicit_padding"] = padding
				551	op.attrs["skirt"] = skirt
				552
				553	return op
				554
				555
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	556	def reorder_depthwise_weights(op, arch, nng):
				557	if op.type.is_depthwise_conv2d_op():
				558	weight_tensor = op.inputs[1]
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	559	weight_tensor.values = np.transpose(weight_tensor.values, (0, 1, 3, 2))
				560	weight_tensor.set_all_shapes(list(weight_tensor.values.shape))
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	561	weight_tensor.weight_transpose_depthwise = True
				562
				563	return op
				564
				565
				566	def optimise_strided_conv(op, arch, nng):
				567	stride_x, stride_y = op.get_kernel_stride()
				568	ifm_tensor, _, weight_tensor, _ = op.get_ifm_ifm2_weights_ofm()
				569
				570	if (
				571	op.type == Op.Conv2DBias
				572	and op.op_index == 0
				573	and stride_x == 2
				574	and op.ifm_shapes[0].depth <= 4
				575	and op.ifm_shapes[0].width % 2 == 0
				576	and weight_tensor is not None
				577	and weight_tensor.shape[1] >= 2
				578	):
				579	ifm_shape = op.ifm_shapes[0]
				580	# IFM
				581	op.ifm_shapes[0] = Shape4D([ifm_shape.batch, ifm_shape.height, ifm_shape.width // 2, ifm_shape.depth * 2])
				582
				583	# Weights
				584	weight_shape = weight_tensor.shape
				585	if weight_shape[1] % 2 != 0:
				586	weight_shape[1] = weight_shape[1] + 1
				587	padded_array = np.zeros(weight_shape)
				588	for i in range(weight_shape[0]):
				589	padded_array[i] = np.vstack(
				590	[
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	591	weight_tensor.values[i],
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	592	np.full((1, weight_shape[2], weight_shape[3]), weight_tensor.quantization.zero_point),
				593	]
				594	)
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	595	weight_tensor.values = padded_array
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	596	weight_shape[1] //= 2
				597	weight_shape[2] *= 2
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	598	weight_tensor.values = np.reshape(weight_tensor.values, weight_shape)
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	599	weight_tensor.set_all_shapes(weight_shape)
				600	# If multiple copies of the weights are used, we could avoid
				601	# them having the same address by changing the value_id
				602	weight_tensor.value_id = uuid.uuid4()
				603
				604	# Strides
				605	stride_x = 1
				606	op.attrs.update({"stride_w": stride_x, "stride_h": stride_y, "strides": (1, stride_y, stride_x, 1)})
				607
				608	return op
				609
				610
				611	def convert_conv_to_fc(op, arch, nng):
				612	# Conv 1x1 can be equivalent to Fully Connected.
				613	# By representing certain convs as fully connected layers, Vela can better determine wether or not to use
				614	# caching/double buffering for the weights.
				615	# (Weights dont need to be reloaded for convs when IFM H and W are 1)
				616	if op.type == Op.Conv2DBias:
				617	h = op.ifm_shapes[0].height
				618	w = op.ifm_shapes[0].width
				619	kh, kw, _, _ = op.inputs[1].shape
				620	if h == 1 and w == 1 and kh == 1 and kw == 1:
				621	# Overwrite this op as a Fully Connected Op
				622	op.name += "_fc"
				623	op.type = Op.FullyConnected
				624	op.attrs = {
				625	"weights_format": 0,
				626	}
				627	# Reshape Weights to be 2D. HWIO becomes just IO (as H and W are 1, they can just be dropped)
				628	weight_tensor = op.inputs[1]
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	629	weight_tensor.values = weight_tensor.values.squeeze(axis=(0, 1))
				630	weight_tensor.set_all_shapes(list(weight_tensor.values.shape))
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	631
				632	DebugDatabase.add_optimised(op, op)
				633	return op
				634
				635
				636	def fixup_relus_with_differing_ifm_ofm_scaling(op, arch, nng):
				637	if op.run_on_npu and op.type.is_relu_op():
				638	ifm = op.inputs[0]
				639	ofm = op.outputs[0]
				640	# Relu with differing IFM and OFM scaling cannot be fused with another primary op
				641	# and requires its own to be inserted
				642	if not check_quantized_tens_scaling_equal(ifm, ofm):
				643	# Override this op with its own primary op (avgpool)
				644	relu_fused_op = create_avgpool_nop(op.name + "_avgpool")
				645	# And fuse the original activation function to it
				646	relu_fused_op.activation = create_activation_function(op.type)
				647	# Tidy up and assign the ifm and ofm to the new op
				648	ifm.consumer_list.remove(op)
				649
				650	relu_fused_op.add_input_tensor(ifm)
				651	relu_fused_op.set_output_tensor(ofm)
				652	relu_fused_op.set_ifm_ofm_shapes()
				653	op = relu_fused_op
				654	return op
				655
				656
				657	def fixup_elementwise_with_scalars(op, arch, nng):
				658	if op.type.is_binary_elementwise_op():
				659	ifm_tensor, ifm2_tensor, _, _ = op.get_ifm_ifm2_weights_ofm()
				660	if ifm2_tensor.shape != [] and ifm_tensor.shape != []:
				661	diff = len(ifm_tensor.shape) - len(ifm2_tensor.shape)
				662	if diff > 0:
				663	ifm2_tensor.shape = full_shape(len(ifm_tensor.shape), ifm2_tensor.shape, 1)
				664	elif diff < 0:
				665	ifm_tensor.shape = full_shape(len(ifm2_tensor.shape), ifm_tensor.shape, 1)
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	666	elif ifm_tensor.shape == [] and ifm_tensor.values is None:
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	667	# IFM is marked as a scalar, but is a result of an operation; change it to a shape of size 1
				668	ifm_tensor.shape = len(ifm2_tensor.shape) * [1]
				669	ifm_tensor.storage_shape = ifm_tensor.shape
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	670	elif ifm2_tensor.shape == [] and ifm2_tensor.values is None:
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	671	# IFM2 is marked as a scalar, but is a result of an operation; change it to a shape of size 1
				672	ifm2_tensor.shape = len(ifm_tensor.shape) * [1]
				673	ifm2_tensor.storage_shape = ifm2_tensor.shape
				674	return op
				675
				676
				677	def convert_softmax(op, arch, nng):
				678	if op.type == Op.Softmax and op.run_on_npu:
				679	softmax = SoftMax(op)
				680	op = softmax.get_graph()
				681	return op
				682
				683
				684	def convert_mul_max_to_abs_or_lrelu(op, arch, nng):
				685	r"""Whenever there is a subgraph with this topology:
				686
				687	Input X For X = -1 or X > 0
				688	\| \ / This subgraph can be replaced with either
				689	\| Mul an Abs (if X = -1) or a LeakyReLU (if X > 0)
				690	\| /
				691	Max
				692	"""
				693
				694	if op.type == Op.Maximum:
				695	# finds the Mul input(s) to the Max
				696	muls = [i for i in op.inputs if i.ops[0].type == Op.Mul]
				697	if len(muls) == 1:
				698	mul = muls[0].ops[0]
				699	elif len(muls) == 2:
				700	# In the case both inputs are Muls, find the one with the same input as the Max
				701	mul = [m for m in muls if len(set(op.inputs + m.ops[0].inputs)) == 1][0].ops[0]
				702	else:
				703	# No Mul inputs
				704	return op
				705
				706	# make sure the Mul doesn't have any other consumers
				707	mul_ofm = mul.outputs[0]
				708	if len(mul_ofm.consumers()) != 1:
				709	return op
				710	# make sure the Mul doesn't have a fused activation function
				711	if mul.activation:
				712	return op
				713	ifm, ofm = op.get_ifm_ofm()
				714	if ifm is None or ofm is None:
				715	return op
				716
				717	if ifm.dtype not in (DataType.uint8, DataType.int8) or ifm.dtype != ofm.dtype:
				718	return op
				719	if not check_quantized_tens_scaling_equal(ifm, ofm) or not check_quantized_tens_scaling_equal(ifm, mul_ofm):
				720	# rewrite to LeakyRelu currently only makes sense if the quantization is identical
				721	return op
				722
				723	# finds the branched input that goes to both the Max and the Mul
				724	shared = set(op.inputs) & set(mul.inputs)
				725	if len(shared) == 1:
				726	shared_in = shared.pop()
				727	# find the constant scalar input to the Mul
				728	const_tens = (set(mul.inputs) - {shared_in}).pop()
				729	# check that it is a scalar
				730	if const_tens.shape != []:
				731	return op
				732	const = const_tens.ops[0]
				733	# check that it is a constant
				734	if const.type != Op.Const:
				735	return op
				736	# Remove the Mul from the shared input's consumers
				737	shared_in.consumer_list.remove(mul)
				738	else:
				739	return op
				740
				741	val = const.outputs[0].values
				742	if val >= 0:
				743	new_op = Op.LeakyRelu
				744	op.attrs["alpha"] = val
				745	# to produce bit exact results, the alpha is not enough;
				746	# save additional scaling info in attr "alpha_scale", to be used as input
				747	# to the LUT construction
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	748	alpha_scalar = const_tens.values - const_tens.quantization.zero_point
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	749	mul_ifm_scale = np.double(ifm.quantization.scale_f32)
				750	mul_ifm2_scale = np.double(const_tens.quantization.scale_f32)
				751	mul_ofm_scale = np.double(mul_ofm.quantization.scale_f32)
				752	alpha_scale, alpha_shift = scaling.elementwise_mul_scale(mul_ifm_scale, mul_ifm2_scale, mul_ofm_scale)
				753	op.attrs["alpha_scaling"] = (alpha_scalar, alpha_scale, alpha_shift)
				754	elif val == -1:
				755	new_op = Op.Abs
				756	else:
				757	return op
				758
				759	op.type = new_op
				760	op.name = op.name.replace("Maximum", new_op.name)
				761	op.outputs[0].name = op.outputs[0].name.replace("Maximum", new_op.name)
				762	op.inputs = [shared_in]
				763	op.set_ifm_ofm_shapes()
				764
				765	# Record optimisation in debug database
				766	DebugDatabase.add_optimised(op, op)
				767
				768	return op
				769
				770
				771	def convert_hardswish_to_lut(op, arch, nng):
				772	if op.type == Op.HardSwish:
				773	ifm, ofm = op.get_ifm_ofm()
				774	# Generate the LUT
				775	ifm_scale = np.double(ifm.quantization.scale_f32)
				776	ofm_scale = np.double(ofm.quantization.scale_f32)
				777	zp_in = ifm.quantization.zero_point
				778	zp_out = ofm.quantization.zero_point
				779	ifm_scale_hires = (1 / 128) * ifm_scale
				780	relu_multiplier = np.double(3 / 32768)
				781	out_scale, out_shift = scaling.quantise_scale(ifm_scale_hires / ofm_scale)
				782	relu_scale, relu_shift = scaling.quantise_scale(ifm_scale_hires / relu_multiplier)
				783	# Use 16bit scale
				784	out_scale_16 = fp_math.downscale_multiplier_int32_to_int16(out_scale)
				785	relu_scale_16 = fp_math.downscale_multiplier_int32_to_int16(relu_scale)
				786
				787	values = []
				788	ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)
				789	quantized_min = min(ix)
				790	quantized_max = max(ix)
				791	for x in ix:
				792	input_value = x - zp_in
				793	input_value_hires = input_value * 128
				794	# Compute the input value on essentially the output scale, not shifted yet
				795	input_value_preshift = fp_math.saturating_rounding_mul16(input_value_hires, out_scale_16)
				796	# Compute the "relu-ish multiplier". This matches the code in TensorFlow Lite Micro kernel
				797	relu_value = np.int16(input_value_hires)
				798	if relu_shift < 31:
				799	relu_value = fp_math.shift_left16(relu_value, 30 - relu_shift)
				800
				801	relu_value = fp_math.saturating_rounding_mul16(relu_value, relu_scale_16)
				802
				803	if relu_shift < 31:
				804	relu_value = fp_math.shift_left16(relu_value, 1)
				805
				806	if relu_shift > 31:
				807	relu_value = fp_math.rounding_divide_by_pot(relu_value, relu_shift - 31)
				808
				809	# Rescaled the value into a 16bit fixedpoint relu_value in [-1, 1]
				810	# Now convert that to a 16bit fixedpoint value in [0, 1]
				811	relu_value = (relu_value + (1 << 15)) >> 1
				812	lut_result = fp_math.saturating_mul16(relu_value, input_value_preshift)
				813	shift = 31 - out_shift
				814	shift = -shift if shift < 0 else 0
				815	# Finally apply the output shift
				816	lut_result = fp_math.rounding_divide_by_pot(lut_result, shift) + zp_out
				817	lut_result = min(quantized_max, max(quantized_min, lut_result))
				818	values.append(lut_result)
				819	return convert_to_lut(op, values, "hardswish")
				820	return op
				821
				822
				823	def convert_lrelu_to_mul_max(op, arch):
				824	# Converts LeakyRelu to Max(alpha * IFM, identity * IFM)
				825	# (the opposite of convert_mul_max_to_abs_or_lrelu)
				826	ifm, ofm = op.get_ifm_ofm()
				827	if ifm is None or ofm is None:
				828	return op
				829
				830	# Add multiplication with alpha
				831	mul_alpha = Operation(Op.Mul, op.name + "_mul_alpha")
				832	mul_alpha.add_input_tensor(ifm)
				833	# Create const tensor containing alpha as scalar
Fredrik Svedberg	cce872b	2021-09-02 15:20:52 +0200	[diff] [blame]	834	alpha = np.float32(op.attrs["alpha"])
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	835	quantization = ifm.quantization.clone()
				836	quantization.min = 0
				837	quantization.max = alpha * (quantization.quant_max - quantization.quant_min)
				838	quantization.zero_point = 0
Fredrik Svedberg	cce872b	2021-09-02 15:20:52 +0200	[diff] [blame]	839	if np.isinf(1 / alpha):
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	840	# Handling of alpha near zero
Fredrik Svedberg	cce872b	2021-09-02 15:20:52 +0200	[diff] [blame]	841	quantization.scale_f32 = np.float32(1)
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	842	scalar = 0
				843	else:
				844	quantization.scale_f32 = alpha
				845	scalar = alpha
				846	alpha_tens = create_const_tensor(
				847	op.name + "_alpha_scalar", [], ifm.dtype, [scalar], np.float32, quantization=quantization
				848	)
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	849	alpha_tens.values = np.array([1])
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	850	mul_alpha.add_input_tensor(alpha_tens)
				851	fm_alpha = ofm.clone(op.name + "_alpha", set_unique=True)
				852	mul_alpha.set_output_tensor(fm_alpha)
				853	mul_alpha.set_ifm_ofm_shapes()
				854	DebugDatabase.add_optimised(op, mul_alpha)
				855
				856	if check_quantized_tens_scaling_equal(ifm, ofm):
				857	# No identity multiplication is needed
				858	fm_id = ifm
				859	else:
				860	# Add multiplication with identity
				861	mul_identity = Operation(Op.Mul, op.name + "_mul_identity")
				862	mul_identity.add_input_tensor(ifm)
				863	# Create const tensor containing identity as scalar
				864	quantization = ifm.quantization.clone()
				865	quantization.min = 0
				866	quantization.max = quantization.quant_max - quantization.quant_min
Fredrik Svedberg	cce872b	2021-09-02 15:20:52 +0200	[diff] [blame]	867	quantization.scale_f32 = np.float32(1)
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	868	quantization.zero_point = 0
				869	identity_tens = create_const_tensor(
				870	op.name + "_id_scalar", [], ifm.dtype, [1], np.uint8, quantization=quantization
				871	)
				872	mul_identity.add_input_tensor(identity_tens)
				873	# Make sure that fm_id is allocated to a different address than fm_alpha
				874	fm_id = ofm.clone(op.name + "_id", set_unique=True)
				875	mul_identity.set_output_tensor(fm_id)
				876	mul_identity.set_ifm_ofm_shapes()
				877	DebugDatabase.add_optimised(op, mul_identity)
				878
				879	# Convert LeakyRelu to Max, add the results of the multiplication(s) as inputs
				880	op.type = Op.Maximum
				881	op.name = op.name.replace("LeakyRelu", "Maximum")
				882	op.inputs = []
				883	ifm.consumer_list.remove(op)
				884	op.add_input_tensor(fm_alpha)
				885	op.add_input_tensor(fm_id)
				886	op.set_ifm_ofm_shapes()
				887
				888	DebugDatabase.add_optimised(op, op)
				889	return op
				890
				891
				892	def convert_to_lut(op, lut_values, lut_name):
				893	# Rewrite the operation by Add with scalar 0 + LUT activation
				894	ifm = op.inputs[0]
				895	if ifm is None:
				896	return op
				897	assert ifm.dtype.size_in_bytes() == 1
				898	op.type = Op.Add
				899	op.name = op.name + "_lut_" + lut_name
				900	# Mark as no-op to enable potential fusing optimizations
				901	op.attrs["is_nop"] = True
				902	# Create an input tensor containing scalar zero
				903	quantization = QuantizationParameters(0.0, 255.0)
				904	quantization.scale_f32 = ifm.quantization.scale_f32
				905	quantization.zero_point = 0
				906	tens = create_const_tensor(op.inputs[0].name + "_scalar0", [], ifm.dtype, [0], np.uint8, quantization=quantization)
				907	op.add_input_tensor(tens)
				908	op.ifm_shapes.append(Shape4D(tens.shape))
				909
				910	# The LUT must be applied without any preceding rescaling (the LUT itself performs the rescale),
				911	# so even if the OFM has a different scale than the IFM, the generated OFM scale instructions
				912	# should be the same as the IFM
				913	op.forced_output_quantization = ifm.quantization
				914	lut_tensor = lut.create_lut_tensor(op.name + "_values", lut_values, DataType.int8)
				915	op.set_activation_lut(lut_tensor)
				916	op.set_ifm_ofm_shapes()
				917	return op
				918
				919
				920	def convert_to_lut8(op, fn, fn_name):
				921	# Converts op to a no-op + int8/uint8 LUT which is generated with the given function.
				922	# fn is a function(real) -> real
				923	ifm, ofm = op.get_ifm_ofm()
				924	if ifm.dtype not in (DataType.uint8, DataType.int8) or ifm.dtype != ofm.dtype:
				925	return op
				926	# Generate the LUT
				927	ifm_scale = np.double(ifm.quantization.scale_f32)
				928	ofm_scale = np.double(ofm.quantization.scale_f32)
				929	zp_in = ifm.quantization.zero_point
				930	zp_out = ofm.quantization.zero_point
				931	values = []
				932	ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)
				933	quantized_min = min(ix)
				934	quantized_max = max(ix)
				935	for x in ix:
				936	x_real = ifm_scale * (x - zp_in)
				937	y_real = fn(x_real)
				938	lut_result = round_away_zero(zp_out + y_real / ofm_scale)
				939	lut_result = min(quantized_max, max(quantized_min, lut_result))
				940	values.append(lut_result)
				941	return convert_to_lut(op, values, fn_name)
				942
				943
				944	def convert_lrelu_to_lut(op, arch):
				945	ifm, ofm = op.get_ifm_ofm()
				946	# Generate the LUT
				947	alpha = op.attrs["alpha"]
				948	ifm_scale = np.double(ifm.quantization.scale_f32)
				949	ofm_scale = np.double(ofm.quantization.scale_f32)
				950	zp_in = ifm.quantization.zero_point
				951	zp_out = ofm.quantization.zero_point
				952	identity_scale, identity_shift = scaling.elementwise_mul_scale(ifm_scale, 1, ofm_scale)
				953	alpha_scalar = 1
				954	alpha_scale, alpha_shift = scaling.elementwise_mul_scale(ifm_scale, alpha, ofm_scale)
				955	if "alpha_scaling" in op.attrs:
				956	# The LeakyRelu was the result from convert_mul_max_to_abs_or_lrelu
				957	alpha_scalar, alpha_scale, alpha_shift = op.attrs["alpha_scaling"]
				958	values = []
				959	ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)
				960	quantized_min = min(ix)
				961	quantized_max = max(ix)
				962	for x in ix:
				963	if x < zp_in:
				964	lut_result = zp_out + fp_math.multiply_by_quantized_multiplier(
				965	alpha_scalar * (x - zp_in), alpha_scale, alpha_shift
				966	)
				967	else:
				968	lut_result = zp_out + fp_math.multiply_by_quantized_multiplier(x - zp_in, identity_scale, identity_shift)
				969	lut_result = min(quantized_max, max(quantized_min, lut_result))
				970	values.append(lut_result)
				971	return convert_to_lut(op, values, "lrelu")
				972
				973
				974	def convert_lrelu(op, arch, nng):
				975	# Converts LeakyRelu to a LUT based solution if possible, otherwise a mul + max
				976	if op.type != Op.LeakyRelu:
				977	return op
				978	ifm, ofm = op.get_ifm_ofm()
				979	if ifm is None or ofm is None:
				980	return op
				981	if ifm.dtype in (DataType.uint8, DataType.int8) and ifm.dtype == ofm.dtype:
				982	# use LUT for int8/uint8
				983	return convert_lrelu_to_lut(op, arch)
				984	if check_quantized_tens_scaling_equal(ifm, ofm) and ifm.dtype == ofm.dtype == DataType.int16:
				985	# use LeakyRelu unmodified for int16 with equal input/output scaling
				986	return op
				987	return convert_lrelu_to_mul_max(op, arch)
				988
				989
				990	def convert_tanh_sigmoid_to_lut(op, arch, nng):
				991	# Converts int8/uint8 Sigmoid and Tanh to a LUT based solution
				992	if op.type == Op.Sigmoid:
				993	return convert_to_lut8(op, clamp_sigmoid, "sigmoid")
				994	elif op.type == Op.Tanh:
				995	return convert_to_lut8(op, math.tanh, "tanh")
				996	return op
				997
				998
Jonas Ohlsson	81942e9	2021-08-20 09:33:28 +0200	[diff] [blame]	999	def remove_reshape_and_squeeze_ops(op, arch):
Jonas Ohlsson	fbfd96e	2021-08-25 11:38:03 +0200	[diff] [blame]	1000	if op.run_on_npu and op.type in (Op.Reshape, Op.Squeeze):
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	1001	ofm = op.ofm
				1002	ifm = op.ifm
				1003
				1004	# Check if quantization is the same in the input and output for the reshape ops
				1005	if not check_quantized_tens_scaling_equal(ifm, ofm):
				1006	# TODO Both tensors are needed, since quantisation properties currently are linked to Tensors.
				1007	# In order to remove this reshape either quantization properties need to be moved to Operator,
				1008	# or the reshape need to be replace with a NOP.
				1009	return
				1010
Patrik Gustavsson	df99510	2021-08-23 15:33:59 +0200	[diff] [blame^]	1011	bypass_reshape_and_squeeze_ops(op)
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	1012
				1013
				1014	def fuse_activation_function_with_prev(op, arch, nng):
				1015	# if op is a no-op: attempts to move the activation function to the preceding op
				1016	if not op.attrs.get("is_nop", False) or op.activation is None:
				1017	return op
				1018	ifm, ofm = op.get_ifm_ofm()
				1019	if ifm is None or ofm is None:
				1020	return op
				1021	# finds the input(s) to the operation
				1022	prev_op = ifm.ops[0]
				1023	# Note: the below checks on prev_op require that a first optimize pass on the full graph has been performed
				1024	fuse = (
				1025	prev_op.run_on_npu
				1026	and prev_op.type.npu_block_type != NpuBlockType.Default
				1027	and len(ifm.ops) == 1
				1028	and len(prev_op.outputs[0].consumers()) == 1
				1029	and prev_op.activation is None
				1030	)
				1031	if op.activation_lut is not None and arch.shram_reserved_unused_banks == 0:
				1032	# TODO: if SHRAM LUT space is shared with SHRAM ACC (32, 64 MAC),
				1033	# LUT currently only works correctly for elementwise ops
				1034	fuse = False
				1035	if not fuse:
				1036	return op
				1037	# Move the fused activation function + corresponding info to prev_op
				1038	prev_op.activation = op.activation
				1039	prev_op.forced_output_quantization = op.forced_output_quantization
				1040	if op.activation_lut is not None:
				1041	prev_op.set_activation_lut(op.activation_lut)
				1042	# Bypass op
				1043	prev_op.set_output_tensor(ofm)
				1044	DebugDatabase.add_optimised(op, prev_op)
				1045	return op
				1046
				1047
				1048	def _leading_pad_ok(leading_pad, stride, kernel_size):
				1049	# If kernel size // 2 > stride, then (left, top) padding must be a multiple of stride,
				1050	# otherwise replacing PAD by hardware padding would iterate the wrong IFM rows/columns
				1051	max_size = kernel_size // 2
				1052	return leading_pad == max_size or max_size <= stride or leading_pad % stride == 0
				1053
				1054
				1055	def replace_pad_by_hw_pad(op: Operation, arch, nng):
				1056	"""
				1057	Tries to completely remove a PAD operator by using hardware padding.
				1058	E.g. a PAD operation that pads 1, followed by a CONV with VALID padding and kernel size 3
				1059	is rewritten such that the PAD is removed, and the CONV uses SAME padding.
				1060	Converts tens1 -> PAD -> tens2 -> CONV to tens1 -> CONV
				1061	if both operations can be run on the NPU.
				1062	This is the most efficient way to implement PAD, but cannot be done for all pad sizes.
				1063	"""
				1064	if (
				1065	(op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op() or op.type.is_avgpool_op())
				1066	and op.run_on_npu
				1067	and op.attrs["padding"] == Padding.VALID
				1068	):
				1069	pad_op = op.ifm.ops[0]
				1070	if pad_op.type != Op.Pad or not pad_op.run_on_npu:
				1071	return op
				1072	if pad_op.ifm.dtype != pad_op.ofm.dtype or not check_quantized_tens_scaling_equal(pad_op.ofm, pad_op.ifm):
				1073	return op
				1074	top, left, bottom, right = get_pad_values_from_input(pad_op.inputs[1].values)
				1075	k = op.kernel
				1076	k_w, k_h = k.dilated_wh()
				1077
				1078	# Check if the PAD operator can be replaced by hardware padding
				1079	if left > k_w // 2 or right > k_w // 2 or top > k_h // 2 or bottom > k_h // 2:
				1080	# Too much padding, it would require hardware padding to actually insert zeros
				1081	return op
				1082	if not _leading_pad_ok(top, k.stride.y, k_h) or not _leading_pad_ok(left, k.stride.x, k_w):
				1083	return op
				1084
				1085	if op.type.is_avgpool_op():
				1086	# For average pool, hardware padding can only be used if padding is 0 or kernel size / 2
				1087	for pad, k_size in (
				1088	(left, k_w),
				1089	(right, k_w),
				1090	(top, k_h),
				1091	(bottom, k_h),
				1092	):
				1093	if pad not in (0, k_size // 2):
				1094	return op
				1095	# Average pool is converted to depthwise, because NPU average pool + same padding
				1096	# has a special implementation that is different from PAD followed by average pool with
				1097	# valid padding.
				1098	k_w, k_h = op.kernel.width, op.kernel.height
				1099	ifm = op.ifm
				1100	# Remember other inputs
				1101	other_inputs = op.inputs[1:]
				1102	# Create a weight tensor, all weights are set to 1/(kernel width * kernel height)
				1103	quantization = QuantizationParameters(0.0, 255.0)
				1104	quantization.scale_f32 = 1.0 / (k_w * k_h)
				1105	quantization.zero_point = 0
				1106	shape = [k_h, k_w, 1, op.ofm.shape[-1]]
				1107	weights = np.full(shape, 1)
				1108
				1109	weight_tens = create_const_tensor(
				1110	op.name + "_weights",
				1111	shape,
				1112	op.ifm.dtype,
				1113	weights,
				1114	np.uint8,
				1115	purpose=TensorPurpose.Weights,
				1116	quantization=quantization,
				1117	)
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	1118	weight_tens.values = weights
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	1119	op.type = Op.DepthwiseConv2DBias
				1120	op.inputs = []
				1121	op.add_input_tensor(ifm)
				1122	op.add_input_tensor(weight_tens)
				1123	# Add bias tensor, all biases set to 0
				1124	op.inputs.append(None)
				1125	fixup_bias_tensors(op, arch, nng)
				1126	# Add other inputs
				1127	op.inputs.extend(other_inputs)
				1128	op.rounding_mode = NpuRoundingMode.NATURAL
				1129
				1130	# Bypass the PAD operator
				1131	op.set_input_tensor(pad_op.ifm, 0)
				1132	# Adjust the padding attributes of the convolution operator
				1133	op.attrs["padding"] = Padding.EXPLICIT
				1134	op.attrs["explicit_padding"] = (top, left, bottom, right)
				1135	op.set_ifm_ofm_shapes()
				1136	return op
				1137
				1138
				1139	def convert_pad(op: Operation, arch, nng):
				1140	"""
				1141	Rewrites PAD operator to an average pool that copies the IFM to the OFM
				1142	+ up to 4 average pool operators that fill the OFM with zeros at the borders.
				1143	This is done as fall-back for the PAD operators that remain after replace_pad_by_hw_pad
				1144	"""
				1145	if op.type != Op.Pad or not op.run_on_npu:
				1146	return op
				1147	top, left, bottom, right = get_pad_values_from_input(op.inputs[1].values)
				1148
				1149	ifm = op.ifm
				1150	assert ifm is not None
				1151	ifm_shape = Shape4D(ifm.shape)
				1152	ofm = op.ofm
				1153	assert ofm is not None
				1154	ofm.ops = []
				1155	ofm_shape = op.ofm_shapes[0]
				1156
				1157	# Average pool op that copies IFM to the right place inside the OFM
				1158	shp0 = Shape4D(0, 0, 0, 0)
				1159	shp_top = shp0.with_height(top)
				1160	avgpool_op = create_avg_pool_for_concat(op, op.name + "_main", ifm, ifm_shape, shp_top.with_width(left))
				1161	avgpool_op.activation = op.activation
				1162	quant = ofm.quantization
				1163	pad_value = quant.zero_point
				1164	# Add operations that fill the borders of the OFM
				1165	if top > 0:
				1166	shape = Shape4D(1, top, ofm_shape.width, ofm_shape.depth)
				1167	zero_tens = create_const_tensor(
				1168	op.name + "_top", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], np.uint8, quantization=quant
				1169	)
				1170	# If top/bottom or left/right are equal, the const tensors can be allocated to the same address
				1171	zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))
				1172	create_avg_pool_for_concat(op, op.name + "_top", zero_tens, shape, shp0)
				1173	if bottom > 0:
				1174	shape = Shape4D(1, bottom, ofm_shape.width, ofm_shape.depth)
				1175	zero_tens = create_const_tensor(
				1176	op.name + "_bottom",
				1177	shape.as_list(),
				1178	ofm.dtype,
				1179	shape.elements() * [pad_value],
				1180	np.uint8,
				1181	quantization=quant,
				1182	)
				1183	zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))
				1184	create_avg_pool_for_concat(
				1185	op, op.name + "_bottom", zero_tens, shape, shp0.with_height(ofm_shape.height - bottom)
				1186	)
				1187	if left > 0:
				1188	shape = Shape4D(1, ifm_shape.height, left, ofm_shape.depth)
				1189	zero_tens = create_const_tensor(
				1190	op.name + "_left", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], np.uint8, quantization=quant
				1191	)
				1192	zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))
				1193	create_avg_pool_for_concat(op, op.name + "_left", zero_tens, shape, shp_top)
				1194	if right > 0:
				1195	shape = Shape4D(1, ifm_shape.height, right, ofm_shape.depth)
				1196	zero_tens = create_const_tensor(
				1197	op.name + "_right", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], np.uint8, quantization=quant
				1198	)
				1199	zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))
				1200	create_avg_pool_for_concat(
				1201	op, op.name + "_right", zero_tens, shape, shp_top.with_width(ofm_shape.width - right)
				1202	)
				1203
				1204	op.type = Op.ConcatTFLite
				1205	return avgpool_op
				1206
				1207
				1208	def add_attrs_to_resizebilinear(op, arch, nng):
				1209	if op.type == Op.ResizeBilinear and op.run_on_npu:
				1210	input_tensor = op.inputs[0]
				1211	input_shape = op.ifm_shapes[0]
				1212	upscaled_height = input_shape.height * 2
				1213	upscaled_width = input_shape.width * 2
				1214	out_shape = op.ofm_shapes[0]
				1215	if not op.attrs["align_corners"] and out_shape.height == upscaled_height and out_shape.width == upscaled_width:
				1216	# this means the output is supposed to be a x2 upscale,
				1217	# so we need to do SAME padding
				1218	op.attrs["padding"] = Padding.SAME
				1219	elif (
				1220	op.attrs["align_corners"]
				1221	and out_shape.height == (upscaled_height - 1)
				1222	and out_shape.width == (upscaled_width - 1)
				1223	):
				1224	# here we can just run the avg pool without padding and
				1225	# produce a (M * 2 - 1, N * 2 - 1) sized output
				1226	op.attrs["padding"] = Padding.VALID
				1227	else:
				1228	return op
				1229	input_tensor.resampling_mode = resampling_mode.NEAREST
				1230	op.attrs.update({"strides": (1, 1, 1, 1), "ksize": (1, 2, 2, 1)})
				1231	return op
				1232
				1233
				1234	def fixup_bias_tensors(op, arch, nng):
				1235	if op.type.needs_bias() and op.bias is None:
				1236	# Op has no bias, add bias tensor filled with zeros
				1237	nr_biases = op.inputs[1].shape[-1]
				1238	bias_values = [0] * nr_biases
				1239	bias_tensor = create_const_tensor(op.name + "_bias", [nr_biases], DataType.int32, bias_values)
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	1240	op.set_input_tensor(bias_tensor, op.type.info.indices.biases[0])
				1241
				1242	return op
				1243
				1244
				1245	def convert_mean_to_depthwise_conv_or_avgpool(op, arch, nng):
				1246	if op.type == Op.Mean and op.run_on_npu:
				1247	keep_dims = op.attrs.get("keep_dims", False)
				1248	inp, axis = op.inputs
				1249	shape = inp.shape
				1250	dims = len(shape)
				1251
				1252	# Height and width axes have different index depending on dimensions
				1253	if axis.shape == [] or axis.shape[0] == 1: # single axis
				1254	axis = int(axis.values) if len(axis.shape) == 0 else int(axis.values[0])
				1255	if dims in (2, 3):
				1256	if axis == 0:
				1257	h, w = shape[axis], 1
				1258	else:
				1259	h, w = 1, shape[axis]
				1260	else:
				1261	if axis == 1:
				1262	h, w = shape[axis], 1
				1263	else:
				1264	h, w = 1, shape[axis]
				1265	else: # multiple axes
				1266	axis = sorted(axis.values)
				1267	h, w = [shape[i] for i in axis]
				1268
				1269	# Set necessary depthwise attributes
				1270	op.attrs.update(
				1271	{
				1272	"padding": Padding.VALID,
				1273	"stride_h": 1,
				1274	"stride_w": 1,
				1275	"strides": (1, 1, 1, 1),
				1276	"depth_multiplier": 1,
				1277	"channel_multiplier": 1,
				1278	"dilation_h_factor": 1,
				1279	"dilation_w_factor": 1,
				1280	"dilation": (1, 1, 1, 1),
				1281	}
				1282	)
				1283	# Change op type
				1284	op.type = Op.DepthwiseConv2DBias
				1285	# Set IFM/OFM shapes after changing op type
				1286	op.set_ifm_ofm_shapes()
				1287
				1288	weight_scale, bias = 1, None
				1289	ofmq, ifmq = op.ofm.quantization, inp.quantization
				1290	# Set rounding mode, scaling and zero point based on which reference implementation to match
				1291	if len(shape) == 4 and axis == [1, 2] and keep_dims:
				1292	if inp.dtype == DataType.uint8:
				1293	# This attribute means a different scaling calculation is used in order to match reference
				1294	op.low_precision_scaling = True
				1295	weight_scale = h * w
				1296	# Set zero points to 0 as they will be adjusted for with bias term
				1297	foq = ofmq.clone()
				1298	foq.zero_point = 0
				1299	fiq = ifmq.clone()
				1300	fiq.zero_point = 0
				1301	op.forced_input_quantization = fiq
				1302	bias_term = ofmq.zero_point - int(ifmq.zero_point * ifmq.scale_f32 / ofmq.scale_f32)
				1303	# If the bias term is outside uint8 range, we need an Add op to apply it.
				1304	if bias_term < 0 or bias_term > 255:
				1305	intermediate = op.ofm.clone(suffix="_intermediate", set_unique=True)
				1306	# Bias term has higher bitness (i32) than input/output (u8).
				1307	# 16 bits is enough since the bias is added/subtracted from a u8 value,
				1308	# the bias can only effectively assume values in the range [-255, 255].
				1309	intermediate.dtype = DataType.int16
				1310	intermediate.quantization.zero_point = 0
				1311	add_op = Operation(Op.Add, op.name + "_bias")
				1312	add_op.forced_output_quantization = foq
				1313	add_op.add_input_tensor(intermediate)
				1314	quant = QuantizationParameters()
				1315	quant.zero_point = 0
				1316	bias_term_tens = create_const_tensor(
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	1317	op.name + "_bias", [1, 1, 1, 1], DataType.int16, [bias_term], np.int16, quantization=quant,
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	1318	)
				1319	add_op.add_input_tensor(bias_term_tens)
				1320	add_op.set_output_tensor(op.ofm)
				1321	add_op.set_ifm_ofm_shapes()
				1322	add_op.activation = op.activation
				1323	op.activation = None
				1324	op.set_output_tensor(intermediate)
				1325	op.set_ifm_ofm_shapes()
				1326	# If not, we can just do it with the OFM zero point.
				1327	else:
				1328	foq.zero_point = bias_term
				1329	op.forced_output_quantization = foq
				1330	else:
				1331	assert inp.dtype == DataType.int8
				1332	# Use a depthwise to calculate the sum,
				1333	# followed by a multiplication with 1/N to get the MEAN
				1334	weight_scale = 1
				1335	intermediate = op.ofm.clone(suffix="_intermediate", set_unique=True)
				1336	intermediate.dtype = DataType.int16
				1337	mul_op = Operation(Op.Mul, op.name + "_mul")
				1338	mul_op.add_input_tensor(intermediate)
				1339	# Create scalar containing 1/N
				1340	quant = QuantizationParameters()
				1341	quant.zero_point = 0
				1342	# The reference rounds negative numbers downwards, e.g. -1.5 is rounded to -2,
				1343	# while rounding mode NATURAL would round this to -1.
				1344	# This can only occur if N is even, and can be emulated by
				1345	# multiplying with a number that is slightly smaller than 1/N.
				1346	# It must be so small that other roundings are not affected;
				1347	# the calculated value is based on worst case,
				1348	# which is sum 256 * N (the maximum sum that can occur with int8)
				1349	n = int(h * w)
				1350	eps = 1 / (256 * (n + 1)) if n % 2 == 0 else 0
				1351	quant.scale_f32 = 1 / (n - eps)
				1352	scalar = create_const_tensor(
				1353	op.name + "_scalar", [1, 1, 1, 1], DataType.uint8, [1], np.uint8, quantization=quant
				1354	)
				1355	mul_op.add_input_tensor(scalar)
				1356	mul_op.set_output_tensor(op.ofm)
				1357	mul_op.set_ifm_ofm_shapes()
				1358	mul_op.rounding_mode = NpuRoundingMode.NATURAL
				1359	mul_op.activation = op.activation
				1360	op.activation = None
				1361	op.set_output_tensor(intermediate)
				1362	op.set_ifm_ofm_shapes()
				1363	elif ifmq.zero_point == ofmq.zero_point and ifmq.scale_f32 == ofmq.scale_f32:
				1364	# Here we can just use a simple AvgPool with truncating rounding,
				1365	# as we're emulating simple integer division.
				1366	op.rounding_mode = NpuRoundingMode.TRUNCATE
				1367	op.type = Op.AvgPool
				1368	op.attrs.update({"ksize": (1, h, w, 1), "filter_height": h, "filter_width": w})
				1369	else:
				1370	op.rounding_mode = NpuRoundingMode.NATURAL
				1371	weight_scale = 1 / (h * w)
				1372	# Input zero point is adjusted after mean calculation, so we emulate that with a bias
				1373	bias = -ifmq.zero_point * h * w
				1374	fiq = ifmq.clone()
				1375	fiq.zero_point = 0
				1376	op.forced_input_quantization = fiq
				1377
				1378	# Change dimensions to 4
				1379	if dims < 4:
				1380	shape = [1] + shape
				1381	if dims == 2:
				1382	shape += [1]
				1383
				1384	# If height is greater than max kernel height, reshape to from HxW to 1x(HxW)
				1385	if h > 64:
				1386	shape = [shape[0], 1, h * w, shape[3]]
				1387	op.ifm_shapes[0] = Shape4D(shape)
				1388	if h > 256 and op.type == Op.AvgPool:
				1389	op.attrs.update({"ksize": (1, 1, h * w, 1), "filter_height": 1, "filter_width": h * w})
				1390
				1391	# If the AvgPool version is used, we don't need to do anything else
				1392	if op.type == Op.AvgPool:
				1393	return op
				1394
				1395	# Make unit weight tensor quantization
				1396	weight_quant = ifmq.clone()
				1397	weight_quant.min = 0
				1398	weight_quant.max = 255
				1399	weight_quant.scale_f32 = weight_scale
				1400	weight_quant.zero_point = 0
				1401
				1402	# Set weight shape to [H,W,C,B]
				1403	weight_shape = shape[1:4] + [shape[0]]
				1404	# Add unit weight tensor
				1405	op.set_input_tensor(
				1406	create_const_tensor(
				1407	"weights",
				1408	weight_shape,
				1409	inp.dtype,
				1410	np.ones(weight_shape),
				1411	value_dtype=np.uint8,
				1412	quantization=weight_quant,
				1413	),
				1414	1,
				1415	)
James Peet	7519d50	2021-07-19 16:47:58 +0100	[diff] [blame]	1416	op.weights.values = np.reshape(op.inputs[1].values, weight_shape)
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	1417
				1418	# Add None bias tensor
				1419	op.inputs.append(None)
				1420	# Add bias tensor
				1421	if bias:
				1422	bias_shape = [shape[-1]]
				1423	op.set_input_tensor(
				1424	create_const_tensor(
Tim Hall	8ae2929	2021-07-28 16:52:03 +0100	[diff] [blame]	1425	"bias", bias_shape, inp.dtype, np.ones(bias_shape) * bias, value_dtype=np.int32, quantization=None,
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	1426	),
				1427	2,
				1428	)
				1429
				1430	return op
				1431
				1432
				1433	def supported_operator_check(op, arch, nng):
Jonas Ohlsson	45e653d	2021-07-26 16:13:12 +0200	[diff] [blame]	1434	op.run_on_npu = arch.tflite_supported_operators.is_operator_supported(op)
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	1435	return op
				1436
				1437
				1438	def tflite_optimise_graph(nng, arch):
				1439	# Pre-processing step
				1440	pre_process_list = [
				1441	supported_operator_check,
				1442	set_ifm_ofm_op_shapes,
				1443	]
				1444
				1445	for idx, sg in enumerate(nng.subgraphs):
				1446	nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
				1447	nng, sg, arch, [], pre_process_list, rewrite_unsupported=False,
				1448	)
				1449
				1450	# Handle Concat Ops
				1451	for idx, sg in enumerate(nng.subgraphs):
				1452	rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [rewrite_concat_ops])
				1453	sg.refresh_after_modification()
				1454
				1455	# Handle Split Ops
				1456	for idx, sg in enumerate(nng.subgraphs):
				1457	nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
				1458	nng,
				1459	sg,
				1460	arch,
				1461	[],
				1462	[rewrite_unpack_output, rewrite_stridedslice_output, convert_nop_split_to_identity],
				1463	rewrite_unsupported=False,
				1464	)
				1465
				1466	for idx, sg in enumerate(nng.subgraphs):
				1467	nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
				1468	nng, sg, arch, [rewrite_split_ops], [], rewrite_unsupported=False,
				1469	)
				1470
				1471	# Handle sg input output
				1472	for idx, sg in enumerate(nng.subgraphs):
				1473	nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
				1474	nng, sg, arch, [], [fix_sg_input_output], rewrite_unsupported=False,
				1475	)
				1476
Jonas Ohlsson	81942e9	2021-08-20 09:33:28 +0200	[diff] [blame]	1477	# Removal of reshapes and squeeze
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	1478	for sg in nng.subgraphs:
Jonas Ohlsson	81942e9	2021-08-20 09:33:28 +0200	[diff] [blame]	1479	rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [remove_reshape_and_squeeze_ops])
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	1480	sg.refresh_after_modification()
				1481
				1482	# Rewrite of operators
				1483	op_rewrite_list = [
				1484	set_tensor_equivalence,
				1485	convert_mean_to_depthwise_conv_or_avgpool,
				1486	convert_depthwise_to_conv,
				1487	convert_conv_to_fc,
				1488	convert_softmax,
				1489	optimise_strided_conv,
				1490	convert_hardswish_to_lut,
				1491	rewrite_fully_connected_input,
				1492	convert_batched_fc_shape,
				1493	fixup_conv2d_backprop,
				1494	fixup_relus_with_differing_ifm_ofm_scaling,
				1495	fixup_elementwise_with_scalars,
				1496	reorder_depthwise_weights,
				1497	fixup_resizebilinear,
				1498	fixup_bias_tensors,
				1499	convert_mul_max_to_abs_or_lrelu,
				1500	convert_lrelu,
				1501	convert_tanh_sigmoid_to_lut,
				1502	replace_pad_by_hw_pad,
				1503	]
				1504
				1505	for idx, sg in enumerate(nng.subgraphs):
				1506	nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
				1507	nng, sg, arch, [], op_rewrite_list, rewrite_unsupported=False,
				1508	)
				1509
				1510	for idx, sg in enumerate(nng.subgraphs):
				1511	# remove passthrough tensors and attempt further optimizations
				1512	nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
				1513	nng,
				1514	sg,
				1515	arch,
				1516	[remove_passthrough_tensor],
				1517	[fuse_activation_function_with_prev, convert_pad, add_padding_fields],
				1518	)
				1519
				1520	# Removal of SplitSliceRead, need to be done after optimisation has been performed,
				1521	# since ifm/ofm_shapes are of importance to this function
				1522	for sg in nng.subgraphs:
				1523	rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [remove_SplitSliceRead])
				1524	sg.refresh_after_modification()
				1525
				1526	return nng