Blame - ethosu/vela/graph_optimiser_util.py - ml/ethos-u/ethos-u-vela

blob: 8095f0821c6344bf2511801f1f2dc2130677d1d3 [file] [log] [blame]

Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	1	# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
				2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
				16	# Description:
				17	# Common functions and definitions used during the graph optimization.
Patrik Gustavsson	c74682c	2021-08-17 14:26:38 +0200	[diff] [blame]	18	from typing import Tuple
				19
Patrik Gustavsson	df99510	2021-08-23 15:33:59 +0200	[diff] [blame]	20	import numpy as np
				21
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	22	from .data_type import DataType
				23	from .debug_database import DebugDatabase
Patrik Gustavsson	df99510	2021-08-23 15:33:59 +0200	[diff] [blame]	24	from .errors import UnsupportedFeatureError
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	25	from .errors import VelaError
				26	from .operation import Op
Patrik Gustavsson	df99510	2021-08-23 15:33:59 +0200	[diff] [blame]	27	from .operation_util import create_avgpool_nop
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	28	from .shape4d import Shape4D
				29	from .tensor import check_quantized_tens_scaling_equal
				30
Jonas Ohlsson	81942e9	2021-08-20 09:33:28 +0200	[diff] [blame]	31	memory_only_ops = (
				32	Op.Reshape,
				33	Op.Squeeze,
				34	)
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	35
				36
				37	def _avoid_nhcwb16_for_concat(tens):
				38	# If axis corresponds to C-dimension, NHCWB16 can only be used in the output if all the concat_start's are a
				39	# multiple of 16. This as, it is only then the address offset for the ofm, for all operations, will be 16 byte
				40	# aligned. For other values of axis the address offsets will be 16 byte aligned, as they are all based on c = 0
				41	# and those addresses are always 16 byte aligned due to the NHCWB16 format.
				42	return any(op.write_offset.depth % 16 != 0 for op in tens.ops if op.write_offset is not None)
				43
				44
				45	def _avoid_nhcwb16_for_split(tens):
				46	# If read offset is not a multiple of 16 in the C-dimension, NHCWB16 need to be avoided in the input
				47	for cons_op in tens.consumer_list:
				48	if cons_op.ifm == tens:
				49	read_offset = cons_op.read_offsets[0]
				50	elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == tens:
				51	read_offset = cons_op.read_offsets[1]
				52	else:
				53	assert False
				54	if read_offset is not None and (read_offset[-1] % 16) != 0:
				55	return True
				56	return False
				57
				58
				59	def _avoid_nhcwb16_for_shapes(tens):
				60	# check all producers/consumers to see if any op shape is preventing NHCWB16
				61	for cons_op in tens.consumer_list:
				62	if cons_op.ifm == tens:
				63	cons_op_shape = cons_op.ifm_shapes[0]
				64	elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == tens:
				65	cons_op_shape = cons_op.ifm_shapes[1]
				66	else:
				67	assert False
				68	if Shape4D(tens.shape) != cons_op_shape:
				69	return True
				70
				71	for prod_op in tens.ops:
				72	if Shape4D(tens.shape) != prod_op.ofm_shapes[0]:
				73	return True
				74
				75	return False
				76
				77
				78	# Check if non linear format can be used
				79	def check_format_restrictions(tens, arch):
				80	if len(tens.ops) < 1:
				81	return
				82	if tens.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const) or any(
				83	cons is None for cons in tens.consumer_list
				84	):
				85	return
				86
				87	# Check if any of the producers/consumers is run on CPU
				88	if not all(cons.run_on_npu for cons in tens.consumer_list):
				89	return
				90	if not all(prod.run_on_npu for prod in tens.ops):
				91	return
				92
				93	# "Concat" ofm exception:
				94	if _avoid_nhcwb16_for_concat(tens):
				95	return
				96
				97	# "Split" ifm exception:
				98	if _avoid_nhcwb16_for_split(tens):
				99	return
				100
				101	# Shapes checking: check all producers/consumers are NHCWB16 compatible with tens.shape
				102	if _avoid_nhcwb16_for_shapes(tens):
				103	return
				104
				105	for op in tens.consumer_list:
				106	if op.type == Op.ReduceSum and tens.dtype == DataType.int32:
				107	return
				108	if op.type == Op.Reshape:
				109	# Using NHCWB16 format for a no-op reshape is only an option if subsequent
				110	# consumers do not also need to perform a reshape or if the OFM is going to
				111	# be processed by CPU operations. No-op reshape consumers with empty lists
				112	# (those that have no consumers, or null-consumers used as list terminators)
				113	# must use normal NHWC output.
				114
				115	def incompatible_consumers(oper):
				116	if oper and oper.type == Op.Reshape:
				117	for consumer in oper.outputs[0].consumer_list:
				118	yield from incompatible_consumers(consumer)
				119	yield not oper or not oper.run_on_npu
				120
				121	if not any(incompatible_consumers(op)):
				122
				123	def get_rewrites(oper):
				124	if oper and oper.type == Op.Reshape:
				125	for consumer in oper.outputs[0].consumer_list:
				126	yield from get_rewrites(consumer)
				127	yield oper
				128
				129	# Detect no-op reshapes by comparing their full input and output tensor shapes.
				130	inshape = op.ifm_shapes[0]
				131	compatible_shape = [(inshape == oper.ofm_shapes[0]) for oper in get_rewrites(op)]
				132	if not (compatible_shape and all(compatible_shape)):
				133	return
				134	else:
				135	return
				136
				137	tens.needs_linear_format = False
				138
				139
Patrik Gustavsson	c74682c	2021-08-17 14:26:38 +0200	[diff] [blame]	140	def calc_explicit_padding(input_size, stride, filter_size, pad_before, pad_after) -> Tuple[int, int]:
				141	"""
				142	Based on explicit padding provided in a PAD operation, returns the corresponding hardware padding
				143	that provides equivalent results.
				144	"""
				145	total_padding = needed_total_padding(input_size, stride, filter_size)
				146
				147	# The bottom/right padding might need downward adjustment depending on stride/input size
				148	total_minus_before = total_padding - pad_before
				149	output_pad_after = pad_after
				150	while output_pad_after > 0 and output_pad_after % stride != total_minus_before % stride:
				151	output_pad_after -= 1
				152	return pad_before, output_pad_after
				153
				154
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	155	def needed_total_padding(input_size, stride, filter_size):
				156	out_size = (input_size + stride - 1) // stride
				157	needed_input = (out_size - 1) * stride + filter_size
				158	total_padding = max(0, needed_input - input_size)
				159	return total_padding
				160
				161
				162	# Set input/output tensor equivalence to the same id for memory operations
				163	def set_tensor_equivalence(op, arch, nng):
				164	if op.type in memory_only_ops:
				165	eid = op.outputs[0].equivalence_id
				166	for inp in op.inputs:
				167	inp.equivalence_id = eid
				168	return op
				169
				170
				171	def set_ifm_ofm_op_shapes(op, arch, nng):
				172	if op.run_on_npu and op.type.needs_shapes():
				173	if op.ifm_shapes or op.ofm_shapes:
				174	# Shapes already set
				175	return op
				176	op.set_ifm_ofm_shapes()
				177	return op
				178
				179
Patrik Gustavsson	df99510	2021-08-23 15:33:59 +0200	[diff] [blame]	180	def bypass_reshape_and_squeeze_ops(op):
				181	assert op.type in (Op.Reshape, Op.Squeeze)
				182	ofm = op.ofm
				183	ifm = op.ifm
				184	# Check if ifm/ofm are network ifm/ofm
				185	ifm_is_sg_ifm = ifm.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const)
				186	ifm_is_sg_ofm = any(ifm_cons is None for ifm_cons in ifm.consumer_list)
				187	ofm_is_sg_ofm = any(ofm_cons is None for ofm_cons in ofm.consumer_list)
				188	# Check if ifm/ofm is produced respectively consumed by CPU
				189	ifm_is_cpu_produced = any(ifm_prod is not None and not ifm_prod.run_on_npu for ifm_prod in op.ifm.ops)
				190	ofm_is_cpu_consumed = any(ofm_cons is not None and not ofm_cons.run_on_npu for ofm_cons in op.ofm.consumer_list)
				191
				192	# This case should be handled prior to this function
				193	assert not ((ifm_is_sg_ifm or ifm_is_sg_ofm or ifm_is_cpu_produced) and (ofm_is_sg_ofm or ofm_is_cpu_consumed))
				194
				195	if ofm_is_sg_ofm or ofm_is_cpu_consumed:
				196	# Bypassed by replacing ifm with ofm
				197	ofm.ops = []
				198	for prev_op in ifm.ops:
				199	prev_op.outputs = [ofm]
				200	ofm.ops.append(prev_op)
				201
				202	# All ifm consumers need to use ofm as input
				203	for ifm_cons in ifm.consumer_list:
				204	for ifm_idx, cons_ifm in enumerate(ifm_cons.inputs):
				205	if cons_ifm == ifm:
				206	ifm_cons.set_input_tensor(ofm, ifm_idx)
				207	else:
				208	# Bypassed by replacing ofm with ifm
				209	for cons in ofm.consumer_list:
				210	for ifm_idx, cons_ifm in enumerate(cons.inputs):
				211	if cons_ifm == ofm:
				212	cons.set_input_tensor(ifm, ifm_idx)
				213
				214
Patrik Gustavsson	f1580f0	2021-09-01 12:43:02 +0200	[diff] [blame]	215	def move_splitsliceread_to_consumer(op, cons_op):
				216	assert op.type == Op.SplitSliceRead
				217
				218	if cons_op.ifm == op.ofm:
				219	cons_op.read_offsets[0] = op.read_offsets[0]
				220	cons_op.read_shapes[0] = op.read_shapes[0]
				221	cons_op.set_input_tensor(op.ifm, cons_op.type.info.indices.ifms[0])
				222	cons_op.ifm_shapes[0] = op.ifm_shapes[0]
				223	elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == op.ofm:
				224	cons_op.read_offsets[1] = op.read_offsets[0]
				225	cons_op.read_shapes[1] = op.read_shapes[0]
				226	cons_op.set_input_tensor(op.ifm, cons_op.type.info.indices.ifms[1])
				227	cons_op.ifm_shapes[1] = op.ifm_shapes[0]
				228
				229	if "skirt" in cons_op.attrs:
				230	assert cons_op.attrs["explicit_padding"] == cons_op.attrs["skirt"]
				231	cons_op.attrs["skirt"] = None
				232	cons_op.attrs["force_padding"] = True
				233	op.ofm.consumer_list.remove(cons_op)
				234	op.ofm.ops = []
				235	op.ifm.consumer_list.remove(op)
				236
				237
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	238	def check_reshapes(op, arch):
				239	if op.run_on_npu and op.type == Op.Reshape:
				240	ofm = op.ofm
				241
				242	if check_quantized_tens_scaling_equal(op.ifm, ofm):
				243	# Reshape should have been removed
				244	raise VelaError(f"Reshape op {op} expected to have been removed, still remains")
				245
				246
				247	def record_optimised(op, arch):
				248	if op.type != Op.Const:
				249	DebugDatabase.add_optimised(op, op)
Patrik Gustavsson	df99510	2021-08-23 15:33:59 +0200	[diff] [blame]	250
				251
				252	def insert_copy_op_after_tens(tens):
				253	tens_cons_list_copy = tens.consumer_list.copy()
				254
				255	# Create a avg_pool nop op with ifm as input
				256	copy_tens = tens.clone()
				257	copy_op = create_avgpool_nop(tens.name + "_avgpool")
				258	copy_op.add_input_tensor(tens)
				259	copy_op.set_output_tensor(copy_tens)
				260	copy_op.set_ifm_ofm_shapes()
				261	copy_op.run_on_npu = True
				262
				263	# Set copy_ifm consumers
				264	for tens_cons in tens_cons_list_copy:
				265	if tens_cons is not None:
				266	for ifm_idx, cons_inp in enumerate(tens_cons.inputs):
				267	if cons_inp == tens:
				268	tens_cons.set_input_tensor(copy_tens, ifm_idx)
				269
				270	DebugDatabase.add_optimised(tens.ops[0], copy_op)
				271
				272
				273	def fix_sg_input_output(op, arch, nng):
				274	if not op.run_on_npu or op.type not in (Op.Reshape, Op.Squeeze):
				275	return op
				276
				277	# For the Reshape/Squeeze operators we want to remove, tensors are removed.
				278	# But in order to to do this, they cannot be outputs of the sg,
				279	# this need to be fixed prior to the removal.
				280	# Solution is to add a avgpool NOP, to maintain the original tensor.
				281	# This is also valid when reshape ifm/ofm is produced respectively
				282	# consumed by CPU
				283
				284	# Check if operator ifm/ofm are sg ifm/ofm
				285	ifm_is_sg_ifm = op.ifm.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const)
				286	ifm_is_sg_ofm = any(ifm_cons is None for ifm_cons in op.ifm.consumer_list)
				287	ofm_is_sg_ofm = any(ofm_cons is None for ofm_cons in op.ofm.consumer_list)
				288	# Check if ifm/ofm is produced respectively consumed by CPU
				289	ifm_is_cpu_produced = any(ifm_prod is not None and not ifm_prod.run_on_npu for ifm_prod in op.ifm.ops)
				290	ofm_is_cpu_consumed = any(ofm_cons is not None and not ofm_cons.run_on_npu for ofm_cons in op.ofm.consumer_list)
				291
				292	if (ifm_is_sg_ofm or ifm_is_sg_ifm or ifm_is_cpu_produced) and (ofm_is_sg_ofm or ofm_is_cpu_consumed):
				293	# Both ifm and ofm need to persist, but only ifm need a copy, in order to remove the Reshape/Squeeze
				294	insert_copy_op_after_tens(op.ifm)
				295
				296	return op
				297
				298
				299	def convert_depthwise_to_conv(op, arch, nng):
				300	# Depthwise is equivalent to a single conv2d if the ifm depth is 1 and
				301	# the ofm depth equals the depth multipler.
				302	# If those conditions are true, then we can perform a simple
				303	# switch of the operator type (and weight order)
				304
				305	if op.type == Op.DepthwiseConv2DBias and (op.attrs["depth_multiplier"] != 1):
				306	ifm_shape = op.ifm_shapes[0]
				307	weight_tensor = op.inputs[1]
				308	ofm_shape = op.ofm_shapes[0]
				309	if (ifm_shape.depth == 1) and (ofm_shape.depth == op.attrs["depth_multiplier"]):
				310	# Change op type to Conv2d
				311	op.type = Op.Conv2DBias
				312	del op.attrs["channel_multiplier"]
				313	del op.attrs["depth_multiplier"]
				314
				315	weight_tensor.values = np.transpose(weight_tensor.values, (0, 1, 3, 2))
				316	weight_tensor.set_all_shapes(list(weight_tensor.values.shape))
				317	else:
				318	raise UnsupportedFeatureError(
				319	f"Unsupported 'DEPTHWISE_CONV_2D' with depth_multiplier = {op.attrs['depth_multiplier']},",
				320	f" ifm channels = {ifm_shape.depth}, ofm channels = {ofm_shape.depth}",
				321	)
				322	DebugDatabase.add_optimised(op, op)
				323	return op