Blame - ethosu/vela/graph_optimiser_util.py - ml/ethos-u/ethos-u-vela

blob: e6a79cef7db9a097eff3bd797a74839278e08697 [file] [log] [blame]

Rickard Bolin	fea1516	2022-07-04 16:19:16 +0000	[diff] [blame]	1	# Copyright (C) 2021-2022 Arm Limited or its affiliates. All rights reserved.
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
				16	# Description:
				17	# Common functions and definitions used during the graph optimization.
Patrik Gustavsson	c74682c	2021-08-17 14:26:38 +0200	[diff] [blame]	18	from typing import Tuple
				19
Patrik Gustavsson	df99510	2021-08-23 15:33:59 +0200	[diff] [blame]	20	import numpy as np
				21
Patrik Gustavsson	f436ada	2021-09-14 14:56:48 +0200	[diff] [blame]	22	from . import lut
Tim Hall	d6efcd3	2022-09-02 15:01:01 +0100	[diff] [blame]	23	from .architecture_features import Accelerator
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	24	from .data_type import DataType
				25	from .debug_database import DebugDatabase
Patrik Gustavsson	df99510	2021-08-23 15:33:59 +0200	[diff] [blame]	26	from .errors import UnsupportedFeatureError
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	27	from .errors import VelaError
				28	from .operation import Op
Patrik Gustavsson	df99510	2021-08-23 15:33:59 +0200	[diff] [blame]	29	from .operation_util import create_avgpool_nop
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	30	from .shape4d import Shape4D
Patrik Gustavsson	f436ada	2021-09-14 14:56:48 +0200	[diff] [blame]	31	from .tensor import create_const_tensor
				32	from .tensor import QuantizationParameters
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	33
Jonas Ohlsson	81942e9	2021-08-20 09:33:28 +0200	[diff] [blame]	34	memory_only_ops = (
				35	Op.Reshape,
Jonas Ohlsson	0957e3e	2021-09-01 15:57:21 +0200	[diff] [blame]	36	Op.QuantizedReshape,
Jonas Ohlsson	81942e9	2021-08-20 09:33:28 +0200	[diff] [blame]	37	Op.Squeeze,
Jonas Ohlsson	0957e3e	2021-09-01 15:57:21 +0200	[diff] [blame]	38	Op.ExpandDims,
Patrik Gustavsson	ef3ebdd	2021-10-01 11:10:25 +0200	[diff] [blame]	39	Op.Identity,
Jonas Ohlsson	81942e9	2021-08-20 09:33:28 +0200	[diff] [blame]	40	)
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	41
				42
				43	def _avoid_nhcwb16_for_concat(tens):
				44	# If axis corresponds to C-dimension, NHCWB16 can only be used in the output if all the concat_start's are a
				45	# multiple of 16. This as, it is only then the address offset for the ofm, for all operations, will be 16 byte
				46	# aligned. For other values of axis the address offsets will be 16 byte aligned, as they are all based on c = 0
				47	# and those addresses are always 16 byte aligned due to the NHCWB16 format.
				48	return any(op.write_offset.depth % 16 != 0 for op in tens.ops if op.write_offset is not None)
				49
				50
				51	def _avoid_nhcwb16_for_split(tens):
				52	# If read offset is not a multiple of 16 in the C-dimension, NHCWB16 need to be avoided in the input
James Ward	6bf1613	2021-09-08 11:14:20 +0100	[diff] [blame]	53
				54	# Return True if NHCWB16 needs to be avoided
				55	def offset_not_aligned(read_offset):
				56	return read_offset is not None and (read_offset.depth % 16) != 0
				57
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	58	for cons_op in tens.consumer_list:
				59	if cons_op.ifm == tens:
James Ward	6bf1613	2021-09-08 11:14:20 +0100	[diff] [blame]	60	if offset_not_aligned(cons_op.read_offsets[0]):
				61	return True
				62	if cons_op.ifm2 is not None and cons_op.ifm2 == tens:
				63	if offset_not_aligned(cons_op.read_offsets[1]):
				64	return True
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	65	return False
				66
				67
				68	def _avoid_nhcwb16_for_shapes(tens):
				69	# check all producers/consumers to see if any op shape is preventing NHCWB16
				70	for cons_op in tens.consumer_list:
				71	if cons_op.ifm == tens:
				72	cons_op_shape = cons_op.ifm_shapes[0]
				73	elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == tens:
				74	cons_op_shape = cons_op.ifm_shapes[1]
				75	else:
				76	assert False
				77	if Shape4D(tens.shape) != cons_op_shape:
				78	return True
				79
				80	for prod_op in tens.ops:
				81	if Shape4D(tens.shape) != prod_op.ofm_shapes[0]:
				82	return True
				83
				84	return False
				85
				86
				87	# Check if non linear format can be used
				88	def check_format_restrictions(tens, arch):
				89	if len(tens.ops) < 1:
				90	return
				91	if tens.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const) or any(
				92	cons is None for cons in tens.consumer_list
				93	):
				94	return
				95
				96	# Check if any of the producers/consumers is run on CPU
				97	if not all(cons.run_on_npu for cons in tens.consumer_list):
				98	return
				99	if not all(prod.run_on_npu for prod in tens.ops):
				100	return
				101
				102	# "Concat" ofm exception:
				103	if _avoid_nhcwb16_for_concat(tens):
				104	return
				105
				106	# "Split" ifm exception:
				107	if _avoid_nhcwb16_for_split(tens):
				108	return
				109
				110	# Shapes checking: check all producers/consumers are NHCWB16 compatible with tens.shape
				111	if _avoid_nhcwb16_for_shapes(tens):
				112	return
				113
Rickard Bolin	fea1516	2022-07-04 16:19:16 +0000	[diff] [blame]	114	# Resize bilinear half pixel center implementation requires OFM with linear format to
				115	# allow stride modification in H/W dimensions.
				116	for op in tens.ops:
				117	if op.original_type == Op.ResizeBilinear and op.type == Op.DepthwiseConv2DBias:
				118	return
				119
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	120	for op in tens.consumer_list:
Tim Hall	d6efcd3	2022-09-02 15:01:01 +0100	[diff] [blame]	121	if op.type == Op.ReduceSum and (
				122	tens.dtype == DataType.int32 or arch.accelerator_config == Accelerator.Ethos_U65_512
				123	):
				124	# ReduceSum requires NHWC input
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	125	return
				126	if op.type == Op.Reshape:
				127	# Using NHCWB16 format for a no-op reshape is only an option if subsequent
				128	# consumers do not also need to perform a reshape or if the OFM is going to
				129	# be processed by CPU operations. No-op reshape consumers with empty lists
				130	# (those that have no consumers, or null-consumers used as list terminators)
				131	# must use normal NHWC output.
				132
				133	def incompatible_consumers(oper):
				134	if oper and oper.type == Op.Reshape:
				135	for consumer in oper.outputs[0].consumer_list:
				136	yield from incompatible_consumers(consumer)
				137	yield not oper or not oper.run_on_npu
				138
				139	if not any(incompatible_consumers(op)):
				140
				141	def get_rewrites(oper):
				142	if oper and oper.type == Op.Reshape:
				143	for consumer in oper.outputs[0].consumer_list:
				144	yield from get_rewrites(consumer)
				145	yield oper
				146
				147	# Detect no-op reshapes by comparing their full input and output tensor shapes.
				148	inshape = op.ifm_shapes[0]
				149	compatible_shape = [(inshape == oper.ofm_shapes[0]) for oper in get_rewrites(op)]
				150	if not (compatible_shape and all(compatible_shape)):
				151	return
				152	else:
				153	return
				154
				155	tens.needs_linear_format = False
				156
				157
Patrik Gustavsson	c74682c	2021-08-17 14:26:38 +0200	[diff] [blame]	158	def calc_explicit_padding(input_size, stride, filter_size, pad_before, pad_after) -> Tuple[int, int]:
				159	"""
				160	Based on explicit padding provided in a PAD operation, returns the corresponding hardware padding
				161	that provides equivalent results.
				162	"""
				163	total_padding = needed_total_padding(input_size, stride, filter_size)
				164
				165	# The bottom/right padding might need downward adjustment depending on stride/input size
				166	total_minus_before = total_padding - pad_before
				167	output_pad_after = pad_after
				168	while output_pad_after > 0 and output_pad_after % stride != total_minus_before % stride:
				169	output_pad_after -= 1
				170	return pad_before, output_pad_after
				171
				172
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	173	def needed_total_padding(input_size, stride, filter_size):
				174	out_size = (input_size + stride - 1) // stride
				175	needed_input = (out_size - 1) * stride + filter_size
				176	total_padding = max(0, needed_input - input_size)
				177	return total_padding
				178
				179
				180	# Set input/output tensor equivalence to the same id for memory operations
				181	def set_tensor_equivalence(op, arch, nng):
				182	if op.type in memory_only_ops:
				183	eid = op.outputs[0].equivalence_id
				184	for inp in op.inputs:
				185	inp.equivalence_id = eid
				186	return op
				187
				188
				189	def set_ifm_ofm_op_shapes(op, arch, nng):
				190	if op.run_on_npu and op.type.needs_shapes():
				191	if op.ifm_shapes or op.ofm_shapes:
				192	# Shapes already set
				193	return op
				194	op.set_ifm_ofm_shapes()
				195	return op
				196
				197
Jonas Ohlsson	0957e3e	2021-09-01 15:57:21 +0200	[diff] [blame]	198	def bypass_memory_only_ops(op):
				199	assert op.type in memory_only_ops
Patrik Gustavsson	df99510	2021-08-23 15:33:59 +0200	[diff] [blame]	200	ofm = op.ofm
				201	ifm = op.ifm
Jonas Ohlsson	0957e3e	2021-09-01 15:57:21 +0200	[diff] [blame]	202
Johan Alfvén	5060ff5	2022-09-15 15:50:30 +0200	[diff] [blame]	203	# Check if ifm is subgraph ifm
Patrik Gustavsson	df99510	2021-08-23 15:33:59 +0200	[diff] [blame]	204	ifm_is_sg_ifm = ifm.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const)
Johan Alfvén	5060ff5	2022-09-15 15:50:30 +0200	[diff] [blame]	205	# Check if ifm is produced by CPU
Patrik Gustavsson	df99510	2021-08-23 15:33:59 +0200	[diff] [blame]	206	ifm_is_cpu_produced = any(ifm_prod is not None and not ifm_prod.run_on_npu for ifm_prod in op.ifm.ops)
Patrik Gustavsson	df99510	2021-08-23 15:33:59 +0200	[diff] [blame]	207
				208	# This case should be handled prior to this function
Johan Alfvén	5060ff5	2022-09-15 15:50:30 +0200	[diff] [blame]	209	assert not (ifm_is_sg_ifm or ifm_is_cpu_produced)
Patrik Gustavsson	df99510	2021-08-23 15:33:59 +0200	[diff] [blame]	210
Johan Alfvén	5060ff5	2022-09-15 15:50:30 +0200	[diff] [blame]	211	# Bypassed by replacing ifm with ofm
				212	ofm.ops = []
				213	for prev_op in ifm.ops:
				214	prev_op.outputs = [ofm]
				215	ofm.ops.append(prev_op)
Patrik Gustavsson	df99510	2021-08-23 15:33:59 +0200	[diff] [blame]	216
Johan Alfvén	5060ff5	2022-09-15 15:50:30 +0200	[diff] [blame]	217	# All ifm consumers need to use ofm as input
				218	for ifm_cons in ifm.consumer_list:
				219	for ifm_idx, cons_ifm in enumerate(ifm_cons.inputs):
				220	if cons_ifm == ifm:
				221	ifm_cons.set_input_tensor(ofm, ifm_idx)
Patrik Gustavsson	df99510	2021-08-23 15:33:59 +0200	[diff] [blame]	222
				223
Patrik Gustavsson	f1580f0	2021-09-01 12:43:02 +0200	[diff] [blame]	224	def move_splitsliceread_to_consumer(op, cons_op):
				225	assert op.type == Op.SplitSliceRead
				226
				227	if cons_op.ifm == op.ofm:
				228	cons_op.read_offsets[0] = op.read_offsets[0]
				229	cons_op.read_shapes[0] = op.read_shapes[0]
				230	cons_op.set_input_tensor(op.ifm, cons_op.type.info.indices.ifms[0])
				231	cons_op.ifm_shapes[0] = op.ifm_shapes[0]
				232	elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == op.ofm:
				233	cons_op.read_offsets[1] = op.read_offsets[0]
				234	cons_op.read_shapes[1] = op.read_shapes[0]
				235	cons_op.set_input_tensor(op.ifm, cons_op.type.info.indices.ifms[1])
				236	cons_op.ifm_shapes[1] = op.ifm_shapes[0]
				237
Patrik Gustavsson	f1580f0	2021-09-01 12:43:02 +0200	[diff] [blame]	238	op.ofm.consumer_list.remove(cons_op)
				239	op.ofm.ops = []
				240	op.ifm.consumer_list.remove(op)
				241
				242
Jonas Ohlsson	0957e3e	2021-09-01 15:57:21 +0200	[diff] [blame]	243	def check_memory_only_removed(op, arch):
				244	if op.run_on_npu and op.type in memory_only_ops:
				245	# Memory only operators should have been removed
				246	raise VelaError(f"Memory only {op.type} op {op} expected to have been removed, still remains")
Patrik Gustavsson	8f1f9aa	2021-06-28 07:41:58 +0200	[diff] [blame]	247
				248
				249	def record_optimised(op, arch):
				250	if op.type != Op.Const:
				251	DebugDatabase.add_optimised(op, op)
Patrik Gustavsson	df99510	2021-08-23 15:33:59 +0200	[diff] [blame]	252
				253
Johan Alfvén	5060ff5	2022-09-15 15:50:30 +0200	[diff] [blame]	254	def insert_copy_op_after_ifm(op):
				255	tens = op.ifm
Patrik Gustavsson	df99510	2021-08-23 15:33:59 +0200	[diff] [blame]	256
				257	# Create a avg_pool nop op with ifm as input
				258	copy_tens = tens.clone()
				259	copy_op = create_avgpool_nop(tens.name + "_avgpool")
				260	copy_op.add_input_tensor(tens)
				261	copy_op.set_output_tensor(copy_tens)
				262	copy_op.set_ifm_ofm_shapes()
				263	copy_op.run_on_npu = True
				264
Johan Alfvén	5060ff5	2022-09-15 15:50:30 +0200	[diff] [blame]	265	op.set_input_tensor(copy_tens, 0)
Patrik Gustavsson	df99510	2021-08-23 15:33:59 +0200	[diff] [blame]	266
				267	DebugDatabase.add_optimised(tens.ops[0], copy_op)
				268
				269
				270	def fix_sg_input_output(op, arch, nng):
Jonas Ohlsson	0957e3e	2021-09-01 15:57:21 +0200	[diff] [blame]	271	if not op.run_on_npu or op.type not in memory_only_ops:
Patrik Gustavsson	df99510	2021-08-23 15:33:59 +0200	[diff] [blame]	272	return op
				273
Johan Alfvén	5060ff5	2022-09-15 15:50:30 +0200	[diff] [blame]	274	# For the memory only operators we want to remove, the ifm tensor
				275	# is replaced by the ofm tensor.
				276	# But in order to to do this, the ifm can not be inputs of the sg or
				277	# the ifm can not have more than one consumers.
				278	# This need to be fixed prior to the removal.
Patrik Gustavsson	df99510	2021-08-23 15:33:59 +0200	[diff] [blame]	279	# Solution is to add a avgpool NOP, to maintain the original tensor.
Johan Alfvén	5060ff5	2022-09-15 15:50:30 +0200	[diff] [blame]	280	# This is also valid when reshape ifm is produced by CPU
Patrik Gustavsson	df99510	2021-08-23 15:33:59 +0200	[diff] [blame]	281
Johan Alfvén	5060ff5	2022-09-15 15:50:30 +0200	[diff] [blame]	282	# Check if operator ifm is subgraph ifm
Patrik Gustavsson	df99510	2021-08-23 15:33:59 +0200	[diff] [blame]	283	ifm_is_sg_ifm = op.ifm.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const)
Patrik Gustavsson	df99510	2021-08-23 15:33:59 +0200	[diff] [blame]	284
Johan Alfvén	5060ff5	2022-09-15 15:50:30 +0200	[diff] [blame]	285	# Check if ifm is produced by CPU
				286	ifm_is_cpu_produced = any(ifm_prod is not None and not ifm_prod.run_on_npu for ifm_prod in op.ifm.ops)
				287
				288	# Check numbers of ifm consumers - if many insert avgpool NOP
				289	ifm_has_multiple_cons = len(op.ifm.consumer_list) > 1
				290
				291	if ifm_is_sg_ifm or ifm_is_cpu_produced or ifm_has_multiple_cons:
				292	# Ifm need to persist in order to remove the memory only operator.
				293	insert_copy_op_after_ifm(op)
Patrik Gustavsson	df99510	2021-08-23 15:33:59 +0200	[diff] [blame]	294
				295	return op
				296
				297
				298	def convert_depthwise_to_conv(op, arch, nng):
				299	# Depthwise is equivalent to a single conv2d if the ifm depth is 1 and
				300	# the ofm depth equals the depth multipler.
				301	# If those conditions are true, then we can perform a simple
				302	# switch of the operator type (and weight order)
				303
				304	if op.type == Op.DepthwiseConv2DBias and (op.attrs["depth_multiplier"] != 1):
				305	ifm_shape = op.ifm_shapes[0]
				306	weight_tensor = op.inputs[1]
				307	ofm_shape = op.ofm_shapes[0]
				308	if (ifm_shape.depth == 1) and (ofm_shape.depth == op.attrs["depth_multiplier"]):
				309	# Change op type to Conv2d
				310	op.type = Op.Conv2DBias
				311	del op.attrs["channel_multiplier"]
				312	del op.attrs["depth_multiplier"]
				313
				314	weight_tensor.values = np.transpose(weight_tensor.values, (0, 1, 3, 2))
				315	weight_tensor.set_all_shapes(list(weight_tensor.values.shape))
				316	else:
				317	raise UnsupportedFeatureError(
				318	f"Unsupported 'DEPTHWISE_CONV_2D' with depth_multiplier = {op.attrs['depth_multiplier']},",
				319	f" ifm channels = {ifm_shape.depth}, ofm channels = {ofm_shape.depth}",
				320	)
				321	DebugDatabase.add_optimised(op, op)
				322	return op
Patrik Gustavsson	f436ada	2021-09-14 14:56:48 +0200	[diff] [blame]	323
				324
				325	def convert_to_lut(op, lut_values, lut_name):
				326	# Rewrite the operation by Add with scalar 0 + LUT activation
				327	ifm = op.inputs[0]
				328	if ifm is None:
				329	return op
				330	assert ifm.dtype.size_in_bytes() == 1
				331	op.type = Op.Add
				332	op.name = op.name + "_lut_" + lut_name
				333	# Mark as no-op to enable potential fusing optimizations
				334	op.attrs["is_nop"] = True
				335	# Create an input tensor containing scalar zero
				336	quantization = QuantizationParameters(0.0, 255.0)
				337	quantization.scale_f32 = ifm.quantization.scale_f32
				338	quantization.zero_point = 0
				339	tens = create_const_tensor(op.inputs[0].name + "_scalar0", [], ifm.dtype, [0], np.uint8, quantization=quantization)
				340	op.add_input_tensor(tens)
				341	op.ifm_shapes.append(Shape4D(tens.shape)) # TODO no shape?
				342
				343	# The LUT must be applied without any preceding rescaling (the LUT itself performs the rescale),
				344	# so even if the OFM has a different scale than the IFM, the generated OFM scale instructions
				345	# should be the same as the IFM
				346	op.forced_output_quantization = ifm.quantization
				347	lut_tensor = lut.create_lut_tensor(op.name + "_values", lut_values, DataType.int8)
				348	op.set_activation_lut(lut_tensor)
				349	op.set_ifm_ofm_shapes()
				350	return op