ethosu/vela/tosa_graph_optimiser.py - ml/ethos-u/ethos-u-vela - Gitiles

 # Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the License); you may
 # not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an AS IS BASIS, WITHOUT
 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Description:
 # Early optimisation of the TOSA based network graph, using the rewrite_graph module to do the traversal of the graph.
 from . import rewrite_graph
 from .api import NpuRoundingMode
 from .data_type import DataType
 from .debug_database import DebugDatabase
 from .graph_optimiser_util import calc_explicit_padding
 from .graph_optimiser_util import needed_total_padding
 from .graph_optimiser_util import set_ifm_ofm_op_shapes
 from .graph_optimiser_util import set_tensor_equivalence
 from .operation import ExplicitScaling
 from .operation import NpuBlockType
 from .operation import Op
 from .operation_util import create_avgpool_nop


 def replace_rescale_with_avg_pool(rescale_op):
     assert rescale_op.type == Op.Rescale

     avgpool_op = create_avgpool_nop(rescale_op.name + "_avgpool")
     rescale_op_clone = rescale_op.clone()
     op = rescale_op
     op.attrs = avgpool_op.attrs.copy()
     op.type = Op.AvgPool
     DebugDatabase.add_optimised(rescale_op_clone, op)

     return op


 def calc_skirt(kernel, input_shape, explicit_padding):
     k_w, k_h = kernel.dilated_wh()
     s_x, s_y = kernel.stride
     ypad = needed_total_padding(int(input_shape.height), int(s_y), int(k_h))
     xpad = needed_total_padding(int(input_shape.width), int(s_x), int(k_w))

     top, left, bottom, right = explicit_padding
     top_pad, bottom_pad = calc_explicit_padding(int(input_shape.height), int(s_y), int(k_h), int(top), int(bottom))
     left_pad, right_pad = calc_explicit_padding(int(input_shape.width), int(s_x), int(k_w), int(left), int(right))

     padding = (top_pad, left_pad, bottom_pad, right_pad)
     skirt = (top_pad, left_pad, ypad - top_pad, xpad - left_pad)
     return padding, skirt


 def add_padding_fields(op, arch, nng):
     if op.run_on_npu:
         if "explicit_padding" in op.attrs:
             input_shape = op.ifm_shapes[0]

             if op.type == Op.Conv2DBackpropInputSwitchedBias:
                 # TODO not yet supported, but there will be need for separate handling
                 assert False
             else:
                 padding, skirt = calc_skirt(op.kernel, input_shape, op.attrs.get("explicit_padding"))

             op.attrs["explicit_padding"] = padding
             op.attrs["skirt"] = skirt

     return op


 def rewrite_activation(op, arch, nng):
     if op.type not in (Op.ReluN, Op.Clamp):
         return op

     ifm = op.ifm
     prev_op = ifm.ops[0]

     # Note: the below checks on prev_op require that a first optimize pass on the full graph has been performed
     fuseable = (
         prev_op.run_on_npu
         and prev_op.type.npu_block_type != NpuBlockType.Default
         and len(ifm.ops) == 1
         and len(prev_op.outputs[0].consumers()) == 1
         and prev_op.activation is None
     )
     if not fuseable:
         print("Warning: relu like op will not be possible to fuse, currently not supported")
         assert False

     zp = ifm.quantization.zero_point if ifm.quantization.zero_point else 0
     if op.ofm.quantization.zero_point is None:
         op.ofm.quantization.zero_point = zp

     if op.type == Op.Clamp:
         op.attrs["min"] = op.attrs["min_int"] - zp
         op.attrs["max"] = op.attrs["max_int"] - zp
     elif op.type == Op.ReluN:
         op.attrs["max"] = op.attrs["max_int"] - zp
     else:
         print("Warning: Unknown TOSA activation Op")
         assert False

     return op


 def rewrite_rescale(op, arch, nng):
     if op.type == Op.Rescale:
         ifm = op.ifm
         ofm = op.ofm

         # some error checking
         assert len(ifm.ops) == 1
         prev_op = ifm.ops[0]

         # TODO currently not supported
         assert len(ifm.consumer_list) == 1

         input_zp = op.attrs["input_zp"]
         output_zp = op.attrs["output_zp"]
         multiplier = op.attrs["multiplier"]
         shift = op.attrs["shift"]
         scale32 = op.attrs["scale32"]
         double_round = op.attrs["double_round"]
         per_channel = op.attrs["per_channel"]

         assert ifm.dtype in (DataType.uint8, DataType.int8, DataType.int32)
         assert ifm.dtype in (DataType.uint8, DataType.int8) or input_zp == 0
         assert ofm.dtype in (DataType.uint8, DataType.int8) or output_zp == 0
         assert (scale32 and ifm.dtype != DataType.int48) or (not scale32 and not double_round)

         # Check that input tensor has the same zp or no zp
         ifm_zp = ifm.quantization.zero_point
         if ifm_zp is not None and ifm_zp != input_zp:
             print("Error (fuse_rescale): zp of tensors producer/consumer differs unexpectedidly ")
             assert False
         ifm.quantization.zero_point = input_zp
         ofm.quantization.zero_point = output_zp
         for s, m in zip(shift, multiplier):
             # TODO these are the TOSA limitations
             assert m >= 0
             assert 2 <= s <= 62
             # TODO these are the HW limitations
             assert 0 <= s < (1 << 6)
         explicit_scaling = ExplicitScaling(per_channel, shift, multiplier)

         if double_round and scale32:
             rounding_mode = NpuRoundingMode.TFL
         else:
             rounding_mode = NpuRoundingMode.NATURAL

         if prev_op.type.is_depthwise_conv2d_op() or prev_op.type.is_conv2d_op() or prev_op.type == Op.FullyConnected:
             assert len(multiplier) == len(shift) == len(prev_op.bias.values)

             if ifm.dtype == DataType.int32 and per_channel:
                 prev_op.explicit_scaling = explicit_scaling
                 prev_op.rounding_mode = rounding_mode

                 # Bypass op
                 prev_op.set_output_tensor(ofm)
                 DebugDatabase.add_optimised(op, prev_op)
                 return op
             else:
                 print("Warning, unsupported fusing of TOSA Rescale previous operator is of type:", prev_op.type)
                 assert False
         # TODO which are the cases we need to and can do standalone Rescale?
         # TODO should we try to identify a conversion uint8<->int8 accomplished by 2 RESCALE ops?
         # origin might be TFLite op QUANTIZE, should we look to see if they can be translated to QUANTIZE?
         # limited to these at the moment:
         elif (
             (ifm.dtype == DataType.int8 and ofm.dtype == DataType.int8)
             or (ifm.dtype == DataType.uint8 and ofm.dtype == DataType.int8)
             or (ifm.dtype == DataType.int8 and ofm.dtype == DataType.uint8)
         ):
             # Create  NOP performing the RESCALE
             avgpool_op = replace_rescale_with_avg_pool(op)
             avgpool_op.rounding_mode = rounding_mode

             if per_channel:
                 # TODO
                 avgpool_op.explicit_scaling = explicit_scaling
                 print("Warning, unsupported TOSA Rescale")
                 assert False
             else:
                 avgpool_op.explicit_scaling = explicit_scaling
         else:
             print("Warning, unsupported fusing of TOSA Rescale previous operator is of type:", prev_op.type)
             assert False
     return op


 def fixup_quantization(op, arch, nng):
     if op.ifm and op.ifm.quantization.zero_point is None:
         op.ifm.quantization.zero_point = 0
     if op.ifm2 and op.ifm2.quantization.zero_point is None:
         op.ifm.quantization.zero_point = 0
     if op.ofm and op.ofm.quantization.zero_point is None:
         op.ofm.quantization.zero_point = 0
     return op


 def supported_operator_check(op, arch, nng):
     op.run_on_npu = arch.tosa_supported_operators.is_operator_supported(op)
     return op


 def tosa_optimise_graph(nng, arch):
     # Pre-processing step
     pre_process_list = [
         supported_operator_check,
         set_ifm_ofm_op_shapes,
     ]

     for idx, sg in enumerate(nng.subgraphs):
         nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
             nng, sg, arch, [], pre_process_list, rewrite_unsupported=False,
         )

     # Rewite Operators step
     op_rewrite_list = [set_tensor_equivalence, rewrite_rescale]

     for idx, sg in enumerate(nng.subgraphs):
         nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
             nng, sg, arch, [], op_rewrite_list, rewrite_unsupported=False,
         )

     # Post-processing step 1
     for idx, sg in enumerate(nng.subgraphs):
         nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
             nng, sg, arch, [], [rewrite_activation, add_padding_fields],
         )

     # Post-processing step 2
     for idx, sg in enumerate(nng.subgraphs):
         nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(nng, sg, arch, [], [fixup_quantization],)

     return nng
	# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
	#
	# SPDX-License-Identifier: Apache-2.0
	#
	# Licensed under the Apache License, Version 2.0 (the License); you may
	# not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# Description:
	# Early optimisation of the TOSA based network graph, using the rewrite_graph module to do the traversal of the graph.
	from . import rewrite_graph
	from .api import NpuRoundingMode
	from .data_type import DataType
	from .debug_database import DebugDatabase
	from .graph_optimiser_util import calc_explicit_padding
	from .graph_optimiser_util import needed_total_padding
	from .graph_optimiser_util import set_ifm_ofm_op_shapes
	from .graph_optimiser_util import set_tensor_equivalence
	from .operation import ExplicitScaling
	from .operation import NpuBlockType
	from .operation import Op
	from .operation_util import create_avgpool_nop


	def replace_rescale_with_avg_pool(rescale_op):
	assert rescale_op.type == Op.Rescale

	avgpool_op = create_avgpool_nop(rescale_op.name + "_avgpool")
	rescale_op_clone = rescale_op.clone()
	op = rescale_op
	op.attrs = avgpool_op.attrs.copy()
	op.type = Op.AvgPool
	DebugDatabase.add_optimised(rescale_op_clone, op)

	return op


	def calc_skirt(kernel, input_shape, explicit_padding):
	k_w, k_h = kernel.dilated_wh()
	s_x, s_y = kernel.stride
	ypad = needed_total_padding(int(input_shape.height), int(s_y), int(k_h))
	xpad = needed_total_padding(int(input_shape.width), int(s_x), int(k_w))

	top, left, bottom, right = explicit_padding
	top_pad, bottom_pad = calc_explicit_padding(int(input_shape.height), int(s_y), int(k_h), int(top), int(bottom))
	left_pad, right_pad = calc_explicit_padding(int(input_shape.width), int(s_x), int(k_w), int(left), int(right))

	padding = (top_pad, left_pad, bottom_pad, right_pad)
	skirt = (top_pad, left_pad, ypad - top_pad, xpad - left_pad)
	return padding, skirt


	def add_padding_fields(op, arch, nng):
	if op.run_on_npu:
	if "explicit_padding" in op.attrs:
	input_shape = op.ifm_shapes[0]

	if op.type == Op.Conv2DBackpropInputSwitchedBias:
	# TODO not yet supported, but there will be need for separate handling
	assert False
	else:
	padding, skirt = calc_skirt(op.kernel, input_shape, op.attrs.get("explicit_padding"))

	op.attrs["explicit_padding"] = padding
	op.attrs["skirt"] = skirt

	return op


	def rewrite_activation(op, arch, nng):
	if op.type not in (Op.ReluN, Op.Clamp):
	return op

	ifm = op.ifm
	prev_op = ifm.ops[0]

	# Note: the below checks on prev_op require that a first optimize pass on the full graph has been performed
	fuseable = (
	prev_op.run_on_npu
	and prev_op.type.npu_block_type != NpuBlockType.Default
	and len(ifm.ops) == 1
	and len(prev_op.outputs[0].consumers()) == 1
	and prev_op.activation is None
	)
	if not fuseable:
	print("Warning: relu like op will not be possible to fuse, currently not supported")
	assert False

	zp = ifm.quantization.zero_point if ifm.quantization.zero_point else 0
	if op.ofm.quantization.zero_point is None:
	op.ofm.quantization.zero_point = zp

	if op.type == Op.Clamp:
	op.attrs["min"] = op.attrs["min_int"] - zp
	op.attrs["max"] = op.attrs["max_int"] - zp
	elif op.type == Op.ReluN:
	op.attrs["max"] = op.attrs["max_int"] - zp
	else:
	print("Warning: Unknown TOSA activation Op")
	assert False

	return op


	def rewrite_rescale(op, arch, nng):
	if op.type == Op.Rescale:
	ifm = op.ifm
	ofm = op.ofm

	# some error checking
	assert len(ifm.ops) == 1
	prev_op = ifm.ops[0]

	# TODO currently not supported
	assert len(ifm.consumer_list) == 1

	input_zp = op.attrs["input_zp"]
	output_zp = op.attrs["output_zp"]
	multiplier = op.attrs["multiplier"]
	shift = op.attrs["shift"]
	scale32 = op.attrs["scale32"]
	double_round = op.attrs["double_round"]
	per_channel = op.attrs["per_channel"]

	assert ifm.dtype in (DataType.uint8, DataType.int8, DataType.int32)
	assert ifm.dtype in (DataType.uint8, DataType.int8) or input_zp == 0
	assert ofm.dtype in (DataType.uint8, DataType.int8) or output_zp == 0
	assert (scale32 and ifm.dtype != DataType.int48) or (not scale32 and not double_round)

	# Check that input tensor has the same zp or no zp
	ifm_zp = ifm.quantization.zero_point
	if ifm_zp is not None and ifm_zp != input_zp:
	print("Error (fuse_rescale): zp of tensors producer/consumer differs unexpectedidly ")
	assert False
	ifm.quantization.zero_point = input_zp
	ofm.quantization.zero_point = output_zp
	for s, m in zip(shift, multiplier):
	# TODO these are the TOSA limitations
	assert m >= 0
	assert 2 <= s <= 62
	# TODO these are the HW limitations
	assert 0 <= s < (1 << 6)
	explicit_scaling = ExplicitScaling(per_channel, shift, multiplier)

	if double_round and scale32:
	rounding_mode = NpuRoundingMode.TFL
	else:
	rounding_mode = NpuRoundingMode.NATURAL

	if prev_op.type.is_depthwise_conv2d_op() or prev_op.type.is_conv2d_op() or prev_op.type == Op.FullyConnected:
	assert len(multiplier) == len(shift) == len(prev_op.bias.values)

	if ifm.dtype == DataType.int32 and per_channel:
	prev_op.explicit_scaling = explicit_scaling
	prev_op.rounding_mode = rounding_mode

	# Bypass op
	prev_op.set_output_tensor(ofm)
	DebugDatabase.add_optimised(op, prev_op)
	return op
	else:
	print("Warning, unsupported fusing of TOSA Rescale previous operator is of type:", prev_op.type)
	assert False
	# TODO which are the cases we need to and can do standalone Rescale?
	# TODO should we try to identify a conversion uint8<->int8 accomplished by 2 RESCALE ops?
	# origin might be TFLite op QUANTIZE, should we look to see if they can be translated to QUANTIZE?
	# limited to these at the moment:
	elif (
	(ifm.dtype == DataType.int8 and ofm.dtype == DataType.int8)
	or (ifm.dtype == DataType.uint8 and ofm.dtype == DataType.int8)
	or (ifm.dtype == DataType.int8 and ofm.dtype == DataType.uint8)
	):
	# Create NOP performing the RESCALE
	avgpool_op = replace_rescale_with_avg_pool(op)
	avgpool_op.rounding_mode = rounding_mode

	if per_channel:
	# TODO
	avgpool_op.explicit_scaling = explicit_scaling
	print("Warning, unsupported TOSA Rescale")
	assert False
	else:
	avgpool_op.explicit_scaling = explicit_scaling
	else:
	print("Warning, unsupported fusing of TOSA Rescale previous operator is of type:", prev_op.type)
	assert False
	return op


	def fixup_quantization(op, arch, nng):
	if op.ifm and op.ifm.quantization.zero_point is None:
	op.ifm.quantization.zero_point = 0
	if op.ifm2 and op.ifm2.quantization.zero_point is None:
	op.ifm.quantization.zero_point = 0
	if op.ofm and op.ofm.quantization.zero_point is None:
	op.ofm.quantization.zero_point = 0
	return op


	def supported_operator_check(op, arch, nng):
	op.run_on_npu = arch.tosa_supported_operators.is_operator_supported(op)
	return op


	def tosa_optimise_graph(nng, arch):
	# Pre-processing step
	pre_process_list = [
	supported_operator_check,
	set_ifm_ofm_op_shapes,
	]

	for idx, sg in enumerate(nng.subgraphs):
	nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
	nng, sg, arch, [], pre_process_list, rewrite_unsupported=False,
	)

	# Rewite Operators step
	op_rewrite_list = [set_tensor_equivalence, rewrite_rescale]

	for idx, sg in enumerate(nng.subgraphs):
	nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
	nng, sg, arch, [], op_rewrite_list, rewrite_unsupported=False,
	)

	# Post-processing step 1
	for idx, sg in enumerate(nng.subgraphs):
	nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
	nng, sg, arch, [], [rewrite_activation, add_padding_fields],
	)

	# Post-processing step 2
	for idx, sg in enumerate(nng.subgraphs):
	nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(nng, sg, arch, [], [fixup_quantization],)

	return nng