blob: 94e6f9999e1b02c7a093ed0760f742f61feb6a38 [file] [log] [blame]
# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Description:
# Early optimisation of the TOSA based network graph, using the rewrite_graph module to do the traversal of the graph.
from . import rewrite_graph
from .api import NpuRoundingMode
from .data_type import DataType
from .debug_database import DebugDatabase
from .graph_optimiser_util import needed_total_padding
from .graph_optimiser_util import set_ifm_ofm_op_shapes
from .graph_optimiser_util import set_tensor_equivalence
from .operation import ExplicitScaling
from .operation import NpuBlockType
from .operation import Op
from .operation import Padding
def calc_padding_and_skirt(padding_type, kernel, input_shape, explicit_padding):
k_w, k_h = kernel.dilated_wh()
s_x, s_y = kernel.stride
ypad = needed_total_padding(int(input_shape.height), int(s_y), int(k_h))
xpad = needed_total_padding(int(input_shape.width), int(s_x), int(k_w))
left_pad, right_pad, top_pad, bottom_pad = explicit_padding
padding = (top_pad, left_pad, bottom_pad, right_pad)
skirt = (top_pad, left_pad, ypad - top_pad, xpad - left_pad)
return padding, skirt
def add_padding_fields(op, arch, nng):
if op.run_on_npu:
if "padding" in op.attrs:
input_shape = op.ifm_shapes[0]
if op.type == Op.Conv2DBackpropInputSwitchedBias:
# TODO not yet supported, but there will be need for separate handling
assert False
else:
padding, skirt = calc_padding_and_skirt(
Padding.EXPLICIT, op.kernel, input_shape, op.attrs.get("padding"),
)
op.attrs["explicit_padding"] = padding
op.attrs["skirt"] = skirt
return op
def rewrite_activation(op, arch, nng):
if not op.type.is_relu_op():
return op
ifm = op.ifm
prev_op = ifm.ops[0]
# Note: the below checks on prev_op require that a first optimize pass on the full graph has been performed
fuseable = (
prev_op.run_on_npu
and prev_op.type.npu_block_type != NpuBlockType.Default
and len(ifm.ops) == 1
and len(prev_op.outputs[0].consumers()) == 1
and prev_op.activation is None
)
if not fuseable:
print("Warning: relu like op will not be possible to fuse, currently not supported")
assert False
zp = ifm.quantization.zero_point if ifm.quantization.zero_point else 0
if op.ofm.quantization.zero_point is None:
op.ofm.quantization.zero_point = zp
if op.type == Op.Clip:
op.attrs["min"] = op.attrs["min_int"] - zp
op.attrs["max"] = op.attrs["max_int"] - zp
elif op.type == Op.ReluN:
op.attrs["max"] = op.attrs["max_int"] - zp
else:
print("Warning: Unknown TOSA activation Op")
assert False
return op
def rewrite_rescale(op, arch, nng):
if op.type == Op.Rescale:
ifm = op.ifm
ofm = op.ofm
# some error checking
assert len(ifm.ops) == 1
prev_op = ifm.ops[0]
# TODO currently not supported
assert prev_op.type not in (Op.Placeholder, Op.SubgraphInput, Op.Const)
assert len(ifm.consumer_list) == 1
input_zp = op.attrs["input_zp"]
output_zp = op.attrs["output_zp"]
multiplier = op.attrs["multiplier"]
shift = op.attrs["shift"]
scale32 = op.attrs["scale32"]
double_round = op.attrs["double_round"]
per_channel = op.attrs["per_channel"]
assert ifm.dtype in (DataType.uint8, DataType.int8, DataType.int32)
assert ifm.dtype in (DataType.uint8, DataType.int8) or input_zp == 0
assert ofm.dtype in (DataType.uint8, DataType.int8) or output_zp == 0
assert (scale32 and ifm.dtype != DataType.int48) or (not scale32 and not double_round)
# Check that input tensor has the same zp or no zp
ifm_zp = ifm.quantization.zero_point
if ifm_zp is not None and ifm_zp != input_zp:
print("Error (fuse_rescale): zp of tensors producer/consumer differs unexpectedidly ")
assert False
ifm.quantization.zero_point = input_zp
if not scale32:
double_round = False
if prev_op.type.is_depthwise_conv2d_op() or prev_op.type.is_conv2d_op() or prev_op.type == Op.FullyConnected:
assert len(multiplier) == len(shift) == len(prev_op.bias.values)
if ifm.dtype == DataType.int32 and per_channel:
for s, m in zip(shift, multiplier):
# TODO these are the TOSA limitations
assert m >= 0
assert 2 <= s <= 62
# TODO these are the HW limitations
assert 0 <= s < (1 << 6)
prev_op.explicit_scaling = ExplicitScaling(per_channel, shift, multiplier)
ofm.quantization.zero_point = output_zp
if double_round:
prev_op.rounding_mode = NpuRoundingMode.TFL
else:
prev_op.rounding_mode = NpuRoundingMode.NATURAL
# Bypass op
prev_op.set_output_tensor(ofm)
DebugDatabase.add_optimised(op, prev_op)
return op
else:
print("Warning, unsupported fusing of TOSA Rescale previous operator is of type:", prev_op.type)
assert False
else:
print("Warning, unsupported fusing of TOSA Rescale previous operator is of type:", prev_op.type)
assert False
return op
def supported_operator_check(op, arch, nng):
op.run_on_npu = arch.tosa_supported_operators.is_operator_supported(op)
return op
def tosa_optimise_graph(nng, arch):
# Pre-processing step
pre_process_list = [
supported_operator_check,
set_ifm_ofm_op_shapes,
]
for idx, sg in enumerate(nng.subgraphs):
nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
nng, sg, arch, [], pre_process_list, rewrite_unsupported=False,
)
# Rewite Operators step
op_rewrite_list = [set_tensor_equivalence, rewrite_rescale]
for idx, sg in enumerate(nng.subgraphs):
nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
nng, sg, arch, [], op_rewrite_list, rewrite_unsupported=False,
)
# Post-processing step
for idx, sg in enumerate(nng.subgraphs):
nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
nng, sg, arch, [], [rewrite_activation, add_padding_fields],
)
return nng