Patrik Gustavsson | 8f1f9aa | 2021-06-28 07:41:58 +0200 | [diff] [blame] | 1 | # Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved. |
| 2 | # |
| 3 | # SPDX-License-Identifier: Apache-2.0 |
| 4 | # |
| 5 | # Licensed under the Apache License, Version 2.0 (the License); you may |
| 6 | # not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| 8 | # |
| 9 | # www.apache.org/licenses/LICENSE-2.0 |
| 10 | # |
| 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an AS IS BASIS, WITHOUT |
| 13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
| 16 | # Description: |
| 17 | # Common functions and definitions used during the graph optimization. |
Patrik Gustavsson | c74682c | 2021-08-17 14:26:38 +0200 | [diff] [blame] | 18 | from typing import Tuple |
| 19 | |
Patrik Gustavsson | df99510 | 2021-08-23 15:33:59 +0200 | [diff] [blame^] | 20 | import numpy as np |
| 21 | |
Patrik Gustavsson | 8f1f9aa | 2021-06-28 07:41:58 +0200 | [diff] [blame] | 22 | from .data_type import DataType |
| 23 | from .debug_database import DebugDatabase |
Patrik Gustavsson | df99510 | 2021-08-23 15:33:59 +0200 | [diff] [blame^] | 24 | from .errors import UnsupportedFeatureError |
Patrik Gustavsson | 8f1f9aa | 2021-06-28 07:41:58 +0200 | [diff] [blame] | 25 | from .errors import VelaError |
| 26 | from .operation import Op |
Patrik Gustavsson | df99510 | 2021-08-23 15:33:59 +0200 | [diff] [blame^] | 27 | from .operation_util import create_avgpool_nop |
Patrik Gustavsson | 8f1f9aa | 2021-06-28 07:41:58 +0200 | [diff] [blame] | 28 | from .shape4d import Shape4D |
| 29 | from .tensor import check_quantized_tens_scaling_equal |
| 30 | |
Jonas Ohlsson | 81942e9 | 2021-08-20 09:33:28 +0200 | [diff] [blame] | 31 | memory_only_ops = ( |
| 32 | Op.Reshape, |
| 33 | Op.Squeeze, |
| 34 | ) |
Patrik Gustavsson | 8f1f9aa | 2021-06-28 07:41:58 +0200 | [diff] [blame] | 35 | |
| 36 | |
| 37 | def _avoid_nhcwb16_for_concat(tens): |
| 38 | # If axis corresponds to C-dimension, NHCWB16 can only be used in the output if all the concat_start's are a |
| 39 | # multiple of 16. This as, it is only then the address offset for the ofm, for all operations, will be 16 byte |
| 40 | # aligned. For other values of axis the address offsets will be 16 byte aligned, as they are all based on c = 0 |
| 41 | # and those addresses are always 16 byte aligned due to the NHCWB16 format. |
| 42 | return any(op.write_offset.depth % 16 != 0 for op in tens.ops if op.write_offset is not None) |
| 43 | |
| 44 | |
| 45 | def _avoid_nhcwb16_for_split(tens): |
| 46 | # If read offset is not a multiple of 16 in the C-dimension, NHCWB16 need to be avoided in the input |
| 47 | for cons_op in tens.consumer_list: |
| 48 | if cons_op.ifm == tens: |
| 49 | read_offset = cons_op.read_offsets[0] |
| 50 | elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == tens: |
| 51 | read_offset = cons_op.read_offsets[1] |
| 52 | else: |
| 53 | assert False |
| 54 | if read_offset is not None and (read_offset[-1] % 16) != 0: |
| 55 | return True |
| 56 | return False |
| 57 | |
| 58 | |
| 59 | def _avoid_nhcwb16_for_shapes(tens): |
| 60 | # check all producers/consumers to see if any op shape is preventing NHCWB16 |
| 61 | for cons_op in tens.consumer_list: |
| 62 | if cons_op.ifm == tens: |
| 63 | cons_op_shape = cons_op.ifm_shapes[0] |
| 64 | elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == tens: |
| 65 | cons_op_shape = cons_op.ifm_shapes[1] |
| 66 | else: |
| 67 | assert False |
| 68 | if Shape4D(tens.shape) != cons_op_shape: |
| 69 | return True |
| 70 | |
| 71 | for prod_op in tens.ops: |
| 72 | if Shape4D(tens.shape) != prod_op.ofm_shapes[0]: |
| 73 | return True |
| 74 | |
| 75 | return False |
| 76 | |
| 77 | |
| 78 | # Check if non linear format can be used |
| 79 | def check_format_restrictions(tens, arch): |
| 80 | if len(tens.ops) < 1: |
| 81 | return |
| 82 | if tens.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const) or any( |
| 83 | cons is None for cons in tens.consumer_list |
| 84 | ): |
| 85 | return |
| 86 | |
| 87 | # Check if any of the producers/consumers is run on CPU |
| 88 | if not all(cons.run_on_npu for cons in tens.consumer_list): |
| 89 | return |
| 90 | if not all(prod.run_on_npu for prod in tens.ops): |
| 91 | return |
| 92 | |
| 93 | # "Concat" ofm exception: |
| 94 | if _avoid_nhcwb16_for_concat(tens): |
| 95 | return |
| 96 | |
| 97 | # "Split" ifm exception: |
| 98 | if _avoid_nhcwb16_for_split(tens): |
| 99 | return |
| 100 | |
| 101 | # Shapes checking: check all producers/consumers are NHCWB16 compatible with tens.shape |
| 102 | if _avoid_nhcwb16_for_shapes(tens): |
| 103 | return |
| 104 | |
| 105 | for op in tens.consumer_list: |
| 106 | if op.type == Op.ReduceSum and tens.dtype == DataType.int32: |
| 107 | return |
| 108 | if op.type == Op.Reshape: |
| 109 | # Using NHCWB16 format for a no-op reshape is only an option if subsequent |
| 110 | # consumers do not also need to perform a reshape or if the OFM is going to |
| 111 | # be processed by CPU operations. No-op reshape consumers with empty lists |
| 112 | # (those that have no consumers, or null-consumers used as list terminators) |
| 113 | # must use normal NHWC output. |
| 114 | |
| 115 | def incompatible_consumers(oper): |
| 116 | if oper and oper.type == Op.Reshape: |
| 117 | for consumer in oper.outputs[0].consumer_list: |
| 118 | yield from incompatible_consumers(consumer) |
| 119 | yield not oper or not oper.run_on_npu |
| 120 | |
| 121 | if not any(incompatible_consumers(op)): |
| 122 | |
| 123 | def get_rewrites(oper): |
| 124 | if oper and oper.type == Op.Reshape: |
| 125 | for consumer in oper.outputs[0].consumer_list: |
| 126 | yield from get_rewrites(consumer) |
| 127 | yield oper |
| 128 | |
| 129 | # Detect no-op reshapes by comparing their full input and output tensor shapes. |
| 130 | inshape = op.ifm_shapes[0] |
| 131 | compatible_shape = [(inshape == oper.ofm_shapes[0]) for oper in get_rewrites(op)] |
| 132 | if not (compatible_shape and all(compatible_shape)): |
| 133 | return |
| 134 | else: |
| 135 | return |
| 136 | |
| 137 | tens.needs_linear_format = False |
| 138 | |
| 139 | |
Patrik Gustavsson | c74682c | 2021-08-17 14:26:38 +0200 | [diff] [blame] | 140 | def calc_explicit_padding(input_size, stride, filter_size, pad_before, pad_after) -> Tuple[int, int]: |
| 141 | """ |
| 142 | Based on explicit padding provided in a PAD operation, returns the corresponding hardware padding |
| 143 | that provides equivalent results. |
| 144 | """ |
| 145 | total_padding = needed_total_padding(input_size, stride, filter_size) |
| 146 | |
| 147 | # The bottom/right padding might need downward adjustment depending on stride/input size |
| 148 | total_minus_before = total_padding - pad_before |
| 149 | output_pad_after = pad_after |
| 150 | while output_pad_after > 0 and output_pad_after % stride != total_minus_before % stride: |
| 151 | output_pad_after -= 1 |
| 152 | return pad_before, output_pad_after |
| 153 | |
| 154 | |
Patrik Gustavsson | 8f1f9aa | 2021-06-28 07:41:58 +0200 | [diff] [blame] | 155 | def needed_total_padding(input_size, stride, filter_size): |
| 156 | out_size = (input_size + stride - 1) // stride |
| 157 | needed_input = (out_size - 1) * stride + filter_size |
| 158 | total_padding = max(0, needed_input - input_size) |
| 159 | return total_padding |
| 160 | |
| 161 | |
| 162 | # Set input/output tensor equivalence to the same id for memory operations |
| 163 | def set_tensor_equivalence(op, arch, nng): |
| 164 | if op.type in memory_only_ops: |
| 165 | eid = op.outputs[0].equivalence_id |
| 166 | for inp in op.inputs: |
| 167 | inp.equivalence_id = eid |
| 168 | return op |
| 169 | |
| 170 | |
| 171 | def set_ifm_ofm_op_shapes(op, arch, nng): |
| 172 | if op.run_on_npu and op.type.needs_shapes(): |
| 173 | if op.ifm_shapes or op.ofm_shapes: |
| 174 | # Shapes already set |
| 175 | return op |
| 176 | op.set_ifm_ofm_shapes() |
| 177 | return op |
| 178 | |
| 179 | |
Patrik Gustavsson | df99510 | 2021-08-23 15:33:59 +0200 | [diff] [blame^] | 180 | def bypass_reshape_and_squeeze_ops(op): |
| 181 | assert op.type in (Op.Reshape, Op.Squeeze) |
| 182 | ofm = op.ofm |
| 183 | ifm = op.ifm |
| 184 | # Check if ifm/ofm are network ifm/ofm |
| 185 | ifm_is_sg_ifm = ifm.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const) |
| 186 | ifm_is_sg_ofm = any(ifm_cons is None for ifm_cons in ifm.consumer_list) |
| 187 | ofm_is_sg_ofm = any(ofm_cons is None for ofm_cons in ofm.consumer_list) |
| 188 | # Check if ifm/ofm is produced respectively consumed by CPU |
| 189 | ifm_is_cpu_produced = any(ifm_prod is not None and not ifm_prod.run_on_npu for ifm_prod in op.ifm.ops) |
| 190 | ofm_is_cpu_consumed = any(ofm_cons is not None and not ofm_cons.run_on_npu for ofm_cons in op.ofm.consumer_list) |
| 191 | |
| 192 | # This case should be handled prior to this function |
| 193 | assert not ((ifm_is_sg_ifm or ifm_is_sg_ofm or ifm_is_cpu_produced) and (ofm_is_sg_ofm or ofm_is_cpu_consumed)) |
| 194 | |
| 195 | if ofm_is_sg_ofm or ofm_is_cpu_consumed: |
| 196 | # Bypassed by replacing ifm with ofm |
| 197 | ofm.ops = [] |
| 198 | for prev_op in ifm.ops: |
| 199 | prev_op.outputs = [ofm] |
| 200 | ofm.ops.append(prev_op) |
| 201 | |
| 202 | # All ifm consumers need to use ofm as input |
| 203 | for ifm_cons in ifm.consumer_list: |
| 204 | for ifm_idx, cons_ifm in enumerate(ifm_cons.inputs): |
| 205 | if cons_ifm == ifm: |
| 206 | ifm_cons.set_input_tensor(ofm, ifm_idx) |
| 207 | else: |
| 208 | # Bypassed by replacing ofm with ifm |
| 209 | for cons in ofm.consumer_list: |
| 210 | for ifm_idx, cons_ifm in enumerate(cons.inputs): |
| 211 | if cons_ifm == ofm: |
| 212 | cons.set_input_tensor(ifm, ifm_idx) |
| 213 | |
| 214 | |
Patrik Gustavsson | 8f1f9aa | 2021-06-28 07:41:58 +0200 | [diff] [blame] | 215 | def check_reshapes(op, arch): |
| 216 | if op.run_on_npu and op.type == Op.Reshape: |
| 217 | ofm = op.ofm |
| 218 | |
| 219 | if check_quantized_tens_scaling_equal(op.ifm, ofm): |
| 220 | # Reshape should have been removed |
| 221 | raise VelaError(f"Reshape op {op} expected to have been removed, still remains") |
| 222 | |
| 223 | |
| 224 | def record_optimised(op, arch): |
| 225 | if op.type != Op.Const: |
| 226 | DebugDatabase.add_optimised(op, op) |
Patrik Gustavsson | df99510 | 2021-08-23 15:33:59 +0200 | [diff] [blame^] | 227 | |
| 228 | |
| 229 | def insert_copy_op_after_tens(tens): |
| 230 | tens_cons_list_copy = tens.consumer_list.copy() |
| 231 | |
| 232 | # Create a avg_pool nop op with ifm as input |
| 233 | copy_tens = tens.clone() |
| 234 | copy_op = create_avgpool_nop(tens.name + "_avgpool") |
| 235 | copy_op.add_input_tensor(tens) |
| 236 | copy_op.set_output_tensor(copy_tens) |
| 237 | copy_op.set_ifm_ofm_shapes() |
| 238 | copy_op.run_on_npu = True |
| 239 | |
| 240 | # Set copy_ifm consumers |
| 241 | for tens_cons in tens_cons_list_copy: |
| 242 | if tens_cons is not None: |
| 243 | for ifm_idx, cons_inp in enumerate(tens_cons.inputs): |
| 244 | if cons_inp == tens: |
| 245 | tens_cons.set_input_tensor(copy_tens, ifm_idx) |
| 246 | |
| 247 | DebugDatabase.add_optimised(tens.ops[0], copy_op) |
| 248 | |
| 249 | |
| 250 | def fix_sg_input_output(op, arch, nng): |
| 251 | if not op.run_on_npu or op.type not in (Op.Reshape, Op.Squeeze): |
| 252 | return op |
| 253 | |
| 254 | # For the Reshape/Squeeze operators we want to remove, tensors are removed. |
| 255 | # But in order to to do this, they cannot be outputs of the sg, |
| 256 | # this need to be fixed prior to the removal. |
| 257 | # Solution is to add a avgpool NOP, to maintain the original tensor. |
| 258 | # This is also valid when reshape ifm/ofm is produced respectively |
| 259 | # consumed by CPU |
| 260 | |
| 261 | # Check if operator ifm/ofm are sg ifm/ofm |
| 262 | ifm_is_sg_ifm = op.ifm.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const) |
| 263 | ifm_is_sg_ofm = any(ifm_cons is None for ifm_cons in op.ifm.consumer_list) |
| 264 | ofm_is_sg_ofm = any(ofm_cons is None for ofm_cons in op.ofm.consumer_list) |
| 265 | # Check if ifm/ofm is produced respectively consumed by CPU |
| 266 | ifm_is_cpu_produced = any(ifm_prod is not None and not ifm_prod.run_on_npu for ifm_prod in op.ifm.ops) |
| 267 | ofm_is_cpu_consumed = any(ofm_cons is not None and not ofm_cons.run_on_npu for ofm_cons in op.ofm.consumer_list) |
| 268 | |
| 269 | if (ifm_is_sg_ofm or ifm_is_sg_ifm or ifm_is_cpu_produced) and (ofm_is_sg_ofm or ofm_is_cpu_consumed): |
| 270 | # Both ifm and ofm need to persist, but only ifm need a copy, in order to remove the Reshape/Squeeze |
| 271 | insert_copy_op_after_tens(op.ifm) |
| 272 | |
| 273 | return op |
| 274 | |
| 275 | |
| 276 | def convert_depthwise_to_conv(op, arch, nng): |
| 277 | # Depthwise is equivalent to a single conv2d if the ifm depth is 1 and |
| 278 | # the ofm depth equals the depth multipler. |
| 279 | # If those conditions are true, then we can perform a simple |
| 280 | # switch of the operator type (and weight order) |
| 281 | |
| 282 | if op.type == Op.DepthwiseConv2DBias and (op.attrs["depth_multiplier"] != 1): |
| 283 | ifm_shape = op.ifm_shapes[0] |
| 284 | weight_tensor = op.inputs[1] |
| 285 | ofm_shape = op.ofm_shapes[0] |
| 286 | if (ifm_shape.depth == 1) and (ofm_shape.depth == op.attrs["depth_multiplier"]): |
| 287 | # Change op type to Conv2d |
| 288 | op.type = Op.Conv2DBias |
| 289 | del op.attrs["channel_multiplier"] |
| 290 | del op.attrs["depth_multiplier"] |
| 291 | |
| 292 | weight_tensor.values = np.transpose(weight_tensor.values, (0, 1, 3, 2)) |
| 293 | weight_tensor.set_all_shapes(list(weight_tensor.values.shape)) |
| 294 | else: |
| 295 | raise UnsupportedFeatureError( |
| 296 | f"Unsupported 'DEPTHWISE_CONV_2D' with depth_multiplier = {op.attrs['depth_multiplier']},", |
| 297 | f" ifm channels = {ifm_shape.depth}, ofm channels = {ofm_shape.depth}", |
| 298 | ) |
| 299 | DebugDatabase.add_optimised(op, op) |
| 300 | return op |