blob: 8095f0821c6344bf2511801f1f2dc2130677d1d3 [file] [log] [blame]
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16# Description:
17# Common functions and definitions used during the graph optimization.
Patrik Gustavssonc74682c2021-08-17 14:26:38 +020018from typing import Tuple
19
Patrik Gustavssondf995102021-08-23 15:33:59 +020020import numpy as np
21
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020022from .data_type import DataType
23from .debug_database import DebugDatabase
Patrik Gustavssondf995102021-08-23 15:33:59 +020024from .errors import UnsupportedFeatureError
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020025from .errors import VelaError
26from .operation import Op
Patrik Gustavssondf995102021-08-23 15:33:59 +020027from .operation_util import create_avgpool_nop
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020028from .shape4d import Shape4D
29from .tensor import check_quantized_tens_scaling_equal
30
Jonas Ohlsson81942e92021-08-20 09:33:28 +020031memory_only_ops = (
32 Op.Reshape,
33 Op.Squeeze,
34)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020035
36
37def _avoid_nhcwb16_for_concat(tens):
38 # If axis corresponds to C-dimension, NHCWB16 can only be used in the output if all the concat_start's are a
39 # multiple of 16. This as, it is only then the address offset for the ofm, for all operations, will be 16 byte
40 # aligned. For other values of axis the address offsets will be 16 byte aligned, as they are all based on c = 0
41 # and those addresses are always 16 byte aligned due to the NHCWB16 format.
42 return any(op.write_offset.depth % 16 != 0 for op in tens.ops if op.write_offset is not None)
43
44
45def _avoid_nhcwb16_for_split(tens):
46 # If read offset is not a multiple of 16 in the C-dimension, NHCWB16 need to be avoided in the input
47 for cons_op in tens.consumer_list:
48 if cons_op.ifm == tens:
49 read_offset = cons_op.read_offsets[0]
50 elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == tens:
51 read_offset = cons_op.read_offsets[1]
52 else:
53 assert False
54 if read_offset is not None and (read_offset[-1] % 16) != 0:
55 return True
56 return False
57
58
59def _avoid_nhcwb16_for_shapes(tens):
60 # check all producers/consumers to see if any op shape is preventing NHCWB16
61 for cons_op in tens.consumer_list:
62 if cons_op.ifm == tens:
63 cons_op_shape = cons_op.ifm_shapes[0]
64 elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == tens:
65 cons_op_shape = cons_op.ifm_shapes[1]
66 else:
67 assert False
68 if Shape4D(tens.shape) != cons_op_shape:
69 return True
70
71 for prod_op in tens.ops:
72 if Shape4D(tens.shape) != prod_op.ofm_shapes[0]:
73 return True
74
75 return False
76
77
78# Check if non linear format can be used
79def check_format_restrictions(tens, arch):
80 if len(tens.ops) < 1:
81 return
82 if tens.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const) or any(
83 cons is None for cons in tens.consumer_list
84 ):
85 return
86
87 # Check if any of the producers/consumers is run on CPU
88 if not all(cons.run_on_npu for cons in tens.consumer_list):
89 return
90 if not all(prod.run_on_npu for prod in tens.ops):
91 return
92
93 # "Concat" ofm exception:
94 if _avoid_nhcwb16_for_concat(tens):
95 return
96
97 # "Split" ifm exception:
98 if _avoid_nhcwb16_for_split(tens):
99 return
100
101 # Shapes checking: check all producers/consumers are NHCWB16 compatible with tens.shape
102 if _avoid_nhcwb16_for_shapes(tens):
103 return
104
105 for op in tens.consumer_list:
106 if op.type == Op.ReduceSum and tens.dtype == DataType.int32:
107 return
108 if op.type == Op.Reshape:
109 # Using NHCWB16 format for a no-op reshape is only an option if subsequent
110 # consumers do not also need to perform a reshape or if the OFM is going to
111 # be processed by CPU operations. No-op reshape consumers with empty lists
112 # (those that have no consumers, or null-consumers used as list terminators)
113 # must use normal NHWC output.
114
115 def incompatible_consumers(oper):
116 if oper and oper.type == Op.Reshape:
117 for consumer in oper.outputs[0].consumer_list:
118 yield from incompatible_consumers(consumer)
119 yield not oper or not oper.run_on_npu
120
121 if not any(incompatible_consumers(op)):
122
123 def get_rewrites(oper):
124 if oper and oper.type == Op.Reshape:
125 for consumer in oper.outputs[0].consumer_list:
126 yield from get_rewrites(consumer)
127 yield oper
128
129 # Detect no-op reshapes by comparing their full input and output tensor shapes.
130 inshape = op.ifm_shapes[0]
131 compatible_shape = [(inshape == oper.ofm_shapes[0]) for oper in get_rewrites(op)]
132 if not (compatible_shape and all(compatible_shape)):
133 return
134 else:
135 return
136
137 tens.needs_linear_format = False
138
139
Patrik Gustavssonc74682c2021-08-17 14:26:38 +0200140def calc_explicit_padding(input_size, stride, filter_size, pad_before, pad_after) -> Tuple[int, int]:
141 """
142 Based on explicit padding provided in a PAD operation, returns the corresponding hardware padding
143 that provides equivalent results.
144 """
145 total_padding = needed_total_padding(input_size, stride, filter_size)
146
147 # The bottom/right padding might need downward adjustment depending on stride/input size
148 total_minus_before = total_padding - pad_before
149 output_pad_after = pad_after
150 while output_pad_after > 0 and output_pad_after % stride != total_minus_before % stride:
151 output_pad_after -= 1
152 return pad_before, output_pad_after
153
154
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200155def needed_total_padding(input_size, stride, filter_size):
156 out_size = (input_size + stride - 1) // stride
157 needed_input = (out_size - 1) * stride + filter_size
158 total_padding = max(0, needed_input - input_size)
159 return total_padding
160
161
162# Set input/output tensor equivalence to the same id for memory operations
163def set_tensor_equivalence(op, arch, nng):
164 if op.type in memory_only_ops:
165 eid = op.outputs[0].equivalence_id
166 for inp in op.inputs:
167 inp.equivalence_id = eid
168 return op
169
170
171def set_ifm_ofm_op_shapes(op, arch, nng):
172 if op.run_on_npu and op.type.needs_shapes():
173 if op.ifm_shapes or op.ofm_shapes:
174 # Shapes already set
175 return op
176 op.set_ifm_ofm_shapes()
177 return op
178
179
Patrik Gustavssondf995102021-08-23 15:33:59 +0200180def bypass_reshape_and_squeeze_ops(op):
181 assert op.type in (Op.Reshape, Op.Squeeze)
182 ofm = op.ofm
183 ifm = op.ifm
184 # Check if ifm/ofm are network ifm/ofm
185 ifm_is_sg_ifm = ifm.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const)
186 ifm_is_sg_ofm = any(ifm_cons is None for ifm_cons in ifm.consumer_list)
187 ofm_is_sg_ofm = any(ofm_cons is None for ofm_cons in ofm.consumer_list)
188 # Check if ifm/ofm is produced respectively consumed by CPU
189 ifm_is_cpu_produced = any(ifm_prod is not None and not ifm_prod.run_on_npu for ifm_prod in op.ifm.ops)
190 ofm_is_cpu_consumed = any(ofm_cons is not None and not ofm_cons.run_on_npu for ofm_cons in op.ofm.consumer_list)
191
192 # This case should be handled prior to this function
193 assert not ((ifm_is_sg_ifm or ifm_is_sg_ofm or ifm_is_cpu_produced) and (ofm_is_sg_ofm or ofm_is_cpu_consumed))
194
195 if ofm_is_sg_ofm or ofm_is_cpu_consumed:
196 # Bypassed by replacing ifm with ofm
197 ofm.ops = []
198 for prev_op in ifm.ops:
199 prev_op.outputs = [ofm]
200 ofm.ops.append(prev_op)
201
202 # All ifm consumers need to use ofm as input
203 for ifm_cons in ifm.consumer_list:
204 for ifm_idx, cons_ifm in enumerate(ifm_cons.inputs):
205 if cons_ifm == ifm:
206 ifm_cons.set_input_tensor(ofm, ifm_idx)
207 else:
208 # Bypassed by replacing ofm with ifm
209 for cons in ofm.consumer_list:
210 for ifm_idx, cons_ifm in enumerate(cons.inputs):
211 if cons_ifm == ofm:
212 cons.set_input_tensor(ifm, ifm_idx)
213
214
Patrik Gustavssonf1580f02021-09-01 12:43:02 +0200215def move_splitsliceread_to_consumer(op, cons_op):
216 assert op.type == Op.SplitSliceRead
217
218 if cons_op.ifm == op.ofm:
219 cons_op.read_offsets[0] = op.read_offsets[0]
220 cons_op.read_shapes[0] = op.read_shapes[0]
221 cons_op.set_input_tensor(op.ifm, cons_op.type.info.indices.ifms[0])
222 cons_op.ifm_shapes[0] = op.ifm_shapes[0]
223 elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == op.ofm:
224 cons_op.read_offsets[1] = op.read_offsets[0]
225 cons_op.read_shapes[1] = op.read_shapes[0]
226 cons_op.set_input_tensor(op.ifm, cons_op.type.info.indices.ifms[1])
227 cons_op.ifm_shapes[1] = op.ifm_shapes[0]
228
229 if "skirt" in cons_op.attrs:
230 assert cons_op.attrs["explicit_padding"] == cons_op.attrs["skirt"]
231 cons_op.attrs["skirt"] = None
232 cons_op.attrs["force_padding"] = True
233 op.ofm.consumer_list.remove(cons_op)
234 op.ofm.ops = []
235 op.ifm.consumer_list.remove(op)
236
237
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200238def check_reshapes(op, arch):
239 if op.run_on_npu and op.type == Op.Reshape:
240 ofm = op.ofm
241
242 if check_quantized_tens_scaling_equal(op.ifm, ofm):
243 # Reshape should have been removed
244 raise VelaError(f"Reshape op {op} expected to have been removed, still remains")
245
246
247def record_optimised(op, arch):
248 if op.type != Op.Const:
249 DebugDatabase.add_optimised(op, op)
Patrik Gustavssondf995102021-08-23 15:33:59 +0200250
251
252def insert_copy_op_after_tens(tens):
253 tens_cons_list_copy = tens.consumer_list.copy()
254
255 # Create a avg_pool nop op with ifm as input
256 copy_tens = tens.clone()
257 copy_op = create_avgpool_nop(tens.name + "_avgpool")
258 copy_op.add_input_tensor(tens)
259 copy_op.set_output_tensor(copy_tens)
260 copy_op.set_ifm_ofm_shapes()
261 copy_op.run_on_npu = True
262
263 # Set copy_ifm consumers
264 for tens_cons in tens_cons_list_copy:
265 if tens_cons is not None:
266 for ifm_idx, cons_inp in enumerate(tens_cons.inputs):
267 if cons_inp == tens:
268 tens_cons.set_input_tensor(copy_tens, ifm_idx)
269
270 DebugDatabase.add_optimised(tens.ops[0], copy_op)
271
272
273def fix_sg_input_output(op, arch, nng):
274 if not op.run_on_npu or op.type not in (Op.Reshape, Op.Squeeze):
275 return op
276
277 # For the Reshape/Squeeze operators we want to remove, tensors are removed.
278 # But in order to to do this, they cannot be outputs of the sg,
279 # this need to be fixed prior to the removal.
280 # Solution is to add a avgpool NOP, to maintain the original tensor.
281 # This is also valid when reshape ifm/ofm is produced respectively
282 # consumed by CPU
283
284 # Check if operator ifm/ofm are sg ifm/ofm
285 ifm_is_sg_ifm = op.ifm.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const)
286 ifm_is_sg_ofm = any(ifm_cons is None for ifm_cons in op.ifm.consumer_list)
287 ofm_is_sg_ofm = any(ofm_cons is None for ofm_cons in op.ofm.consumer_list)
288 # Check if ifm/ofm is produced respectively consumed by CPU
289 ifm_is_cpu_produced = any(ifm_prod is not None and not ifm_prod.run_on_npu for ifm_prod in op.ifm.ops)
290 ofm_is_cpu_consumed = any(ofm_cons is not None and not ofm_cons.run_on_npu for ofm_cons in op.ofm.consumer_list)
291
292 if (ifm_is_sg_ofm or ifm_is_sg_ifm or ifm_is_cpu_produced) and (ofm_is_sg_ofm or ofm_is_cpu_consumed):
293 # Both ifm and ofm need to persist, but only ifm need a copy, in order to remove the Reshape/Squeeze
294 insert_copy_op_after_tens(op.ifm)
295
296 return op
297
298
299def convert_depthwise_to_conv(op, arch, nng):
300 # Depthwise is equivalent to a single conv2d if the ifm depth is 1 and
301 # the ofm depth equals the depth multipler.
302 # If those conditions are true, then we can perform a simple
303 # switch of the operator type (and weight order)
304
305 if op.type == Op.DepthwiseConv2DBias and (op.attrs["depth_multiplier"] != 1):
306 ifm_shape = op.ifm_shapes[0]
307 weight_tensor = op.inputs[1]
308 ofm_shape = op.ofm_shapes[0]
309 if (ifm_shape.depth == 1) and (ofm_shape.depth == op.attrs["depth_multiplier"]):
310 # Change op type to Conv2d
311 op.type = Op.Conv2DBias
312 del op.attrs["channel_multiplier"]
313 del op.attrs["depth_multiplier"]
314
315 weight_tensor.values = np.transpose(weight_tensor.values, (0, 1, 3, 2))
316 weight_tensor.set_all_shapes(list(weight_tensor.values.shape))
317 else:
318 raise UnsupportedFeatureError(
319 f"Unsupported 'DEPTHWISE_CONV_2D' with depth_multiplier = {op.attrs['depth_multiplier']},",
320 f" ifm channels = {ifm_shape.depth}, ofm channels = {ofm_shape.depth}",
321 )
322 DebugDatabase.add_optimised(op, op)
323 return op