blob: dafd28498e5400a2fbbe28953da5194935ce901e [file] [log] [blame]
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16# Description:
17# Common functions and definitions used during the graph optimization.
Patrik Gustavssonc74682c2021-08-17 14:26:38 +020018from typing import Tuple
19
Patrik Gustavssondf995102021-08-23 15:33:59 +020020import numpy as np
21
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020022from .data_type import DataType
23from .debug_database import DebugDatabase
Patrik Gustavssondf995102021-08-23 15:33:59 +020024from .errors import UnsupportedFeatureError
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020025from .errors import VelaError
26from .operation import Op
Patrik Gustavssondf995102021-08-23 15:33:59 +020027from .operation_util import create_avgpool_nop
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020028from .shape4d import Shape4D
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020029
Jonas Ohlsson81942e92021-08-20 09:33:28 +020030memory_only_ops = (
31 Op.Reshape,
Jonas Ohlsson0957e3e2021-09-01 15:57:21 +020032 Op.QuantizedReshape,
Jonas Ohlsson81942e92021-08-20 09:33:28 +020033 Op.Squeeze,
Jonas Ohlsson0957e3e2021-09-01 15:57:21 +020034 Op.ExpandDims,
Jonas Ohlsson81942e92021-08-20 09:33:28 +020035)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020036
37
38def _avoid_nhcwb16_for_concat(tens):
39 # If axis corresponds to C-dimension, NHCWB16 can only be used in the output if all the concat_start's are a
40 # multiple of 16. This as, it is only then the address offset for the ofm, for all operations, will be 16 byte
41 # aligned. For other values of axis the address offsets will be 16 byte aligned, as they are all based on c = 0
42 # and those addresses are always 16 byte aligned due to the NHCWB16 format.
43 return any(op.write_offset.depth % 16 != 0 for op in tens.ops if op.write_offset is not None)
44
45
46def _avoid_nhcwb16_for_split(tens):
47 # If read offset is not a multiple of 16 in the C-dimension, NHCWB16 need to be avoided in the input
48 for cons_op in tens.consumer_list:
49 if cons_op.ifm == tens:
50 read_offset = cons_op.read_offsets[0]
51 elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == tens:
52 read_offset = cons_op.read_offsets[1]
53 else:
54 assert False
55 if read_offset is not None and (read_offset[-1] % 16) != 0:
56 return True
57 return False
58
59
60def _avoid_nhcwb16_for_shapes(tens):
61 # check all producers/consumers to see if any op shape is preventing NHCWB16
62 for cons_op in tens.consumer_list:
63 if cons_op.ifm == tens:
64 cons_op_shape = cons_op.ifm_shapes[0]
65 elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == tens:
66 cons_op_shape = cons_op.ifm_shapes[1]
67 else:
68 assert False
69 if Shape4D(tens.shape) != cons_op_shape:
70 return True
71
72 for prod_op in tens.ops:
73 if Shape4D(tens.shape) != prod_op.ofm_shapes[0]:
74 return True
75
76 return False
77
78
79# Check if non linear format can be used
80def check_format_restrictions(tens, arch):
81 if len(tens.ops) < 1:
82 return
83 if tens.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const) or any(
84 cons is None for cons in tens.consumer_list
85 ):
86 return
87
88 # Check if any of the producers/consumers is run on CPU
89 if not all(cons.run_on_npu for cons in tens.consumer_list):
90 return
91 if not all(prod.run_on_npu for prod in tens.ops):
92 return
93
94 # "Concat" ofm exception:
95 if _avoid_nhcwb16_for_concat(tens):
96 return
97
98 # "Split" ifm exception:
99 if _avoid_nhcwb16_for_split(tens):
100 return
101
102 # Shapes checking: check all producers/consumers are NHCWB16 compatible with tens.shape
103 if _avoid_nhcwb16_for_shapes(tens):
104 return
105
106 for op in tens.consumer_list:
107 if op.type == Op.ReduceSum and tens.dtype == DataType.int32:
108 return
109 if op.type == Op.Reshape:
110 # Using NHCWB16 format for a no-op reshape is only an option if subsequent
111 # consumers do not also need to perform a reshape or if the OFM is going to
112 # be processed by CPU operations. No-op reshape consumers with empty lists
113 # (those that have no consumers, or null-consumers used as list terminators)
114 # must use normal NHWC output.
115
116 def incompatible_consumers(oper):
117 if oper and oper.type == Op.Reshape:
118 for consumer in oper.outputs[0].consumer_list:
119 yield from incompatible_consumers(consumer)
120 yield not oper or not oper.run_on_npu
121
122 if not any(incompatible_consumers(op)):
123
124 def get_rewrites(oper):
125 if oper and oper.type == Op.Reshape:
126 for consumer in oper.outputs[0].consumer_list:
127 yield from get_rewrites(consumer)
128 yield oper
129
130 # Detect no-op reshapes by comparing their full input and output tensor shapes.
131 inshape = op.ifm_shapes[0]
132 compatible_shape = [(inshape == oper.ofm_shapes[0]) for oper in get_rewrites(op)]
133 if not (compatible_shape and all(compatible_shape)):
134 return
135 else:
136 return
137
138 tens.needs_linear_format = False
139
140
Patrik Gustavssonc74682c2021-08-17 14:26:38 +0200141def calc_explicit_padding(input_size, stride, filter_size, pad_before, pad_after) -> Tuple[int, int]:
142 """
143 Based on explicit padding provided in a PAD operation, returns the corresponding hardware padding
144 that provides equivalent results.
145 """
146 total_padding = needed_total_padding(input_size, stride, filter_size)
147
148 # The bottom/right padding might need downward adjustment depending on stride/input size
149 total_minus_before = total_padding - pad_before
150 output_pad_after = pad_after
151 while output_pad_after > 0 and output_pad_after % stride != total_minus_before % stride:
152 output_pad_after -= 1
153 return pad_before, output_pad_after
154
155
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200156def needed_total_padding(input_size, stride, filter_size):
157 out_size = (input_size + stride - 1) // stride
158 needed_input = (out_size - 1) * stride + filter_size
159 total_padding = max(0, needed_input - input_size)
160 return total_padding
161
162
163# Set input/output tensor equivalence to the same id for memory operations
164def set_tensor_equivalence(op, arch, nng):
165 if op.type in memory_only_ops:
166 eid = op.outputs[0].equivalence_id
167 for inp in op.inputs:
168 inp.equivalence_id = eid
169 return op
170
171
172def set_ifm_ofm_op_shapes(op, arch, nng):
173 if op.run_on_npu and op.type.needs_shapes():
174 if op.ifm_shapes or op.ofm_shapes:
175 # Shapes already set
176 return op
177 op.set_ifm_ofm_shapes()
178 return op
179
180
Jonas Ohlsson0957e3e2021-09-01 15:57:21 +0200181def bypass_memory_only_ops(op):
182 assert op.type in memory_only_ops
Patrik Gustavssondf995102021-08-23 15:33:59 +0200183 ofm = op.ofm
184 ifm = op.ifm
Jonas Ohlsson0957e3e2021-09-01 15:57:21 +0200185
Patrik Gustavssondf995102021-08-23 15:33:59 +0200186 # Check if ifm/ofm are network ifm/ofm
187 ifm_is_sg_ifm = ifm.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const)
188 ifm_is_sg_ofm = any(ifm_cons is None for ifm_cons in ifm.consumer_list)
189 ofm_is_sg_ofm = any(ofm_cons is None for ofm_cons in ofm.consumer_list)
190 # Check if ifm/ofm is produced respectively consumed by CPU
191 ifm_is_cpu_produced = any(ifm_prod is not None and not ifm_prod.run_on_npu for ifm_prod in op.ifm.ops)
192 ofm_is_cpu_consumed = any(ofm_cons is not None and not ofm_cons.run_on_npu for ofm_cons in op.ofm.consumer_list)
193
194 # This case should be handled prior to this function
195 assert not ((ifm_is_sg_ifm or ifm_is_sg_ofm or ifm_is_cpu_produced) and (ofm_is_sg_ofm or ofm_is_cpu_consumed))
196
197 if ofm_is_sg_ofm or ofm_is_cpu_consumed:
198 # Bypassed by replacing ifm with ofm
199 ofm.ops = []
200 for prev_op in ifm.ops:
201 prev_op.outputs = [ofm]
202 ofm.ops.append(prev_op)
203
204 # All ifm consumers need to use ofm as input
205 for ifm_cons in ifm.consumer_list:
206 for ifm_idx, cons_ifm in enumerate(ifm_cons.inputs):
207 if cons_ifm == ifm:
208 ifm_cons.set_input_tensor(ofm, ifm_idx)
209 else:
210 # Bypassed by replacing ofm with ifm
211 for cons in ofm.consumer_list:
212 for ifm_idx, cons_ifm in enumerate(cons.inputs):
213 if cons_ifm == ofm:
214 cons.set_input_tensor(ifm, ifm_idx)
215
216
Patrik Gustavssonf1580f02021-09-01 12:43:02 +0200217def move_splitsliceread_to_consumer(op, cons_op):
218 assert op.type == Op.SplitSliceRead
219
220 if cons_op.ifm == op.ofm:
221 cons_op.read_offsets[0] = op.read_offsets[0]
222 cons_op.read_shapes[0] = op.read_shapes[0]
223 cons_op.set_input_tensor(op.ifm, cons_op.type.info.indices.ifms[0])
224 cons_op.ifm_shapes[0] = op.ifm_shapes[0]
225 elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == op.ofm:
226 cons_op.read_offsets[1] = op.read_offsets[0]
227 cons_op.read_shapes[1] = op.read_shapes[0]
228 cons_op.set_input_tensor(op.ifm, cons_op.type.info.indices.ifms[1])
229 cons_op.ifm_shapes[1] = op.ifm_shapes[0]
230
231 if "skirt" in cons_op.attrs:
232 assert cons_op.attrs["explicit_padding"] == cons_op.attrs["skirt"]
233 cons_op.attrs["skirt"] = None
234 cons_op.attrs["force_padding"] = True
235 op.ofm.consumer_list.remove(cons_op)
236 op.ofm.ops = []
237 op.ifm.consumer_list.remove(op)
238
239
Jonas Ohlsson0957e3e2021-09-01 15:57:21 +0200240def check_memory_only_removed(op, arch):
241 if op.run_on_npu and op.type in memory_only_ops:
242 # Memory only operators should have been removed
243 raise VelaError(f"Memory only {op.type} op {op} expected to have been removed, still remains")
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200244
245
246def record_optimised(op, arch):
247 if op.type != Op.Const:
248 DebugDatabase.add_optimised(op, op)
Patrik Gustavssondf995102021-08-23 15:33:59 +0200249
250
251def insert_copy_op_after_tens(tens):
252 tens_cons_list_copy = tens.consumer_list.copy()
253
254 # Create a avg_pool nop op with ifm as input
255 copy_tens = tens.clone()
256 copy_op = create_avgpool_nop(tens.name + "_avgpool")
257 copy_op.add_input_tensor(tens)
258 copy_op.set_output_tensor(copy_tens)
259 copy_op.set_ifm_ofm_shapes()
260 copy_op.run_on_npu = True
261
262 # Set copy_ifm consumers
263 for tens_cons in tens_cons_list_copy:
264 if tens_cons is not None:
265 for ifm_idx, cons_inp in enumerate(tens_cons.inputs):
266 if cons_inp == tens:
267 tens_cons.set_input_tensor(copy_tens, ifm_idx)
268
269 DebugDatabase.add_optimised(tens.ops[0], copy_op)
270
271
272def fix_sg_input_output(op, arch, nng):
Jonas Ohlsson0957e3e2021-09-01 15:57:21 +0200273 if not op.run_on_npu or op.type not in memory_only_ops:
Patrik Gustavssondf995102021-08-23 15:33:59 +0200274 return op
275
Jonas Ohlsson0957e3e2021-09-01 15:57:21 +0200276 # For the memory only operators we want to remove, tensors are removed.
Patrik Gustavssondf995102021-08-23 15:33:59 +0200277 # But in order to to do this, they cannot be outputs of the sg,
278 # this need to be fixed prior to the removal.
279 # Solution is to add a avgpool NOP, to maintain the original tensor.
280 # This is also valid when reshape ifm/ofm is produced respectively
281 # consumed by CPU
282
283 # Check if operator ifm/ofm are sg ifm/ofm
284 ifm_is_sg_ifm = op.ifm.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const)
285 ifm_is_sg_ofm = any(ifm_cons is None for ifm_cons in op.ifm.consumer_list)
286 ofm_is_sg_ofm = any(ofm_cons is None for ofm_cons in op.ofm.consumer_list)
287 # Check if ifm/ofm is produced respectively consumed by CPU
288 ifm_is_cpu_produced = any(ifm_prod is not None and not ifm_prod.run_on_npu for ifm_prod in op.ifm.ops)
289 ofm_is_cpu_consumed = any(ofm_cons is not None and not ofm_cons.run_on_npu for ofm_cons in op.ofm.consumer_list)
290
291 if (ifm_is_sg_ofm or ifm_is_sg_ifm or ifm_is_cpu_produced) and (ofm_is_sg_ofm or ofm_is_cpu_consumed):
Jonas Ohlsson0957e3e2021-09-01 15:57:21 +0200292 # Both ifm and ofm need to persist, but only ifm need a copy, in order to remove the memory only operator.
Patrik Gustavssondf995102021-08-23 15:33:59 +0200293 insert_copy_op_after_tens(op.ifm)
294
295 return op
296
297
298def convert_depthwise_to_conv(op, arch, nng):
299 # Depthwise is equivalent to a single conv2d if the ifm depth is 1 and
300 # the ofm depth equals the depth multipler.
301 # If those conditions are true, then we can perform a simple
302 # switch of the operator type (and weight order)
303
304 if op.type == Op.DepthwiseConv2DBias and (op.attrs["depth_multiplier"] != 1):
305 ifm_shape = op.ifm_shapes[0]
306 weight_tensor = op.inputs[1]
307 ofm_shape = op.ofm_shapes[0]
308 if (ifm_shape.depth == 1) and (ofm_shape.depth == op.attrs["depth_multiplier"]):
309 # Change op type to Conv2d
310 op.type = Op.Conv2DBias
311 del op.attrs["channel_multiplier"]
312 del op.attrs["depth_multiplier"]
313
314 weight_tensor.values = np.transpose(weight_tensor.values, (0, 1, 3, 2))
315 weight_tensor.set_all_shapes(list(weight_tensor.values.shape))
316 else:
317 raise UnsupportedFeatureError(
318 f"Unsupported 'DEPTHWISE_CONV_2D' with depth_multiplier = {op.attrs['depth_multiplier']},",
319 f" ifm channels = {ifm_shape.depth}, ofm channels = {ofm_shape.depth}",
320 )
321 DebugDatabase.add_optimised(op, op)
322 return op