blob: e6a79cef7db9a097eff3bd797a74839278e08697 [file] [log] [blame]
Rickard Bolinfea15162022-07-04 16:19:16 +00001# Copyright (C) 2021-2022 Arm Limited or its affiliates. All rights reserved.
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16# Description:
17# Common functions and definitions used during the graph optimization.
Patrik Gustavssonc74682c2021-08-17 14:26:38 +020018from typing import Tuple
19
Patrik Gustavssondf995102021-08-23 15:33:59 +020020import numpy as np
21
Patrik Gustavssonf436ada2021-09-14 14:56:48 +020022from . import lut
Tim Halld6efcd32022-09-02 15:01:01 +010023from .architecture_features import Accelerator
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020024from .data_type import DataType
25from .debug_database import DebugDatabase
Patrik Gustavssondf995102021-08-23 15:33:59 +020026from .errors import UnsupportedFeatureError
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020027from .errors import VelaError
28from .operation import Op
Patrik Gustavssondf995102021-08-23 15:33:59 +020029from .operation_util import create_avgpool_nop
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020030from .shape4d import Shape4D
Patrik Gustavssonf436ada2021-09-14 14:56:48 +020031from .tensor import create_const_tensor
32from .tensor import QuantizationParameters
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020033
Jonas Ohlsson81942e92021-08-20 09:33:28 +020034memory_only_ops = (
35 Op.Reshape,
Jonas Ohlsson0957e3e2021-09-01 15:57:21 +020036 Op.QuantizedReshape,
Jonas Ohlsson81942e92021-08-20 09:33:28 +020037 Op.Squeeze,
Jonas Ohlsson0957e3e2021-09-01 15:57:21 +020038 Op.ExpandDims,
Patrik Gustavssonef3ebdd2021-10-01 11:10:25 +020039 Op.Identity,
Jonas Ohlsson81942e92021-08-20 09:33:28 +020040)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020041
42
43def _avoid_nhcwb16_for_concat(tens):
44 # If axis corresponds to C-dimension, NHCWB16 can only be used in the output if all the concat_start's are a
45 # multiple of 16. This as, it is only then the address offset for the ofm, for all operations, will be 16 byte
46 # aligned. For other values of axis the address offsets will be 16 byte aligned, as they are all based on c = 0
47 # and those addresses are always 16 byte aligned due to the NHCWB16 format.
48 return any(op.write_offset.depth % 16 != 0 for op in tens.ops if op.write_offset is not None)
49
50
51def _avoid_nhcwb16_for_split(tens):
52 # If read offset is not a multiple of 16 in the C-dimension, NHCWB16 need to be avoided in the input
James Ward6bf16132021-09-08 11:14:20 +010053
54 # Return True if NHCWB16 needs to be avoided
55 def offset_not_aligned(read_offset):
56 return read_offset is not None and (read_offset.depth % 16) != 0
57
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020058 for cons_op in tens.consumer_list:
59 if cons_op.ifm == tens:
James Ward6bf16132021-09-08 11:14:20 +010060 if offset_not_aligned(cons_op.read_offsets[0]):
61 return True
62 if cons_op.ifm2 is not None and cons_op.ifm2 == tens:
63 if offset_not_aligned(cons_op.read_offsets[1]):
64 return True
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020065 return False
66
67
68def _avoid_nhcwb16_for_shapes(tens):
69 # check all producers/consumers to see if any op shape is preventing NHCWB16
70 for cons_op in tens.consumer_list:
71 if cons_op.ifm == tens:
72 cons_op_shape = cons_op.ifm_shapes[0]
73 elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == tens:
74 cons_op_shape = cons_op.ifm_shapes[1]
75 else:
76 assert False
77 if Shape4D(tens.shape) != cons_op_shape:
78 return True
79
80 for prod_op in tens.ops:
81 if Shape4D(tens.shape) != prod_op.ofm_shapes[0]:
82 return True
83
84 return False
85
86
87# Check if non linear format can be used
88def check_format_restrictions(tens, arch):
89 if len(tens.ops) < 1:
90 return
91 if tens.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const) or any(
92 cons is None for cons in tens.consumer_list
93 ):
94 return
95
96 # Check if any of the producers/consumers is run on CPU
97 if not all(cons.run_on_npu for cons in tens.consumer_list):
98 return
99 if not all(prod.run_on_npu for prod in tens.ops):
100 return
101
102 # "Concat" ofm exception:
103 if _avoid_nhcwb16_for_concat(tens):
104 return
105
106 # "Split" ifm exception:
107 if _avoid_nhcwb16_for_split(tens):
108 return
109
110 # Shapes checking: check all producers/consumers are NHCWB16 compatible with tens.shape
111 if _avoid_nhcwb16_for_shapes(tens):
112 return
113
Rickard Bolinfea15162022-07-04 16:19:16 +0000114 # Resize bilinear half pixel center implementation requires OFM with linear format to
115 # allow stride modification in H/W dimensions.
116 for op in tens.ops:
117 if op.original_type == Op.ResizeBilinear and op.type == Op.DepthwiseConv2DBias:
118 return
119
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200120 for op in tens.consumer_list:
Tim Halld6efcd32022-09-02 15:01:01 +0100121 if op.type == Op.ReduceSum and (
122 tens.dtype == DataType.int32 or arch.accelerator_config == Accelerator.Ethos_U65_512
123 ):
124 # ReduceSum requires NHWC input
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200125 return
126 if op.type == Op.Reshape:
127 # Using NHCWB16 format for a no-op reshape is only an option if subsequent
128 # consumers do not also need to perform a reshape or if the OFM is going to
129 # be processed by CPU operations. No-op reshape consumers with empty lists
130 # (those that have no consumers, or null-consumers used as list terminators)
131 # must use normal NHWC output.
132
133 def incompatible_consumers(oper):
134 if oper and oper.type == Op.Reshape:
135 for consumer in oper.outputs[0].consumer_list:
136 yield from incompatible_consumers(consumer)
137 yield not oper or not oper.run_on_npu
138
139 if not any(incompatible_consumers(op)):
140
141 def get_rewrites(oper):
142 if oper and oper.type == Op.Reshape:
143 for consumer in oper.outputs[0].consumer_list:
144 yield from get_rewrites(consumer)
145 yield oper
146
147 # Detect no-op reshapes by comparing their full input and output tensor shapes.
148 inshape = op.ifm_shapes[0]
149 compatible_shape = [(inshape == oper.ofm_shapes[0]) for oper in get_rewrites(op)]
150 if not (compatible_shape and all(compatible_shape)):
151 return
152 else:
153 return
154
155 tens.needs_linear_format = False
156
157
Patrik Gustavssonc74682c2021-08-17 14:26:38 +0200158def calc_explicit_padding(input_size, stride, filter_size, pad_before, pad_after) -> Tuple[int, int]:
159 """
160 Based on explicit padding provided in a PAD operation, returns the corresponding hardware padding
161 that provides equivalent results.
162 """
163 total_padding = needed_total_padding(input_size, stride, filter_size)
164
165 # The bottom/right padding might need downward adjustment depending on stride/input size
166 total_minus_before = total_padding - pad_before
167 output_pad_after = pad_after
168 while output_pad_after > 0 and output_pad_after % stride != total_minus_before % stride:
169 output_pad_after -= 1
170 return pad_before, output_pad_after
171
172
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200173def needed_total_padding(input_size, stride, filter_size):
174 out_size = (input_size + stride - 1) // stride
175 needed_input = (out_size - 1) * stride + filter_size
176 total_padding = max(0, needed_input - input_size)
177 return total_padding
178
179
180# Set input/output tensor equivalence to the same id for memory operations
181def set_tensor_equivalence(op, arch, nng):
182 if op.type in memory_only_ops:
183 eid = op.outputs[0].equivalence_id
184 for inp in op.inputs:
185 inp.equivalence_id = eid
186 return op
187
188
189def set_ifm_ofm_op_shapes(op, arch, nng):
190 if op.run_on_npu and op.type.needs_shapes():
191 if op.ifm_shapes or op.ofm_shapes:
192 # Shapes already set
193 return op
194 op.set_ifm_ofm_shapes()
195 return op
196
197
Jonas Ohlsson0957e3e2021-09-01 15:57:21 +0200198def bypass_memory_only_ops(op):
199 assert op.type in memory_only_ops
Patrik Gustavssondf995102021-08-23 15:33:59 +0200200 ofm = op.ofm
201 ifm = op.ifm
Jonas Ohlsson0957e3e2021-09-01 15:57:21 +0200202
Johan Alfvén5060ff52022-09-15 15:50:30 +0200203 # Check if ifm is subgraph ifm
Patrik Gustavssondf995102021-08-23 15:33:59 +0200204 ifm_is_sg_ifm = ifm.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const)
Johan Alfvén5060ff52022-09-15 15:50:30 +0200205 # Check if ifm is produced by CPU
Patrik Gustavssondf995102021-08-23 15:33:59 +0200206 ifm_is_cpu_produced = any(ifm_prod is not None and not ifm_prod.run_on_npu for ifm_prod in op.ifm.ops)
Patrik Gustavssondf995102021-08-23 15:33:59 +0200207
208 # This case should be handled prior to this function
Johan Alfvén5060ff52022-09-15 15:50:30 +0200209 assert not (ifm_is_sg_ifm or ifm_is_cpu_produced)
Patrik Gustavssondf995102021-08-23 15:33:59 +0200210
Johan Alfvén5060ff52022-09-15 15:50:30 +0200211 # Bypassed by replacing ifm with ofm
212 ofm.ops = []
213 for prev_op in ifm.ops:
214 prev_op.outputs = [ofm]
215 ofm.ops.append(prev_op)
Patrik Gustavssondf995102021-08-23 15:33:59 +0200216
Johan Alfvén5060ff52022-09-15 15:50:30 +0200217 # All ifm consumers need to use ofm as input
218 for ifm_cons in ifm.consumer_list:
219 for ifm_idx, cons_ifm in enumerate(ifm_cons.inputs):
220 if cons_ifm == ifm:
221 ifm_cons.set_input_tensor(ofm, ifm_idx)
Patrik Gustavssondf995102021-08-23 15:33:59 +0200222
223
Patrik Gustavssonf1580f02021-09-01 12:43:02 +0200224def move_splitsliceread_to_consumer(op, cons_op):
225 assert op.type == Op.SplitSliceRead
226
227 if cons_op.ifm == op.ofm:
228 cons_op.read_offsets[0] = op.read_offsets[0]
229 cons_op.read_shapes[0] = op.read_shapes[0]
230 cons_op.set_input_tensor(op.ifm, cons_op.type.info.indices.ifms[0])
231 cons_op.ifm_shapes[0] = op.ifm_shapes[0]
232 elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == op.ofm:
233 cons_op.read_offsets[1] = op.read_offsets[0]
234 cons_op.read_shapes[1] = op.read_shapes[0]
235 cons_op.set_input_tensor(op.ifm, cons_op.type.info.indices.ifms[1])
236 cons_op.ifm_shapes[1] = op.ifm_shapes[0]
237
Patrik Gustavssonf1580f02021-09-01 12:43:02 +0200238 op.ofm.consumer_list.remove(cons_op)
239 op.ofm.ops = []
240 op.ifm.consumer_list.remove(op)
241
242
Jonas Ohlsson0957e3e2021-09-01 15:57:21 +0200243def check_memory_only_removed(op, arch):
244 if op.run_on_npu and op.type in memory_only_ops:
245 # Memory only operators should have been removed
246 raise VelaError(f"Memory only {op.type} op {op} expected to have been removed, still remains")
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200247
248
249def record_optimised(op, arch):
250 if op.type != Op.Const:
251 DebugDatabase.add_optimised(op, op)
Patrik Gustavssondf995102021-08-23 15:33:59 +0200252
253
Johan Alfvén5060ff52022-09-15 15:50:30 +0200254def insert_copy_op_after_ifm(op):
255 tens = op.ifm
Patrik Gustavssondf995102021-08-23 15:33:59 +0200256
257 # Create a avg_pool nop op with ifm as input
258 copy_tens = tens.clone()
259 copy_op = create_avgpool_nop(tens.name + "_avgpool")
260 copy_op.add_input_tensor(tens)
261 copy_op.set_output_tensor(copy_tens)
262 copy_op.set_ifm_ofm_shapes()
263 copy_op.run_on_npu = True
264
Johan Alfvén5060ff52022-09-15 15:50:30 +0200265 op.set_input_tensor(copy_tens, 0)
Patrik Gustavssondf995102021-08-23 15:33:59 +0200266
267 DebugDatabase.add_optimised(tens.ops[0], copy_op)
268
269
270def fix_sg_input_output(op, arch, nng):
Jonas Ohlsson0957e3e2021-09-01 15:57:21 +0200271 if not op.run_on_npu or op.type not in memory_only_ops:
Patrik Gustavssondf995102021-08-23 15:33:59 +0200272 return op
273
Johan Alfvén5060ff52022-09-15 15:50:30 +0200274 # For the memory only operators we want to remove, the ifm tensor
275 # is replaced by the ofm tensor.
276 # But in order to to do this, the ifm can not be inputs of the sg or
277 # the ifm can not have more than one consumers.
278 # This need to be fixed prior to the removal.
Patrik Gustavssondf995102021-08-23 15:33:59 +0200279 # Solution is to add a avgpool NOP, to maintain the original tensor.
Johan Alfvén5060ff52022-09-15 15:50:30 +0200280 # This is also valid when reshape ifm is produced by CPU
Patrik Gustavssondf995102021-08-23 15:33:59 +0200281
Johan Alfvén5060ff52022-09-15 15:50:30 +0200282 # Check if operator ifm is subgraph ifm
Patrik Gustavssondf995102021-08-23 15:33:59 +0200283 ifm_is_sg_ifm = op.ifm.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const)
Patrik Gustavssondf995102021-08-23 15:33:59 +0200284
Johan Alfvén5060ff52022-09-15 15:50:30 +0200285 # Check if ifm is produced by CPU
286 ifm_is_cpu_produced = any(ifm_prod is not None and not ifm_prod.run_on_npu for ifm_prod in op.ifm.ops)
287
288 # Check numbers of ifm consumers - if many insert avgpool NOP
289 ifm_has_multiple_cons = len(op.ifm.consumer_list) > 1
290
291 if ifm_is_sg_ifm or ifm_is_cpu_produced or ifm_has_multiple_cons:
292 # Ifm need to persist in order to remove the memory only operator.
293 insert_copy_op_after_ifm(op)
Patrik Gustavssondf995102021-08-23 15:33:59 +0200294
295 return op
296
297
298def convert_depthwise_to_conv(op, arch, nng):
299 # Depthwise is equivalent to a single conv2d if the ifm depth is 1 and
300 # the ofm depth equals the depth multipler.
301 # If those conditions are true, then we can perform a simple
302 # switch of the operator type (and weight order)
303
304 if op.type == Op.DepthwiseConv2DBias and (op.attrs["depth_multiplier"] != 1):
305 ifm_shape = op.ifm_shapes[0]
306 weight_tensor = op.inputs[1]
307 ofm_shape = op.ofm_shapes[0]
308 if (ifm_shape.depth == 1) and (ofm_shape.depth == op.attrs["depth_multiplier"]):
309 # Change op type to Conv2d
310 op.type = Op.Conv2DBias
311 del op.attrs["channel_multiplier"]
312 del op.attrs["depth_multiplier"]
313
314 weight_tensor.values = np.transpose(weight_tensor.values, (0, 1, 3, 2))
315 weight_tensor.set_all_shapes(list(weight_tensor.values.shape))
316 else:
317 raise UnsupportedFeatureError(
318 f"Unsupported 'DEPTHWISE_CONV_2D' with depth_multiplier = {op.attrs['depth_multiplier']},",
319 f" ifm channels = {ifm_shape.depth}, ofm channels = {ofm_shape.depth}",
320 )
321 DebugDatabase.add_optimised(op, op)
322 return op
Patrik Gustavssonf436ada2021-09-14 14:56:48 +0200323
324
325def convert_to_lut(op, lut_values, lut_name):
326 # Rewrite the operation by Add with scalar 0 + LUT activation
327 ifm = op.inputs[0]
328 if ifm is None:
329 return op
330 assert ifm.dtype.size_in_bytes() == 1
331 op.type = Op.Add
332 op.name = op.name + "_lut_" + lut_name
333 # Mark as no-op to enable potential fusing optimizations
334 op.attrs["is_nop"] = True
335 # Create an input tensor containing scalar zero
336 quantization = QuantizationParameters(0.0, 255.0)
337 quantization.scale_f32 = ifm.quantization.scale_f32
338 quantization.zero_point = 0
339 tens = create_const_tensor(op.inputs[0].name + "_scalar0", [], ifm.dtype, [0], np.uint8, quantization=quantization)
340 op.add_input_tensor(tens)
341 op.ifm_shapes.append(Shape4D(tens.shape)) # TODO no shape?
342
343 # The LUT must be applied without any preceding rescaling (the LUT itself performs the rescale),
344 # so even if the OFM has a different scale than the IFM, the generated OFM scale instructions
345 # should be the same as the IFM
346 op.forced_output_quantization = ifm.quantization
347 lut_tensor = lut.create_lut_tensor(op.name + "_values", lut_values, DataType.int8)
348 op.set_activation_lut(lut_tensor)
349 op.set_ifm_ofm_shapes()
350 return op