blob: ef39aea3a876d942f3a7045e092505e7931bcd99 [file] [log] [blame]
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16# Description:
17# Early optimisation of a TensorFlow Lite based network graph, using the rewrite_graph module
18# to do the traversal of the graph.
19import math
20import uuid
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020021
22import numpy as np
23
24from . import fp_math
25from . import lut
26from . import rewrite_graph
27from . import scaling
28from .api import NpuRoundingMode
29from .data_type import DataType
30from .debug_database import DebugDatabase
31from .errors import UnsupportedFeatureError
32from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Patrik Gustavssondf995102021-08-23 15:33:59 +020033from .graph_optimiser_util import bypass_reshape_and_squeeze_ops
Patrik Gustavssonc74682c2021-08-17 14:26:38 +020034from .graph_optimiser_util import calc_explicit_padding
Patrik Gustavssondf995102021-08-23 15:33:59 +020035from .graph_optimiser_util import convert_depthwise_to_conv
36from .graph_optimiser_util import fix_sg_input_output
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020037from .graph_optimiser_util import needed_total_padding
38from .graph_optimiser_util import set_ifm_ofm_op_shapes
39from .graph_optimiser_util import set_tensor_equivalence
40from .numeric_util import clamp_sigmoid
41from .numeric_util import full_shape
42from .numeric_util import round_away_zero
43from .operation import create_activation_function
44from .operation import NpuBlockType
45from .operation import Op
46from .operation import Operation
47from .operation import Padding
48from .operation_util import create_avgpool_nop
49from .operation_util import get_pad_values_from_input
50from .shape4d import Shape4D
51from .softmax import SoftMax
52from .tensor import check_quantized_tens_scaling_equal
53from .tensor import create_const_tensor
54from .tensor import create_equivalence_id
55from .tensor import QuantizationParameters
56from .tensor import Tensor
57from .tensor import TensorPurpose
58from .tflite_mapping import optype_to_builtintype
59
60passthrough_nodes = (Op.Identity,)
61
62
63def create_avg_pool_for_concat(concat_op, name, ifm, ifm_shape: Shape4D, write_offset: Shape4D):
64 """Creates an average pool for the given concat op/input feature map"""
65 ofm = concat_op.ofm
66 avgpool_op = create_avgpool_nop(name)
67 avgpool_op.inputs = [ifm]
68 avgpool_op.outputs = [ofm]
69
70 avgpool_op.write_offset = write_offset
71 avgpool_op.write_shape = ifm_shape
72 ofm.ops.append(avgpool_op)
73 DebugDatabase.add_optimised(concat_op, avgpool_op)
74 avgpool_op.ifm_shapes.append(ifm_shape)
75 avgpool_op.ofm_shapes.append(concat_op.ofm_shapes[0])
76 avgpool_op.memory_function = Op.ConcatSliceWrite
77 return avgpool_op
78
79
80def remove_passthrough_tensor(tens, arch, nng):
81 if len(tens.ops) == 1 and tens.ops[0].type in passthrough_nodes:
82 assert len(tens.ops[0].inputs) == 1
83 tens = tens.ops[0].inputs[0]
84 return tens
85
86
87def rewrite_concat_ops(op, arch):
88 if not op.run_on_npu or not op.type.is_concat_op():
89 return
90
91 axis_4D = 0
92 ofm = op.ofm
93 ofm.ops = []
94 offset = 0
95
96 unfuse_activation_function(op)
97
98 if op.type == Op.Pack:
99 # Pack is also referred to as Stack
100 axis = int(op.attrs["axis"])
101 if axis < 0: # Convert to positive axis
102 axis = len(op.inputs[0].shape) + 1 + axis
103
104 desired_shape = op.inputs[0].shape[:axis] + [1] + op.inputs[0].shape[axis:]
105
106 axis_4D = axis + (4 - len(desired_shape))
107
108 for idx, inp in enumerate(op.inputs):
109 op.ifm_shapes[idx] = Shape4D(desired_shape)
110 op.type = Op.PackReshaped
111
112 inputs, axis = op.get_concat_inputs_axis()
113 for idx, inp in enumerate(inputs):
114 if op.type != Op.PackReshaped:
115 op.ifm_shapes[idx] = Shape4D(inp.shape)
116 if axis >= 0:
117 axis_4D = axis + (4 - len(inp.shape))
118 else:
119 axis_4D = axis
120 write_offset = [0, 0, 0, 0]
121 write_offset[axis_4D] = offset
122 concat_end = offset + op.ifm_shapes[idx][axis_4D]
123 create_avg_pool_for_concat(
124 op, op.name + str(idx) + "_avgpool", inp, op.ifm_shapes[idx], Shape4D.from_list(write_offset)
125 )
126 offset = concat_end
127 assert ofm.shape[axis] == offset
128
129 return op
130
131
132def rewrite_split_ops(tens, arch, nng):
133
134 if len(tens.ops) == 1 and tens.ops[0].type.is_split_op() and tens.ops[0].type != Op.Unpack:
135 split_op = tens.ops[0]
136
137 # Not supported so leave it and run on CPU
138 if not split_op.run_on_npu:
139 return tens
140
141 inp, outputs, axis, offset_start, offset_end = split_op.get_split_inputs_axis()
142
143 tens.ops = []
144 new_op = Operation(Op.SplitSliceRead, split_op.name)
145 new_op.inputs = [inp]
146 ofm_shape_idx = 0
147 read_shape = offset_end
148
149 # For Split the offset cannot be extracted from the tensor so it has to
150 # be calculated from the index of the output tensor
151 if axis is not None:
152 # Get the start and end of the split
153 offset_start = [0] * 4
154 axis_4D_list = split_op.attrs.get("split_axis_4D", None) # Present for UnpackReshaped and some StridedSlice
155 for idx, out in enumerate(outputs):
156 if axis_4D_list is not None:
157 axis_4D = axis_4D_list[idx]
158 else:
159 split_op.ofm_shapes[idx] = Shape4D(out.shape)
160 if axis >= 0:
161 axis_4D = axis + (4 - len(out.shape))
162 else:
163 axis_4D = axis
164
165 if out == tens:
166 ofm_shape_idx = idx
167 read_shape = split_op.ofm_shapes[idx]
168 break
169
170 offset_start[axis_4D] += split_op.ofm_shapes[idx][axis_4D]
171
172 new_op.read_offsets[0] = Shape4D.from_list(offset_start, 0)
173 new_op.read_shapes[0] = read_shape
174 new_op.run_on_npu = True
175 new_op.set_output_tensor(tens)
176 new_op.ifm_shapes.append(Shape4D(inp.shape))
177 new_op.ofm_shapes.append(split_op.ofm_shapes[ofm_shape_idx])
178 DebugDatabase.add_optimised(split_op, new_op)
179
180 return tens
181
182
183def remove_SplitSliceRead(op, arch):
184
185 if op.type == Op.SplitSliceRead:
186 # Check if it is possible to put the SplitSliceRead on the tensor consumer, or if an avgpool need to be inserted
187 if (
188 len(op.ofm.consumer_list) == 1
189 and op.ofm.consumer_list[0] is not None
190 and op.ofm.consumer_list[0].run_on_npu
Jonas Ohlssonfbfd96e2021-08-25 11:38:03 +0200191 and op.ofm.consumer_list[0].type not in (Op.Reshape, Op.Squeeze)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200192 and op.ofm_shapes[0] == Shape4D.from_list(op.ofm.shape)
193 ):
194 # SplitSliceRead can be performed by tensor consumer
195 cons_op = op.ofm.consumer_list[0]
196 if cons_op.ifm == op.ofm:
197 cons_op.read_offsets[0] = op.read_offsets[0]
198 cons_op.read_shapes[0] = op.read_shapes[0]
199 cons_op.set_input_tensor(op.ifm, cons_op.type.info.indices.ifms[0])
200 cons_op.ifm_shapes[0] = op.ifm_shapes[0]
201 elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == op.ofm:
202 cons_op.read_offsets[1] = op.read_offsets[0]
203 cons_op.read_shapes[1] = op.read_shapes[0]
204 cons_op.set_input_tensor(op.ifm, cons_op.type.info.indices.ifms[1])
205 cons_op.ifm_shapes[1] = op.ifm_shapes[0]
206
207 if "skirt" in cons_op.attrs:
208 assert cons_op.attrs["explicit_padding"] == cons_op.attrs["skirt"]
209 cons_op.attrs["skirt"] = None
210 cons_op.attrs["force_padding"] = True
211 op.ofm.consumer_list.remove(cons_op)
212 op.ofm.ops = []
213 op.ifm.consumer_list.remove(op)
214 else:
215 avgpool_op = create_avgpool_nop(op.name + "_avgpool")
216 avgpool_op.add_input_tensor(op.ifm)
217 avgpool_op.outputs = [op.ofm]
218 op.ofm.ops.remove(op)
219 op.ofm.ops.append(avgpool_op)
220 avgpool_op.ifm_shapes.append(op.ifm_shapes[0])
221 avgpool_op.ofm_shapes.append(op.ofm_shapes[0])
222 avgpool_op.read_offsets[0] = op.read_offsets[0]
223 avgpool_op.read_shapes[0] = op.read_shapes[0]
224
225 op.ifm.consumer_list.remove(op)
226 DebugDatabase.add_optimised(op, avgpool_op)
227
228
229def insert_copy_op_after_tens(tens):
230 tens_cons_list_copy = tens.consumer_list.copy()
231
232 # Create a avg_pool nop op with ifm as input
233 copy_tens = tens.clone()
234 copy_op = create_avgpool_nop(tens.name + "_avgpool")
235 copy_op.add_input_tensor(tens)
236 copy_op.set_output_tensor(copy_tens)
237 copy_op.set_ifm_ofm_shapes()
238 copy_op.run_on_npu = True
239
240 # Set copy_ifm consumers
241 for tens_cons in tens_cons_list_copy:
242 if tens_cons is not None:
243 for ifm_idx, cons_inp in enumerate(tens_cons.inputs):
244 if cons_inp == tens:
245 tens_cons.set_input_tensor(copy_tens, ifm_idx)
246
247 DebugDatabase.add_optimised(tens.ops[0], copy_op)
248
249
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200250def calc_padding_and_skirt(padding_type, kernel, input_shape, explicit_padding):
251 k_w, k_h = kernel.dilated_wh()
252 s_x, s_y = kernel.stride
253 ypad = needed_total_padding(int(input_shape.height), int(s_y), int(k_h))
254 xpad = needed_total_padding(int(input_shape.width), int(s_x), int(k_w))
255 if padding_type == Padding.SAME:
256 left_pad = (xpad + 0) // 2
257 right_pad = (xpad + 1) // 2
258 top_pad = (ypad + 0) // 2
259 bottom_pad = (ypad + 1) // 2
260 elif padding_type == Padding.VALID:
261 left_pad = 0
262 right_pad = 0
263 top_pad = 0
264 bottom_pad = 0
265 elif padding_type == Padding.EXPLICIT:
266 # Padding is specified in a PAD operator which has been bypassed.
267 top, left, bottom, right = explicit_padding
268 top_pad, bottom_pad = calc_explicit_padding(int(input_shape.height), int(s_y), int(k_h), int(top), int(bottom))
269 left_pad, right_pad = calc_explicit_padding(int(input_shape.width), int(s_x), int(k_w), int(left), int(right))
270 else:
271 raise UnsupportedFeatureError(f"Unknown padding")
272 padding = (top_pad, left_pad, bottom_pad, right_pad)
273 skirt = (top_pad, left_pad, ypad - top_pad, xpad - left_pad)
274 return padding, skirt
275
276
277def calc_upscaled_padding_and_skirt(padding_type, kernel_size, stride, input_shape, upscaling_factor):
278 kernel_height, kernel_width = kernel_size[0], kernel_size[1]
279 if padding_type == Padding.SAME:
280 ypad = needed_total_padding(int(input_shape.height) * upscaling_factor, int(stride[1]), int(kernel_height))
281 xpad = needed_total_padding(int(input_shape.width) * upscaling_factor, int(stride[2]), int(kernel_width))
282 right_pad = max(((xpad + 1) // upscaling_factor) - 1, 0)
283 bottom_pad = max(((ypad + 1) // upscaling_factor) - 1, 0)
284 left_pad = max(kernel_width - 1 - right_pad, 0)
285 top_pad = max(kernel_height - 1 - bottom_pad, 0)
286 elif padding_type == Padding.VALID:
287 right_pad = max(kernel_width - 2, 0)
288 bottom_pad = max(kernel_height - 2, 0)
289 left_pad = kernel_width - 1
290 top_pad = kernel_height - 1
291 else:
292 raise UnsupportedFeatureError(f"Unknown padding")
293 padding = (top_pad, left_pad, bottom_pad, right_pad)
294 skirt = padding
295 return padding, skirt
296
297
298def fixup_conv2d_backprop(op, arch, nng):
299 if op.type == Op.Conv2DBackpropInput:
300 # flip the inputs
301 op.inputs[0], op.inputs[2] = op.inputs[2], op.inputs[0]
302 op.type = Op.Conv2DBackpropInputSwitchedBias
303 op.ifm.resampling_mode = resampling_mode.TRANSPOSE
304
305 # Update strides
306 op.attrs.update({"stride_w": 1, "stride_h": 1, "strides": (1, 1, 1, 1)})
307
308 return op
309
310
311# Convert the op to an elementwise add
312def convert_resizebilinear_1x1_to_add(op):
313 op.type = Op.Add
314 op.name = op.name + "_add"
315 op.attrs["resizebilinear"] = True
316 # Create an input tensor filled with zeros
317 shape = op.ofm_shapes[0].as_list()
318 tens = Tensor(shape, op.inputs[0].dtype, op.inputs[1].name + "_add")
James Peet7519d502021-07-19 16:47:58 +0100319 tens.values = np.zeros(shape, tens.dtype.as_numpy_type())
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200320 tens.quantization = QuantizationParameters(0.0, 255.0)
321 tens.quantization.scale_f32 = 1.0
322 tens.quantization.zero_point = 0
323 tens.consumer_list = [op]
324 tens_op = op.inputs[1].ops[0]
325 tens_op.set_output_tensor(tens)
326 # Set the add inputs
327 op.inputs[1] = op.inputs[0]
328 op.inputs[0] = tens
329 op.set_ifm_ofm_shapes()
330
331 return op
332
333
334# Convert ResizeBilinear to a number of 2x2 pool ops
335def convert_resizebilinear_to_2x2_pool(op):
336 count = 0
337 pre_op = op
338 outputs = op.outputs
339
340 op.attrs.update({"strides": (1, 1, 1, 1), "ksize": (1, 2, 2, 1)})
341 if op.attrs["align_corners"]:
342 shape_modifier = 1
343 op.attrs["padding"] = Padding.VALID
344 else:
345 shape_modifier = 0
346 op.attrs["padding"] = Padding.SAME
347 op.inputs[0].resampling_mode = resampling_mode.NEAREST
348
349 upscaled_shape = np.array(op.ifm_shapes[0].get_hw_as_list())
350 out_shape = np.array(op.ofm_shapes[0].get_hw_as_list())
351 if (upscaled_shape == upscaled_shape * 2 - shape_modifier).all():
352 return op
353
354 while (upscaled_shape < out_shape).all():
355 if count == 0:
356 scaled_op = pre_op
357 else:
358 scaled_op = op.clone("_{}".format(count))
359 scaled_op.inputs[0] = pre_op.outputs[0]
360
361 upscaled_shape = upscaled_shape * 2 - shape_modifier
362
363 if (upscaled_shape == out_shape).all():
364 scaled_op.outputs = outputs
365 scaled_op.outputs[0].ops = [scaled_op]
366 else:
367 shape = op.ofm_shapes[0].as_list()
368 shape[1:3] = upscaled_shape
369 out_tens = Tensor(shape, DataType.int16, "{}_{}".format(op.outputs[0].name, count))
370 out_tens.quantization = op.outputs[0].quantization.clone()
371 out_tens.quantization.quant_min = np.iinfo(np.int16).min
372 out_tens.quantization.quant_max = np.iinfo(np.int16).max
373 scaled_op.set_output_tensor(out_tens)
374 pre_op = scaled_op
375 count += 1
376
377 # Setup the scale value
378 if scaled_op.inputs[0].dtype.bits == 8 and scaled_op.outputs[0].dtype.bits == 16:
379 scaled_op.rescale = 128
380 elif scaled_op.inputs[0].dtype.bits == 16 and scaled_op.outputs[0].dtype.bits == 8:
381 scaled_op.rescale = 1 / 128
382 else:
383 scaled_op.rescale = None
384 scaled_op.set_ifm_ofm_shapes()
385
386 return op
387
388
389def fixup_resizebilinear(op, arch, nng):
390 if op.type == Op.ResizeBilinear and op.run_on_npu:
391 if op.ifm_shapes[0] == op.ofm_shapes[0]:
392 # Bypass nop resizebilinear
393 op.inputs = op.inputs[:1]
394 op.type = Op.Identity
395 elif op.ifm_shapes[0].height == 1 and op.ifm_shapes[0].width == 1:
396 convert_resizebilinear_1x1_to_add(op)
397 else:
398 convert_resizebilinear_to_2x2_pool(op)
399
400 return op
401
402
403def convert_nop_split_to_identity(op, arch, nng):
404 if op.type == Op.Split and op.attrs.get("num_splits") == 1:
405 # the list comprehension should return a list with a single tensor
406 # if it shouldn't, remove_passthrough_tensor will fail appropriately
407 op.inputs = [i for i in op.inputs if i.shape == op.outputs[0].shape]
408 op.type = Op.Identity
409 return op
410
411
412def rewrite_fully_connected_input(op, arch, nng):
413 if op.type == Op.FullyConnected:
414 n_in_elems = op.weights.shape[-2]
415 elms = op.ifm.elements()
416 batch_size = elms // n_in_elems
417 assert batch_size * n_in_elems == elms
418
419 op.ifm_shapes[0] = Shape4D([batch_size, 1, 1, n_in_elems])
420 return op
421
422
423def convert_batched_fc_shape(op, arch, nng):
424 if op.type == Op.FullyConnected:
425 # Check if the first dimension indicates batching
426 if op.ifm_shapes[0].batch > 1:
427 batching_split = {4: (2, 2), 8: (2, 4), 16: (4, 4)}
428 n = op.ifm_shapes[0].batch
429 h, w = batching_split.get(n, (1, n))
430 op.ifm_shapes[0] = Shape4D([1, h, w, op.ifm_shapes[0].depth])
431
432 # Reshape Weights to be 4D. IO becomes HWIO
433 weight_tensor = op.inputs[1]
James Peet7519d502021-07-19 16:47:58 +0100434 weight_tensor.values = np.expand_dims(np.expand_dims(weight_tensor.values, axis=0), axis=0)
435 weight_tensor.set_all_shapes(list(weight_tensor.values.shape))
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200436
437 n = op.ofm_shapes[0].batch
438 h, w = batching_split.get(n, (1, n))
439 op.ofm_shapes[0] = Shape4D([1, h, w, op.ofm_shapes[0].depth])
440 return op
441
442
443def unfuse_activation_function(op):
444 if op.type == Op.ConcatTFLite and op.run_on_npu and op.activation is not None:
445 act_op = Operation(op.activation.op_type, op.name + op.activation.op_type.name)
446 op.activation = None
447 out_tens = op.outputs[0]
448 intermediate_tens = out_tens.clone("_act_intermediate")
449 act_op.set_output_tensor(out_tens)
450 act_op.add_input_tensor(intermediate_tens)
451 op.set_output_tensor(intermediate_tens)
452 act_op.set_ifm_ofm_shapes()
453
454
455def rewrite_stridedslice_output(op, arch, nng):
456 if not op.run_on_npu or op.type != Op.StridedSlice:
457 return op
458
459 new_axis_mask = op.attrs["new_axis_mask"]
460 shrink_axis_mask = op.attrs["shrink_axis_mask"]
461
462 if shrink_axis_mask == 0 and new_axis_mask == 0:
463 return op
464
465 axis_4D = [0] * len(op.outputs)
466 for idx, out_tens in enumerate(op.outputs):
467 output_shape = list(out_tens.shape)
468
469 if shrink_axis_mask != 0:
470 n = 0
471 axis = 0
472 while shrink_axis_mask:
473 prev_mask = shrink_axis_mask
474 n += 1
475 shrink_axis_mask &= shrink_axis_mask - 1
476 axis = int(math.log2(prev_mask - shrink_axis_mask))
477 output_shape = output_shape[:axis] + [1] + output_shape[axis:]
478
479 assert len(out_tens.shape) == (len(op.inputs[0].shape) - n)
480 op.attrs["shrink_axis_mask"] = 0
481 if axis >= 0:
482 axis_4D[idx] = axis + (4 - len(output_shape))
483 else:
484 axis_4D[idx] = axis
485 op.ofm_shapes[idx] = Shape4D(output_shape)
486
487 elif new_axis_mask != 0:
488 n = 0
489 axis = 0
490 while new_axis_mask:
491 prev_mask = new_axis_mask
492 n += 1
493 new_axis_mask &= new_axis_mask - 1
494 axis = int(math.log2(prev_mask - new_axis_mask))
495 output_shape = output_shape[:axis] + output_shape[(axis + 1) :]
496 new_axis_mask >>= 1
497
498 assert len(out_tens.shape) == (len(op.inputs[0].shape) + n)
499 op.attrs["new_axis_mask"] = 0
500 if axis >= 0:
501 axis_4D[idx] = axis + (4 - len(output_shape))
502 else:
503 axis_4D[idx] = axis
504 op.ofm_shapes[idx] = Shape4D(output_shape)
505
506 op.attrs["split_axis_4D"] = axis_4D
507 return op
508
509
510def rewrite_unpack_output(op, arch, nng):
511 tens = op.outputs[0]
512 if op.run_on_npu and op.type == Op.Unpack:
513 # Unpack is also referred to as Unstack
514 axis = int(op.attrs["axis"])
515 if axis < 0: # Convert to positive axis
516 axis = len(op.inputs[0].shape) + 1 + axis
517 op.type = Op.UnpackReshaped
518 desired_output_shape = tens.shape[:axis] + [1] + tens.shape[axis:]
519
520 axis_4D = axis + (4 - len(desired_output_shape))
521 op.attrs["split_axis_4D"] = [axis_4D] * len(op.outputs)
522
523 for idx, out_tens in enumerate(op.outputs):
524 op.ofm_shapes[idx] = Shape4D(desired_output_shape)
525 return op
526
527
528def add_padding_fields(op, arch, nng):
529 if op.run_on_npu:
530 if "padding" in op.attrs:
531 input_shape = op.ifm_shapes[0]
532 output_shape = op.ofm_shapes[0]
533 if op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op():
534 kernel_size = op.inputs[1].shape[:2]
535 elif op.type.is_pool_op() or op.type.npu_block_type == NpuBlockType.ReduceSum:
536 kernel_size = op.attrs["ksize"][1:3]
537 else:
538 raise UnsupportedFeatureError(f"Unknown operation that uses padding: {optype_to_builtintype(op.type)}")
539
540 if op.type == Op.Conv2DBackpropInputSwitchedBias:
541 upscaling_factor = output_shape.height // input_shape.height
542 padding, skirt = calc_upscaled_padding_and_skirt(
543 op.attrs["padding"], kernel_size, op.attrs["strides"], input_shape, upscaling_factor
544 )
545 else:
546 padding, skirt = calc_padding_and_skirt(
547 op.attrs["padding"], op.kernel, input_shape, op.attrs.get("explicit_padding"),
548 )
549
550 op.attrs["explicit_padding"] = padding
551 op.attrs["skirt"] = skirt
552
553 return op
554
555
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200556def reorder_depthwise_weights(op, arch, nng):
557 if op.type.is_depthwise_conv2d_op():
558 weight_tensor = op.inputs[1]
James Peet7519d502021-07-19 16:47:58 +0100559 weight_tensor.values = np.transpose(weight_tensor.values, (0, 1, 3, 2))
560 weight_tensor.set_all_shapes(list(weight_tensor.values.shape))
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200561 weight_tensor.weight_transpose_depthwise = True
562
563 return op
564
565
566def optimise_strided_conv(op, arch, nng):
567 stride_x, stride_y = op.get_kernel_stride()
568 ifm_tensor, _, weight_tensor, _ = op.get_ifm_ifm2_weights_ofm()
569
570 if (
571 op.type == Op.Conv2DBias
572 and op.op_index == 0
573 and stride_x == 2
574 and op.ifm_shapes[0].depth <= 4
575 and op.ifm_shapes[0].width % 2 == 0
576 and weight_tensor is not None
577 and weight_tensor.shape[1] >= 2
578 ):
579 ifm_shape = op.ifm_shapes[0]
580 # IFM
581 op.ifm_shapes[0] = Shape4D([ifm_shape.batch, ifm_shape.height, ifm_shape.width // 2, ifm_shape.depth * 2])
582
583 # Weights
584 weight_shape = weight_tensor.shape
585 if weight_shape[1] % 2 != 0:
586 weight_shape[1] = weight_shape[1] + 1
587 padded_array = np.zeros(weight_shape)
588 for i in range(weight_shape[0]):
589 padded_array[i] = np.vstack(
590 [
James Peet7519d502021-07-19 16:47:58 +0100591 weight_tensor.values[i],
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200592 np.full((1, weight_shape[2], weight_shape[3]), weight_tensor.quantization.zero_point),
593 ]
594 )
James Peet7519d502021-07-19 16:47:58 +0100595 weight_tensor.values = padded_array
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200596 weight_shape[1] //= 2
597 weight_shape[2] *= 2
James Peet7519d502021-07-19 16:47:58 +0100598 weight_tensor.values = np.reshape(weight_tensor.values, weight_shape)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200599 weight_tensor.set_all_shapes(weight_shape)
600 # If multiple copies of the weights are used, we could avoid
601 # them having the same address by changing the value_id
602 weight_tensor.value_id = uuid.uuid4()
603
604 # Strides
605 stride_x = 1
606 op.attrs.update({"stride_w": stride_x, "stride_h": stride_y, "strides": (1, stride_y, stride_x, 1)})
607
608 return op
609
610
611def convert_conv_to_fc(op, arch, nng):
612 # Conv 1x1 can be equivalent to Fully Connected.
613 # By representing certain convs as fully connected layers, Vela can better determine wether or not to use
614 # caching/double buffering for the weights.
615 # (Weights dont need to be reloaded for convs when IFM H and W are 1)
616 if op.type == Op.Conv2DBias:
617 h = op.ifm_shapes[0].height
618 w = op.ifm_shapes[0].width
619 kh, kw, _, _ = op.inputs[1].shape
620 if h == 1 and w == 1 and kh == 1 and kw == 1:
621 # Overwrite this op as a Fully Connected Op
622 op.name += "_fc"
623 op.type = Op.FullyConnected
624 op.attrs = {
625 "weights_format": 0,
626 }
627 # Reshape Weights to be 2D. HWIO becomes just IO (as H and W are 1, they can just be dropped)
628 weight_tensor = op.inputs[1]
James Peet7519d502021-07-19 16:47:58 +0100629 weight_tensor.values = weight_tensor.values.squeeze(axis=(0, 1))
630 weight_tensor.set_all_shapes(list(weight_tensor.values.shape))
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200631
632 DebugDatabase.add_optimised(op, op)
633 return op
634
635
636def fixup_relus_with_differing_ifm_ofm_scaling(op, arch, nng):
637 if op.run_on_npu and op.type.is_relu_op():
638 ifm = op.inputs[0]
639 ofm = op.outputs[0]
640 # Relu with differing IFM and OFM scaling cannot be fused with another primary op
641 # and requires its own to be inserted
642 if not check_quantized_tens_scaling_equal(ifm, ofm):
643 # Override this op with its own primary op (avgpool)
644 relu_fused_op = create_avgpool_nop(op.name + "_avgpool")
645 # And fuse the original activation function to it
646 relu_fused_op.activation = create_activation_function(op.type)
647 # Tidy up and assign the ifm and ofm to the new op
648 ifm.consumer_list.remove(op)
649
650 relu_fused_op.add_input_tensor(ifm)
651 relu_fused_op.set_output_tensor(ofm)
652 relu_fused_op.set_ifm_ofm_shapes()
653 op = relu_fused_op
654 return op
655
656
657def fixup_elementwise_with_scalars(op, arch, nng):
658 if op.type.is_binary_elementwise_op():
659 ifm_tensor, ifm2_tensor, _, _ = op.get_ifm_ifm2_weights_ofm()
660 if ifm2_tensor.shape != [] and ifm_tensor.shape != []:
661 diff = len(ifm_tensor.shape) - len(ifm2_tensor.shape)
662 if diff > 0:
663 ifm2_tensor.shape = full_shape(len(ifm_tensor.shape), ifm2_tensor.shape, 1)
664 elif diff < 0:
665 ifm_tensor.shape = full_shape(len(ifm2_tensor.shape), ifm_tensor.shape, 1)
James Peet7519d502021-07-19 16:47:58 +0100666 elif ifm_tensor.shape == [] and ifm_tensor.values is None:
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200667 # IFM is marked as a scalar, but is a result of an operation; change it to a shape of size 1
668 ifm_tensor.shape = len(ifm2_tensor.shape) * [1]
669 ifm_tensor.storage_shape = ifm_tensor.shape
James Peet7519d502021-07-19 16:47:58 +0100670 elif ifm2_tensor.shape == [] and ifm2_tensor.values is None:
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200671 # IFM2 is marked as a scalar, but is a result of an operation; change it to a shape of size 1
672 ifm2_tensor.shape = len(ifm_tensor.shape) * [1]
673 ifm2_tensor.storage_shape = ifm2_tensor.shape
674 return op
675
676
677def convert_softmax(op, arch, nng):
678 if op.type == Op.Softmax and op.run_on_npu:
679 softmax = SoftMax(op)
680 op = softmax.get_graph()
681 return op
682
683
684def convert_mul_max_to_abs_or_lrelu(op, arch, nng):
685 r"""Whenever there is a subgraph with this topology:
686
687 Input X For X = -1 or X > 0
688 | \ / This subgraph can be replaced with either
689 | Mul an Abs (if X = -1) or a LeakyReLU (if X > 0)
690 | /
691 Max
692 """
693
694 if op.type == Op.Maximum:
695 # finds the Mul input(s) to the Max
696 muls = [i for i in op.inputs if i.ops[0].type == Op.Mul]
697 if len(muls) == 1:
698 mul = muls[0].ops[0]
699 elif len(muls) == 2:
700 # In the case both inputs are Muls, find the one with the same input as the Max
701 mul = [m for m in muls if len(set(op.inputs + m.ops[0].inputs)) == 1][0].ops[0]
702 else:
703 # No Mul inputs
704 return op
705
706 # make sure the Mul doesn't have any other consumers
707 mul_ofm = mul.outputs[0]
708 if len(mul_ofm.consumers()) != 1:
709 return op
710 # make sure the Mul doesn't have a fused activation function
711 if mul.activation:
712 return op
713 ifm, ofm = op.get_ifm_ofm()
714 if ifm is None or ofm is None:
715 return op
716
717 if ifm.dtype not in (DataType.uint8, DataType.int8) or ifm.dtype != ofm.dtype:
718 return op
719 if not check_quantized_tens_scaling_equal(ifm, ofm) or not check_quantized_tens_scaling_equal(ifm, mul_ofm):
720 # rewrite to LeakyRelu currently only makes sense if the quantization is identical
721 return op
722
723 # finds the branched input that goes to both the Max and the Mul
724 shared = set(op.inputs) & set(mul.inputs)
725 if len(shared) == 1:
726 shared_in = shared.pop()
727 # find the constant scalar input to the Mul
728 const_tens = (set(mul.inputs) - {shared_in}).pop()
729 # check that it is a scalar
730 if const_tens.shape != []:
731 return op
732 const = const_tens.ops[0]
733 # check that it is a constant
734 if const.type != Op.Const:
735 return op
736 # Remove the Mul from the shared input's consumers
737 shared_in.consumer_list.remove(mul)
738 else:
739 return op
740
741 val = const.outputs[0].values
742 if val >= 0:
743 new_op = Op.LeakyRelu
744 op.attrs["alpha"] = val
745 # to produce bit exact results, the alpha is not enough;
746 # save additional scaling info in attr "alpha_scale", to be used as input
747 # to the LUT construction
James Peet7519d502021-07-19 16:47:58 +0100748 alpha_scalar = const_tens.values - const_tens.quantization.zero_point
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200749 mul_ifm_scale = np.double(ifm.quantization.scale_f32)
750 mul_ifm2_scale = np.double(const_tens.quantization.scale_f32)
751 mul_ofm_scale = np.double(mul_ofm.quantization.scale_f32)
752 alpha_scale, alpha_shift = scaling.elementwise_mul_scale(mul_ifm_scale, mul_ifm2_scale, mul_ofm_scale)
753 op.attrs["alpha_scaling"] = (alpha_scalar, alpha_scale, alpha_shift)
754 elif val == -1:
755 new_op = Op.Abs
756 else:
757 return op
758
759 op.type = new_op
760 op.name = op.name.replace("Maximum", new_op.name)
761 op.outputs[0].name = op.outputs[0].name.replace("Maximum", new_op.name)
762 op.inputs = [shared_in]
763 op.set_ifm_ofm_shapes()
764
765 # Record optimisation in debug database
766 DebugDatabase.add_optimised(op, op)
767
768 return op
769
770
771def convert_hardswish_to_lut(op, arch, nng):
772 if op.type == Op.HardSwish:
773 ifm, ofm = op.get_ifm_ofm()
774 # Generate the LUT
775 ifm_scale = np.double(ifm.quantization.scale_f32)
776 ofm_scale = np.double(ofm.quantization.scale_f32)
777 zp_in = ifm.quantization.zero_point
778 zp_out = ofm.quantization.zero_point
779 ifm_scale_hires = (1 / 128) * ifm_scale
780 relu_multiplier = np.double(3 / 32768)
781 out_scale, out_shift = scaling.quantise_scale(ifm_scale_hires / ofm_scale)
782 relu_scale, relu_shift = scaling.quantise_scale(ifm_scale_hires / relu_multiplier)
783 # Use 16bit scale
784 out_scale_16 = fp_math.downscale_multiplier_int32_to_int16(out_scale)
785 relu_scale_16 = fp_math.downscale_multiplier_int32_to_int16(relu_scale)
786
787 values = []
788 ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)
789 quantized_min = min(ix)
790 quantized_max = max(ix)
791 for x in ix:
792 input_value = x - zp_in
793 input_value_hires = input_value * 128
794 # Compute the input value on essentially the output scale, not shifted yet
795 input_value_preshift = fp_math.saturating_rounding_mul16(input_value_hires, out_scale_16)
796 # Compute the "relu-ish multiplier". This matches the code in TensorFlow Lite Micro kernel
797 relu_value = np.int16(input_value_hires)
798 if relu_shift < 31:
799 relu_value = fp_math.shift_left16(relu_value, 30 - relu_shift)
800
801 relu_value = fp_math.saturating_rounding_mul16(relu_value, relu_scale_16)
802
803 if relu_shift < 31:
804 relu_value = fp_math.shift_left16(relu_value, 1)
805
806 if relu_shift > 31:
807 relu_value = fp_math.rounding_divide_by_pot(relu_value, relu_shift - 31)
808
809 # Rescaled the value into a 16bit fixedpoint relu_value in [-1, 1]
810 # Now convert that to a 16bit fixedpoint value in [0, 1]
811 relu_value = (relu_value + (1 << 15)) >> 1
812 lut_result = fp_math.saturating_mul16(relu_value, input_value_preshift)
813 shift = 31 - out_shift
814 shift = -shift if shift < 0 else 0
815 # Finally apply the output shift
816 lut_result = fp_math.rounding_divide_by_pot(lut_result, shift) + zp_out
817 lut_result = min(quantized_max, max(quantized_min, lut_result))
818 values.append(lut_result)
819 return convert_to_lut(op, values, "hardswish")
820 return op
821
822
823def convert_lrelu_to_mul_max(op, arch):
824 # Converts LeakyRelu to Max(alpha * IFM, identity * IFM)
825 # (the opposite of convert_mul_max_to_abs_or_lrelu)
826 ifm, ofm = op.get_ifm_ofm()
827 if ifm is None or ofm is None:
828 return op
829
830 # Add multiplication with alpha
831 mul_alpha = Operation(Op.Mul, op.name + "_mul_alpha")
832 mul_alpha.add_input_tensor(ifm)
833 # Create const tensor containing alpha as scalar
Fredrik Svedbergcce872b2021-09-02 15:20:52 +0200834 alpha = np.float32(op.attrs["alpha"])
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200835 quantization = ifm.quantization.clone()
836 quantization.min = 0
837 quantization.max = alpha * (quantization.quant_max - quantization.quant_min)
838 quantization.zero_point = 0
Fredrik Svedbergcce872b2021-09-02 15:20:52 +0200839 if np.isinf(1 / alpha):
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200840 # Handling of alpha near zero
Fredrik Svedbergcce872b2021-09-02 15:20:52 +0200841 quantization.scale_f32 = np.float32(1)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200842 scalar = 0
843 else:
844 quantization.scale_f32 = alpha
845 scalar = alpha
846 alpha_tens = create_const_tensor(
847 op.name + "_alpha_scalar", [], ifm.dtype, [scalar], np.float32, quantization=quantization
848 )
James Peet7519d502021-07-19 16:47:58 +0100849 alpha_tens.values = np.array([1])
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200850 mul_alpha.add_input_tensor(alpha_tens)
851 fm_alpha = ofm.clone(op.name + "_alpha", set_unique=True)
852 mul_alpha.set_output_tensor(fm_alpha)
853 mul_alpha.set_ifm_ofm_shapes()
854 DebugDatabase.add_optimised(op, mul_alpha)
855
856 if check_quantized_tens_scaling_equal(ifm, ofm):
857 # No identity multiplication is needed
858 fm_id = ifm
859 else:
860 # Add multiplication with identity
861 mul_identity = Operation(Op.Mul, op.name + "_mul_identity")
862 mul_identity.add_input_tensor(ifm)
863 # Create const tensor containing identity as scalar
864 quantization = ifm.quantization.clone()
865 quantization.min = 0
866 quantization.max = quantization.quant_max - quantization.quant_min
Fredrik Svedbergcce872b2021-09-02 15:20:52 +0200867 quantization.scale_f32 = np.float32(1)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200868 quantization.zero_point = 0
869 identity_tens = create_const_tensor(
870 op.name + "_id_scalar", [], ifm.dtype, [1], np.uint8, quantization=quantization
871 )
872 mul_identity.add_input_tensor(identity_tens)
873 # Make sure that fm_id is allocated to a different address than fm_alpha
874 fm_id = ofm.clone(op.name + "_id", set_unique=True)
875 mul_identity.set_output_tensor(fm_id)
876 mul_identity.set_ifm_ofm_shapes()
877 DebugDatabase.add_optimised(op, mul_identity)
878
879 # Convert LeakyRelu to Max, add the results of the multiplication(s) as inputs
880 op.type = Op.Maximum
881 op.name = op.name.replace("LeakyRelu", "Maximum")
882 op.inputs = []
883 ifm.consumer_list.remove(op)
884 op.add_input_tensor(fm_alpha)
885 op.add_input_tensor(fm_id)
886 op.set_ifm_ofm_shapes()
887
888 DebugDatabase.add_optimised(op, op)
889 return op
890
891
892def convert_to_lut(op, lut_values, lut_name):
893 # Rewrite the operation by Add with scalar 0 + LUT activation
894 ifm = op.inputs[0]
895 if ifm is None:
896 return op
897 assert ifm.dtype.size_in_bytes() == 1
898 op.type = Op.Add
899 op.name = op.name + "_lut_" + lut_name
900 # Mark as no-op to enable potential fusing optimizations
901 op.attrs["is_nop"] = True
902 # Create an input tensor containing scalar zero
903 quantization = QuantizationParameters(0.0, 255.0)
904 quantization.scale_f32 = ifm.quantization.scale_f32
905 quantization.zero_point = 0
906 tens = create_const_tensor(op.inputs[0].name + "_scalar0", [], ifm.dtype, [0], np.uint8, quantization=quantization)
907 op.add_input_tensor(tens)
908 op.ifm_shapes.append(Shape4D(tens.shape))
909
910 # The LUT must be applied without any preceding rescaling (the LUT itself performs the rescale),
911 # so even if the OFM has a different scale than the IFM, the generated OFM scale instructions
912 # should be the same as the IFM
913 op.forced_output_quantization = ifm.quantization
914 lut_tensor = lut.create_lut_tensor(op.name + "_values", lut_values, DataType.int8)
915 op.set_activation_lut(lut_tensor)
916 op.set_ifm_ofm_shapes()
917 return op
918
919
920def convert_to_lut8(op, fn, fn_name):
921 # Converts op to a no-op + int8/uint8 LUT which is generated with the given function.
922 # fn is a function(real) -> real
923 ifm, ofm = op.get_ifm_ofm()
924 if ifm.dtype not in (DataType.uint8, DataType.int8) or ifm.dtype != ofm.dtype:
925 return op
926 # Generate the LUT
927 ifm_scale = np.double(ifm.quantization.scale_f32)
928 ofm_scale = np.double(ofm.quantization.scale_f32)
929 zp_in = ifm.quantization.zero_point
930 zp_out = ofm.quantization.zero_point
931 values = []
932 ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)
933 quantized_min = min(ix)
934 quantized_max = max(ix)
935 for x in ix:
936 x_real = ifm_scale * (x - zp_in)
937 y_real = fn(x_real)
938 lut_result = round_away_zero(zp_out + y_real / ofm_scale)
939 lut_result = min(quantized_max, max(quantized_min, lut_result))
940 values.append(lut_result)
941 return convert_to_lut(op, values, fn_name)
942
943
944def convert_lrelu_to_lut(op, arch):
945 ifm, ofm = op.get_ifm_ofm()
946 # Generate the LUT
947 alpha = op.attrs["alpha"]
948 ifm_scale = np.double(ifm.quantization.scale_f32)
949 ofm_scale = np.double(ofm.quantization.scale_f32)
950 zp_in = ifm.quantization.zero_point
951 zp_out = ofm.quantization.zero_point
952 identity_scale, identity_shift = scaling.elementwise_mul_scale(ifm_scale, 1, ofm_scale)
953 alpha_scalar = 1
954 alpha_scale, alpha_shift = scaling.elementwise_mul_scale(ifm_scale, alpha, ofm_scale)
955 if "alpha_scaling" in op.attrs:
956 # The LeakyRelu was the result from convert_mul_max_to_abs_or_lrelu
957 alpha_scalar, alpha_scale, alpha_shift = op.attrs["alpha_scaling"]
958 values = []
959 ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)
960 quantized_min = min(ix)
961 quantized_max = max(ix)
962 for x in ix:
963 if x < zp_in:
964 lut_result = zp_out + fp_math.multiply_by_quantized_multiplier(
965 alpha_scalar * (x - zp_in), alpha_scale, alpha_shift
966 )
967 else:
968 lut_result = zp_out + fp_math.multiply_by_quantized_multiplier(x - zp_in, identity_scale, identity_shift)
969 lut_result = min(quantized_max, max(quantized_min, lut_result))
970 values.append(lut_result)
971 return convert_to_lut(op, values, "lrelu")
972
973
974def convert_lrelu(op, arch, nng):
975 # Converts LeakyRelu to a LUT based solution if possible, otherwise a mul + max
976 if op.type != Op.LeakyRelu:
977 return op
978 ifm, ofm = op.get_ifm_ofm()
979 if ifm is None or ofm is None:
980 return op
981 if ifm.dtype in (DataType.uint8, DataType.int8) and ifm.dtype == ofm.dtype:
982 # use LUT for int8/uint8
983 return convert_lrelu_to_lut(op, arch)
984 if check_quantized_tens_scaling_equal(ifm, ofm) and ifm.dtype == ofm.dtype == DataType.int16:
985 # use LeakyRelu unmodified for int16 with equal input/output scaling
986 return op
987 return convert_lrelu_to_mul_max(op, arch)
988
989
990def convert_tanh_sigmoid_to_lut(op, arch, nng):
991 # Converts int8/uint8 Sigmoid and Tanh to a LUT based solution
992 if op.type == Op.Sigmoid:
993 return convert_to_lut8(op, clamp_sigmoid, "sigmoid")
994 elif op.type == Op.Tanh:
995 return convert_to_lut8(op, math.tanh, "tanh")
996 return op
997
998
Jonas Ohlsson81942e92021-08-20 09:33:28 +0200999def remove_reshape_and_squeeze_ops(op, arch):
Jonas Ohlssonfbfd96e2021-08-25 11:38:03 +02001000 if op.run_on_npu and op.type in (Op.Reshape, Op.Squeeze):
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001001 ofm = op.ofm
1002 ifm = op.ifm
1003
1004 # Check if quantization is the same in the input and output for the reshape ops
1005 if not check_quantized_tens_scaling_equal(ifm, ofm):
1006 # TODO Both tensors are needed, since quantisation properties currently are linked to Tensors.
1007 # In order to remove this reshape either quantization properties need to be moved to Operator,
1008 # or the reshape need to be replace with a NOP.
1009 return
1010
Patrik Gustavssondf995102021-08-23 15:33:59 +02001011 bypass_reshape_and_squeeze_ops(op)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001012
1013
1014def fuse_activation_function_with_prev(op, arch, nng):
1015 # if op is a no-op: attempts to move the activation function to the preceding op
1016 if not op.attrs.get("is_nop", False) or op.activation is None:
1017 return op
1018 ifm, ofm = op.get_ifm_ofm()
1019 if ifm is None or ofm is None:
1020 return op
1021 # finds the input(s) to the operation
1022 prev_op = ifm.ops[0]
1023 # Note: the below checks on prev_op require that a first optimize pass on the full graph has been performed
1024 fuse = (
1025 prev_op.run_on_npu
1026 and prev_op.type.npu_block_type != NpuBlockType.Default
1027 and len(ifm.ops) == 1
1028 and len(prev_op.outputs[0].consumers()) == 1
1029 and prev_op.activation is None
1030 )
1031 if op.activation_lut is not None and arch.shram_reserved_unused_banks == 0:
1032 # TODO: if SHRAM LUT space is shared with SHRAM ACC (32, 64 MAC),
1033 # LUT currently only works correctly for elementwise ops
1034 fuse = False
1035 if not fuse:
1036 return op
1037 # Move the fused activation function + corresponding info to prev_op
1038 prev_op.activation = op.activation
1039 prev_op.forced_output_quantization = op.forced_output_quantization
1040 if op.activation_lut is not None:
1041 prev_op.set_activation_lut(op.activation_lut)
1042 # Bypass op
1043 prev_op.set_output_tensor(ofm)
1044 DebugDatabase.add_optimised(op, prev_op)
1045 return op
1046
1047
1048def _leading_pad_ok(leading_pad, stride, kernel_size):
1049 # If kernel size // 2 > stride, then (left, top) padding must be a multiple of stride,
1050 # otherwise replacing PAD by hardware padding would iterate the wrong IFM rows/columns
1051 max_size = kernel_size // 2
1052 return leading_pad == max_size or max_size <= stride or leading_pad % stride == 0
1053
1054
1055def replace_pad_by_hw_pad(op: Operation, arch, nng):
1056 """
1057 Tries to completely remove a PAD operator by using hardware padding.
1058 E.g. a PAD operation that pads 1, followed by a CONV with VALID padding and kernel size 3
1059 is rewritten such that the PAD is removed, and the CONV uses SAME padding.
1060 Converts tens1 -> PAD -> tens2 -> CONV to tens1 -> CONV
1061 if both operations can be run on the NPU.
1062 This is the most efficient way to implement PAD, but cannot be done for all pad sizes.
1063 """
1064 if (
1065 (op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op() or op.type.is_avgpool_op())
1066 and op.run_on_npu
1067 and op.attrs["padding"] == Padding.VALID
1068 ):
1069 pad_op = op.ifm.ops[0]
1070 if pad_op.type != Op.Pad or not pad_op.run_on_npu:
1071 return op
1072 if pad_op.ifm.dtype != pad_op.ofm.dtype or not check_quantized_tens_scaling_equal(pad_op.ofm, pad_op.ifm):
1073 return op
1074 top, left, bottom, right = get_pad_values_from_input(pad_op.inputs[1].values)
1075 k = op.kernel
1076 k_w, k_h = k.dilated_wh()
1077
1078 # Check if the PAD operator can be replaced by hardware padding
1079 if left > k_w // 2 or right > k_w // 2 or top > k_h // 2 or bottom > k_h // 2:
1080 # Too much padding, it would require hardware padding to actually insert zeros
1081 return op
1082 if not _leading_pad_ok(top, k.stride.y, k_h) or not _leading_pad_ok(left, k.stride.x, k_w):
1083 return op
1084
1085 if op.type.is_avgpool_op():
1086 # For average pool, hardware padding can only be used if padding is 0 or kernel size / 2
1087 for pad, k_size in (
1088 (left, k_w),
1089 (right, k_w),
1090 (top, k_h),
1091 (bottom, k_h),
1092 ):
1093 if pad not in (0, k_size // 2):
1094 return op
1095 # Average pool is converted to depthwise, because NPU average pool + same padding
1096 # has a special implementation that is different from PAD followed by average pool with
1097 # valid padding.
1098 k_w, k_h = op.kernel.width, op.kernel.height
1099 ifm = op.ifm
1100 # Remember other inputs
1101 other_inputs = op.inputs[1:]
1102 # Create a weight tensor, all weights are set to 1/(kernel width * kernel height)
1103 quantization = QuantizationParameters(0.0, 255.0)
1104 quantization.scale_f32 = 1.0 / (k_w * k_h)
1105 quantization.zero_point = 0
1106 shape = [k_h, k_w, 1, op.ofm.shape[-1]]
1107 weights = np.full(shape, 1)
1108
1109 weight_tens = create_const_tensor(
1110 op.name + "_weights",
1111 shape,
1112 op.ifm.dtype,
1113 weights,
1114 np.uint8,
1115 purpose=TensorPurpose.Weights,
1116 quantization=quantization,
1117 )
James Peet7519d502021-07-19 16:47:58 +01001118 weight_tens.values = weights
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001119 op.type = Op.DepthwiseConv2DBias
1120 op.inputs = []
1121 op.add_input_tensor(ifm)
1122 op.add_input_tensor(weight_tens)
1123 # Add bias tensor, all biases set to 0
1124 op.inputs.append(None)
1125 fixup_bias_tensors(op, arch, nng)
1126 # Add other inputs
1127 op.inputs.extend(other_inputs)
1128 op.rounding_mode = NpuRoundingMode.NATURAL
1129
1130 # Bypass the PAD operator
1131 op.set_input_tensor(pad_op.ifm, 0)
1132 # Adjust the padding attributes of the convolution operator
1133 op.attrs["padding"] = Padding.EXPLICIT
1134 op.attrs["explicit_padding"] = (top, left, bottom, right)
1135 op.set_ifm_ofm_shapes()
1136 return op
1137
1138
1139def convert_pad(op: Operation, arch, nng):
1140 """
1141 Rewrites PAD operator to an average pool that copies the IFM to the OFM
1142 + up to 4 average pool operators that fill the OFM with zeros at the borders.
1143 This is done as fall-back for the PAD operators that remain after replace_pad_by_hw_pad
1144 """
1145 if op.type != Op.Pad or not op.run_on_npu:
1146 return op
1147 top, left, bottom, right = get_pad_values_from_input(op.inputs[1].values)
1148
1149 ifm = op.ifm
1150 assert ifm is not None
1151 ifm_shape = Shape4D(ifm.shape)
1152 ofm = op.ofm
1153 assert ofm is not None
1154 ofm.ops = []
1155 ofm_shape = op.ofm_shapes[0]
1156
1157 # Average pool op that copies IFM to the right place inside the OFM
1158 shp0 = Shape4D(0, 0, 0, 0)
1159 shp_top = shp0.with_height(top)
1160 avgpool_op = create_avg_pool_for_concat(op, op.name + "_main", ifm, ifm_shape, shp_top.with_width(left))
1161 avgpool_op.activation = op.activation
1162 quant = ofm.quantization
1163 pad_value = quant.zero_point
1164 # Add operations that fill the borders of the OFM
1165 if top > 0:
1166 shape = Shape4D(1, top, ofm_shape.width, ofm_shape.depth)
1167 zero_tens = create_const_tensor(
1168 op.name + "_top", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], np.uint8, quantization=quant
1169 )
1170 # If top/bottom or left/right are equal, the const tensors can be allocated to the same address
1171 zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))
1172 create_avg_pool_for_concat(op, op.name + "_top", zero_tens, shape, shp0)
1173 if bottom > 0:
1174 shape = Shape4D(1, bottom, ofm_shape.width, ofm_shape.depth)
1175 zero_tens = create_const_tensor(
1176 op.name + "_bottom",
1177 shape.as_list(),
1178 ofm.dtype,
1179 shape.elements() * [pad_value],
1180 np.uint8,
1181 quantization=quant,
1182 )
1183 zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))
1184 create_avg_pool_for_concat(
1185 op, op.name + "_bottom", zero_tens, shape, shp0.with_height(ofm_shape.height - bottom)
1186 )
1187 if left > 0:
1188 shape = Shape4D(1, ifm_shape.height, left, ofm_shape.depth)
1189 zero_tens = create_const_tensor(
1190 op.name + "_left", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], np.uint8, quantization=quant
1191 )
1192 zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))
1193 create_avg_pool_for_concat(op, op.name + "_left", zero_tens, shape, shp_top)
1194 if right > 0:
1195 shape = Shape4D(1, ifm_shape.height, right, ofm_shape.depth)
1196 zero_tens = create_const_tensor(
1197 op.name + "_right", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], np.uint8, quantization=quant
1198 )
1199 zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))
1200 create_avg_pool_for_concat(
1201 op, op.name + "_right", zero_tens, shape, shp_top.with_width(ofm_shape.width - right)
1202 )
1203
1204 op.type = Op.ConcatTFLite
1205 return avgpool_op
1206
1207
1208def add_attrs_to_resizebilinear(op, arch, nng):
1209 if op.type == Op.ResizeBilinear and op.run_on_npu:
1210 input_tensor = op.inputs[0]
1211 input_shape = op.ifm_shapes[0]
1212 upscaled_height = input_shape.height * 2
1213 upscaled_width = input_shape.width * 2
1214 out_shape = op.ofm_shapes[0]
1215 if not op.attrs["align_corners"] and out_shape.height == upscaled_height and out_shape.width == upscaled_width:
1216 # this means the output is supposed to be a x2 upscale,
1217 # so we need to do SAME padding
1218 op.attrs["padding"] = Padding.SAME
1219 elif (
1220 op.attrs["align_corners"]
1221 and out_shape.height == (upscaled_height - 1)
1222 and out_shape.width == (upscaled_width - 1)
1223 ):
1224 # here we can just run the avg pool without padding and
1225 # produce a (M * 2 - 1, N * 2 - 1) sized output
1226 op.attrs["padding"] = Padding.VALID
1227 else:
1228 return op
1229 input_tensor.resampling_mode = resampling_mode.NEAREST
1230 op.attrs.update({"strides": (1, 1, 1, 1), "ksize": (1, 2, 2, 1)})
1231 return op
1232
1233
1234def fixup_bias_tensors(op, arch, nng):
1235 if op.type.needs_bias() and op.bias is None:
1236 # Op has no bias, add bias tensor filled with zeros
1237 nr_biases = op.inputs[1].shape[-1]
1238 bias_values = [0] * nr_biases
1239 bias_tensor = create_const_tensor(op.name + "_bias", [nr_biases], DataType.int32, bias_values)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001240 op.set_input_tensor(bias_tensor, op.type.info.indices.biases[0])
1241
1242 return op
1243
1244
1245def convert_mean_to_depthwise_conv_or_avgpool(op, arch, nng):
1246 if op.type == Op.Mean and op.run_on_npu:
1247 keep_dims = op.attrs.get("keep_dims", False)
1248 inp, axis = op.inputs
1249 shape = inp.shape
1250 dims = len(shape)
1251
1252 # Height and width axes have different index depending on dimensions
1253 if axis.shape == [] or axis.shape[0] == 1: # single axis
1254 axis = int(axis.values) if len(axis.shape) == 0 else int(axis.values[0])
1255 if dims in (2, 3):
1256 if axis == 0:
1257 h, w = shape[axis], 1
1258 else:
1259 h, w = 1, shape[axis]
1260 else:
1261 if axis == 1:
1262 h, w = shape[axis], 1
1263 else:
1264 h, w = 1, shape[axis]
1265 else: # multiple axes
1266 axis = sorted(axis.values)
1267 h, w = [shape[i] for i in axis]
1268
1269 # Set necessary depthwise attributes
1270 op.attrs.update(
1271 {
1272 "padding": Padding.VALID,
1273 "stride_h": 1,
1274 "stride_w": 1,
1275 "strides": (1, 1, 1, 1),
1276 "depth_multiplier": 1,
1277 "channel_multiplier": 1,
1278 "dilation_h_factor": 1,
1279 "dilation_w_factor": 1,
1280 "dilation": (1, 1, 1, 1),
1281 }
1282 )
1283 # Change op type
1284 op.type = Op.DepthwiseConv2DBias
1285 # Set IFM/OFM shapes after changing op type
1286 op.set_ifm_ofm_shapes()
1287
1288 weight_scale, bias = 1, None
1289 ofmq, ifmq = op.ofm.quantization, inp.quantization
1290 # Set rounding mode, scaling and zero point based on which reference implementation to match
1291 if len(shape) == 4 and axis == [1, 2] and keep_dims:
1292 if inp.dtype == DataType.uint8:
1293 # This attribute means a different scaling calculation is used in order to match reference
1294 op.low_precision_scaling = True
1295 weight_scale = h * w
1296 # Set zero points to 0 as they will be adjusted for with bias term
1297 foq = ofmq.clone()
1298 foq.zero_point = 0
1299 fiq = ifmq.clone()
1300 fiq.zero_point = 0
1301 op.forced_input_quantization = fiq
1302 bias_term = ofmq.zero_point - int(ifmq.zero_point * ifmq.scale_f32 / ofmq.scale_f32)
1303 # If the bias term is outside uint8 range, we need an Add op to apply it.
1304 if bias_term < 0 or bias_term > 255:
1305 intermediate = op.ofm.clone(suffix="_intermediate", set_unique=True)
1306 # Bias term has higher bitness (i32) than input/output (u8).
1307 # 16 bits is enough since the bias is added/subtracted from a u8 value,
1308 # the bias can only effectively assume values in the range [-255, 255].
1309 intermediate.dtype = DataType.int16
1310 intermediate.quantization.zero_point = 0
1311 add_op = Operation(Op.Add, op.name + "_bias")
1312 add_op.forced_output_quantization = foq
1313 add_op.add_input_tensor(intermediate)
1314 quant = QuantizationParameters()
1315 quant.zero_point = 0
1316 bias_term_tens = create_const_tensor(
James Peet7519d502021-07-19 16:47:58 +01001317 op.name + "_bias", [1, 1, 1, 1], DataType.int16, [bias_term], np.int16, quantization=quant,
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001318 )
1319 add_op.add_input_tensor(bias_term_tens)
1320 add_op.set_output_tensor(op.ofm)
1321 add_op.set_ifm_ofm_shapes()
1322 add_op.activation = op.activation
1323 op.activation = None
1324 op.set_output_tensor(intermediate)
1325 op.set_ifm_ofm_shapes()
1326 # If not, we can just do it with the OFM zero point.
1327 else:
1328 foq.zero_point = bias_term
1329 op.forced_output_quantization = foq
1330 else:
1331 assert inp.dtype == DataType.int8
1332 # Use a depthwise to calculate the sum,
1333 # followed by a multiplication with 1/N to get the MEAN
1334 weight_scale = 1
1335 intermediate = op.ofm.clone(suffix="_intermediate", set_unique=True)
1336 intermediate.dtype = DataType.int16
1337 mul_op = Operation(Op.Mul, op.name + "_mul")
1338 mul_op.add_input_tensor(intermediate)
1339 # Create scalar containing 1/N
1340 quant = QuantizationParameters()
1341 quant.zero_point = 0
1342 # The reference rounds negative numbers downwards, e.g. -1.5 is rounded to -2,
1343 # while rounding mode NATURAL would round this to -1.
1344 # This can only occur if N is even, and can be emulated by
1345 # multiplying with a number that is slightly smaller than 1/N.
1346 # It must be so small that other roundings are not affected;
1347 # the calculated value is based on worst case,
1348 # which is sum 256 * N (the maximum sum that can occur with int8)
1349 n = int(h * w)
1350 eps = 1 / (256 * (n + 1)) if n % 2 == 0 else 0
1351 quant.scale_f32 = 1 / (n - eps)
1352 scalar = create_const_tensor(
1353 op.name + "_scalar", [1, 1, 1, 1], DataType.uint8, [1], np.uint8, quantization=quant
1354 )
1355 mul_op.add_input_tensor(scalar)
1356 mul_op.set_output_tensor(op.ofm)
1357 mul_op.set_ifm_ofm_shapes()
1358 mul_op.rounding_mode = NpuRoundingMode.NATURAL
1359 mul_op.activation = op.activation
1360 op.activation = None
1361 op.set_output_tensor(intermediate)
1362 op.set_ifm_ofm_shapes()
1363 elif ifmq.zero_point == ofmq.zero_point and ifmq.scale_f32 == ofmq.scale_f32:
1364 # Here we can just use a simple AvgPool with truncating rounding,
1365 # as we're emulating simple integer division.
1366 op.rounding_mode = NpuRoundingMode.TRUNCATE
1367 op.type = Op.AvgPool
1368 op.attrs.update({"ksize": (1, h, w, 1), "filter_height": h, "filter_width": w})
1369 else:
1370 op.rounding_mode = NpuRoundingMode.NATURAL
1371 weight_scale = 1 / (h * w)
1372 # Input zero point is adjusted after mean calculation, so we emulate that with a bias
1373 bias = -ifmq.zero_point * h * w
1374 fiq = ifmq.clone()
1375 fiq.zero_point = 0
1376 op.forced_input_quantization = fiq
1377
1378 # Change dimensions to 4
1379 if dims < 4:
1380 shape = [1] + shape
1381 if dims == 2:
1382 shape += [1]
1383
1384 # If height is greater than max kernel height, reshape to from HxW to 1x(HxW)
1385 if h > 64:
1386 shape = [shape[0], 1, h * w, shape[3]]
1387 op.ifm_shapes[0] = Shape4D(shape)
1388 if h > 256 and op.type == Op.AvgPool:
1389 op.attrs.update({"ksize": (1, 1, h * w, 1), "filter_height": 1, "filter_width": h * w})
1390
1391 # If the AvgPool version is used, we don't need to do anything else
1392 if op.type == Op.AvgPool:
1393 return op
1394
1395 # Make unit weight tensor quantization
1396 weight_quant = ifmq.clone()
1397 weight_quant.min = 0
1398 weight_quant.max = 255
1399 weight_quant.scale_f32 = weight_scale
1400 weight_quant.zero_point = 0
1401
1402 # Set weight shape to [H,W,C,B]
1403 weight_shape = shape[1:4] + [shape[0]]
1404 # Add unit weight tensor
1405 op.set_input_tensor(
1406 create_const_tensor(
1407 "weights",
1408 weight_shape,
1409 inp.dtype,
1410 np.ones(weight_shape),
1411 value_dtype=np.uint8,
1412 quantization=weight_quant,
1413 ),
1414 1,
1415 )
James Peet7519d502021-07-19 16:47:58 +01001416 op.weights.values = np.reshape(op.inputs[1].values, weight_shape)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001417
1418 # Add None bias tensor
1419 op.inputs.append(None)
1420 # Add bias tensor
1421 if bias:
1422 bias_shape = [shape[-1]]
1423 op.set_input_tensor(
1424 create_const_tensor(
Tim Hall8ae29292021-07-28 16:52:03 +01001425 "bias", bias_shape, inp.dtype, np.ones(bias_shape) * bias, value_dtype=np.int32, quantization=None,
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001426 ),
1427 2,
1428 )
1429
1430 return op
1431
1432
1433def supported_operator_check(op, arch, nng):
Jonas Ohlsson45e653d2021-07-26 16:13:12 +02001434 op.run_on_npu = arch.tflite_supported_operators.is_operator_supported(op)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001435 return op
1436
1437
1438def tflite_optimise_graph(nng, arch):
1439 # Pre-processing step
1440 pre_process_list = [
1441 supported_operator_check,
1442 set_ifm_ofm_op_shapes,
1443 ]
1444
1445 for idx, sg in enumerate(nng.subgraphs):
1446 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
1447 nng, sg, arch, [], pre_process_list, rewrite_unsupported=False,
1448 )
1449
1450 # Handle Concat Ops
1451 for idx, sg in enumerate(nng.subgraphs):
1452 rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [rewrite_concat_ops])
1453 sg.refresh_after_modification()
1454
1455 # Handle Split Ops
1456 for idx, sg in enumerate(nng.subgraphs):
1457 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
1458 nng,
1459 sg,
1460 arch,
1461 [],
1462 [rewrite_unpack_output, rewrite_stridedslice_output, convert_nop_split_to_identity],
1463 rewrite_unsupported=False,
1464 )
1465
1466 for idx, sg in enumerate(nng.subgraphs):
1467 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
1468 nng, sg, arch, [rewrite_split_ops], [], rewrite_unsupported=False,
1469 )
1470
1471 # Handle sg input output
1472 for idx, sg in enumerate(nng.subgraphs):
1473 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
1474 nng, sg, arch, [], [fix_sg_input_output], rewrite_unsupported=False,
1475 )
1476
Jonas Ohlsson81942e92021-08-20 09:33:28 +02001477 # Removal of reshapes and squeeze
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001478 for sg in nng.subgraphs:
Jonas Ohlsson81942e92021-08-20 09:33:28 +02001479 rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [remove_reshape_and_squeeze_ops])
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001480 sg.refresh_after_modification()
1481
1482 # Rewrite of operators
1483 op_rewrite_list = [
1484 set_tensor_equivalence,
1485 convert_mean_to_depthwise_conv_or_avgpool,
1486 convert_depthwise_to_conv,
1487 convert_conv_to_fc,
1488 convert_softmax,
1489 optimise_strided_conv,
1490 convert_hardswish_to_lut,
1491 rewrite_fully_connected_input,
1492 convert_batched_fc_shape,
1493 fixup_conv2d_backprop,
1494 fixup_relus_with_differing_ifm_ofm_scaling,
1495 fixup_elementwise_with_scalars,
1496 reorder_depthwise_weights,
1497 fixup_resizebilinear,
1498 fixup_bias_tensors,
1499 convert_mul_max_to_abs_or_lrelu,
1500 convert_lrelu,
1501 convert_tanh_sigmoid_to_lut,
1502 replace_pad_by_hw_pad,
1503 ]
1504
1505 for idx, sg in enumerate(nng.subgraphs):
1506 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
1507 nng, sg, arch, [], op_rewrite_list, rewrite_unsupported=False,
1508 )
1509
1510 for idx, sg in enumerate(nng.subgraphs):
1511 # remove passthrough tensors and attempt further optimizations
1512 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
1513 nng,
1514 sg,
1515 arch,
1516 [remove_passthrough_tensor],
1517 [fuse_activation_function_with_prev, convert_pad, add_padding_fields],
1518 )
1519
1520 # Removal of SplitSliceRead, need to be done after optimisation has been performed,
1521 # since ifm/ofm_shapes are of importance to this function
1522 for sg in nng.subgraphs:
1523 rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [remove_SplitSliceRead])
1524 sg.refresh_after_modification()
1525
1526 return nng