blob: 3815eedd19c4480861674be2378c1b570fd8122f [file] [log] [blame]
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16# Description:
17# Early optimisation of a TensorFlow Lite based network graph, using the rewrite_graph module
18# to do the traversal of the graph.
19import math
20import uuid
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020021
22import numpy as np
23
24from . import fp_math
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020025from . import rewrite_graph
26from . import scaling
27from .api import NpuRoundingMode
28from .data_type import DataType
29from .debug_database import DebugDatabase
30from .errors import UnsupportedFeatureError
31from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Jonas Ohlsson0957e3e2021-09-01 15:57:21 +020032from .graph_optimiser_util import bypass_memory_only_ops
Patrik Gustavssonc74682c2021-08-17 14:26:38 +020033from .graph_optimiser_util import calc_explicit_padding
Patrik Gustavssondf995102021-08-23 15:33:59 +020034from .graph_optimiser_util import convert_depthwise_to_conv
Patrik Gustavssonf436ada2021-09-14 14:56:48 +020035from .graph_optimiser_util import convert_to_lut
Patrik Gustavssondf995102021-08-23 15:33:59 +020036from .graph_optimiser_util import fix_sg_input_output
Jonas Ohlsson0957e3e2021-09-01 15:57:21 +020037from .graph_optimiser_util import memory_only_ops
Patrik Gustavssonf1580f02021-09-01 12:43:02 +020038from .graph_optimiser_util import move_splitsliceread_to_consumer
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020039from .graph_optimiser_util import needed_total_padding
40from .graph_optimiser_util import set_ifm_ofm_op_shapes
41from .graph_optimiser_util import set_tensor_equivalence
42from .numeric_util import clamp_sigmoid
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020043from .numeric_util import round_away_zero
44from .operation import create_activation_function
Fredrik Svedberg1a7527c2021-09-13 15:52:16 +020045from .operation import ExplicitScaling
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020046from .operation import NpuBlockType
47from .operation import Op
48from .operation import Operation
49from .operation import Padding
50from .operation_util import create_avgpool_nop
51from .operation_util import get_pad_values_from_input
52from .shape4d import Shape4D
53from .softmax import SoftMax
54from .tensor import check_quantized_tens_scaling_equal
55from .tensor import create_const_tensor
56from .tensor import create_equivalence_id
57from .tensor import QuantizationParameters
58from .tensor import Tensor
59from .tensor import TensorPurpose
60from .tflite_mapping import optype_to_builtintype
61
62passthrough_nodes = (Op.Identity,)
63
64
65def create_avg_pool_for_concat(concat_op, name, ifm, ifm_shape: Shape4D, write_offset: Shape4D):
66 """Creates an average pool for the given concat op/input feature map"""
67 ofm = concat_op.ofm
68 avgpool_op = create_avgpool_nop(name)
69 avgpool_op.inputs = [ifm]
70 avgpool_op.outputs = [ofm]
71
72 avgpool_op.write_offset = write_offset
73 avgpool_op.write_shape = ifm_shape
74 ofm.ops.append(avgpool_op)
75 DebugDatabase.add_optimised(concat_op, avgpool_op)
76 avgpool_op.ifm_shapes.append(ifm_shape)
77 avgpool_op.ofm_shapes.append(concat_op.ofm_shapes[0])
78 avgpool_op.memory_function = Op.ConcatSliceWrite
79 return avgpool_op
80
81
82def remove_passthrough_tensor(tens, arch, nng):
83 if len(tens.ops) == 1 and tens.ops[0].type in passthrough_nodes:
84 assert len(tens.ops[0].inputs) == 1
85 tens = tens.ops[0].inputs[0]
86 return tens
87
88
89def rewrite_concat_ops(op, arch):
90 if not op.run_on_npu or not op.type.is_concat_op():
91 return
92
93 axis_4D = 0
94 ofm = op.ofm
95 ofm.ops = []
96 offset = 0
97
98 unfuse_activation_function(op)
99
100 if op.type == Op.Pack:
101 # Pack is also referred to as Stack
102 axis = int(op.attrs["axis"])
103 if axis < 0: # Convert to positive axis
104 axis = len(op.inputs[0].shape) + 1 + axis
105
106 desired_shape = op.inputs[0].shape[:axis] + [1] + op.inputs[0].shape[axis:]
107
108 axis_4D = axis + (4 - len(desired_shape))
109
110 for idx, inp in enumerate(op.inputs):
111 op.ifm_shapes[idx] = Shape4D(desired_shape)
112 op.type = Op.PackReshaped
113
114 inputs, axis = op.get_concat_inputs_axis()
115 for idx, inp in enumerate(inputs):
116 if op.type != Op.PackReshaped:
117 op.ifm_shapes[idx] = Shape4D(inp.shape)
118 if axis >= 0:
119 axis_4D = axis + (4 - len(inp.shape))
120 else:
121 axis_4D = axis
122 write_offset = [0, 0, 0, 0]
123 write_offset[axis_4D] = offset
124 concat_end = offset + op.ifm_shapes[idx][axis_4D]
125 create_avg_pool_for_concat(
126 op, op.name + str(idx) + "_avgpool", inp, op.ifm_shapes[idx], Shape4D.from_list(write_offset)
127 )
128 offset = concat_end
129 assert ofm.shape[axis] == offset
130
131 return op
132
133
134def rewrite_split_ops(tens, arch, nng):
135
136 if len(tens.ops) == 1 and tens.ops[0].type.is_split_op() and tens.ops[0].type != Op.Unpack:
137 split_op = tens.ops[0]
138
139 # Not supported so leave it and run on CPU
140 if not split_op.run_on_npu:
141 return tens
142
143 inp, outputs, axis, offset_start, offset_end = split_op.get_split_inputs_axis()
144
145 tens.ops = []
146 new_op = Operation(Op.SplitSliceRead, split_op.name)
147 new_op.inputs = [inp]
148 ofm_shape_idx = 0
Tim Hall51a8dce2021-12-20 16:49:27 +0000149 if None in (offset_end, offset_start):
150 read_shape = None
151 else:
152 # the read shape is relative to each start offset
153 read_shape = [oe - os for oe, os in zip(offset_end, offset_start)]
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200154
155 # For Split the offset cannot be extracted from the tensor so it has to
156 # be calculated from the index of the output tensor
157 if axis is not None:
158 # Get the start and end of the split
159 offset_start = [0] * 4
160 axis_4D_list = split_op.attrs.get("split_axis_4D", None) # Present for UnpackReshaped and some StridedSlice
161 for idx, out in enumerate(outputs):
162 if axis_4D_list is not None:
163 axis_4D = axis_4D_list[idx]
164 else:
165 split_op.ofm_shapes[idx] = Shape4D(out.shape)
166 if axis >= 0:
167 axis_4D = axis + (4 - len(out.shape))
168 else:
169 axis_4D = axis
170
171 if out == tens:
172 ofm_shape_idx = idx
173 read_shape = split_op.ofm_shapes[idx]
174 break
175
176 offset_start[axis_4D] += split_op.ofm_shapes[idx][axis_4D]
177
178 new_op.read_offsets[0] = Shape4D.from_list(offset_start, 0)
179 new_op.read_shapes[0] = read_shape
180 new_op.run_on_npu = True
181 new_op.set_output_tensor(tens)
182 new_op.ifm_shapes.append(Shape4D(inp.shape))
183 new_op.ofm_shapes.append(split_op.ofm_shapes[ofm_shape_idx])
184 DebugDatabase.add_optimised(split_op, new_op)
185
186 return tens
187
188
189def remove_SplitSliceRead(op, arch):
190
191 if op.type == Op.SplitSliceRead:
192 # Check if it is possible to put the SplitSliceRead on the tensor consumer, or if an avgpool need to be inserted
193 if (
194 len(op.ofm.consumer_list) == 1
195 and op.ofm.consumer_list[0] is not None
196 and op.ofm.consumer_list[0].run_on_npu
Jonas Ohlsson0957e3e2021-09-01 15:57:21 +0200197 and op.ofm.consumer_list[0].type not in memory_only_ops
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200198 and op.ofm_shapes[0] == Shape4D.from_list(op.ofm.shape)
199 ):
200 # SplitSliceRead can be performed by tensor consumer
201 cons_op = op.ofm.consumer_list[0]
Patrik Gustavssonf1580f02021-09-01 12:43:02 +0200202 move_splitsliceread_to_consumer(op, cons_op)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200203 else:
204 avgpool_op = create_avgpool_nop(op.name + "_avgpool")
205 avgpool_op.add_input_tensor(op.ifm)
206 avgpool_op.outputs = [op.ofm]
207 op.ofm.ops.remove(op)
208 op.ofm.ops.append(avgpool_op)
209 avgpool_op.ifm_shapes.append(op.ifm_shapes[0])
210 avgpool_op.ofm_shapes.append(op.ofm_shapes[0])
211 avgpool_op.read_offsets[0] = op.read_offsets[0]
212 avgpool_op.read_shapes[0] = op.read_shapes[0]
213
214 op.ifm.consumer_list.remove(op)
215 DebugDatabase.add_optimised(op, avgpool_op)
216
217
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200218def calc_padding_and_skirt(padding_type, kernel, input_shape, explicit_padding):
219 k_w, k_h = kernel.dilated_wh()
220 s_x, s_y = kernel.stride
221 ypad = needed_total_padding(int(input_shape.height), int(s_y), int(k_h))
222 xpad = needed_total_padding(int(input_shape.width), int(s_x), int(k_w))
223 if padding_type == Padding.SAME:
224 left_pad = (xpad + 0) // 2
225 right_pad = (xpad + 1) // 2
226 top_pad = (ypad + 0) // 2
227 bottom_pad = (ypad + 1) // 2
228 elif padding_type == Padding.VALID:
229 left_pad = 0
230 right_pad = 0
231 top_pad = 0
232 bottom_pad = 0
233 elif padding_type == Padding.EXPLICIT:
234 # Padding is specified in a PAD operator which has been bypassed.
235 top, left, bottom, right = explicit_padding
236 top_pad, bottom_pad = calc_explicit_padding(int(input_shape.height), int(s_y), int(k_h), int(top), int(bottom))
237 left_pad, right_pad = calc_explicit_padding(int(input_shape.width), int(s_x), int(k_w), int(left), int(right))
238 else:
Tim Hall0ab2edc2022-02-23 17:58:02 +0000239 raise UnsupportedFeatureError(f"Unsupported padding = {padding_type} for padding calculation")
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200240 padding = (top_pad, left_pad, bottom_pad, right_pad)
241 skirt = (top_pad, left_pad, ypad - top_pad, xpad - left_pad)
242 return padding, skirt
243
244
245def calc_upscaled_padding_and_skirt(padding_type, kernel_size, stride, input_shape, upscaling_factor):
246 kernel_height, kernel_width = kernel_size[0], kernel_size[1]
247 if padding_type == Padding.SAME:
248 ypad = needed_total_padding(int(input_shape.height) * upscaling_factor, int(stride[1]), int(kernel_height))
249 xpad = needed_total_padding(int(input_shape.width) * upscaling_factor, int(stride[2]), int(kernel_width))
250 right_pad = max(((xpad + 1) // upscaling_factor) - 1, 0)
251 bottom_pad = max(((ypad + 1) // upscaling_factor) - 1, 0)
252 left_pad = max(kernel_width - 1 - right_pad, 0)
253 top_pad = max(kernel_height - 1 - bottom_pad, 0)
254 elif padding_type == Padding.VALID:
255 right_pad = max(kernel_width - 2, 0)
256 bottom_pad = max(kernel_height - 2, 0)
257 left_pad = kernel_width - 1
258 top_pad = kernel_height - 1
259 else:
Tim Hall0ab2edc2022-02-23 17:58:02 +0000260 raise UnsupportedFeatureError(f"Unsupported padding = {padding_type} for up-scaled padding calculation")
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200261 padding = (top_pad, left_pad, bottom_pad, right_pad)
262 skirt = padding
263 return padding, skirt
264
265
266def fixup_conv2d_backprop(op, arch, nng):
267 if op.type == Op.Conv2DBackpropInput:
268 # flip the inputs
269 op.inputs[0], op.inputs[2] = op.inputs[2], op.inputs[0]
270 op.type = Op.Conv2DBackpropInputSwitchedBias
271 op.ifm.resampling_mode = resampling_mode.TRANSPOSE
272
273 # Update strides
274 op.attrs.update({"stride_w": 1, "stride_h": 1, "strides": (1, 1, 1, 1)})
275
276 return op
277
278
279# Convert the op to an elementwise add
280def convert_resizebilinear_1x1_to_add(op):
281 op.type = Op.Add
282 op.name = op.name + "_add"
283 op.attrs["resizebilinear"] = True
284 # Create an input tensor filled with zeros
285 shape = op.ofm_shapes[0].as_list()
286 tens = Tensor(shape, op.inputs[0].dtype, op.inputs[1].name + "_add")
James Peet7519d502021-07-19 16:47:58 +0100287 tens.values = np.zeros(shape, tens.dtype.as_numpy_type())
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200288 tens.quantization = QuantizationParameters(0.0, 255.0)
289 tens.quantization.scale_f32 = 1.0
290 tens.quantization.zero_point = 0
291 tens.consumer_list = [op]
292 tens_op = op.inputs[1].ops[0]
293 tens_op.set_output_tensor(tens)
294 # Set the add inputs
295 op.inputs[1] = op.inputs[0]
296 op.inputs[0] = tens
297 op.set_ifm_ofm_shapes()
298
299 return op
300
301
Rickard Boline546def2022-01-25 15:45:00 +0000302# Convert ResizeBilinear to a number of 2x2 nearest neighbor upscaling and one avgpool op with kernel size dependent
303# on the upscaling factor. Avgpool kernel limit of 8x8 when padding is applied limits upscaling to 8x8.
304def convert_resizebilinear_to_nearest_neighbor_upscaling_and_pool(op):
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200305 pre_op = op
306 outputs = op.outputs
Rickard Boline546def2022-01-25 15:45:00 +0000307 dtype = op.ifm.dtype
308 op.attrs.update({"strides": (1, 1, 1, 1), "ksize": (1, 1, 1, 1)})
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200309 if op.attrs["align_corners"]:
310 shape_modifier = 1
311 op.attrs["padding"] = Padding.VALID
312 else:
313 shape_modifier = 0
314 op.attrs["padding"] = Padding.SAME
315 op.inputs[0].resampling_mode = resampling_mode.NEAREST
316
317 upscaled_shape = np.array(op.ifm_shapes[0].get_hw_as_list())
318 out_shape = np.array(op.ofm_shapes[0].get_hw_as_list())
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200319
Rickard Boline546def2022-01-25 15:45:00 +0000320 # Calculate how many times 2x2 upscaling needs to be performed
321 upscale_factor = round(out_shape[1] / upscaled_shape[1])
322 n = int(np.log2(upscale_factor))
323
324 # Perform 2x2 upscaling n-1 times
325 scaled_op = pre_op
326 for count in range(n - 1):
327 if count > 0:
328 scaled_op = op.clone(f"_{count}")
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200329 scaled_op.inputs[0] = pre_op.outputs[0]
330
Rickard Boline546def2022-01-25 15:45:00 +0000331 # Nearest neighbor 2x2 upscaling
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200332 upscaled_shape = upscaled_shape * 2 - shape_modifier
Rickard Boline546def2022-01-25 15:45:00 +0000333 shape = op.ofm_shapes[0].as_list()
334 shape[1:3] = upscaled_shape
335 out_tens = Tensor(shape, dtype, f"{op.outputs[0].name}_{count}")
336 out_tens.quantization = op.outputs[0].quantization.clone()
337 scaled_op.set_output_tensor(out_tens)
338 pre_op = scaled_op
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200339
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200340 scaled_op.set_ifm_ofm_shapes()
341
Rickard Boline546def2022-01-25 15:45:00 +0000342 # Last 2x2 upscaling also applies avgpool with kernel size dependent on the upscaling factor and adds
343 # padding to the right and bottom.
344 if n > 1:
345 scaled_op = op.clone(f"_{n-1}")
346 scaled_op.inputs[0] = pre_op.outputs[0]
347 scaled_op.attrs["padding"] = Padding.EXPLICIT
348 scaled_op.attrs["explicit_padding"] = [0, 0, upscale_factor - 1, upscale_factor - 1]
349 scaled_op.attrs.update({"ksize": (1, upscale_factor, upscale_factor, 1)})
350 scaled_op.outputs = outputs
351 scaled_op.outputs[0].ops = [scaled_op]
352 scaled_op.set_ifm_ofm_shapes()
353
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200354 return op
355
356
357def fixup_resizebilinear(op, arch, nng):
358 if op.type == Op.ResizeBilinear and op.run_on_npu:
359 if op.ifm_shapes[0] == op.ofm_shapes[0]:
360 # Bypass nop resizebilinear
361 op.inputs = op.inputs[:1]
362 op.type = Op.Identity
363 elif op.ifm_shapes[0].height == 1 and op.ifm_shapes[0].width == 1:
364 convert_resizebilinear_1x1_to_add(op)
365 else:
Rickard Boline546def2022-01-25 15:45:00 +0000366 convert_resizebilinear_to_nearest_neighbor_upscaling_and_pool(op)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200367
368 return op
369
370
371def convert_nop_split_to_identity(op, arch, nng):
372 if op.type == Op.Split and op.attrs.get("num_splits") == 1:
373 # the list comprehension should return a list with a single tensor
374 # if it shouldn't, remove_passthrough_tensor will fail appropriately
375 op.inputs = [i for i in op.inputs if i.shape == op.outputs[0].shape]
376 op.type = Op.Identity
377 return op
378
379
380def rewrite_fully_connected_input(op, arch, nng):
381 if op.type == Op.FullyConnected:
382 n_in_elems = op.weights.shape[-2]
383 elms = op.ifm.elements()
384 batch_size = elms // n_in_elems
385 assert batch_size * n_in_elems == elms
386
387 op.ifm_shapes[0] = Shape4D([batch_size, 1, 1, n_in_elems])
388 return op
389
390
391def convert_batched_fc_shape(op, arch, nng):
392 if op.type == Op.FullyConnected:
393 # Check if the first dimension indicates batching
394 if op.ifm_shapes[0].batch > 1:
395 batching_split = {4: (2, 2), 8: (2, 4), 16: (4, 4)}
396 n = op.ifm_shapes[0].batch
397 h, w = batching_split.get(n, (1, n))
398 op.ifm_shapes[0] = Shape4D([1, h, w, op.ifm_shapes[0].depth])
399
400 # Reshape Weights to be 4D. IO becomes HWIO
401 weight_tensor = op.inputs[1]
James Peet7519d502021-07-19 16:47:58 +0100402 weight_tensor.values = np.expand_dims(np.expand_dims(weight_tensor.values, axis=0), axis=0)
403 weight_tensor.set_all_shapes(list(weight_tensor.values.shape))
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200404
405 n = op.ofm_shapes[0].batch
406 h, w = batching_split.get(n, (1, n))
407 op.ofm_shapes[0] = Shape4D([1, h, w, op.ofm_shapes[0].depth])
408 return op
409
410
411def unfuse_activation_function(op):
412 if op.type == Op.ConcatTFLite and op.run_on_npu and op.activation is not None:
413 act_op = Operation(op.activation.op_type, op.name + op.activation.op_type.name)
414 op.activation = None
415 out_tens = op.outputs[0]
416 intermediate_tens = out_tens.clone("_act_intermediate")
417 act_op.set_output_tensor(out_tens)
418 act_op.add_input_tensor(intermediate_tens)
419 op.set_output_tensor(intermediate_tens)
420 act_op.set_ifm_ofm_shapes()
421
422
423def rewrite_stridedslice_output(op, arch, nng):
424 if not op.run_on_npu or op.type != Op.StridedSlice:
425 return op
426
427 new_axis_mask = op.attrs["new_axis_mask"]
428 shrink_axis_mask = op.attrs["shrink_axis_mask"]
429
430 if shrink_axis_mask == 0 and new_axis_mask == 0:
431 return op
432
433 axis_4D = [0] * len(op.outputs)
434 for idx, out_tens in enumerate(op.outputs):
435 output_shape = list(out_tens.shape)
436
437 if shrink_axis_mask != 0:
438 n = 0
439 axis = 0
440 while shrink_axis_mask:
441 prev_mask = shrink_axis_mask
442 n += 1
443 shrink_axis_mask &= shrink_axis_mask - 1
444 axis = int(math.log2(prev_mask - shrink_axis_mask))
445 output_shape = output_shape[:axis] + [1] + output_shape[axis:]
446
447 assert len(out_tens.shape) == (len(op.inputs[0].shape) - n)
448 op.attrs["shrink_axis_mask"] = 0
449 if axis >= 0:
450 axis_4D[idx] = axis + (4 - len(output_shape))
451 else:
452 axis_4D[idx] = axis
453 op.ofm_shapes[idx] = Shape4D(output_shape)
454
455 elif new_axis_mask != 0:
456 n = 0
457 axis = 0
458 while new_axis_mask:
459 prev_mask = new_axis_mask
460 n += 1
461 new_axis_mask &= new_axis_mask - 1
462 axis = int(math.log2(prev_mask - new_axis_mask))
463 output_shape = output_shape[:axis] + output_shape[(axis + 1) :]
464 new_axis_mask >>= 1
465
466 assert len(out_tens.shape) == (len(op.inputs[0].shape) + n)
467 op.attrs["new_axis_mask"] = 0
468 if axis >= 0:
469 axis_4D[idx] = axis + (4 - len(output_shape))
470 else:
471 axis_4D[idx] = axis
472 op.ofm_shapes[idx] = Shape4D(output_shape)
473
474 op.attrs["split_axis_4D"] = axis_4D
475 return op
476
477
478def rewrite_unpack_output(op, arch, nng):
479 tens = op.outputs[0]
480 if op.run_on_npu and op.type == Op.Unpack:
481 # Unpack is also referred to as Unstack
482 axis = int(op.attrs["axis"])
483 if axis < 0: # Convert to positive axis
484 axis = len(op.inputs[0].shape) + 1 + axis
485 op.type = Op.UnpackReshaped
486 desired_output_shape = tens.shape[:axis] + [1] + tens.shape[axis:]
487
488 axis_4D = axis + (4 - len(desired_output_shape))
489 op.attrs["split_axis_4D"] = [axis_4D] * len(op.outputs)
490
491 for idx, out_tens in enumerate(op.outputs):
492 op.ofm_shapes[idx] = Shape4D(desired_output_shape)
493 return op
494
495
496def add_padding_fields(op, arch, nng):
497 if op.run_on_npu:
498 if "padding" in op.attrs:
499 input_shape = op.ifm_shapes[0]
500 output_shape = op.ofm_shapes[0]
501 if op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op():
502 kernel_size = op.inputs[1].shape[:2]
503 elif op.type.is_pool_op() or op.type.npu_block_type == NpuBlockType.ReduceSum:
504 kernel_size = op.attrs["ksize"][1:3]
505 else:
506 raise UnsupportedFeatureError(f"Unknown operation that uses padding: {optype_to_builtintype(op.type)}")
507
508 if op.type == Op.Conv2DBackpropInputSwitchedBias:
509 upscaling_factor = output_shape.height // input_shape.height
510 padding, skirt = calc_upscaled_padding_and_skirt(
511 op.attrs["padding"], kernel_size, op.attrs["strides"], input_shape, upscaling_factor
512 )
513 else:
514 padding, skirt = calc_padding_and_skirt(
515 op.attrs["padding"], op.kernel, input_shape, op.attrs.get("explicit_padding"),
516 )
517
518 op.attrs["explicit_padding"] = padding
519 op.attrs["skirt"] = skirt
520
521 return op
522
523
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200524def reorder_depthwise_weights(op, arch, nng):
525 if op.type.is_depthwise_conv2d_op():
526 weight_tensor = op.inputs[1]
James Peet7519d502021-07-19 16:47:58 +0100527 weight_tensor.values = np.transpose(weight_tensor.values, (0, 1, 3, 2))
528 weight_tensor.set_all_shapes(list(weight_tensor.values.shape))
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200529 weight_tensor.weight_transpose_depthwise = True
530
531 return op
532
533
534def optimise_strided_conv(op, arch, nng):
535 stride_x, stride_y = op.get_kernel_stride()
536 ifm_tensor, _, weight_tensor, _ = op.get_ifm_ifm2_weights_ofm()
537
538 if (
539 op.type == Op.Conv2DBias
540 and op.op_index == 0
541 and stride_x == 2
542 and op.ifm_shapes[0].depth <= 4
543 and op.ifm_shapes[0].width % 2 == 0
544 and weight_tensor is not None
545 and weight_tensor.shape[1] >= 2
546 ):
547 ifm_shape = op.ifm_shapes[0]
548 # IFM
549 op.ifm_shapes[0] = Shape4D([ifm_shape.batch, ifm_shape.height, ifm_shape.width // 2, ifm_shape.depth * 2])
550
551 # Weights
552 weight_shape = weight_tensor.shape
553 if weight_shape[1] % 2 != 0:
554 weight_shape[1] = weight_shape[1] + 1
555 padded_array = np.zeros(weight_shape)
556 for i in range(weight_shape[0]):
557 padded_array[i] = np.vstack(
558 [
James Peet7519d502021-07-19 16:47:58 +0100559 weight_tensor.values[i],
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200560 np.full((1, weight_shape[2], weight_shape[3]), weight_tensor.quantization.zero_point),
561 ]
562 )
James Peet7519d502021-07-19 16:47:58 +0100563 weight_tensor.values = padded_array
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200564 weight_shape[1] //= 2
565 weight_shape[2] *= 2
James Peet7519d502021-07-19 16:47:58 +0100566 weight_tensor.values = np.reshape(weight_tensor.values, weight_shape)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200567 weight_tensor.set_all_shapes(weight_shape)
568 # If multiple copies of the weights are used, we could avoid
569 # them having the same address by changing the value_id
570 weight_tensor.value_id = uuid.uuid4()
571
572 # Strides
573 stride_x = 1
574 op.attrs.update({"stride_w": stride_x, "stride_h": stride_y, "strides": (1, stride_y, stride_x, 1)})
575
576 return op
577
578
579def convert_conv_to_fc(op, arch, nng):
580 # Conv 1x1 can be equivalent to Fully Connected.
581 # By representing certain convs as fully connected layers, Vela can better determine wether or not to use
582 # caching/double buffering for the weights.
583 # (Weights dont need to be reloaded for convs when IFM H and W are 1)
584 if op.type == Op.Conv2DBias:
585 h = op.ifm_shapes[0].height
586 w = op.ifm_shapes[0].width
587 kh, kw, _, _ = op.inputs[1].shape
588 if h == 1 and w == 1 and kh == 1 and kw == 1:
589 # Overwrite this op as a Fully Connected Op
590 op.name += "_fc"
591 op.type = Op.FullyConnected
592 op.attrs = {
593 "weights_format": 0,
594 }
595 # Reshape Weights to be 2D. HWIO becomes just IO (as H and W are 1, they can just be dropped)
596 weight_tensor = op.inputs[1]
James Peet7519d502021-07-19 16:47:58 +0100597 weight_tensor.values = weight_tensor.values.squeeze(axis=(0, 1))
598 weight_tensor.set_all_shapes(list(weight_tensor.values.shape))
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200599
600 DebugDatabase.add_optimised(op, op)
601 return op
602
603
604def fixup_relus_with_differing_ifm_ofm_scaling(op, arch, nng):
605 if op.run_on_npu and op.type.is_relu_op():
606 ifm = op.inputs[0]
607 ofm = op.outputs[0]
608 # Relu with differing IFM and OFM scaling cannot be fused with another primary op
609 # and requires its own to be inserted
610 if not check_quantized_tens_scaling_equal(ifm, ofm):
611 # Override this op with its own primary op (avgpool)
612 relu_fused_op = create_avgpool_nop(op.name + "_avgpool")
613 # And fuse the original activation function to it
614 relu_fused_op.activation = create_activation_function(op.type)
Fredrik Svedberg1a7527c2021-09-13 15:52:16 +0200615 # Add explicit rescaling
616 rescale = ifm.quantization.scale_f32 / ofm.quantization.scale_f32
617 multiplier, shift = scaling.quantise_scale(rescale)
618 relu_fused_op.rescale = ExplicitScaling(False, [shift], [multiplier])
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200619 # Tidy up and assign the ifm and ofm to the new op
620 ifm.consumer_list.remove(op)
621
622 relu_fused_op.add_input_tensor(ifm)
623 relu_fused_op.set_output_tensor(ofm)
624 relu_fused_op.set_ifm_ofm_shapes()
625 op = relu_fused_op
626 return op
627
628
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200629def convert_softmax(op, arch, nng):
630 if op.type == Op.Softmax and op.run_on_npu:
631 softmax = SoftMax(op)
632 op = softmax.get_graph()
633 return op
634
635
636def convert_mul_max_to_abs_or_lrelu(op, arch, nng):
637 r"""Whenever there is a subgraph with this topology:
638
639 Input X For X = -1 or X > 0
640 | \ / This subgraph can be replaced with either
641 | Mul an Abs (if X = -1) or a LeakyReLU (if X > 0)
642 | /
643 Max
644 """
645
646 if op.type == Op.Maximum:
647 # finds the Mul input(s) to the Max
648 muls = [i for i in op.inputs if i.ops[0].type == Op.Mul]
649 if len(muls) == 1:
650 mul = muls[0].ops[0]
651 elif len(muls) == 2:
652 # In the case both inputs are Muls, find the one with the same input as the Max
653 mul = [m for m in muls if len(set(op.inputs + m.ops[0].inputs)) == 1][0].ops[0]
654 else:
655 # No Mul inputs
656 return op
657
658 # make sure the Mul doesn't have any other consumers
659 mul_ofm = mul.outputs[0]
660 if len(mul_ofm.consumers()) != 1:
661 return op
662 # make sure the Mul doesn't have a fused activation function
663 if mul.activation:
664 return op
665 ifm, ofm = op.get_ifm_ofm()
666 if ifm is None or ofm is None:
667 return op
668
669 if ifm.dtype not in (DataType.uint8, DataType.int8) or ifm.dtype != ofm.dtype:
670 return op
671 if not check_quantized_tens_scaling_equal(ifm, ofm) or not check_quantized_tens_scaling_equal(ifm, mul_ofm):
672 # rewrite to LeakyRelu currently only makes sense if the quantization is identical
673 return op
674
675 # finds the branched input that goes to both the Max and the Mul
676 shared = set(op.inputs) & set(mul.inputs)
677 if len(shared) == 1:
678 shared_in = shared.pop()
679 # find the constant scalar input to the Mul
680 const_tens = (set(mul.inputs) - {shared_in}).pop()
681 # check that it is a scalar
682 if const_tens.shape != []:
683 return op
684 const = const_tens.ops[0]
685 # check that it is a constant
686 if const.type != Op.Const:
687 return op
688 # Remove the Mul from the shared input's consumers
689 shared_in.consumer_list.remove(mul)
690 else:
691 return op
692
693 val = const.outputs[0].values
694 if val >= 0:
695 new_op = Op.LeakyRelu
696 op.attrs["alpha"] = val
697 # to produce bit exact results, the alpha is not enough;
698 # save additional scaling info in attr "alpha_scale", to be used as input
699 # to the LUT construction
James Peet7519d502021-07-19 16:47:58 +0100700 alpha_scalar = const_tens.values - const_tens.quantization.zero_point
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200701 mul_ifm_scale = np.double(ifm.quantization.scale_f32)
702 mul_ifm2_scale = np.double(const_tens.quantization.scale_f32)
703 mul_ofm_scale = np.double(mul_ofm.quantization.scale_f32)
704 alpha_scale, alpha_shift = scaling.elementwise_mul_scale(mul_ifm_scale, mul_ifm2_scale, mul_ofm_scale)
705 op.attrs["alpha_scaling"] = (alpha_scalar, alpha_scale, alpha_shift)
706 elif val == -1:
707 new_op = Op.Abs
708 else:
709 return op
710
711 op.type = new_op
712 op.name = op.name.replace("Maximum", new_op.name)
713 op.outputs[0].name = op.outputs[0].name.replace("Maximum", new_op.name)
714 op.inputs = [shared_in]
715 op.set_ifm_ofm_shapes()
716
717 # Record optimisation in debug database
718 DebugDatabase.add_optimised(op, op)
719
720 return op
721
722
723def convert_hardswish_to_lut(op, arch, nng):
724 if op.type == Op.HardSwish:
725 ifm, ofm = op.get_ifm_ofm()
726 # Generate the LUT
727 ifm_scale = np.double(ifm.quantization.scale_f32)
728 ofm_scale = np.double(ofm.quantization.scale_f32)
729 zp_in = ifm.quantization.zero_point
730 zp_out = ofm.quantization.zero_point
731 ifm_scale_hires = (1 / 128) * ifm_scale
732 relu_multiplier = np.double(3 / 32768)
733 out_scale, out_shift = scaling.quantise_scale(ifm_scale_hires / ofm_scale)
734 relu_scale, relu_shift = scaling.quantise_scale(ifm_scale_hires / relu_multiplier)
735 # Use 16bit scale
736 out_scale_16 = fp_math.downscale_multiplier_int32_to_int16(out_scale)
737 relu_scale_16 = fp_math.downscale_multiplier_int32_to_int16(relu_scale)
738
739 values = []
740 ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)
741 quantized_min = min(ix)
742 quantized_max = max(ix)
743 for x in ix:
744 input_value = x - zp_in
745 input_value_hires = input_value * 128
746 # Compute the input value on essentially the output scale, not shifted yet
747 input_value_preshift = fp_math.saturating_rounding_mul16(input_value_hires, out_scale_16)
748 # Compute the "relu-ish multiplier". This matches the code in TensorFlow Lite Micro kernel
749 relu_value = np.int16(input_value_hires)
750 if relu_shift < 31:
751 relu_value = fp_math.shift_left16(relu_value, 30 - relu_shift)
752
753 relu_value = fp_math.saturating_rounding_mul16(relu_value, relu_scale_16)
754
755 if relu_shift < 31:
756 relu_value = fp_math.shift_left16(relu_value, 1)
757
758 if relu_shift > 31:
759 relu_value = fp_math.rounding_divide_by_pot(relu_value, relu_shift - 31)
760
761 # Rescaled the value into a 16bit fixedpoint relu_value in [-1, 1]
762 # Now convert that to a 16bit fixedpoint value in [0, 1]
763 relu_value = (relu_value + (1 << 15)) >> 1
764 lut_result = fp_math.saturating_mul16(relu_value, input_value_preshift)
765 shift = 31 - out_shift
766 shift = -shift if shift < 0 else 0
767 # Finally apply the output shift
768 lut_result = fp_math.rounding_divide_by_pot(lut_result, shift) + zp_out
769 lut_result = min(quantized_max, max(quantized_min, lut_result))
770 values.append(lut_result)
771 return convert_to_lut(op, values, "hardswish")
772 return op
773
774
775def convert_lrelu_to_mul_max(op, arch):
776 # Converts LeakyRelu to Max(alpha * IFM, identity * IFM)
777 # (the opposite of convert_mul_max_to_abs_or_lrelu)
778 ifm, ofm = op.get_ifm_ofm()
779 if ifm is None or ofm is None:
780 return op
781
782 # Add multiplication with alpha
783 mul_alpha = Operation(Op.Mul, op.name + "_mul_alpha")
784 mul_alpha.add_input_tensor(ifm)
785 # Create const tensor containing alpha as scalar
Fredrik Svedbergcce872b2021-09-02 15:20:52 +0200786 alpha = np.float32(op.attrs["alpha"])
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200787 quantization = ifm.quantization.clone()
788 quantization.min = 0
789 quantization.max = alpha * (quantization.quant_max - quantization.quant_min)
790 quantization.zero_point = 0
Fredrik Svedbergcce872b2021-09-02 15:20:52 +0200791 if np.isinf(1 / alpha):
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200792 # Handling of alpha near zero
Fredrik Svedbergcce872b2021-09-02 15:20:52 +0200793 quantization.scale_f32 = np.float32(1)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200794 scalar = 0
795 else:
796 quantization.scale_f32 = alpha
797 scalar = alpha
798 alpha_tens = create_const_tensor(
799 op.name + "_alpha_scalar", [], ifm.dtype, [scalar], np.float32, quantization=quantization
800 )
James Peet7519d502021-07-19 16:47:58 +0100801 alpha_tens.values = np.array([1])
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200802 mul_alpha.add_input_tensor(alpha_tens)
803 fm_alpha = ofm.clone(op.name + "_alpha", set_unique=True)
804 mul_alpha.set_output_tensor(fm_alpha)
805 mul_alpha.set_ifm_ofm_shapes()
806 DebugDatabase.add_optimised(op, mul_alpha)
807
808 if check_quantized_tens_scaling_equal(ifm, ofm):
809 # No identity multiplication is needed
810 fm_id = ifm
811 else:
812 # Add multiplication with identity
813 mul_identity = Operation(Op.Mul, op.name + "_mul_identity")
814 mul_identity.add_input_tensor(ifm)
815 # Create const tensor containing identity as scalar
816 quantization = ifm.quantization.clone()
817 quantization.min = 0
818 quantization.max = quantization.quant_max - quantization.quant_min
Fredrik Svedbergcce872b2021-09-02 15:20:52 +0200819 quantization.scale_f32 = np.float32(1)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200820 quantization.zero_point = 0
821 identity_tens = create_const_tensor(
822 op.name + "_id_scalar", [], ifm.dtype, [1], np.uint8, quantization=quantization
823 )
824 mul_identity.add_input_tensor(identity_tens)
825 # Make sure that fm_id is allocated to a different address than fm_alpha
826 fm_id = ofm.clone(op.name + "_id", set_unique=True)
827 mul_identity.set_output_tensor(fm_id)
828 mul_identity.set_ifm_ofm_shapes()
829 DebugDatabase.add_optimised(op, mul_identity)
830
831 # Convert LeakyRelu to Max, add the results of the multiplication(s) as inputs
832 op.type = Op.Maximum
833 op.name = op.name.replace("LeakyRelu", "Maximum")
834 op.inputs = []
835 ifm.consumer_list.remove(op)
836 op.add_input_tensor(fm_alpha)
837 op.add_input_tensor(fm_id)
838 op.set_ifm_ofm_shapes()
839
840 DebugDatabase.add_optimised(op, op)
841 return op
842
843
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200844def convert_to_lut8(op, fn, fn_name):
845 # Converts op to a no-op + int8/uint8 LUT which is generated with the given function.
846 # fn is a function(real) -> real
847 ifm, ofm = op.get_ifm_ofm()
848 if ifm.dtype not in (DataType.uint8, DataType.int8) or ifm.dtype != ofm.dtype:
849 return op
850 # Generate the LUT
851 ifm_scale = np.double(ifm.quantization.scale_f32)
852 ofm_scale = np.double(ofm.quantization.scale_f32)
853 zp_in = ifm.quantization.zero_point
854 zp_out = ofm.quantization.zero_point
855 values = []
856 ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)
857 quantized_min = min(ix)
858 quantized_max = max(ix)
859 for x in ix:
860 x_real = ifm_scale * (x - zp_in)
861 y_real = fn(x_real)
862 lut_result = round_away_zero(zp_out + y_real / ofm_scale)
863 lut_result = min(quantized_max, max(quantized_min, lut_result))
864 values.append(lut_result)
865 return convert_to_lut(op, values, fn_name)
866
867
868def convert_lrelu_to_lut(op, arch):
869 ifm, ofm = op.get_ifm_ofm()
870 # Generate the LUT
871 alpha = op.attrs["alpha"]
872 ifm_scale = np.double(ifm.quantization.scale_f32)
873 ofm_scale = np.double(ofm.quantization.scale_f32)
874 zp_in = ifm.quantization.zero_point
875 zp_out = ofm.quantization.zero_point
876 identity_scale, identity_shift = scaling.elementwise_mul_scale(ifm_scale, 1, ofm_scale)
877 alpha_scalar = 1
878 alpha_scale, alpha_shift = scaling.elementwise_mul_scale(ifm_scale, alpha, ofm_scale)
879 if "alpha_scaling" in op.attrs:
880 # The LeakyRelu was the result from convert_mul_max_to_abs_or_lrelu
881 alpha_scalar, alpha_scale, alpha_shift = op.attrs["alpha_scaling"]
882 values = []
883 ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)
884 quantized_min = min(ix)
885 quantized_max = max(ix)
886 for x in ix:
887 if x < zp_in:
888 lut_result = zp_out + fp_math.multiply_by_quantized_multiplier(
889 alpha_scalar * (x - zp_in), alpha_scale, alpha_shift
890 )
891 else:
892 lut_result = zp_out + fp_math.multiply_by_quantized_multiplier(x - zp_in, identity_scale, identity_shift)
893 lut_result = min(quantized_max, max(quantized_min, lut_result))
894 values.append(lut_result)
895 return convert_to_lut(op, values, "lrelu")
896
897
898def convert_lrelu(op, arch, nng):
899 # Converts LeakyRelu to a LUT based solution if possible, otherwise a mul + max
900 if op.type != Op.LeakyRelu:
901 return op
902 ifm, ofm = op.get_ifm_ofm()
903 if ifm is None or ofm is None:
904 return op
905 if ifm.dtype in (DataType.uint8, DataType.int8) and ifm.dtype == ofm.dtype:
906 # use LUT for int8/uint8
907 return convert_lrelu_to_lut(op, arch)
908 if check_quantized_tens_scaling_equal(ifm, ofm) and ifm.dtype == ofm.dtype == DataType.int16:
909 # use LeakyRelu unmodified for int16 with equal input/output scaling
910 return op
911 return convert_lrelu_to_mul_max(op, arch)
912
913
914def convert_tanh_sigmoid_to_lut(op, arch, nng):
915 # Converts int8/uint8 Sigmoid and Tanh to a LUT based solution
916 if op.type == Op.Sigmoid:
917 return convert_to_lut8(op, clamp_sigmoid, "sigmoid")
918 elif op.type == Op.Tanh:
919 return convert_to_lut8(op, math.tanh, "tanh")
920 return op
921
922
Jonas Ohlsson0957e3e2021-09-01 15:57:21 +0200923def remove_memory_only_ops(op, arch):
924 if op.run_on_npu and op.type in memory_only_ops:
925 bypass_memory_only_ops(op)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200926
927
928def fuse_activation_function_with_prev(op, arch, nng):
929 # if op is a no-op: attempts to move the activation function to the preceding op
930 if not op.attrs.get("is_nop", False) or op.activation is None:
931 return op
932 ifm, ofm = op.get_ifm_ofm()
933 if ifm is None or ofm is None:
934 return op
935 # finds the input(s) to the operation
936 prev_op = ifm.ops[0]
937 # Note: the below checks on prev_op require that a first optimize pass on the full graph has been performed
938 fuse = (
939 prev_op.run_on_npu
940 and prev_op.type.npu_block_type != NpuBlockType.Default
941 and len(ifm.ops) == 1
942 and len(prev_op.outputs[0].consumers()) == 1
943 and prev_op.activation is None
944 )
945 if op.activation_lut is not None and arch.shram_reserved_unused_banks == 0:
946 # TODO: if SHRAM LUT space is shared with SHRAM ACC (32, 64 MAC),
947 # LUT currently only works correctly for elementwise ops
948 fuse = False
949 if not fuse:
950 return op
951 # Move the fused activation function + corresponding info to prev_op
952 prev_op.activation = op.activation
953 prev_op.forced_output_quantization = op.forced_output_quantization
954 if op.activation_lut is not None:
955 prev_op.set_activation_lut(op.activation_lut)
956 # Bypass op
957 prev_op.set_output_tensor(ofm)
958 DebugDatabase.add_optimised(op, prev_op)
959 return op
960
961
962def _leading_pad_ok(leading_pad, stride, kernel_size):
963 # If kernel size // 2 > stride, then (left, top) padding must be a multiple of stride,
964 # otherwise replacing PAD by hardware padding would iterate the wrong IFM rows/columns
965 max_size = kernel_size // 2
966 return leading_pad == max_size or max_size <= stride or leading_pad % stride == 0
967
968
969def replace_pad_by_hw_pad(op: Operation, arch, nng):
970 """
971 Tries to completely remove a PAD operator by using hardware padding.
972 E.g. a PAD operation that pads 1, followed by a CONV with VALID padding and kernel size 3
973 is rewritten such that the PAD is removed, and the CONV uses SAME padding.
974 Converts tens1 -> PAD -> tens2 -> CONV to tens1 -> CONV
975 if both operations can be run on the NPU.
976 This is the most efficient way to implement PAD, but cannot be done for all pad sizes.
977 """
978 if (
979 (op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op() or op.type.is_avgpool_op())
Tim Hall0ab2edc2022-02-23 17:58:02 +0000980 and op.type not in (Op.Conv2DBackpropInput, Op.Conv2DBackpropInputSwitchedBias)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200981 and op.run_on_npu
982 and op.attrs["padding"] == Padding.VALID
983 ):
984 pad_op = op.ifm.ops[0]
985 if pad_op.type != Op.Pad or not pad_op.run_on_npu:
986 return op
987 if pad_op.ifm.dtype != pad_op.ofm.dtype or not check_quantized_tens_scaling_equal(pad_op.ofm, pad_op.ifm):
988 return op
989 top, left, bottom, right = get_pad_values_from_input(pad_op.inputs[1].values)
990 k = op.kernel
991 k_w, k_h = k.dilated_wh()
992
993 # Check if the PAD operator can be replaced by hardware padding
994 if left > k_w // 2 or right > k_w // 2 or top > k_h // 2 or bottom > k_h // 2:
995 # Too much padding, it would require hardware padding to actually insert zeros
996 return op
997 if not _leading_pad_ok(top, k.stride.y, k_h) or not _leading_pad_ok(left, k.stride.x, k_w):
998 return op
999
1000 if op.type.is_avgpool_op():
1001 # For average pool, hardware padding can only be used if padding is 0 or kernel size / 2
1002 for pad, k_size in (
1003 (left, k_w),
1004 (right, k_w),
1005 (top, k_h),
1006 (bottom, k_h),
1007 ):
1008 if pad not in (0, k_size // 2):
1009 return op
1010 # Average pool is converted to depthwise, because NPU average pool + same padding
1011 # has a special implementation that is different from PAD followed by average pool with
1012 # valid padding.
1013 k_w, k_h = op.kernel.width, op.kernel.height
1014 ifm = op.ifm
1015 # Remember other inputs
1016 other_inputs = op.inputs[1:]
1017 # Create a weight tensor, all weights are set to 1/(kernel width * kernel height)
1018 quantization = QuantizationParameters(0.0, 255.0)
1019 quantization.scale_f32 = 1.0 / (k_w * k_h)
1020 quantization.zero_point = 0
1021 shape = [k_h, k_w, 1, op.ofm.shape[-1]]
1022 weights = np.full(shape, 1)
1023
1024 weight_tens = create_const_tensor(
1025 op.name + "_weights",
1026 shape,
1027 op.ifm.dtype,
1028 weights,
1029 np.uint8,
1030 purpose=TensorPurpose.Weights,
1031 quantization=quantization,
1032 )
James Peet7519d502021-07-19 16:47:58 +01001033 weight_tens.values = weights
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001034 op.type = Op.DepthwiseConv2DBias
1035 op.inputs = []
1036 op.add_input_tensor(ifm)
1037 op.add_input_tensor(weight_tens)
1038 # Add bias tensor, all biases set to 0
1039 op.inputs.append(None)
1040 fixup_bias_tensors(op, arch, nng)
1041 # Add other inputs
1042 op.inputs.extend(other_inputs)
1043 op.rounding_mode = NpuRoundingMode.NATURAL
1044
1045 # Bypass the PAD operator
1046 op.set_input_tensor(pad_op.ifm, 0)
1047 # Adjust the padding attributes of the convolution operator
1048 op.attrs["padding"] = Padding.EXPLICIT
1049 op.attrs["explicit_padding"] = (top, left, bottom, right)
1050 op.set_ifm_ofm_shapes()
1051 return op
1052
1053
1054def convert_pad(op: Operation, arch, nng):
1055 """
1056 Rewrites PAD operator to an average pool that copies the IFM to the OFM
1057 + up to 4 average pool operators that fill the OFM with zeros at the borders.
1058 This is done as fall-back for the PAD operators that remain after replace_pad_by_hw_pad
1059 """
1060 if op.type != Op.Pad or not op.run_on_npu:
1061 return op
1062 top, left, bottom, right = get_pad_values_from_input(op.inputs[1].values)
1063
1064 ifm = op.ifm
1065 assert ifm is not None
James Ward3e134342021-10-28 10:01:40 +01001066 ifm_shape = op.ifm_shapes[0]
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001067 ofm = op.ofm
1068 assert ofm is not None
1069 ofm.ops = []
1070 ofm_shape = op.ofm_shapes[0]
1071
1072 # Average pool op that copies IFM to the right place inside the OFM
1073 shp0 = Shape4D(0, 0, 0, 0)
1074 shp_top = shp0.with_height(top)
1075 avgpool_op = create_avg_pool_for_concat(op, op.name + "_main", ifm, ifm_shape, shp_top.with_width(left))
1076 avgpool_op.activation = op.activation
1077 quant = ofm.quantization
1078 pad_value = quant.zero_point
1079 # Add operations that fill the borders of the OFM
1080 if top > 0:
1081 shape = Shape4D(1, top, ofm_shape.width, ofm_shape.depth)
1082 zero_tens = create_const_tensor(
1083 op.name + "_top", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], np.uint8, quantization=quant
1084 )
1085 # If top/bottom or left/right are equal, the const tensors can be allocated to the same address
1086 zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))
1087 create_avg_pool_for_concat(op, op.name + "_top", zero_tens, shape, shp0)
1088 if bottom > 0:
1089 shape = Shape4D(1, bottom, ofm_shape.width, ofm_shape.depth)
1090 zero_tens = create_const_tensor(
1091 op.name + "_bottom",
1092 shape.as_list(),
1093 ofm.dtype,
1094 shape.elements() * [pad_value],
1095 np.uint8,
1096 quantization=quant,
1097 )
1098 zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))
1099 create_avg_pool_for_concat(
1100 op, op.name + "_bottom", zero_tens, shape, shp0.with_height(ofm_shape.height - bottom)
1101 )
1102 if left > 0:
1103 shape = Shape4D(1, ifm_shape.height, left, ofm_shape.depth)
1104 zero_tens = create_const_tensor(
1105 op.name + "_left", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], np.uint8, quantization=quant
1106 )
1107 zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))
1108 create_avg_pool_for_concat(op, op.name + "_left", zero_tens, shape, shp_top)
1109 if right > 0:
1110 shape = Shape4D(1, ifm_shape.height, right, ofm_shape.depth)
1111 zero_tens = create_const_tensor(
1112 op.name + "_right", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], np.uint8, quantization=quant
1113 )
1114 zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))
1115 create_avg_pool_for_concat(
1116 op, op.name + "_right", zero_tens, shape, shp_top.with_width(ofm_shape.width - right)
1117 )
1118
1119 op.type = Op.ConcatTFLite
1120 return avgpool_op
1121
1122
1123def add_attrs_to_resizebilinear(op, arch, nng):
1124 if op.type == Op.ResizeBilinear and op.run_on_npu:
1125 input_tensor = op.inputs[0]
1126 input_shape = op.ifm_shapes[0]
1127 upscaled_height = input_shape.height * 2
1128 upscaled_width = input_shape.width * 2
1129 out_shape = op.ofm_shapes[0]
1130 if not op.attrs["align_corners"] and out_shape.height == upscaled_height and out_shape.width == upscaled_width:
1131 # this means the output is supposed to be a x2 upscale,
1132 # so we need to do SAME padding
1133 op.attrs["padding"] = Padding.SAME
1134 elif (
1135 op.attrs["align_corners"]
1136 and out_shape.height == (upscaled_height - 1)
1137 and out_shape.width == (upscaled_width - 1)
1138 ):
1139 # here we can just run the avg pool without padding and
1140 # produce a (M * 2 - 1, N * 2 - 1) sized output
1141 op.attrs["padding"] = Padding.VALID
1142 else:
1143 return op
1144 input_tensor.resampling_mode = resampling_mode.NEAREST
1145 op.attrs.update({"strides": (1, 1, 1, 1), "ksize": (1, 2, 2, 1)})
1146 return op
1147
1148
1149def fixup_bias_tensors(op, arch, nng):
1150 if op.type.needs_bias() and op.bias is None:
1151 # Op has no bias, add bias tensor filled with zeros
1152 nr_biases = op.inputs[1].shape[-1]
1153 bias_values = [0] * nr_biases
1154 bias_tensor = create_const_tensor(op.name + "_bias", [nr_biases], DataType.int32, bias_values)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001155 op.set_input_tensor(bias_tensor, op.type.info.indices.biases[0])
1156
1157 return op
1158
1159
Fredrik Svedbergcc8569f2021-11-01 14:25:29 +01001160def fixup_asymmetric_weights(op, arch, nng):
1161 if op.run_on_npu and (op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op()):
1162 if op.ifm.dtype == DataType.int8:
1163 if not np.all(op.weights.quantization.zero_point == 0):
1164 print(f"Warning: {op.type} '{op.name}' has asymmetric weights, zero points have been adjusted.")
1165 op.weights.quantization.zero_point *= 0
1166
1167 return op
1168
1169
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001170def convert_mean_to_depthwise_conv_or_avgpool(op, arch, nng):
1171 if op.type == Op.Mean and op.run_on_npu:
1172 keep_dims = op.attrs.get("keep_dims", False)
1173 inp, axis = op.inputs
1174 shape = inp.shape
Diqing Zhong1ddb2ed2022-03-09 12:23:47 +01001175 ofm_shape = op.ofm.shape
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001176 dims = len(shape)
Diqing Zhong1ddb2ed2022-03-09 12:23:47 +01001177 dims_ofm = len(ofm_shape)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001178
1179 # Height and width axes have different index depending on dimensions
1180 if axis.shape == [] or axis.shape[0] == 1: # single axis
1181 axis = int(axis.values) if len(axis.shape) == 0 else int(axis.values[0])
1182 if dims in (2, 3):
1183 if axis == 0:
1184 h, w = shape[axis], 1
1185 else:
1186 h, w = 1, shape[axis]
1187 else:
1188 if axis == 1:
1189 h, w = shape[axis], 1
1190 else:
1191 h, w = 1, shape[axis]
1192 else: # multiple axes
1193 axis = sorted(axis.values)
1194 h, w = [shape[i] for i in axis]
1195
1196 # Set necessary depthwise attributes
1197 op.attrs.update(
1198 {
1199 "padding": Padding.VALID,
1200 "stride_h": 1,
1201 "stride_w": 1,
1202 "strides": (1, 1, 1, 1),
1203 "depth_multiplier": 1,
1204 "channel_multiplier": 1,
1205 "dilation_h_factor": 1,
1206 "dilation_w_factor": 1,
1207 "dilation": (1, 1, 1, 1),
1208 }
1209 )
1210 # Change op type
1211 op.type = Op.DepthwiseConv2DBias
1212 # Set IFM/OFM shapes after changing op type
1213 op.set_ifm_ofm_shapes()
1214
1215 weight_scale, bias = 1, None
1216 ofmq, ifmq = op.ofm.quantization, inp.quantization
1217 # Set rounding mode, scaling and zero point based on which reference implementation to match
1218 if len(shape) == 4 and axis == [1, 2] and keep_dims:
1219 if inp.dtype == DataType.uint8:
1220 # This attribute means a different scaling calculation is used in order to match reference
1221 op.low_precision_scaling = True
1222 weight_scale = h * w
1223 # Set zero points to 0 as they will be adjusted for with bias term
1224 foq = ofmq.clone()
1225 foq.zero_point = 0
1226 fiq = ifmq.clone()
1227 fiq.zero_point = 0
1228 op.forced_input_quantization = fiq
1229 bias_term = ofmq.zero_point - int(ifmq.zero_point * ifmq.scale_f32 / ofmq.scale_f32)
1230 # If the bias term is outside uint8 range, we need an Add op to apply it.
1231 if bias_term < 0 or bias_term > 255:
1232 intermediate = op.ofm.clone(suffix="_intermediate", set_unique=True)
1233 # Bias term has higher bitness (i32) than input/output (u8).
1234 # 16 bits is enough since the bias is added/subtracted from a u8 value,
1235 # the bias can only effectively assume values in the range [-255, 255].
1236 intermediate.dtype = DataType.int16
1237 intermediate.quantization.zero_point = 0
1238 add_op = Operation(Op.Add, op.name + "_bias")
1239 add_op.forced_output_quantization = foq
1240 add_op.add_input_tensor(intermediate)
1241 quant = QuantizationParameters()
1242 quant.zero_point = 0
1243 bias_term_tens = create_const_tensor(
James Peet7519d502021-07-19 16:47:58 +01001244 op.name + "_bias", [1, 1, 1, 1], DataType.int16, [bias_term], np.int16, quantization=quant,
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001245 )
1246 add_op.add_input_tensor(bias_term_tens)
1247 add_op.set_output_tensor(op.ofm)
1248 add_op.set_ifm_ofm_shapes()
1249 add_op.activation = op.activation
1250 op.activation = None
1251 op.set_output_tensor(intermediate)
1252 op.set_ifm_ofm_shapes()
1253 # If not, we can just do it with the OFM zero point.
1254 else:
1255 foq.zero_point = bias_term
1256 op.forced_output_quantization = foq
1257 else:
1258 assert inp.dtype == DataType.int8
1259 # Use a depthwise to calculate the sum,
1260 # followed by a multiplication with 1/N to get the MEAN
1261 weight_scale = 1
1262 intermediate = op.ofm.clone(suffix="_intermediate", set_unique=True)
1263 intermediate.dtype = DataType.int16
1264 mul_op = Operation(Op.Mul, op.name + "_mul")
1265 mul_op.add_input_tensor(intermediate)
1266 # Create scalar containing 1/N
1267 quant = QuantizationParameters()
1268 quant.zero_point = 0
1269 # The reference rounds negative numbers downwards, e.g. -1.5 is rounded to -2,
1270 # while rounding mode NATURAL would round this to -1.
1271 # This can only occur if N is even, and can be emulated by
1272 # multiplying with a number that is slightly smaller than 1/N.
1273 # It must be so small that other roundings are not affected;
1274 # the calculated value is based on worst case,
1275 # which is sum 256 * N (the maximum sum that can occur with int8)
1276 n = int(h * w)
1277 eps = 1 / (256 * (n + 1)) if n % 2 == 0 else 0
1278 quant.scale_f32 = 1 / (n - eps)
1279 scalar = create_const_tensor(
1280 op.name + "_scalar", [1, 1, 1, 1], DataType.uint8, [1], np.uint8, quantization=quant
1281 )
1282 mul_op.add_input_tensor(scalar)
1283 mul_op.set_output_tensor(op.ofm)
1284 mul_op.set_ifm_ofm_shapes()
1285 mul_op.rounding_mode = NpuRoundingMode.NATURAL
1286 mul_op.activation = op.activation
1287 op.activation = None
1288 op.set_output_tensor(intermediate)
1289 op.set_ifm_ofm_shapes()
1290 elif ifmq.zero_point == ofmq.zero_point and ifmq.scale_f32 == ofmq.scale_f32:
1291 # Here we can just use a simple AvgPool with truncating rounding,
1292 # as we're emulating simple integer division.
1293 op.rounding_mode = NpuRoundingMode.TRUNCATE
1294 op.type = Op.AvgPool
1295 op.attrs.update({"ksize": (1, h, w, 1), "filter_height": h, "filter_width": w})
1296 else:
1297 op.rounding_mode = NpuRoundingMode.NATURAL
1298 weight_scale = 1 / (h * w)
1299 # Input zero point is adjusted after mean calculation, so we emulate that with a bias
1300 bias = -ifmq.zero_point * h * w
1301 fiq = ifmq.clone()
1302 fiq.zero_point = 0
1303 op.forced_input_quantization = fiq
1304
1305 # Change dimensions to 4
Diqing Zhong1ddb2ed2022-03-09 12:23:47 +01001306 def extend_dims(dim, in_shape):
1307 if dim < 4:
1308 in_shape = [1] + in_shape
1309 if dim == 2:
1310 in_shape += [1]
1311 return in_shape
1312
1313 if dims < 4 or dims_ofm < 4:
1314 # Fix the ofm dimension when keep_dims is false
1315 # e.g. IFM=1xHxWxC axis=2 OFM=1xHxC, the ofm_shape should be 1xHx1xC, not 1x1xHxC
1316 if isinstance(axis, int) and dims_ofm + 1 == dims:
1317 ofm_shape.insert(axis, 1)
1318 elif isinstance(axis, list) and (dims_ofm + len(axis) == dims):
1319 for i in axis:
1320 ofm_shape.insert(i, 1)
1321 shape = extend_dims(dims, shape)
1322 dims_ofm = len(ofm_shape)
1323 ofm_shape = extend_dims(dims_ofm, ofm_shape)
1324 op.set_ifm_ofm_shapes()
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001325
Rickard Bolin7d7cb672021-12-07 09:09:14 +00001326 # If height is greater than max kernel height, reshape from HxW to 1x(HxW)
1327 if (h > 64 and op.type == Op.DepthwiseConv2DBias) or (h > 256 and op.type == Op.AvgPool):
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001328 shape = [shape[0], 1, h * w, shape[3]]
1329 op.ifm_shapes[0] = Shape4D(shape)
1330 if h > 256 and op.type == Op.AvgPool:
1331 op.attrs.update({"ksize": (1, 1, h * w, 1), "filter_height": 1, "filter_width": h * w})
1332
1333 # If the AvgPool version is used, we don't need to do anything else
1334 if op.type == Op.AvgPool:
1335 return op
1336
1337 # Make unit weight tensor quantization
1338 weight_quant = ifmq.clone()
1339 weight_quant.min = 0
1340 weight_quant.max = 255
1341 weight_quant.scale_f32 = weight_scale
1342 weight_quant.zero_point = 0
1343
1344 # Set weight shape to [H,W,C,B]
Diqing Zhong1ddb2ed2022-03-09 12:23:47 +01001345 weight_shape = [h, w, shape[3], shape[0]]
1346
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001347 # Add unit weight tensor
1348 op.set_input_tensor(
1349 create_const_tensor(
1350 "weights",
1351 weight_shape,
1352 inp.dtype,
1353 np.ones(weight_shape),
1354 value_dtype=np.uint8,
1355 quantization=weight_quant,
1356 ),
1357 1,
1358 )
James Peet7519d502021-07-19 16:47:58 +01001359 op.weights.values = np.reshape(op.inputs[1].values, weight_shape)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001360
1361 # Add None bias tensor
1362 op.inputs.append(None)
1363 # Add bias tensor
1364 if bias:
1365 bias_shape = [shape[-1]]
1366 op.set_input_tensor(
1367 create_const_tensor(
Tim Hall8ae29292021-07-28 16:52:03 +01001368 "bias", bias_shape, inp.dtype, np.ones(bias_shape) * bias, value_dtype=np.int32, quantization=None,
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001369 ),
1370 2,
1371 )
1372
1373 return op
1374
1375
1376def supported_operator_check(op, arch, nng):
Jonas Ohlsson45e653d2021-07-26 16:13:12 +02001377 op.run_on_npu = arch.tflite_supported_operators.is_operator_supported(op)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001378 return op
1379
1380
1381def tflite_optimise_graph(nng, arch):
1382 # Pre-processing step
1383 pre_process_list = [
1384 supported_operator_check,
1385 set_ifm_ofm_op_shapes,
1386 ]
1387
1388 for idx, sg in enumerate(nng.subgraphs):
1389 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
1390 nng, sg, arch, [], pre_process_list, rewrite_unsupported=False,
1391 )
1392
1393 # Handle Concat Ops
1394 for idx, sg in enumerate(nng.subgraphs):
1395 rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [rewrite_concat_ops])
1396 sg.refresh_after_modification()
1397
1398 # Handle Split Ops
1399 for idx, sg in enumerate(nng.subgraphs):
1400 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
1401 nng,
1402 sg,
1403 arch,
1404 [],
1405 [rewrite_unpack_output, rewrite_stridedslice_output, convert_nop_split_to_identity],
1406 rewrite_unsupported=False,
1407 )
1408
1409 for idx, sg in enumerate(nng.subgraphs):
1410 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
1411 nng, sg, arch, [rewrite_split_ops], [], rewrite_unsupported=False,
1412 )
1413
1414 # Handle sg input output
1415 for idx, sg in enumerate(nng.subgraphs):
1416 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
1417 nng, sg, arch, [], [fix_sg_input_output], rewrite_unsupported=False,
1418 )
1419
Jonas Ohlsson0957e3e2021-09-01 15:57:21 +02001420 # Removal of memory only operators
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001421 for sg in nng.subgraphs:
Jonas Ohlsson0957e3e2021-09-01 15:57:21 +02001422 rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [remove_memory_only_ops])
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001423 sg.refresh_after_modification()
1424
1425 # Rewrite of operators
1426 op_rewrite_list = [
1427 set_tensor_equivalence,
1428 convert_mean_to_depthwise_conv_or_avgpool,
1429 convert_depthwise_to_conv,
1430 convert_conv_to_fc,
1431 convert_softmax,
1432 optimise_strided_conv,
1433 convert_hardswish_to_lut,
1434 rewrite_fully_connected_input,
1435 convert_batched_fc_shape,
1436 fixup_conv2d_backprop,
1437 fixup_relus_with_differing_ifm_ofm_scaling,
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001438 reorder_depthwise_weights,
1439 fixup_resizebilinear,
1440 fixup_bias_tensors,
Fredrik Svedbergcc8569f2021-11-01 14:25:29 +01001441 fixup_asymmetric_weights,
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001442 convert_mul_max_to_abs_or_lrelu,
1443 convert_lrelu,
1444 convert_tanh_sigmoid_to_lut,
1445 replace_pad_by_hw_pad,
1446 ]
1447
1448 for idx, sg in enumerate(nng.subgraphs):
1449 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
1450 nng, sg, arch, [], op_rewrite_list, rewrite_unsupported=False,
1451 )
1452
1453 for idx, sg in enumerate(nng.subgraphs):
1454 # remove passthrough tensors and attempt further optimizations
1455 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
1456 nng,
1457 sg,
1458 arch,
1459 [remove_passthrough_tensor],
1460 [fuse_activation_function_with_prev, convert_pad, add_padding_fields],
1461 )
1462
1463 # Removal of SplitSliceRead, need to be done after optimisation has been performed,
1464 # since ifm/ofm_shapes are of importance to this function
1465 for sg in nng.subgraphs:
1466 rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [remove_SplitSliceRead])
1467 sg.refresh_after_modification()
1468
1469 return nng