blob: ef6b90b5ce6aa62ecf481d76e6578b8943ff5e0a [file] [log] [blame]
Tim Hall3b1578e2023-01-13 17:57:25 +00001# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Rickard Bolinbc6ee582022-11-04 08:24:29 +000016#
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020017# Description:
18# Early optimisation of a TensorFlow Lite based network graph, using the rewrite_graph module
19# to do the traversal of the graph.
Raul Farkas10d6b3b2023-01-30 12:58:46 +000020from __future__ import annotations
21
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020022import math
23import uuid
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020024
25import numpy as np
26
27from . import fp_math
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020028from . import rewrite_graph
29from . import scaling
Fredrik Svedberga04f2f72022-07-06 13:42:24 +020030from .data_type import BaseType
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020031from .data_type import DataType
32from .debug_database import DebugDatabase
33from .errors import UnsupportedFeatureError
34from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Jonas Ohlsson0957e3e2021-09-01 15:57:21 +020035from .graph_optimiser_util import bypass_memory_only_ops
Patrik Gustavssonc74682c2021-08-17 14:26:38 +020036from .graph_optimiser_util import calc_explicit_padding
Patrik Gustavssondf995102021-08-23 15:33:59 +020037from .graph_optimiser_util import convert_depthwise_to_conv
Fredrik Svedberg0ac08042023-04-11 22:35:04 +020038from .graph_optimiser_util import create_avg_pool_for_concat
Jonas Ohlsson0957e3e2021-09-01 15:57:21 +020039from .graph_optimiser_util import memory_only_ops
Patrik Gustavssonf1580f02021-09-01 12:43:02 +020040from .graph_optimiser_util import move_splitsliceread_to_consumer
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020041from .graph_optimiser_util import needed_total_padding
42from .graph_optimiser_util import set_ifm_ofm_op_shapes
43from .graph_optimiser_util import set_tensor_equivalence
Fredrik Svedberg0ac08042023-04-11 22:35:04 +020044from .lstm import Lstm
Johan Alfvence502732023-04-24 13:35:40 +020045from .lut import convert_to_lut
46from .lut import create_lut_8bit_op
47from .lut import create_lut_int16_op
Johan Alfven8e525ca2023-05-07 13:12:37 +020048from .lut import create_lut_rsqrt_int8_op
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020049from .numeric_util import clamp_sigmoid
Johan Alfven56811e62023-03-27 11:33:50 +020050from .numeric_util import full_shape
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020051from .numeric_util import round_away_zero
Johan Alfven7b3008a2023-04-13 18:54:47 +020052from .numeric_util import round_down_log2
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020053from .operation import create_activation_function
Fredrik Svedberg1a7527c2021-09-13 15:52:16 +020054from .operation import ExplicitScaling
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020055from .operation import NpuBlockType
56from .operation import Op
57from .operation import Operation
58from .operation import Padding
Tim Hall5ff4cd12023-05-16 22:39:14 +010059from .operation import RoundingMode
Alexander Hansson90c34b52023-05-31 15:03:03 +000060from .operation_util import create_add
Fredrik Svedbergf3c7d552022-11-04 09:48:49 +010061from .operation_util import create_add_nop
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020062from .operation_util import create_avgpool_nop
Johan Alfvenc1ad80b2023-03-31 10:19:23 +020063from .operation_util import create_cast_op
Rickard Bolin6986a072022-12-19 12:33:40 +000064from .operation_util import create_depthwise_maxpool
Johan Alfvenc1ad80b2023-03-31 10:19:23 +020065from .operation_util import create_memcpy
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020066from .operation_util import get_pad_values_from_input
Ayaan Masood25f48dd2022-06-29 18:16:04 +010067from .scaling import quantise_scale
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020068from .shape4d import Shape4D
69from .softmax import SoftMax
70from .tensor import check_quantized_tens_scaling_equal
71from .tensor import create_const_tensor
72from .tensor import create_equivalence_id
73from .tensor import QuantizationParameters
74from .tensor import Tensor
75from .tensor import TensorPurpose
76from .tflite_mapping import optype_to_builtintype
Raul Farkas3b64f062023-05-16 17:18:31 +010077from .utils import calc_resize_factor
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020078
79passthrough_nodes = (Op.Identity,)
80
81
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020082def remove_passthrough_tensor(tens, arch, nng):
83 if len(tens.ops) == 1 and tens.ops[0].type in passthrough_nodes:
84 assert len(tens.ops[0].inputs) == 1
85 tens = tens.ops[0].inputs[0]
86 return tens
87
88
89def rewrite_concat_ops(op, arch):
90 if not op.run_on_npu or not op.type.is_concat_op():
91 return
92
93 axis_4D = 0
94 ofm = op.ofm
95 ofm.ops = []
96 offset = 0
97
98 unfuse_activation_function(op)
99
100 if op.type == Op.Pack:
101 # Pack is also referred to as Stack
102 axis = int(op.attrs["axis"])
103 if axis < 0: # Convert to positive axis
104 axis = len(op.inputs[0].shape) + 1 + axis
105
106 desired_shape = op.inputs[0].shape[:axis] + [1] + op.inputs[0].shape[axis:]
107
108 axis_4D = axis + (4 - len(desired_shape))
109
110 for idx, inp in enumerate(op.inputs):
111 op.ifm_shapes[idx] = Shape4D(desired_shape)
112 op.type = Op.PackReshaped
113
114 inputs, axis = op.get_concat_inputs_axis()
115 for idx, inp in enumerate(inputs):
116 if op.type != Op.PackReshaped:
117 op.ifm_shapes[idx] = Shape4D(inp.shape)
118 if axis >= 0:
119 axis_4D = axis + (4 - len(inp.shape))
120 else:
121 axis_4D = axis
122 write_offset = [0, 0, 0, 0]
123 write_offset[axis_4D] = offset
124 concat_end = offset + op.ifm_shapes[idx][axis_4D]
125 create_avg_pool_for_concat(
126 op, op.name + str(idx) + "_avgpool", inp, op.ifm_shapes[idx], Shape4D.from_list(write_offset)
127 )
128 offset = concat_end
129 assert ofm.shape[axis] == offset
130
131 return op
132
133
134def rewrite_split_ops(tens, arch, nng):
135
136 if len(tens.ops) == 1 and tens.ops[0].type.is_split_op() and tens.ops[0].type != Op.Unpack:
137 split_op = tens.ops[0]
138
139 # Not supported so leave it and run on CPU
140 if not split_op.run_on_npu:
141 return tens
142
143 inp, outputs, axis, offset_start, offset_end = split_op.get_split_inputs_axis()
144
145 tens.ops = []
146 new_op = Operation(Op.SplitSliceRead, split_op.name)
147 new_op.inputs = [inp]
148 ofm_shape_idx = 0
Tim Hall51a8dce2021-12-20 16:49:27 +0000149 if None in (offset_end, offset_start):
150 read_shape = None
151 else:
152 # the read shape is relative to each start offset
William Isakssona71efe02023-07-12 12:28:05 +0000153 read_shape = Shape4D([oe - os for oe, os in zip(offset_end, offset_start)])
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200154
155 # For Split the offset cannot be extracted from the tensor so it has to
156 # be calculated from the index of the output tensor
157 if axis is not None:
158 # Get the start and end of the split
159 offset_start = [0] * 4
160 axis_4D_list = split_op.attrs.get("split_axis_4D", None) # Present for UnpackReshaped and some StridedSlice
161 for idx, out in enumerate(outputs):
162 if axis_4D_list is not None:
163 axis_4D = axis_4D_list[idx]
164 else:
165 split_op.ofm_shapes[idx] = Shape4D(out.shape)
166 if axis >= 0:
167 axis_4D = axis + (4 - len(out.shape))
168 else:
169 axis_4D = axis
170
171 if out == tens:
172 ofm_shape_idx = idx
173 read_shape = split_op.ofm_shapes[idx]
174 break
175
176 offset_start[axis_4D] += split_op.ofm_shapes[idx][axis_4D]
177
178 new_op.read_offsets[0] = Shape4D.from_list(offset_start, 0)
179 new_op.read_shapes[0] = read_shape
180 new_op.run_on_npu = True
181 new_op.set_output_tensor(tens)
182 new_op.ifm_shapes.append(Shape4D(inp.shape))
183 new_op.ofm_shapes.append(split_op.ofm_shapes[ofm_shape_idx])
184 DebugDatabase.add_optimised(split_op, new_op)
185
186 return tens
187
188
189def remove_SplitSliceRead(op, arch):
190
191 if op.type == Op.SplitSliceRead:
Fredrik Svedberg0ac08042023-04-11 22:35:04 +0200192 # Check if it is possible to put the SplitSliceRead on the tensor consumer(s),
193 # or if an avgpool need to be inserted
194 if op.ofm_shapes[0] == Shape4D.from_list(op.ofm.shape) and all(
195 consumer is not None and consumer.run_on_npu and consumer.type not in memory_only_ops
196 for consumer in op.ofm.consumer_list
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200197 ):
Fredrik Svedberg0ac08042023-04-11 22:35:04 +0200198 # SplitSliceRead can be performed by tensor consumer(s)
199 for cons_op in list(op.ofm.consumer_list):
200 move_splitsliceread_to_consumer(op, cons_op)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200201 else:
202 avgpool_op = create_avgpool_nop(op.name + "_avgpool")
203 avgpool_op.add_input_tensor(op.ifm)
204 avgpool_op.outputs = [op.ofm]
205 op.ofm.ops.remove(op)
206 op.ofm.ops.append(avgpool_op)
207 avgpool_op.ifm_shapes.append(op.ifm_shapes[0])
208 avgpool_op.ofm_shapes.append(op.ofm_shapes[0])
209 avgpool_op.read_offsets[0] = op.read_offsets[0]
210 avgpool_op.read_shapes[0] = op.read_shapes[0]
211
212 op.ifm.consumer_list.remove(op)
213 DebugDatabase.add_optimised(op, avgpool_op)
214
215
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200216def calc_padding_and_skirt(padding_type, kernel, input_shape, explicit_padding):
217 k_w, k_h = kernel.dilated_wh()
218 s_x, s_y = kernel.stride
219 ypad = needed_total_padding(int(input_shape.height), int(s_y), int(k_h))
220 xpad = needed_total_padding(int(input_shape.width), int(s_x), int(k_w))
221 if padding_type == Padding.SAME:
222 left_pad = (xpad + 0) // 2
223 right_pad = (xpad + 1) // 2
224 top_pad = (ypad + 0) // 2
225 bottom_pad = (ypad + 1) // 2
226 elif padding_type == Padding.VALID:
227 left_pad = 0
228 right_pad = 0
229 top_pad = 0
230 bottom_pad = 0
231 elif padding_type == Padding.EXPLICIT:
232 # Padding is specified in a PAD operator which has been bypassed.
233 top, left, bottom, right = explicit_padding
234 top_pad, bottom_pad = calc_explicit_padding(int(input_shape.height), int(s_y), int(k_h), int(top), int(bottom))
235 left_pad, right_pad = calc_explicit_padding(int(input_shape.width), int(s_x), int(k_w), int(left), int(right))
Rickard Bolin9ae34552022-06-09 13:07:17 +0000236 elif padding_type == Padding.TILE:
237 # The values in the explicit padding only represent the "direction" in which to pad
238 top_pad, left_pad, bottom_pad, right_pad = explicit_padding
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200239 else:
Tim Hall0ab2edc2022-02-23 17:58:02 +0000240 raise UnsupportedFeatureError(f"Unsupported padding = {padding_type} for padding calculation")
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200241 padding = (top_pad, left_pad, bottom_pad, right_pad)
242 skirt = (top_pad, left_pad, ypad - top_pad, xpad - left_pad)
243 return padding, skirt
244
245
246def calc_upscaled_padding_and_skirt(padding_type, kernel_size, stride, input_shape, upscaling_factor):
247 kernel_height, kernel_width = kernel_size[0], kernel_size[1]
248 if padding_type == Padding.SAME:
249 ypad = needed_total_padding(int(input_shape.height) * upscaling_factor, int(stride[1]), int(kernel_height))
250 xpad = needed_total_padding(int(input_shape.width) * upscaling_factor, int(stride[2]), int(kernel_width))
251 right_pad = max(((xpad + 1) // upscaling_factor) - 1, 0)
252 bottom_pad = max(((ypad + 1) // upscaling_factor) - 1, 0)
253 left_pad = max(kernel_width - 1 - right_pad, 0)
254 top_pad = max(kernel_height - 1 - bottom_pad, 0)
255 elif padding_type == Padding.VALID:
256 right_pad = max(kernel_width - 2, 0)
257 bottom_pad = max(kernel_height - 2, 0)
258 left_pad = kernel_width - 1
259 top_pad = kernel_height - 1
260 else:
Tim Hall0ab2edc2022-02-23 17:58:02 +0000261 raise UnsupportedFeatureError(f"Unsupported padding = {padding_type} for up-scaled padding calculation")
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200262 padding = (top_pad, left_pad, bottom_pad, right_pad)
263 skirt = padding
264 return padding, skirt
265
266
Raul Farkas66207142023-05-25 11:15:20 +0100267def fixup_conv2d_backprop(op: Operation, arch, nng) -> Operation:
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200268 if op.type == Op.Conv2DBackpropInput:
269 # flip the inputs
270 op.inputs[0], op.inputs[2] = op.inputs[2], op.inputs[0]
271 op.type = Op.Conv2DBackpropInputSwitchedBias
Tim Hall3c5cfe92022-03-16 16:31:57 +0000272 op.ifm_resampling_mode = resampling_mode.TRANSPOSE
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200273
274 # Update strides
275 op.attrs.update({"stride_w": 1, "stride_h": 1, "strides": (1, 1, 1, 1)})
wilisa0179a89042022-11-02 17:18:43 +0000276 DebugDatabase.add_optimised(op, op)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200277
278 return op
279
280
281# Convert the op to an elementwise add
Tim Hall885033b2022-07-21 11:46:03 +0100282def convert_resize_1x1_to_add(op):
283 op.type = Op.Add # original_type will stay as Op.ResizeBilinear or Op.ResizeNearestNeighbor
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200284 op.name = op.name + "_add"
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200285 # Create an input tensor filled with zeros
wilisa018289d512023-01-12 08:17:23 +0000286 name = op.inputs[1].name + "_add"
287 dtype = op.inputs[0].dtype
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200288 shape = op.ofm_shapes[0].as_list()
wilisa018289d512023-01-12 08:17:23 +0000289 values = np.zeros(shape, dtype.as_numpy_type())
290 quantization = QuantizationParameters(0.0, 255.0)
291 quantization.scale_f32 = 1.0
292 quantization.zero_point = 0
wilisa0116b5e5e2023-02-14 12:03:59 +0000293 op.inputs[1] = op.inputs[0]
294 op.set_input_tensor(create_const_tensor(name, shape, dtype, values, quantization=quantization), 0)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200295 op.set_ifm_ofm_shapes()
wilisa0179a89042022-11-02 17:18:43 +0000296 DebugDatabase.add_optimised(op, op)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200297
298 return op
299
300
Tim Hall5ff4cd12023-05-16 22:39:14 +0100301# Convert ResizeNearestNeighbor with align corners to a depthwise convolution. The IFM will already have been upscaled
Tim Hall885033b2022-07-21 11:46:03 +0100302# apart from the final x2 scaling which will be done as part of this operation. The kernel contains a single coefficient
303# to select the appropriate nearest neighbor value
304def convert_resizenn_ac_to_depthwise_conv(op, upscale_factor):
305 ifm = op.ifm
306 ofm = op.ofm
307 output_depth = ofm.shape[-1]
308 dw_op_attrs = {
309 "padding": Padding.VALID,
310 "stride_h": 1,
311 "stride_w": 1,
312 "strides": (1, 1, 1, 1),
313 "depth_multiplier": 1,
314 "channel_multiplier": 1,
315 "dilation_h_factor": 1,
316 "dilation_w_factor": 1,
317 "dilation": (1, 1, 1, 1),
318 }
319
Tim Hall5ff4cd12023-05-16 22:39:14 +0100320 # change ResizeNearestNeighbor to Depthwise
Tim Hall885033b2022-07-21 11:46:03 +0100321 op.type = Op.DepthwiseConv2DBias
322 op.attrs.update(dw_op_attrs)
323 op.set_input_tensor(ifm, 0) # ifm tensor index
324 op.activation = None
325
326 # add input resample to resize by x2
327 op.ifm_resampling_mode = resampling_mode.NEAREST
328
329 # don't care about the rounding mode as it is nearest neighbor
330
331 # setup weight tensor
332 weight_quant = QuantizationParameters()
333 weight_quant.scale_f32 = 1.0 # no scaling as only a single non-zero coeff to select the desired value
334 weight_quant.zero_point = 0
335 weight_quant.quant_dim = 0
336 ofm_dtype = ofm.dtype
Tim Hall3b1578e2023-01-13 17:57:25 +0000337 if ofm_dtype.type == BaseType.UnsignedInt:
Tim Hall885033b2022-07-21 11:46:03 +0100338 weight_quant.quant_min = 0
339 weight_quant.quant_max = (1 << ofm_dtype.bits) - 1
340 else:
Tim Hall885033b2022-07-21 11:46:03 +0100341 weight_quant.quant_min = -(1 << (ofm_dtype.bits - 1))
342 weight_quant.quant_max = (1 << (ofm_dtype.bits - 1)) - 1
343
344 weight_shape = [upscale_factor, upscale_factor, output_depth, output_depth] # HWIO
345
346 # the single non-zero coefficient used to select the desired value needs to be placed in the 'centre value', which
347 # is calculated by finding the 'centre position' ('*' in the diagram below) and then choosing the 'value' that is
348 # below-and-right (i.e. next) to it (D).
349 # 0---1---2
350 # | A | B |
351 # 1---*---+
352 # | C | D |
353 # 2---+---+
354 weight_values = [0] * (upscale_factor * upscale_factor)
355 centre_coeff = (upscale_factor // 2) * upscale_factor + (upscale_factor // 2)
356 weight_values[centre_coeff] = 1
357
358 # add weight tensor, this will discard the size tensor of the resize op
359 op.set_input_tensor(
360 create_const_tensor(
361 "weights",
362 weight_shape,
Tim Hall3b1578e2023-01-13 17:57:25 +0000363 ofm_dtype,
Tim Hall885033b2022-07-21 11:46:03 +0100364 np.array(weight_values).reshape(weight_shape),
Tim Hall885033b2022-07-21 11:46:03 +0100365 quantization=weight_quant,
366 ),
367 1, # inputs tensor weight index
368 )
369
370 # setup bias tensor by assign None and then call the fix-up function to create a suitable tensor.
371 # need to append the bias tensor as resize ops only have 2 inputs
372 assert len(op.inputs) == 2
373 op.inputs.append(None)
Fredrik Svedbergcc219be2022-09-20 16:32:52 +0200374 fixup_bias_tensors(op, None, None, DataType.int32)
Tim Hall885033b2022-07-21 11:46:03 +0100375
376 # finally update the shape incase we've change the tensor shapes or connections
377 op.set_ifm_ofm_shapes()
wilisa0179a89042022-11-02 17:18:43 +0000378 DebugDatabase.add_optimised(op, op)
Tim Hall885033b2022-07-21 11:46:03 +0100379
380 return op
381
382
383# Convert ResizeBilinear/NearestNeighbor to a number of 1x1 average pools with nearest neighbor x2 upscaling and one
384# final average pool with a kernel size that depends upon the resize ops upscaling factor (x2, x4 or x8). The maximum
385# upscale factor is limited to x8 because of the limit 8x8 kernel size limit for average pool with padding.
386def convert_resize_to_upscale_and_average_pool(op):
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200387 pre_op = op
388 outputs = op.outputs
Rickard Boline546def2022-01-25 15:45:00 +0000389 dtype = op.ifm.dtype
Tim Hall885033b2022-07-21 11:46:03 +0100390
Rickard Boline546def2022-01-25 15:45:00 +0000391 op.attrs.update({"strides": (1, 1, 1, 1), "ksize": (1, 1, 1, 1)})
Tim Hall47c76362022-07-18 21:26:47 +0100392 op.attrs["padding"] = Padding.SAME # doesn't really matter as the kernel is 1x1
Tim Hall3c5cfe92022-03-16 16:31:57 +0000393 op.ifm_resampling_mode = resampling_mode.NEAREST
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200394
395 upscaled_shape = np.array(op.ifm_shapes[0].get_hw_as_list())
Tim Hall47c76362022-07-18 21:26:47 +0100396
397 # Get upscale factor that was calculated in the supported operators check
398 upscale_factor = op.attrs["upscale_factor"]
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200399
Rickard Boline546def2022-01-25 15:45:00 +0000400 # Calculate how many times 2x2 upscaling needs to be performed
Tim Hallf9267da2022-04-20 20:19:48 +0100401 # Force the result of round to be an integer. This is because the behaviour of rounding numpy.float64 values changed
402 # between different versions of numpy. This consistency ensures that the kernel dimensions are kept integral
Rickard Boline546def2022-01-25 15:45:00 +0000403 n = int(np.log2(upscale_factor))
404
Tim Hall885033b2022-07-21 11:46:03 +0100405 # Perform x2 upscaling n-1 times
Rickard Boline546def2022-01-25 15:45:00 +0000406 scaled_op = pre_op
407 for count in range(n - 1):
408 if count > 0:
409 scaled_op = op.clone(f"_{count}")
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200410 scaled_op.inputs[0] = pre_op.outputs[0]
411
Tim Hall885033b2022-07-21 11:46:03 +0100412 # Nearest neighbor x2 upscaling
Tim Hall47c76362022-07-18 21:26:47 +0100413 upscaled_shape = upscaled_shape * 2
Rickard Boline546def2022-01-25 15:45:00 +0000414 shape = op.ofm_shapes[0].as_list()
415 shape[1:3] = upscaled_shape
416 out_tens = Tensor(shape, dtype, f"{op.outputs[0].name}_{count}")
417 out_tens.quantization = op.outputs[0].quantization.clone()
418 scaled_op.set_output_tensor(out_tens)
419 pre_op = scaled_op
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200420
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200421 scaled_op.set_ifm_ofm_shapes()
wilisa0179a89042022-11-02 17:18:43 +0000422 DebugDatabase.add_optimised(op, scaled_op)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200423
Tim Hall885033b2022-07-21 11:46:03 +0100424 # Last x2 upscaling
Rickard Boline546def2022-01-25 15:45:00 +0000425 if n > 1:
426 scaled_op = op.clone(f"_{n-1}")
427 scaled_op.inputs[0] = pre_op.outputs[0]
Tim Hall885033b2022-07-21 11:46:03 +0100428
429 if scaled_op.original_type == Op.ResizeBilinear:
430 if scaled_op.attrs["align_corners"]:
431 # no padding
432 scaled_op.attrs["padding"] = Padding.VALID
433 else:
434 # padding to the right and bottom (limits average pool to 8x8 kernel)
435 scaled_op.attrs["padding"] = Padding.EXPLICIT
436 scaled_op.attrs["explicit_padding"] = [0, 0, upscale_factor - 1, upscale_factor - 1]
437
438 # kernal size dependent on the upscaling factor
439 scaled_op.attrs.update({"ksize": (1, upscale_factor, upscale_factor, 1)})
440 else: # Op.ResizeNearestNeighbor
441 if scaled_op.attrs["align_corners"]:
442 # use depthwise conv to select the correct value
443 scaled_op = convert_resizenn_ac_to_depthwise_conv(scaled_op, upscale_factor)
444 else:
Johan Alfvéna64616c2022-10-17 12:29:12 +0200445 # Keep 1x1 kernel and average pool, this applies both when
446 # half-pixel-centers is True and False. Calculations are the
447 # same in the reference.
Tim Hall885033b2022-07-21 11:46:03 +0100448 pass
449
Rickard Boline546def2022-01-25 15:45:00 +0000450 scaled_op.outputs = outputs
451 scaled_op.outputs[0].ops = [scaled_op]
452 scaled_op.set_ifm_ofm_shapes()
wilisa0179a89042022-11-02 17:18:43 +0000453 DebugDatabase.add_optimised(op, scaled_op)
Rickard Boline546def2022-01-25 15:45:00 +0000454
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200455 return op
456
457
Raul Farkas66207142023-05-25 11:15:20 +0100458def convert_argmax_to_depthwise_conv_and_max_pool(op: Operation, arch, nng) -> Operation:
Rickard Bolin6986a072022-12-19 12:33:40 +0000459 """
460 Convert ArgMax to DWConv2D->MaxPool->DWConv2D, see details below.
461
462 Example:
463 arr = [4, [00000100,
464 6, = 00000110, # <-- This is the largest value, so we're expecting argmax(arr) = 1
465 5] 00000101]
466
467 Use 16-bit precision and shift all values 7 bits to the left:
468 Shifted_arr = [0000001000000000,
469 0000001100000000,
470 0000001010000000]
471
472 Add "c - index of channel" to each channel:
473 Shifted_arr_plus_reverse_idx = [0000001000000010, (+2)
474 0000001100000001, (+1)
475 0000001010000000] (+0)
476
477 The index is reversed since ArgMax selects the lowest index if maximum value is found at two index. The index will
478 act as a tie-breaker between channels with equal values and since we want the smallest channel index to be chosen
479 we reverse the index before the maxpool and then subtract the index from the number of channel after the maxpool to
480 get the correct index.
481
482 Find the maximum value in the array:
483 val = max(shifted_arr_plus_reverse_idx) = 0000001100000001
484
485 Subtract the value from the number of channels:
486 shifted_arr_plus_idx = (c-1) - val = 2 - 1 = 1
487
488 Extract the 7 lowest bits using a LUT to cut off the 9 most significant bits:
489 idx = LUT(val) = 0000000000000001 = 1
490 """
491
492 if op.type == Op.ArgMax:
493 ifm, ofm = op.inputs[0], op.outputs[0]
494 identity_quant = QuantizationParameters()
495 identity_quant.zero_point = 0
496 identity_quant.scale_f32 = 1.0
Rickard Bolin6986a072022-12-19 12:33:40 +0000497 # Add last dimension to ofm shape
498 ofm.shape += [1]
499 ofm.ops = []
500
501 # Create 1x1 Depthwise convolution with 2**7 weights for each channel to convert precision to 16 bit and shift
502 # all values 7 bits to the left
503 # Set necessary depthwise attributes
504 dw_op_attrs = {
505 "padding": Padding.VALID,
506 "stride_h": 1,
507 "stride_w": 1,
508 "strides": (1, 1, 1, 1),
509 "depth_multiplier": 1,
510 "channel_multiplier": 1,
511 "dilation_h_factor": 1,
512 "dilation_w_factor": 1,
513 "dilation": (1, 1, 1, 1),
514 "explicit_padding": None,
515 }
Johan Alfvenc1ad80b2023-03-31 10:19:23 +0200516 orig_name = op.name
517 op.name = f"{orig_name}_depthwise_conv_SHL_7"
Rickard Bolin6986a072022-12-19 12:33:40 +0000518 op.type = Op.DepthwiseConv2DBias
519 op.attrs.update(dw_op_attrs)
Johan Alfven56811e62023-03-27 11:33:50 +0200520 n, h, w, c = full_shape(4, ifm.shape, 1)
Rickard Bolin6986a072022-12-19 12:33:40 +0000521 shape = [1, 1, 1, c]
522 kernel = np.dstack([2**7] * c)
523 op.inputs = []
524 op.add_input_tensor(ifm)
525 op.add_input_tensor(
526 create_const_tensor(
527 "weights",
528 shape,
529 DataType.uint8,
530 np.array(kernel).reshape(shape),
531 quantization=identity_quant,
532 ),
533 )
534 # Let the bias for each channel be the "reverse" index of the channel it is in, ie c - channel_idx
535 reverse_idxs = list(reversed(range(c)))
536 bias_tensor = create_const_tensor(op.name + "_bias", [c], DataType.int64, reverse_idxs)
537 op.add_input_tensor(bias_tensor)
538
539 intermediate_tens = Tensor([n, h, w, c], DataType.int16, "int16_and_shifted_7_bits_left")
540 intermediate_tens.quantization = ifm.quantization
541 op.set_output_tensor(intermediate_tens)
542 op.set_ifm_ofm_shapes()
543 orig_ifm_shape = op.ifm_shapes[0]
544 DebugDatabase.add_optimised(op, op)
545
546 # To extract 7 least significant bits and swap reverse index back to real index using a LUT activation, we set
547 # the base value to c-1 and slope to -128. The 16-bit LUT uses a table of 32-bit values where the top 16 bits
548 # represent the slope and bottom 16 bits the base which are used to interpolate the activation value.
549 slope = (-128 & 0xFFFF) << 16 # Top 16 bits of 32 bit LUT table value
550 base = c - 1 # Bottom 16 bits of the LUT table value
551 lut_tensor = create_const_tensor(
552 "maxpool_LUT_extract_7_LSB",
553 [1, 1, 1, 512],
554 DataType.uint32,
555 [slope + base] * 512,
556 TensorPurpose.LUT,
557 )
558
559 # Split large feature maps into smaller chunks since the Depthwise Maxpool height dimension can overflow due to
560 # flattening the ifm to (H*W)xCx1
561 max_height = 2**16 // orig_ifm_shape.width
562 num_full_height_ops = orig_ifm_shape.height // max_height
563 last_op_height = orig_ifm_shape.height - max_height * num_full_height_ops
564 op_heights = [max_height] * num_full_height_ops
565 if last_op_height > 0:
566 op_heights.append(last_op_height)
567
568 # Create maxpool output tensor which is reshaped to 1x(H*W)x1x1. The product H*W might be larger than the
569 # maximum allowed height, but that's handled by reading and writing the data in chunks
570 maxpool_ofm = Tensor([1, orig_ifm_shape.height * orig_ifm_shape.width, 1, 1], DataType.int16, "argmax_maxpool")
571 maxpool_ofm.quantization = identity_quant
572
573 for op_idx, op_height in enumerate(op_heights):
574 maxpool_op = create_depthwise_maxpool(
575 f"dw_maxpool_{op_idx}", intermediate_tens, orig_ifm_shape, identity_quant
576 )
577 maxpool_op.outputs = [maxpool_ofm]
578 maxpool_ofm.ops.append(maxpool_op)
579 maxpool_op.ofm_shapes = [Shape4D(maxpool_ofm.shape)]
580 maxpool_op.set_activation_lut(lut_tensor)
581
582 # Set read and write shapes/offsets to read/write chunks of the IFM/OFM
583 maxpool_op.read_shapes[0] = Shape4D([1, op_height * orig_ifm_shape.width, orig_ifm_shape.depth, 1])
584 maxpool_op.read_offsets[0] = Shape4D([0, sum(op_heights[:op_idx]) * orig_ifm_shape.width, 0, 0])
585 maxpool_op.write_shape = Shape4D([1, op_height * orig_ifm_shape.width, 1, 1])
586 maxpool_op.write_offset = Shape4D([0, sum(op_heights[:op_idx]) * orig_ifm_shape.width, 0, 0])
587 DebugDatabase.add_optimised(op, maxpool_op)
588
Johan Alfvenc1ad80b2023-03-31 10:19:23 +0200589 # Set final shape
590 maxpool_ofm.set_all_shapes([1, h, w, 1])
591
592 # Convert 16bit to 32bit or 64bit
593 if ofm.dtype == DataType.int64:
594 # If OFM dtype is int64 the result is converted by two cast ops (16bit to 32bit)
595 #
596 # A -> B -> C -> D (OFM)
597 # |0001| |00010000| |0001|0000| |00010000|00000000|
598 # i16 i32 i16 i16 i32 i32
599 # <-------i64------->
600 #
601 # Memcpy is used to copy the content from B to C and from D to OFM
602 # Memcpy will be turned into a nop or an DMA transer if memory regions differs.
603 intermediate_32bit = Tensor([1, h, w, 1], DataType.int32, f"{orig_name}_32bit")
604 else:
605 intermediate_32bit = ofm
606
607 op_cast = create_cast_op(f"{orig_name}_cast_to_32bit_1", maxpool_ofm, intermediate_32bit)
608 DebugDatabase.add_optimised(op, op_cast)
609
610 if ofm.dtype == DataType.int64:
611 # Create int16 tensor with double shape to cover the intermediate_32bit result from the first cast
612 intermediate_16bit_2x_size = Tensor([1, h, w, 2], DataType.int16, f"{orig_name}_16bit_2x_size")
613 memcpy_op = create_memcpy(f"{orig_name}_memcpy_1", intermediate_32bit, intermediate_16bit_2x_size)
614 DebugDatabase.add_optimised(op, memcpy_op)
615
616 # Create int32 tensor with double ofm shape to be able to store a "int64" result
617 intermediate_32bit_2x_size = Tensor([1, h, w, 2], DataType.int32, f"{orig_name}_32bit_2x_size")
618
619 op_cast = create_cast_op(
620 f"{orig_name}_cast_to_32bit_2", intermediate_16bit_2x_size, intermediate_32bit_2x_size
621 )
622 DebugDatabase.add_optimised(op, op_cast)
623
624 memcpy_op = create_memcpy("f{orig_name}_memcpy_2", intermediate_32bit_2x_size, ofm)
625 DebugDatabase.add_optimised(op, memcpy_op)
Rickard Bolin6986a072022-12-19 12:33:40 +0000626
627 return op
628
629
Rickard Bolinfea15162022-07-04 16:19:16 +0000630def convert_resizebilinear_to_depthwise_convolutions(op, half_pixel_centers=True):
631 def _compute_interpolation_values(index, input_size, output_size):
632 scale = input_size / output_size
633 scaled_value = (index + 0.5 * half_pixel_centers) * scale - 0.5 * half_pixel_centers
634 lower_bound = max(np.floor(scaled_value), 0)
635
636 return scaled_value, lower_bound
637
638 def _compute_kernels(input_height, input_width, output_height, output_width):
639 kernels = []
640 for y in (1, 2):
641 for x in (1, 2):
642 sv_h, lb_h = _compute_interpolation_values(y, input_height, output_height)
643 sv_w, lb_w = _compute_interpolation_values(x, input_width, output_width)
644
645 # Interpolation values calculated for (x, y) = ([1, 2], [1, 2]) will always generalize to the whole
646 # input for upscale = 2 and input sizes >= 2x2 and be in the correct order for going left-to-right,
647 # top-to-bottom - same as the depthwise convolution strides across each tile
648 kernel = np.zeros((2, 2))
649 kernel[1, 1] = (1 - (sv_h - lb_h)) * (1 - (sv_w - lb_w))
650 kernel[0, 1] = (sv_h - lb_h) * (1 - (sv_w - lb_w))
651 kernel[1, 0] = (1 - (sv_h - lb_h)) * (sv_w - lb_w)
652 kernel[0, 0] = (sv_h - lb_h) * (sv_w - lb_w)
653 kernel *= 16
654 kernels.append(kernel)
655
656 return kernels
657
658 def _build_convolutions(op, kernels):
659 dw_op_attrs = {
660 "padding": Padding.TILE,
661 "stride_h": 1,
662 "stride_w": 1,
663 "strides": (1, 1, 1, 1),
664 "depth_multiplier": 1,
665 "channel_multiplier": 1,
666 "dilation_h_factor": 1,
667 "dilation_w_factor": 1,
668 "dilation": (1, 1, 1, 1),
669 }
670 ifm = op.ifm
671 ofm = op.ofm
672 ofm.ops = []
673 elem_size = 2 if ofm.dtype == DataType.int16 else 1
674
675 n, h, w, c = ifm.shape
676 _, _, ow, _ = ofm.shape
677
678 intermediate_tens = Tensor(ifm.shape, ifm.dtype, "intermediate_tens")
679 intermediate_tens.quantization = op.outputs[0].quantization.clone()
680 avgpool_op = op
681 avgpool_op.name = "rb_init_avgpool"
682 avgpool_op.type = Op.AvgPool
683 avgpool_op.attrs["padding"] = Padding.VALID
684 avgpool_op.attrs["stride_w"] = 1
685 avgpool_op.attrs["stride_h"] = 1
686 avgpool_op.attrs["filter_width"] = 1
687 avgpool_op.attrs["filter_height"] = 1
688 avgpool_op.attrs["strides"] = [1, 1, 1, 1]
689 avgpool_op.attrs["ksize"] = [1, 1, 1, 1]
690
691 avgpool_op.add_input_tensor(ifm)
692 avgpool_op.set_output_tensor(intermediate_tens)
693 avgpool_op.set_ifm_ofm_shapes()
wilisa0179a89042022-11-02 17:18:43 +0000694 DebugDatabase.add_optimised(op, op)
Rickard Bolinfea15162022-07-04 16:19:16 +0000695
696 dw_conv = Operation(Op.DepthwiseConv2DBias, "depthwise_conv")
697 dw_conv._original_type = Op.ResizeBilinear
698 dw_conv.write_shape = Shape4D(n, h, w, c)
699 dw_conv.write_offset = Shape4D(0, 0, 0, 0)
700
Tim Hall5ff4cd12023-05-16 22:39:14 +0100701 # Resize bilinear requires rounding away from zero
702 dw_conv.rounding_mode = RoundingMode.AwayZero
Rickard Bolinfea15162022-07-04 16:19:16 +0000703
704 # Double height and width stride to write the output of each of the four depthwise convolutions below
705 # interleaved with each other when combined with OFM tile base offsets.
706 dw_conv.ofm_stride_multiplier = [1, 2, 2] # C/H/W
707
708 # Choose tile padding direction - pad by 1 with edge values in two direction.
709 # For example, TL (top left) will pad top and left in H/W-plane in all channels.
710 directions = [[1, 1, 0, 0], [1, 0, 0, 1], [0, 1, 1, 0], [0, 0, 1, 1]] # TL, TR, BL, BR
711 for i in (0, 1):
712 for j in (0, 1):
713 index = i * 2 + j
714 dw_conv.name = f"depthwise_conv_{index}"
715 dw_op_attrs["explicit_padding"] = directions[index]
716 dw_conv.attrs.update(dw_op_attrs)
717
718 # This will offset the start of the write by modifying the Tile 0 base address
719 dw_conv.tile_base_offsets_ofm[0] = (i * ow + j) * c * elem_size
720
721 ofm.ops.append(dw_conv)
722 dw_conv.outputs = [ofm]
723
724 kernel = kernels[index]
725 shape = [2, 2, 1, c]
726 kernel = np.dstack([kernel] * c)
727
728 quant = QuantizationParameters()
729 quant.zero_point = 0
730 quant.scale_f32 = 1.0 / 16
731
732 dw_conv.inputs = []
733 dw_conv.add_input_tensor(intermediate_tens)
734 dw_conv.add_input_tensor(
735 create_const_tensor(
736 "weights",
737 shape,
738 intermediate_tens.dtype,
739 np.array(kernel).reshape(shape),
Rickard Bolinfea15162022-07-04 16:19:16 +0000740 quantization=quant,
741 ),
742 )
743
744 # setup bias tensor by assign None and then call the fix-up function to create a suitable tensor.
745 # need to append the bias tensor as resize ops only have 2 inputs
746 assert len(dw_conv.inputs) == 2
747 dw_conv.inputs.append(None)
Rickard Bolin017b4cc2022-09-23 10:16:48 +0000748 fixup_bias_tensors(dw_conv, None, None, dtype=DataType.int32)
Rickard Bolinfea15162022-07-04 16:19:16 +0000749
750 dw_conv.set_ifm_ofm_shapes()
wilisa0179a89042022-11-02 17:18:43 +0000751 DebugDatabase.add_optimised(op, dw_conv)
752
Rickard Bolinfea15162022-07-04 16:19:16 +0000753 dw_conv = dw_conv.clone(f"_{index}")
754 return op
755
756 _, input_height, input_width, _ = op.ifm.shape
757 _, output_height, output_width, _ = op.ofm.shape
758
759 kernels = _compute_kernels(input_height, input_width, output_height, output_width)
760 op = _build_convolutions(op, kernels)
761
762 return op
763
764
Raul Farkas66207142023-05-25 11:15:20 +0100765def fixup_resize(op: Operation, arch, nng) -> Operation:
766 """Fixup resize ops to increase support for ResizeNearestNeighbor cases."""
Tim Hall885033b2022-07-21 11:46:03 +0100767 if op.type.is_resize_op() and op.run_on_npu:
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200768 if op.ifm_shapes[0] == op.ofm_shapes[0]:
Tim Hall885033b2022-07-21 11:46:03 +0100769 # Bypass the resize op which is essentially a NOP
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200770 op.inputs = op.inputs[:1]
771 op.type = Op.Identity
772 elif op.ifm_shapes[0].height == 1 and op.ifm_shapes[0].width == 1:
Tim Hall885033b2022-07-21 11:46:03 +0100773 convert_resize_1x1_to_add(op)
Rickard Bolinfea15162022-07-04 16:19:16 +0000774 elif op.type == Op.ResizeBilinear and op.attrs.get("half_pixel_centers", False):
775 convert_resizebilinear_to_depthwise_convolutions(op)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200776 else:
Tim Hall885033b2022-07-21 11:46:03 +0100777 convert_resize_to_upscale_and_average_pool(op)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200778
779 return op
780
781
782def convert_nop_split_to_identity(op, arch, nng):
783 if op.type == Op.Split and op.attrs.get("num_splits") == 1:
784 # the list comprehension should return a list with a single tensor
785 # if it shouldn't, remove_passthrough_tensor will fail appropriately
786 op.inputs = [i for i in op.inputs if i.shape == op.outputs[0].shape]
787 op.type = Op.Identity
788 return op
789
790
Raul Farkas66207142023-05-25 11:15:20 +0100791def rewrite_fully_connected_input(op: Operation, arch, nng) -> Operation:
792 """Rewrite FullyConnected shape as 2D to allow it to run on NPU."""
Fredrik Svedberg0ac08042023-04-11 22:35:04 +0200793 # If the operation already have a read shape do not modify
794 # the ifm shape, since that will already be correct
795 if op.type == Op.FullyConnected and not op.read_shapes[0]:
Ayaan Masooda2ec5aa2022-04-21 14:28:03 +0100796 new_shape = op.ifm.get_shape_as_2d(op.weights.shape[-2])
797 assert new_shape is not None, "Tensor can not be reshaped to 2D"
798 op.ifm_shapes[0] = new_shape
Johan Alfvén65835e02022-10-13 10:49:30 +0200799
800 if op.ifm_shapes[0].batch > 1 and op.ofm_shapes[0].batch == 1:
801 # If IFM is batching then also make sure OFM is batching
802 h, w = op.ofm_shapes[0].height, op.ofm_shapes[0].width
803 op.ofm_shapes[0] = Shape4D([h * w, 1, 1, op.ofm_shapes[0].depth])
804
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200805 return op
806
807
Raul Farkas66207142023-05-25 11:15:20 +0100808def convert_batched_fc_shape(op: Operation, arch, nng) -> Operation:
809 """Convert batched FullyConnected op shape to allow for support on NPU."""
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200810 if op.type == Op.FullyConnected:
811 # Check if the first dimension indicates batching
812 if op.ifm_shapes[0].batch > 1:
813 batching_split = {4: (2, 2), 8: (2, 4), 16: (4, 4)}
814 n = op.ifm_shapes[0].batch
815 h, w = batching_split.get(n, (1, n))
816 op.ifm_shapes[0] = Shape4D([1, h, w, op.ifm_shapes[0].depth])
817
818 # Reshape Weights to be 4D. IO becomes HWIO
819 weight_tensor = op.inputs[1]
James Peet7519d502021-07-19 16:47:58 +0100820 weight_tensor.values = np.expand_dims(np.expand_dims(weight_tensor.values, axis=0), axis=0)
821 weight_tensor.set_all_shapes(list(weight_tensor.values.shape))
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200822
823 n = op.ofm_shapes[0].batch
824 h, w = batching_split.get(n, (1, n))
825 op.ofm_shapes[0] = Shape4D([1, h, w, op.ofm_shapes[0].depth])
826 return op
827
828
829def unfuse_activation_function(op):
830 if op.type == Op.ConcatTFLite and op.run_on_npu and op.activation is not None:
831 act_op = Operation(op.activation.op_type, op.name + op.activation.op_type.name)
832 op.activation = None
833 out_tens = op.outputs[0]
834 intermediate_tens = out_tens.clone("_act_intermediate")
835 act_op.set_output_tensor(out_tens)
836 act_op.add_input_tensor(intermediate_tens)
837 op.set_output_tensor(intermediate_tens)
838 act_op.set_ifm_ofm_shapes()
wilisa0179a89042022-11-02 17:18:43 +0000839 DebugDatabase.add_optimised(op, act_op)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200840
841
842def rewrite_stridedslice_output(op, arch, nng):
843 if not op.run_on_npu or op.type != Op.StridedSlice:
844 return op
845
846 new_axis_mask = op.attrs["new_axis_mask"]
847 shrink_axis_mask = op.attrs["shrink_axis_mask"]
848
849 if shrink_axis_mask == 0 and new_axis_mask == 0:
850 return op
851
852 axis_4D = [0] * len(op.outputs)
853 for idx, out_tens in enumerate(op.outputs):
854 output_shape = list(out_tens.shape)
855
856 if shrink_axis_mask != 0:
857 n = 0
858 axis = 0
859 while shrink_axis_mask:
860 prev_mask = shrink_axis_mask
861 n += 1
862 shrink_axis_mask &= shrink_axis_mask - 1
863 axis = int(math.log2(prev_mask - shrink_axis_mask))
864 output_shape = output_shape[:axis] + [1] + output_shape[axis:]
865
866 assert len(out_tens.shape) == (len(op.inputs[0].shape) - n)
867 op.attrs["shrink_axis_mask"] = 0
868 if axis >= 0:
869 axis_4D[idx] = axis + (4 - len(output_shape))
870 else:
871 axis_4D[idx] = axis
872 op.ofm_shapes[idx] = Shape4D(output_shape)
873
874 elif new_axis_mask != 0:
875 n = 0
876 axis = 0
877 while new_axis_mask:
878 prev_mask = new_axis_mask
879 n += 1
880 new_axis_mask &= new_axis_mask - 1
881 axis = int(math.log2(prev_mask - new_axis_mask))
882 output_shape = output_shape[:axis] + output_shape[(axis + 1) :]
883 new_axis_mask >>= 1
884
885 assert len(out_tens.shape) == (len(op.inputs[0].shape) + n)
886 op.attrs["new_axis_mask"] = 0
887 if axis >= 0:
888 axis_4D[idx] = axis + (4 - len(output_shape))
889 else:
890 axis_4D[idx] = axis
891 op.ofm_shapes[idx] = Shape4D(output_shape)
892
893 op.attrs["split_axis_4D"] = axis_4D
894 return op
895
896
897def rewrite_unpack_output(op, arch, nng):
898 tens = op.outputs[0]
899 if op.run_on_npu and op.type == Op.Unpack:
900 # Unpack is also referred to as Unstack
901 axis = int(op.attrs["axis"])
902 if axis < 0: # Convert to positive axis
903 axis = len(op.inputs[0].shape) + 1 + axis
904 op.type = Op.UnpackReshaped
905 desired_output_shape = tens.shape[:axis] + [1] + tens.shape[axis:]
906
907 axis_4D = axis + (4 - len(desired_output_shape))
908 op.attrs["split_axis_4D"] = [axis_4D] * len(op.outputs)
909
910 for idx, out_tens in enumerate(op.outputs):
911 op.ofm_shapes[idx] = Shape4D(desired_output_shape)
912 return op
913
914
915def add_padding_fields(op, arch, nng):
916 if op.run_on_npu:
917 if "padding" in op.attrs:
918 input_shape = op.ifm_shapes[0]
919 output_shape = op.ofm_shapes[0]
920 if op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op():
921 kernel_size = op.inputs[1].shape[:2]
922 elif op.type.is_pool_op() or op.type.npu_block_type == NpuBlockType.ReduceSum:
923 kernel_size = op.attrs["ksize"][1:3]
924 else:
925 raise UnsupportedFeatureError(f"Unknown operation that uses padding: {optype_to_builtintype(op.type)}")
926
927 if op.type == Op.Conv2DBackpropInputSwitchedBias:
928 upscaling_factor = output_shape.height // input_shape.height
929 padding, skirt = calc_upscaled_padding_and_skirt(
930 op.attrs["padding"], kernel_size, op.attrs["strides"], input_shape, upscaling_factor
931 )
932 else:
933 padding, skirt = calc_padding_and_skirt(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200934 op.attrs["padding"],
935 op.kernel,
936 input_shape,
937 op.attrs.get("explicit_padding"),
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200938 )
939
940 op.attrs["explicit_padding"] = padding
941 op.attrs["skirt"] = skirt
942
943 return op
944
945
Raul Farkas66207142023-05-25 11:15:20 +0100946def reorder_depthwise_weights(op: Operation, arch, nng) -> Operation:
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200947 if op.type.is_depthwise_conv2d_op():
948 weight_tensor = op.inputs[1]
Alexander Hansson90c34b52023-05-31 15:03:03 +0000949 if not weight_tensor.weight_transpose_depthwise:
950 weight_tensor.values = np.transpose(weight_tensor.values, (0, 1, 3, 2))
951 weight_tensor.set_all_shapes(list(weight_tensor.values.shape))
952 weight_tensor.weight_transpose_depthwise = True
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200953
954 return op
955
956
Raul Farkas3e7157b2023-05-09 09:09:17 +0100957def convert_avg_pool_to_conv2d(op: Operation, arch, nng) -> Operation:
958 """Convert strided Average Pools with stride >= 4 to Conv2D."""
959 if op.type != Op.AvgPool:
960 return op
961
962 stride_x, stride_y = op.get_kernel_stride()
963 # For strides <= 3 no optimization is needed
964 if stride_x <= 3:
965 return op
966 h, w = op.attrs["filter_height"], op.attrs["filter_width"]
967 inputs = op.inputs[0]
968 shape = inputs.shape
969
970 # Set necessary conv2d attributes
971 op.attrs.update(
972 {
973 "stride_h": stride_y,
974 "stride_w": stride_x,
975 "dilation_h_factor": 1,
976 "dilation_w_factor": 1,
977 "strides": (1, stride_y, stride_x, 1),
978 "dilation": (1, 1, 1, 1),
979 }
980 )
981
982 # Change op type
983 op.type = Op.Conv2DBias
984 op.name += "_conv2d"
985
986 op.rounding_mode = RoundingMode.AwayZero
987 shape = [h, w, 1, op.ofm.shape[-1]]
988 weights = np.full(shape, 1)
989 quant = QuantizationParameters(scale_f32=1 / (h * w), zero_point=0)
990 # Add unit weight tensor
991 op.add_input_tensor(
992 create_const_tensor(
993 "weights",
994 shape,
995 inputs.dtype,
996 weights,
997 quantization=quant,
998 ),
999 )
1000 op.weights.values = np.reshape(op.inputs[1].values, shape)
1001
1002 # Set IFM/OFM shapes after changing op type
1003 op.set_ifm_ofm_shapes()
1004 return op
1005
1006
1007def fixup_strided_conv(op: Operation, arch, nng):
Raul Farkas72c6a242023-03-16 16:38:05 +00001008 """Optimize or fixup strided Conv2DBias
1009 Optimization:
Raul Farkas10d6b3b2023-01-30 12:58:46 +00001010 Reduce, when possible, the Conv2DBias stride from N with 1 > N > 4 to 1
1011 by re-shaping both IFM and filter.
Raul Farkas72c6a242023-03-16 16:38:05 +00001012
1013 Fixup:
Raul Farkas10d6b3b2023-01-30 12:58:46 +00001014 Introduce software support for Conv2DBias with stride_width > 4 by
1015 reducing it to 1, 2 or 3 (HW supported strides) when possible by
1016 re-shaping both IFM and filter.
Raul Farkas72c6a242023-03-16 16:38:05 +00001017 """
Raul Farkas090f18a2023-01-24 16:29:06 +00001018 if op.type != Op.Conv2DBias:
Louis Verhaard43d27582022-03-17 14:06:00 +01001019 return op
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001020 stride_x, stride_y = op.get_kernel_stride()
Louis Verhaard43d27582022-03-17 14:06:00 +01001021 weight_tensor = op.weights
1022 ifm_shape = op.ifm_shapes[0]
Raul Farkas69782af2023-05-09 10:39:52 +01001023
1024 # Do not optimize if op is not the first in the network and stride is
1025 # supported by the hardware
1026 if op.op_index != 0 and stride_x < 4:
1027 return op
1028
Raul Farkas10d6b3b2023-01-30 12:58:46 +00001029 resize_factor, final_stride = calc_resize_factor(ifm_shape.width, stride_x)
1030
1031 def calc_filter_padding(
1032 ifm_padding_type: Padding | None,
1033 ifm_current_padding_x: int,
1034 post_op_stride: int,
1035 opt_resize_factor: int,
1036 filter_width: int,
Raul Farkas3b64f062023-05-16 17:18:31 +01001037 ifm_width: int,
Raul Farkas10d6b3b2023-01-30 12:58:46 +00001038 ) -> tuple[int, int, int, int]:
1039 """Calculate zero padding to be added to the filter.
1040
1041 Parameters
1042 ----------
1043 ifm_padding_type : Padding or None
1044 The padding type that is applied to the IFM.
1045 ifm_current_padding_x : int
1046 Padding amount that is added to the IFM before optimization.
1047 post_op_stride : int
1048 The final stride once optimization is performed.
1049 opt_resize_factor : int
1050 The factor by which the stride will be reduced.
1051 E.g. opt_resize_factor = 2 on a stride of 4 will produce
1052 a stride of 2 after the optimization
1053 filter_width : int
1054 Width of the filter before optimization.
Raul Farkas3b64f062023-05-16 17:18:31 +01001055 ifm_width : int
1056 Width of the IFM before optimization
Raul Farkas10d6b3b2023-01-30 12:58:46 +00001057
1058 Returns
1059 -------
1060 padding : tuple[int, int, int, int]
1061 A tuple with the ammount of padding on each side (top, left, bottom, right)
1062 """
1063 padding_size = 0
1064 padding = (0, 0, 0, 0)
1065 if ifm_padding_type and ifm_padding_type != Padding.VALID:
Raul Farkas3b64f062023-05-16 17:18:31 +01001066 # Compute padding size for the filter that guarantees that HW padding added to IFM matches
1067 # before and after the optimization is performed
1068 expected_filter_size = 0
1069 pre_opt_stride = post_op_stride * opt_resize_factor
1070 post_opt_ifm_width = ifm_width // opt_resize_factor
1071 # Compute the total expected filter size post optimization that ensures that the same HW padding
1072 # is added to IFM.
1073 # There are two ways of calculating required filter size depending on whether IFM width is divisible
1074 # by stride width or not. These approaches match the cases used to calculate HW padding in
1075 # needed_total_padding method.
1076 if ifm_width % pre_opt_stride == 0:
1077 expected_filter_size = ifm_current_padding_x + post_op_stride
1078 else:
1079 expected_filter_size = ifm_current_padding_x + (post_opt_ifm_width % post_op_stride)
1080 # Compute padding size from expected filter size
1081 padding_size = expected_filter_size * opt_resize_factor - filter_width
1082
1083 if ifm_current_padding_x == 0:
1084 # If no HW padding is added to IFM, divide filter padding between left and right following
1085 # the same strategy as the reference.
1086 padding_left = padding_size // 2
1087 else:
1088 # If HW padding is added to IFM, split padding for the filter so that left padding and right padding
1089 # are proportional to left and right HW padding.
1090 left_hw_padding = ifm_current_padding_x // 2
1091 # Compute filter padding
1092 padding_left = padding_size // ifm_current_padding_x * left_hw_padding
Raul Farkas10d6b3b2023-01-30 12:58:46 +00001093 padding = (0, padding_left, 0, padding_size - padding_left)
1094
1095 # Check if filter width is divisible by the stride width (required for optimization)
Raul Farkas3b64f062023-05-16 17:18:31 +01001096 # If filter width is not divisible by stride width and no HW padding is added to IFM, compute
1097 # filter padding required for the filter width to be divisible by the stride width and apply it as right
1098 # padding.
1099 if filter_width % opt_resize_factor != 0 and (padding_size == 0 or ifm_current_padding_x == 0):
Raul Farkas10d6b3b2023-01-30 12:58:46 +00001100 padding_size = opt_resize_factor - (filter_width % opt_resize_factor)
1101 # Add padding zeros to the right
1102 padding = (0, 0, 0, padding_size)
1103
1104 return padding
1105
1106 # Compute the depth of the IFM once the strided Conv2D is optimised
1107 post_opt_ifm_depth = ifm_shape.depth * resize_factor
1108
1109 if stride_x > 1 and (post_opt_ifm_depth <= 8 or stride_x > 3) and resize_factor != 1 and weight_tensor is not None:
1110 k_w, _ = op.get_kernel_size()
1111 weight_shape = weight_tensor.shape
1112
1113 padding_type = op.attrs.get("padding", None)
1114 if padding_type in (None, Padding.EXPLICIT, Padding.TILE):
Louis Verhaard43d27582022-03-17 14:06:00 +01001115 return op
Raul Farkas10d6b3b2023-01-30 12:58:46 +00001116 # Compute current padding as if IFM padding is SAME
1117 curr_padding_x = needed_total_padding(ifm_shape.width, stride_x, k_w)
1118 # Compute the padding needed on the filter for the optimisation
1119 _, left_filter_padding, _, right_filter_padding = calc_filter_padding(
Raul Farkas3b64f062023-05-16 17:18:31 +01001120 padding_type, curr_padding_x, final_stride, resize_factor, k_w, ifm_shape.width
Raul Farkas10d6b3b2023-01-30 12:58:46 +00001121 )
1122 total_horizontal_padding = left_filter_padding + right_filter_padding
1123 # If IFM padding is enabled, check if pre-opt and post-opt padding is
1124 # the same while taking into consideration the extra filter padding.
1125 if padding_type == Padding.SAME:
1126 optimised_padding_x = needed_total_padding(
1127 ifm_shape.width // resize_factor, final_stride, (k_w + 1 + total_horizontal_padding) // resize_factor
1128 )
1129 if curr_padding_x != optimised_padding_x:
1130 # Horizontal padding would become different after optimisation; this would not work
1131 return op
1132
1133 # Resize IFM
Raul Farkas090f18a2023-01-24 16:29:06 +00001134 op.ifm_shapes[0] = Shape4D(
Raul Farkas10d6b3b2023-01-30 12:58:46 +00001135 [ifm_shape.batch, ifm_shape.height, ifm_shape.width // resize_factor, ifm_shape.depth * resize_factor]
Raul Farkas090f18a2023-01-24 16:29:06 +00001136 )
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001137
Raul Farkas10d6b3b2023-01-30 12:58:46 +00001138 # Compute list of 0 padding for each dimensions of the filter
1139 filter_dimension_padding = [(0, 0) for _ in weight_tensor.shape]
1140 # Update padding for filter width with computed padding
1141 filter_dimension_padding[1] = (left_filter_padding, right_filter_padding)
1142 # Add padding to the filter
1143 zero_point = weight_tensor.quantization.zero_point
1144 padding_constant = zero_point if np.isscalar(zero_point) else 0
1145 padded_filter_tensor = np.pad(weight_tensor.values, filter_dimension_padding, constant_values=padding_constant)
1146 weight_shape[1] = padded_filter_tensor.shape[1]
1147 weight_tensor.values = padded_filter_tensor
Raul Farkas090f18a2023-01-24 16:29:06 +00001148 # Change weight shape based on stride_x
Raul Farkas10d6b3b2023-01-30 12:58:46 +00001149 weight_shape[1] //= resize_factor
1150 weight_shape[2] *= resize_factor
Raul Farkas090f18a2023-01-24 16:29:06 +00001151
James Peet7519d502021-07-19 16:47:58 +01001152 weight_tensor.values = np.reshape(weight_tensor.values, weight_shape)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001153 weight_tensor.set_all_shapes(weight_shape)
1154 # If multiple copies of the weights are used, we could avoid
1155 # them having the same address by changing the value_id
1156 weight_tensor.value_id = uuid.uuid4()
1157
1158 # Strides
Raul Farkas10d6b3b2023-01-30 12:58:46 +00001159 stride_x = final_stride
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001160 op.attrs.update({"stride_w": stride_x, "stride_h": stride_y, "strides": (1, stride_y, stride_x, 1)})
1161
1162 return op
1163
1164
Raul Farkas66207142023-05-25 11:15:20 +01001165def convert_conv_to_fc(op: Operation, arch, nng) -> Operation:
1166 """Convert 1x1 Conv2D that behave like FullyConnected to FullyConnected, since they don't need any weight
1167 buffering.
1168 """
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001169 # Conv 1x1 can be equivalent to Fully Connected.
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001170 # (Weights dont need to be reloaded for convs when IFM H and W are 1)
1171 if op.type == Op.Conv2DBias:
1172 h = op.ifm_shapes[0].height
1173 w = op.ifm_shapes[0].width
1174 kh, kw, _, _ = op.inputs[1].shape
1175 if h == 1 and w == 1 and kh == 1 and kw == 1:
1176 # Overwrite this op as a Fully Connected Op
1177 op.name += "_fc"
1178 op.type = Op.FullyConnected
1179 op.attrs = {
1180 "weights_format": 0,
1181 }
1182 # Reshape Weights to be 2D. HWIO becomes just IO (as H and W are 1, they can just be dropped)
1183 weight_tensor = op.inputs[1]
James Peet7519d502021-07-19 16:47:58 +01001184 weight_tensor.values = weight_tensor.values.squeeze(axis=(0, 1))
1185 weight_tensor.set_all_shapes(list(weight_tensor.values.shape))
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001186
1187 DebugDatabase.add_optimised(op, op)
1188 return op
1189
1190
Raul Farkas66207142023-05-25 11:15:20 +01001191def fixup_relus_with_differing_ifm_ofm_scaling(op: Operation, arch, nng) -> Operation:
1192 """Fixup Relu with different IFM and OFM to allow fusing by adding its own primary op."""
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001193 if op.run_on_npu and op.type.is_relu_op():
1194 ifm = op.inputs[0]
1195 ofm = op.outputs[0]
1196 # Relu with differing IFM and OFM scaling cannot be fused with another primary op
1197 # and requires its own to be inserted
1198 if not check_quantized_tens_scaling_equal(ifm, ofm):
1199 # Override this op with its own primary op (avgpool)
1200 relu_fused_op = create_avgpool_nop(op.name + "_avgpool")
1201 # And fuse the original activation function to it
1202 relu_fused_op.activation = create_activation_function(op.type)
Fredrik Svedberg1a7527c2021-09-13 15:52:16 +02001203 # Add explicit rescaling
1204 rescale = ifm.quantization.scale_f32 / ofm.quantization.scale_f32
1205 multiplier, shift = scaling.quantise_scale(rescale)
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +02001206 relu_fused_op.explicit_scaling = ExplicitScaling(False, [shift], [multiplier])
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001207 # Tidy up and assign the ifm and ofm to the new op
1208 ifm.consumer_list.remove(op)
1209
1210 relu_fused_op.add_input_tensor(ifm)
1211 relu_fused_op.set_output_tensor(ofm)
1212 relu_fused_op.set_ifm_ofm_shapes()
1213 op = relu_fused_op
1214 return op
1215
1216
Raul Farkas66207142023-05-25 11:15:20 +01001217def convert_lstm(op: Operation, arch, nng) -> Operation:
1218 """Convert LSTM op into its basic opearations to allow for support on NPU."""
Fredrik Svedberg0ac08042023-04-11 22:35:04 +02001219 if op.type == Op.UnidirectionalSequenceLstm:
1220 lstm = Lstm(op)
1221 op = lstm.get_graph()
1222 return op
1223
1224
Raul Farkas66207142023-05-25 11:15:20 +01001225def convert_softmax(op: Operation, arch, nng) -> Operation:
1226 """Convert Softmax op into its basic operations to allow for support on NPU."""
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001227 if op.type == Op.Softmax and op.run_on_npu:
1228 softmax = SoftMax(op)
1229 op = softmax.get_graph()
1230 return op
1231
1232
Raul Farkas66207142023-05-25 11:15:20 +01001233def convert_prelu(op: Operation, arch, nng) -> Operation:
1234 """Convert PReLU op to other ops based on alpha values to allow for support on NPU."""
Fredrik Svedberg8ddd4892022-08-19 16:06:04 +02001235 if op.type == Op.Prelu:
1236 ifm, alpha, ofm = op.get_ifm_ifm2_ofm()
1237 if None in (ifm, alpha, ofm):
1238 return op
1239
Fredrik Svedberg66591652022-08-29 10:51:27 +02001240 if alpha.values is not None:
1241 # If const alpha check for possible optimisations
1242 alpha_zp = alpha.quantization.zero_point
1243 alpha_scale = alpha.quantization.scale_f32
1244 # If all alpha values are the same the PReLU can be converted to LeakyRelu
Rickard Bolin5fdcf172022-12-19 12:56:17 +00001245 alpha_min = (alpha.values.min().astype(int) - alpha_zp) * alpha_scale
1246 alpha_max = (alpha.values.max().astype(int) - alpha_zp) * alpha_scale
Fredrik Svedberg66591652022-08-29 10:51:27 +02001247 if alpha_min == alpha_max:
1248 # or even a Relu
1249 if alpha_min == 0:
1250 new_op = Op.Relu
1251 else:
1252 new_op = Op.LeakyRelu
1253 op.attrs["alpha"] = alpha_min
1254 # setup alpha_scaling for bit exact result
1255 ifm_scale = ifm.quantization.scale_f32
1256 ofm_scale = ofm.quantization.scale_f32
1257 alpha_scale, alpha_shift = scaling.elementwise_mul_scale(ifm_scale, alpha_scale, ofm_scale)
1258 op.attrs["alpha_scaling"] = (alpha.values.min() - alpha_zp, alpha_scale, alpha_shift)
1259 # Change op type
1260 op.type = new_op
1261 op.name = op.name.replace("Prelu", new_op.name)
1262 del op.inputs[1] # Remove alpha tensor
1263 return op
1264 elif alpha_max < 1:
1265 # If alpha_max is less than 1 convert PReLU to Max(alpha * IFM, identity * IFM)
1266 # Multiply with alpha tensor
1267 mul_alpha = Operation(Op.Mul, op.name + "_mul_alpha")
1268 mul_alpha.add_input_tensor(ifm)
1269 mul_alpha.add_input_tensor(alpha)
1270 fm_alpha = ofm.clone(op.name + "_alpha", set_unique=True)
1271 mul_alpha.set_output_tensor(fm_alpha)
1272 mul_alpha.set_ifm_ofm_shapes()
1273 DebugDatabase.add_optimised(op, mul_alpha)
1274 if check_quantized_tens_scaling_equal(ifm, ofm):
1275 # No scaling is needed
1276 fm_id = ifm
1277 else:
1278 # Add multiplication with identity
1279 mul_identity = Operation(Op.Mul, op.name + "_mul_identity")
1280 mul_identity.add_input_tensor(ifm)
1281 # Create const tensor containing identity as scalar
1282 quantization = ifm.quantization.clone()
1283 quantization.scale_f32 = np.float32(1)
1284 quantization.zero_point = 0
1285 one = create_const_tensor("one_const", [], ifm.dtype, [1], quantization=quantization)
1286 mul_identity.add_input_tensor(one)
1287 # Make sure that fm_id is allocated to a different address than fm_alpha
1288 fm_id = ofm.clone(op.name + "_id", set_unique=True)
1289 mul_identity.set_output_tensor(fm_id)
1290 mul_identity.set_ifm_ofm_shapes()
wilisa0179a89042022-11-02 17:18:43 +00001291 DebugDatabase.add_optimised(op, mul_identity)
Fredrik Svedberg66591652022-08-29 10:51:27 +02001292
1293 # Combine scaled and alpha multiplied values
1294 max_op = Operation(Op.Maximum, op.name + "_max")
1295 max_op.add_input_tensor(fm_alpha)
1296 max_op.add_input_tensor(fm_id)
1297 max_op.set_output_tensor(ofm)
1298 max_op.set_ifm_ofm_shapes()
1299
1300 DebugDatabase.add_optimised(op, max_op)
1301 ifm.consumer_list.remove(op)
1302 return max_op
1303
1304 # Catch all PReLU conversion for the cases that could not be optimised above
Fredrik Svedberg8ddd4892022-08-19 16:06:04 +02001305 no_scale_quant = ifm.quantization.clone()
1306 no_scale_quant.scale_f32 = None
1307 no_scale_quant.zero_point = 0
Fredrik Svedberg66591652022-08-29 10:51:27 +02001308 zero = create_const_tensor("zero_const", [], ifm.dtype, [0], quantization=no_scale_quant)
Fredrik Svedberg8ddd4892022-08-19 16:06:04 +02001309
1310 # Select values < 0
1311 min_op = Operation(Op.Minimum, op.name + "_min")
1312 min_op.add_input_tensor(ifm)
1313 min_op.add_input_tensor(zero)
1314 fm_negative = ifm.clone(op.name + "_negative", set_unique=True)
1315 min_op.set_output_tensor(fm_negative)
1316 min_op.set_ifm_ofm_shapes()
1317 DebugDatabase.add_optimised(op, min_op)
1318
1319 # and multiply with alpha tensor
1320 mul_alpha = Operation(Op.Mul, op.name + "_mul_alpha")
1321 mul_alpha.add_input_tensor(fm_negative)
1322 mul_alpha.add_input_tensor(alpha)
1323 fm_alpha = ofm.clone(op.name + "_negative_alpha", set_unique=True)
1324 mul_alpha.set_output_tensor(fm_alpha)
1325 mul_alpha.set_ifm_ofm_shapes()
1326 DebugDatabase.add_optimised(op, mul_alpha)
1327
1328 # Select (and scale) values > 0
1329 relu_op = Operation(Op.Relu, op.name + "_relu")
1330 relu_op.add_input_tensor(ifm)
1331 fm_scaled = ofm.clone(op.name + "_positive_scaled", set_unique=True)
1332 relu_op.set_output_tensor(fm_scaled)
1333 relu_op.set_ifm_ofm_shapes()
1334 DebugDatabase.add_optimised(op, relu_op)
1335
1336 # Add scaled and alpha multiplied values (without scaling)
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +02001337 add_op = Operation(Op.Add, op.name + "_add")
1338 add_op.explicit_scaling = ExplicitScaling(False, shift=[0], multiplier=[1]) # No scaling
Fredrik Svedberg8ddd4892022-08-19 16:06:04 +02001339 add_op.add_input_tensor(fm_alpha)
1340 add_op.add_input_tensor(fm_scaled)
1341 add_op.set_output_tensor(ofm)
1342 add_op.set_ifm_ofm_shapes()
1343
1344 DebugDatabase.add_optimised(op, add_op)
1345 ifm.consumer_list.remove(op)
1346 op = add_op
1347
1348 return op
1349
1350
Raul Farkas66207142023-05-25 11:15:20 +01001351def convert_mul_max_to_abs_or_lrelu(op: Operation, arch, nng) -> Operation:
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001352 r"""Whenever there is a subgraph with this topology:
1353
Jonas Ohlssond8575072022-03-30 10:30:25 +02001354 Input X For X = -1 or X > 0
1355 | \ / This subgraph can be replaced with either
1356 | Mul an Abs (if X = -1) or a LeakyReLU (if X > 0)
1357 | /
1358 Max
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001359 """
1360
1361 if op.type == Op.Maximum:
1362 # finds the Mul input(s) to the Max
1363 muls = [i for i in op.inputs if i.ops[0].type == Op.Mul]
1364 if len(muls) == 1:
1365 mul = muls[0].ops[0]
1366 elif len(muls) == 2:
1367 # In the case both inputs are Muls, find the one with the same input as the Max
Fredrik Svedberg66591652022-08-29 10:51:27 +02001368 mul_ifms = [m for m in muls if len(set(op.inputs + m.ops[0].inputs)) == 1]
1369 if len(mul_ifms):
1370 mul = mul_ifms[0].ops[0]
1371 else:
1372 # Not using same input
1373 return op
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001374 else:
1375 # No Mul inputs
1376 return op
1377
1378 # make sure the Mul doesn't have any other consumers
1379 mul_ofm = mul.outputs[0]
1380 if len(mul_ofm.consumers()) != 1:
1381 return op
1382 # make sure the Mul doesn't have a fused activation function
1383 if mul.activation:
1384 return op
1385 ifm, ofm = op.get_ifm_ofm()
1386 if ifm is None or ofm is None:
1387 return op
1388
1389 if ifm.dtype not in (DataType.uint8, DataType.int8) or ifm.dtype != ofm.dtype:
1390 return op
1391 if not check_quantized_tens_scaling_equal(ifm, ofm) or not check_quantized_tens_scaling_equal(ifm, mul_ofm):
1392 # rewrite to LeakyRelu currently only makes sense if the quantization is identical
1393 return op
1394
1395 # finds the branched input that goes to both the Max and the Mul
1396 shared = set(op.inputs) & set(mul.inputs)
1397 if len(shared) == 1:
1398 shared_in = shared.pop()
1399 # find the constant scalar input to the Mul
1400 const_tens = (set(mul.inputs) - {shared_in}).pop()
1401 # check that it is a scalar
1402 if const_tens.shape != []:
1403 return op
1404 const = const_tens.ops[0]
1405 # check that it is a constant
1406 if const.type != Op.Const:
1407 return op
1408 # Remove the Mul from the shared input's consumers
1409 shared_in.consumer_list.remove(mul)
1410 else:
1411 return op
1412
1413 val = const.outputs[0].values
1414 if val >= 0:
1415 new_op = Op.LeakyRelu
1416 op.attrs["alpha"] = val
1417 # to produce bit exact results, the alpha is not enough;
1418 # save additional scaling info in attr "alpha_scale", to be used as input
1419 # to the LUT construction
James Peet7519d502021-07-19 16:47:58 +01001420 alpha_scalar = const_tens.values - const_tens.quantization.zero_point
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001421 mul_ifm_scale = np.double(ifm.quantization.scale_f32)
1422 mul_ifm2_scale = np.double(const_tens.quantization.scale_f32)
1423 mul_ofm_scale = np.double(mul_ofm.quantization.scale_f32)
1424 alpha_scale, alpha_shift = scaling.elementwise_mul_scale(mul_ifm_scale, mul_ifm2_scale, mul_ofm_scale)
1425 op.attrs["alpha_scaling"] = (alpha_scalar, alpha_scale, alpha_shift)
1426 elif val == -1:
1427 new_op = Op.Abs
1428 else:
1429 return op
1430
1431 op.type = new_op
1432 op.name = op.name.replace("Maximum", new_op.name)
1433 op.outputs[0].name = op.outputs[0].name.replace("Maximum", new_op.name)
1434 op.inputs = [shared_in]
1435 op.set_ifm_ofm_shapes()
1436
1437 # Record optimisation in debug database
1438 DebugDatabase.add_optimised(op, op)
1439
1440 return op
1441
1442
Raul Farkas66207142023-05-25 11:15:20 +01001443def convert_hardswish_to_lut(op: Operation, arch, nng) -> Operation:
1444 """Convert HardSwish to LUT to allow for support on NPU."""
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001445 if op.type == Op.HardSwish:
1446 ifm, ofm = op.get_ifm_ofm()
1447 # Generate the LUT
1448 ifm_scale = np.double(ifm.quantization.scale_f32)
1449 ofm_scale = np.double(ofm.quantization.scale_f32)
1450 zp_in = ifm.quantization.zero_point
1451 zp_out = ofm.quantization.zero_point
1452 ifm_scale_hires = (1 / 128) * ifm_scale
1453 relu_multiplier = np.double(3 / 32768)
1454 out_scale, out_shift = scaling.quantise_scale(ifm_scale_hires / ofm_scale)
1455 relu_scale, relu_shift = scaling.quantise_scale(ifm_scale_hires / relu_multiplier)
1456 # Use 16bit scale
1457 out_scale_16 = fp_math.downscale_multiplier_int32_to_int16(out_scale)
1458 relu_scale_16 = fp_math.downscale_multiplier_int32_to_int16(relu_scale)
1459
1460 values = []
1461 ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)
1462 quantized_min = min(ix)
1463 quantized_max = max(ix)
1464 for x in ix:
1465 input_value = x - zp_in
1466 input_value_hires = input_value * 128
1467 # Compute the input value on essentially the output scale, not shifted yet
1468 input_value_preshift = fp_math.saturating_rounding_mul16(input_value_hires, out_scale_16)
1469 # Compute the "relu-ish multiplier". This matches the code in TensorFlow Lite Micro kernel
1470 relu_value = np.int16(input_value_hires)
1471 if relu_shift < 31:
1472 relu_value = fp_math.shift_left16(relu_value, 30 - relu_shift)
1473
1474 relu_value = fp_math.saturating_rounding_mul16(relu_value, relu_scale_16)
1475
1476 if relu_shift < 31:
1477 relu_value = fp_math.shift_left16(relu_value, 1)
1478
1479 if relu_shift > 31:
1480 relu_value = fp_math.rounding_divide_by_pot(relu_value, relu_shift - 31)
1481
1482 # Rescaled the value into a 16bit fixedpoint relu_value in [-1, 1]
1483 # Now convert that to a 16bit fixedpoint value in [0, 1]
1484 relu_value = (relu_value + (1 << 15)) >> 1
1485 lut_result = fp_math.saturating_mul16(relu_value, input_value_preshift)
1486 shift = 31 - out_shift
1487 shift = -shift if shift < 0 else 0
1488 # Finally apply the output shift
1489 lut_result = fp_math.rounding_divide_by_pot(lut_result, shift) + zp_out
1490 lut_result = min(quantized_max, max(quantized_min, lut_result))
1491 values.append(lut_result)
1492 return convert_to_lut(op, values, "hardswish")
1493 return op
1494
1495
1496def convert_lrelu_to_mul_max(op, arch):
1497 # Converts LeakyRelu to Max(alpha * IFM, identity * IFM)
1498 # (the opposite of convert_mul_max_to_abs_or_lrelu)
1499 ifm, ofm = op.get_ifm_ofm()
1500 if ifm is None or ofm is None:
1501 return op
1502
Fredrik Svedberg701ba912022-09-07 16:01:15 +02001503 alpha = np.float32(op.attrs["alpha"])
1504 use_mul_max = 0 < alpha < 1
Fredrik Svedberg36424312022-09-16 09:39:26 +02001505 is_converted_prelu = "alpha_scaling" in op.attrs
Fredrik Svedberg701ba912022-09-07 16:01:15 +02001506 if use_mul_max:
1507 mul_ifm = ifm
1508 new_op = Op.Maximum
1509 else:
Fredrik Svedberg36424312022-09-16 09:39:26 +02001510 # Need to use a different approach for alpha < 0 or alpha > 1
Fredrik Svedberg701ba912022-09-07 16:01:15 +02001511 no_scale_quant = ifm.quantization.clone()
1512 no_scale_quant.scale_f32 = None
1513 no_scale_quant.zero_point = 0
1514 zero = create_const_tensor("zero_const", [], ifm.dtype, [0], quantization=no_scale_quant)
1515
1516 # Select values < 0
1517 min_op = Operation(Op.Minimum, op.name + "_min")
1518 min_op.add_input_tensor(ifm)
1519 min_op.add_input_tensor(zero)
1520 mul_ifm = ifm.clone(op.name + "_negative", set_unique=True)
Fredrik Svedberg36424312022-09-16 09:39:26 +02001521 if alpha < 0 and not is_converted_prelu:
1522 # For negative alpha that is not from a converted PReLU we need to use
1523 # int32 Mul below to perform the (negative) alpha scaling
1524 mul_ifm.dtype = DataType.int32
Fredrik Svedberg701ba912022-09-07 16:01:15 +02001525 min_op.set_output_tensor(mul_ifm)
1526 min_op.set_ifm_ofm_shapes()
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +02001527 new_op = Op.Add
1528 op.explicit_scaling = ExplicitScaling(False, shift=[0], multiplier=[1]) # No scaling
Fredrik Svedberg701ba912022-09-07 16:01:15 +02001529 DebugDatabase.add_optimised(op, min_op)
1530
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001531 # Add multiplication with alpha
1532 mul_alpha = Operation(Op.Mul, op.name + "_mul_alpha")
Fredrik Svedberg701ba912022-09-07 16:01:15 +02001533 mul_alpha.add_input_tensor(mul_ifm)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001534 # Create const tensor containing alpha as scalar
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001535 quantization = ifm.quantization.clone()
1536 quantization.min = 0
1537 quantization.max = alpha * (quantization.quant_max - quantization.quant_min)
1538 quantization.zero_point = 0
Fredrik Svedberg7f3ccd52022-09-13 15:22:01 +02001539 alpha_dtype = mul_ifm.dtype
Fredrik Svedberg36424312022-09-16 09:39:26 +02001540 if is_converted_prelu:
1541 # The LeakyRelu was the result from convert_prelu and the scaling is provided
Fredrik Svedberg66591652022-08-29 10:51:27 +02001542 scalar, alpha_scale, alpha_shift = op.attrs["alpha_scaling"]
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +02001543 mul_alpha.explicit_scaling = ExplicitScaling(False, [alpha_shift], [alpha_scale])
Fredrik Svedberg7f3ccd52022-09-13 15:22:01 +02001544 elif alpha == 0 or np.isinf(1 / alpha):
1545 # Handling of alpha near or at zero
Fredrik Svedbergcce872b2021-09-02 15:20:52 +02001546 quantization.scale_f32 = np.float32(1)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001547 scalar = 0
1548 else:
1549 quantization.scale_f32 = alpha
Fredrik Svedberg7f3ccd52022-09-13 15:22:01 +02001550 if alpha_dtype == DataType.int32:
Fredrik Svedberg36424312022-09-16 09:39:26 +02001551 # When the datatype is int32 (alpha negative) we need to do the scaling with the multiplication
Fredrik Svedberg7f3ccd52022-09-13 15:22:01 +02001552 scalar, _ = scaling.elementwise_mul_scale(ifm.quantization.scale_f32, alpha, ofm.quantization.scale_f32)
1553 else:
1554 scalar = 1
Tim Hall3b1578e2023-01-13 17:57:25 +00001555 alpha_tens = create_const_tensor(op.name + "_alpha_scalar", [1], alpha_dtype, [scalar], quantization=quantization)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001556 mul_alpha.add_input_tensor(alpha_tens)
1557 fm_alpha = ofm.clone(op.name + "_alpha", set_unique=True)
1558 mul_alpha.set_output_tensor(fm_alpha)
1559 mul_alpha.set_ifm_ofm_shapes()
1560 DebugDatabase.add_optimised(op, mul_alpha)
1561
Fredrik Svedberg701ba912022-09-07 16:01:15 +02001562 if not use_mul_max:
1563 relu_op = Operation(Op.Relu, op.name + "_relu")
1564 relu_op.add_input_tensor(ifm)
1565 fm_id = ofm.clone(op.name + "_positive_scaled", set_unique=True)
1566 relu_op.set_output_tensor(fm_id)
1567 relu_op.set_ifm_ofm_shapes()
1568 DebugDatabase.add_optimised(op, relu_op)
1569 elif check_quantized_tens_scaling_equal(ifm, ofm):
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001570 # No identity multiplication is needed
1571 fm_id = ifm
1572 else:
1573 # Add multiplication with identity
1574 mul_identity = Operation(Op.Mul, op.name + "_mul_identity")
1575 mul_identity.add_input_tensor(ifm)
1576 # Create const tensor containing identity as scalar
1577 quantization = ifm.quantization.clone()
1578 quantization.min = 0
1579 quantization.max = quantization.quant_max - quantization.quant_min
Fredrik Svedbergcce872b2021-09-02 15:20:52 +02001580 quantization.scale_f32 = np.float32(1)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001581 quantization.zero_point = 0
Tim Hall3b1578e2023-01-13 17:57:25 +00001582 identity_tens = create_const_tensor(op.name + "_id_scalar", [], ifm.dtype, [1], quantization=quantization)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001583 mul_identity.add_input_tensor(identity_tens)
1584 # Make sure that fm_id is allocated to a different address than fm_alpha
1585 fm_id = ofm.clone(op.name + "_id", set_unique=True)
1586 mul_identity.set_output_tensor(fm_id)
1587 mul_identity.set_ifm_ofm_shapes()
1588 DebugDatabase.add_optimised(op, mul_identity)
1589
1590 # Convert LeakyRelu to Max, add the results of the multiplication(s) as inputs
Fredrik Svedberg701ba912022-09-07 16:01:15 +02001591 op.type = new_op
1592 op.name = op.name.replace("LeakyRelu", new_op.name)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001593 op.inputs = []
1594 ifm.consumer_list.remove(op)
1595 op.add_input_tensor(fm_alpha)
1596 op.add_input_tensor(fm_id)
1597 op.set_ifm_ofm_shapes()
1598
1599 DebugDatabase.add_optimised(op, op)
1600 return op
1601
1602
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001603def convert_to_lut8(op, fn, fn_name):
1604 # Converts op to a no-op + int8/uint8 LUT which is generated with the given function.
1605 # fn is a function(real) -> real
1606 ifm, ofm = op.get_ifm_ofm()
1607 if ifm.dtype not in (DataType.uint8, DataType.int8) or ifm.dtype != ofm.dtype:
1608 return op
1609 # Generate the LUT
1610 ifm_scale = np.double(ifm.quantization.scale_f32)
1611 ofm_scale = np.double(ofm.quantization.scale_f32)
1612 zp_in = ifm.quantization.zero_point
1613 zp_out = ofm.quantization.zero_point
1614 values = []
1615 ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)
1616 quantized_min = min(ix)
1617 quantized_max = max(ix)
1618 for x in ix:
1619 x_real = ifm_scale * (x - zp_in)
1620 y_real = fn(x_real)
1621 lut_result = round_away_zero(zp_out + y_real / ofm_scale)
1622 lut_result = min(quantized_max, max(quantized_min, lut_result))
1623 values.append(lut_result)
1624 return convert_to_lut(op, values, fn_name)
1625
1626
1627def convert_lrelu_to_lut(op, arch):
1628 ifm, ofm = op.get_ifm_ofm()
1629 # Generate the LUT
1630 alpha = op.attrs["alpha"]
1631 ifm_scale = np.double(ifm.quantization.scale_f32)
1632 ofm_scale = np.double(ofm.quantization.scale_f32)
1633 zp_in = ifm.quantization.zero_point
1634 zp_out = ofm.quantization.zero_point
1635 identity_scale, identity_shift = scaling.elementwise_mul_scale(ifm_scale, 1, ofm_scale)
1636 alpha_scalar = 1
1637 alpha_scale, alpha_shift = scaling.elementwise_mul_scale(ifm_scale, alpha, ofm_scale)
1638 if "alpha_scaling" in op.attrs:
1639 # The LeakyRelu was the result from convert_mul_max_to_abs_or_lrelu
1640 alpha_scalar, alpha_scale, alpha_shift = op.attrs["alpha_scaling"]
1641 values = []
1642 ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)
1643 quantized_min = min(ix)
1644 quantized_max = max(ix)
1645 for x in ix:
1646 if x < zp_in:
1647 lut_result = zp_out + fp_math.multiply_by_quantized_multiplier(
1648 alpha_scalar * (x - zp_in), alpha_scale, alpha_shift
1649 )
1650 else:
1651 lut_result = zp_out + fp_math.multiply_by_quantized_multiplier(x - zp_in, identity_scale, identity_shift)
1652 lut_result = min(quantized_max, max(quantized_min, lut_result))
1653 values.append(lut_result)
1654 return convert_to_lut(op, values, "lrelu")
1655
1656
Raul Farkas66207142023-05-25 11:15:20 +01001657def convert_lrelu(op: Operation, arch, nng) -> Operation:
1658 """Convert LeakyRelu to a LUT based solution if possible, otherwise a mul + max."""
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001659 if op.type != Op.LeakyRelu:
1660 return op
1661 ifm, ofm = op.get_ifm_ofm()
1662 if ifm is None or ofm is None:
1663 return op
Fredrik Svedberg36424312022-09-16 09:39:26 +02001664 alpha = op.attrs["alpha"]
1665 if alpha == 0:
1666 # When alpha is 0 the opertion can be converted to a ReLU
1667 op.type = Op.Relu
1668 op.name = op.name.replace("LeakyRelu", op.type.name)
1669 return op
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001670 if ifm.dtype in (DataType.uint8, DataType.int8) and ifm.dtype == ofm.dtype:
1671 # use LUT for int8/uint8
1672 return convert_lrelu_to_lut(op, arch)
Fredrik Svedberg36424312022-09-16 09:39:26 +02001673 if check_quantized_tens_scaling_equal(ifm, ofm) and ifm.dtype == ofm.dtype == DataType.int16 and alpha > 0:
Fredrik Svedberg701ba912022-09-07 16:01:15 +02001674 # use LeakyRelu unmodified for int16 with equal input/output scaling and positive alpha
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001675 return op
1676 return convert_lrelu_to_mul_max(op, arch)
1677
1678
Raul Farkas66207142023-05-25 11:15:20 +01001679def convert_tanh_sigmoid_to_lut(op: Operation, arch, nng) -> Operation:
1680 """Convert int8/uint8 Sigmoid and Tanh to a LUT based solution."""
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001681 if op.type == Op.Sigmoid:
1682 return convert_to_lut8(op, clamp_sigmoid, "sigmoid")
1683 elif op.type == Op.Tanh:
1684 return convert_to_lut8(op, math.tanh, "tanh")
1685 return op
1686
1687
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001688def fuse_activation_function_with_prev(op, arch, nng):
1689 # if op is a no-op: attempts to move the activation function to the preceding op
1690 if not op.attrs.get("is_nop", False) or op.activation is None:
1691 return op
1692 ifm, ofm = op.get_ifm_ofm()
1693 if ifm is None or ofm is None:
1694 return op
1695 # finds the input(s) to the operation
1696 prev_op = ifm.ops[0]
1697 # Note: the below checks on prev_op require that a first optimize pass on the full graph has been performed
1698 fuse = (
1699 prev_op.run_on_npu
1700 and prev_op.type.npu_block_type != NpuBlockType.Default
1701 and len(ifm.ops) == 1
1702 and len(prev_op.outputs[0].consumers()) == 1
1703 and prev_op.activation is None
1704 )
1705 if op.activation_lut is not None and arch.shram_reserved_unused_banks == 0:
1706 # TODO: if SHRAM LUT space is shared with SHRAM ACC (32, 64 MAC),
1707 # LUT currently only works correctly for elementwise ops
1708 fuse = False
1709 if not fuse:
1710 return op
1711 # Move the fused activation function + corresponding info to prev_op
1712 prev_op.activation = op.activation
1713 prev_op.forced_output_quantization = op.forced_output_quantization
1714 if op.activation_lut is not None:
1715 prev_op.set_activation_lut(op.activation_lut)
1716 # Bypass op
1717 prev_op.set_output_tensor(ofm)
wilisa0179a89042022-11-02 17:18:43 +00001718 DebugDatabase.add_optimised(prev_op, prev_op)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001719 return op
1720
1721
1722def _leading_pad_ok(leading_pad, stride, kernel_size):
1723 # If kernel size // 2 > stride, then (left, top) padding must be a multiple of stride,
1724 # otherwise replacing PAD by hardware padding would iterate the wrong IFM rows/columns
1725 max_size = kernel_size // 2
1726 return leading_pad == max_size or max_size <= stride or leading_pad % stride == 0
1727
1728
Raul Farkas66207142023-05-25 11:15:20 +01001729def replace_pad_by_hw_pad(op: Operation, arch, nng) -> Operation:
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001730 """
1731 Tries to completely remove a PAD operator by using hardware padding.
1732 E.g. a PAD operation that pads 1, followed by a CONV with VALID padding and kernel size 3
1733 is rewritten such that the PAD is removed, and the CONV uses SAME padding.
1734 Converts tens1 -> PAD -> tens2 -> CONV to tens1 -> CONV
1735 if both operations can be run on the NPU.
1736 This is the most efficient way to implement PAD, but cannot be done for all pad sizes.
1737 """
1738 if (
1739 (op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op() or op.type.is_avgpool_op())
Tim Hall0ab2edc2022-02-23 17:58:02 +00001740 and op.type not in (Op.Conv2DBackpropInput, Op.Conv2DBackpropInputSwitchedBias)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001741 and op.run_on_npu
1742 and op.attrs["padding"] == Padding.VALID
1743 ):
1744 pad_op = op.ifm.ops[0]
1745 if pad_op.type != Op.Pad or not pad_op.run_on_npu:
1746 return op
1747 if pad_op.ifm.dtype != pad_op.ofm.dtype or not check_quantized_tens_scaling_equal(pad_op.ofm, pad_op.ifm):
1748 return op
1749 top, left, bottom, right = get_pad_values_from_input(pad_op.inputs[1].values)
1750 k = op.kernel
1751 k_w, k_h = k.dilated_wh()
1752
1753 # Check if the PAD operator can be replaced by hardware padding
1754 if left > k_w // 2 or right > k_w // 2 or top > k_h // 2 or bottom > k_h // 2:
1755 # Too much padding, it would require hardware padding to actually insert zeros
1756 return op
1757 if not _leading_pad_ok(top, k.stride.y, k_h) or not _leading_pad_ok(left, k.stride.x, k_w):
1758 return op
1759
1760 if op.type.is_avgpool_op():
1761 # For average pool, hardware padding can only be used if padding is 0 or kernel size / 2
1762 for pad, k_size in (
1763 (left, k_w),
1764 (right, k_w),
1765 (top, k_h),
1766 (bottom, k_h),
1767 ):
1768 if pad not in (0, k_size // 2):
1769 return op
1770 # Average pool is converted to depthwise, because NPU average pool + same padding
1771 # has a special implementation that is different from PAD followed by average pool with
1772 # valid padding.
1773 k_w, k_h = op.kernel.width, op.kernel.height
1774 ifm = op.ifm
1775 # Remember other inputs
1776 other_inputs = op.inputs[1:]
1777 # Create a weight tensor, all weights are set to 1/(kernel width * kernel height)
1778 quantization = QuantizationParameters(0.0, 255.0)
1779 quantization.scale_f32 = 1.0 / (k_w * k_h)
1780 quantization.zero_point = 0
1781 shape = [k_h, k_w, 1, op.ofm.shape[-1]]
1782 weights = np.full(shape, 1)
1783
1784 weight_tens = create_const_tensor(
1785 op.name + "_weights",
1786 shape,
1787 op.ifm.dtype,
1788 weights,
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001789 purpose=TensorPurpose.Weights,
1790 quantization=quantization,
1791 )
James Peet7519d502021-07-19 16:47:58 +01001792 weight_tens.values = weights
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001793 op.type = Op.DepthwiseConv2DBias
1794 op.inputs = []
1795 op.add_input_tensor(ifm)
1796 op.add_input_tensor(weight_tens)
Tim Hall5ff4cd12023-05-16 22:39:14 +01001797
1798 if op.ifm.dtype == DataType.uint8:
1799 op.rounding_mode = RoundingMode.HalfUp
1800
1801 # Add bias tensor, all biases set to 0
1802 op.inputs.append(None)
1803 fixup_bias_tensors(op, arch, nng, DataType.int32)
1804
1805 else:
1806 op.rounding_mode = RoundingMode.AwayZero
1807
1808 # The DepthwiseConv needs to be performed with the IFM zero point set appropriately so that the correct
1809 # pad values are used. However, in order to use the rounding away from zero mode the zero point needs to
1810 # have been removed so that the zero point is at zero. This is done by adding a kernel sized amount of
1811 # the zero point as a bias. The datatype of the bias needs to be set to int32, even for an int16 IFM,
1812 # because this will cause full precision scaling to be used (see weight compression). Finally, the OFM
1813 # zero point will need forcing to zero (as it has already been removed)
1814 nr_biases = op.inputs[1].shape[-1]
1815 bias_values = [op.ifm.quantization.zero_point * k_h * k_w] * nr_biases
1816 bias_tensor = create_const_tensor(op.name + "_bias", [nr_biases], DataType.int32, bias_values)
1817 op.add_input_tensor(bias_tensor)
1818
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001819 # Add other inputs
1820 op.inputs.extend(other_inputs)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001821
1822 # Bypass the PAD operator
1823 op.set_input_tensor(pad_op.ifm, 0)
1824 # Adjust the padding attributes of the convolution operator
1825 op.attrs["padding"] = Padding.EXPLICIT
1826 op.attrs["explicit_padding"] = (top, left, bottom, right)
1827 op.set_ifm_ofm_shapes()
wilisa0179a89042022-11-02 17:18:43 +00001828 DebugDatabase.add_optimised(op, op)
1829
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001830 return op
1831
1832
1833def convert_pad(op: Operation, arch, nng):
1834 """
1835 Rewrites PAD operator to an average pool that copies the IFM to the OFM
1836 + up to 4 average pool operators that fill the OFM with zeros at the borders.
1837 This is done as fall-back for the PAD operators that remain after replace_pad_by_hw_pad
1838 """
1839 if op.type != Op.Pad or not op.run_on_npu:
1840 return op
1841 top, left, bottom, right = get_pad_values_from_input(op.inputs[1].values)
1842
1843 ifm = op.ifm
1844 assert ifm is not None
James Ward3e134342021-10-28 10:01:40 +01001845 ifm_shape = op.ifm_shapes[0]
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001846 ofm = op.ofm
1847 assert ofm is not None
1848 ofm.ops = []
1849 ofm_shape = op.ofm_shapes[0]
1850
1851 # Average pool op that copies IFM to the right place inside the OFM
1852 shp0 = Shape4D(0, 0, 0, 0)
1853 shp_top = shp0.with_height(top)
1854 avgpool_op = create_avg_pool_for_concat(op, op.name + "_main", ifm, ifm_shape, shp_top.with_width(left))
1855 avgpool_op.activation = op.activation
1856 quant = ofm.quantization
1857 pad_value = quant.zero_point
1858 # Add operations that fill the borders of the OFM
1859 if top > 0:
1860 shape = Shape4D(1, top, ofm_shape.width, ofm_shape.depth)
1861 zero_tens = create_const_tensor(
Tim Hall3b1578e2023-01-13 17:57:25 +00001862 op.name + "_top", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], quantization=quant
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001863 )
1864 # If top/bottom or left/right are equal, the const tensors can be allocated to the same address
1865 zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))
1866 create_avg_pool_for_concat(op, op.name + "_top", zero_tens, shape, shp0)
1867 if bottom > 0:
1868 shape = Shape4D(1, bottom, ofm_shape.width, ofm_shape.depth)
1869 zero_tens = create_const_tensor(
1870 op.name + "_bottom",
1871 shape.as_list(),
1872 ofm.dtype,
1873 shape.elements() * [pad_value],
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001874 quantization=quant,
1875 )
1876 zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))
1877 create_avg_pool_for_concat(
1878 op, op.name + "_bottom", zero_tens, shape, shp0.with_height(ofm_shape.height - bottom)
1879 )
1880 if left > 0:
1881 shape = Shape4D(1, ifm_shape.height, left, ofm_shape.depth)
1882 zero_tens = create_const_tensor(
Tim Hall3b1578e2023-01-13 17:57:25 +00001883 op.name + "_left", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], quantization=quant
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001884 )
1885 zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))
1886 create_avg_pool_for_concat(op, op.name + "_left", zero_tens, shape, shp_top)
1887 if right > 0:
1888 shape = Shape4D(1, ifm_shape.height, right, ofm_shape.depth)
1889 zero_tens = create_const_tensor(
Tim Hall3b1578e2023-01-13 17:57:25 +00001890 op.name + "_right", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], quantization=quant
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001891 )
1892 zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))
1893 create_avg_pool_for_concat(
1894 op, op.name + "_right", zero_tens, shape, shp_top.with_width(ofm_shape.width - right)
1895 )
1896
1897 op.type = Op.ConcatTFLite
1898 return avgpool_op
1899
1900
Raul Farkas66207142023-05-25 11:15:20 +01001901def fixup_bias_tensors(op: Operation, arch, nng, dtype=None) -> Operation:
1902 """Fixup ops that require a bias and don't have one by adding a bias tensor filled with zeros."""
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001903 if op.type.needs_bias() and op.bias is None:
1904 # Op has no bias, add bias tensor filled with zeros
1905 nr_biases = op.inputs[1].shape[-1]
1906 bias_values = [0] * nr_biases
Fredrik Svedbergcc219be2022-09-20 16:32:52 +02001907 # The DataType of the bias tensor can be explicitly provided or deduced from the ifm
1908 # DataType. Default is int32 bias for 8-bit ifms and int64 for int16 ifms.
1909 # For int16 the selected bias DataType will have an impact on the scaling
1910 # used when encoding the scales and biases later. The default mode will match the
1911 # refence with reduced scaling for int64 bias.
1912 # This means that in cases (in the graph optimiser) where DepthwiseConv2DBias
1913 # is used to emulate average pool int32 bias should be selected for full precision
1914 # int16 scaling.
1915 if dtype is None:
1916 dtype = DataType.int64 if op.ifm.dtype == DataType.int16 else DataType.int32
1917 bias_tensor = create_const_tensor(op.name + "_bias", [nr_biases], dtype, bias_values)
Raul Farkas3e7157b2023-05-09 09:09:17 +01001918 bias_index = op.type.info.indices.biases[0]
1919 if bias_index < len(op.inputs):
1920 op.set_input_tensor(bias_tensor, bias_index)
1921 else:
1922 op.add_input_tensor(bias_tensor)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001923
1924 return op
1925
1926
wilisa0146c94772023-02-08 09:56:14 +00001927def detect_asymmetric_weights(op):
1928 # Check all ops (cpu and npu)
1929 if op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op():
1930 if op.ifm.dtype in (DataType.int8, DataType.int16):
Fredrik Svedbergcc8569f2021-11-01 14:25:29 +01001931 if not np.all(op.weights.quantization.zero_point == 0):
wilisa0146c94772023-02-08 09:56:14 +00001932 print(f"Warning: Op {op.type} '{op.name}' has asymmetric weights.", end=" ")
1933 return True
1934 return False
Fredrik Svedbergcc8569f2021-11-01 14:25:29 +01001935
wilisa0146c94772023-02-08 09:56:14 +00001936
Raul Farkas66207142023-05-25 11:15:20 +01001937def fixup_asymmetric_weights(op: Operation, arch, nng) -> Operation:
wilisa0146c94772023-02-08 09:56:14 +00001938 if detect_asymmetric_weights(op):
1939 if op.run_on_npu:
1940 print("Zero points have been adjusted.")
1941 op.weights.quantization.zero_point *= 0
Fredrik Svedbergcc8569f2021-11-01 14:25:29 +01001942 return op
1943
1944
wilisa0146c94772023-02-08 09:56:14 +00001945def check_asymmetric_weights(op, arch, nng):
1946 # This function can modify the run_on_npu flag which causes an operator to be placed on the CPU. It is usually only
1947 # set by the supported operator checks. Therefore, it should be run immediately after those checks to avoid the
1948 # possibility of other graph optimiser functions modify the operator (that is later run on the CPU)
1949 if detect_asymmetric_weights(op):
1950 if op.run_on_npu:
1951 print("To run the operator on Ethos-U use the option --force-symmetric-int-weights")
1952 op.run_on_npu = False
1953 return op
1954
1955
1956def fixup_or_check_asymmetric_weights(force_symmetric_int_weights):
1957 if force_symmetric_int_weights:
1958 return fixup_asymmetric_weights
1959 else:
1960 return check_asymmetric_weights
1961
1962
Rickard Bolina68b82a2023-04-20 15:12:28 +00001963def convert_mean_to_depthwise_conv(op, arch, nng):
Alexander Hansson90c34b52023-05-31 15:03:03 +00001964 """
1965 When h x w <= 4096 When h x w > 4096 there is a need to split into several ops.
1966 Do this by splitting up h and change the read_offset/shape.
1967 Below is an example where ifm is 1x190x64x1
1968 MEAN MEAN
1969 | |-----------------------|----------------------|
1970 DepthwiseConv2DBias 1_DepthwiseConv2DBias 2_DepthwiseConv2DBias 3_DepthwiseConv2DBias
1971 | | | |
1972 MUL |---------ADD-----------| |
1973 | |
1974 |----------------ADD---------------|
1975 |
1976 MUL
1977 1_DepthwiseConv2DBias: read_offset [0, 0, 0, 0]> read_shape [1, 64, 64, 1]>
1978 2_DepthwiseConv2DBias: read_offset [0, 64, 0, 0]> read_shape [1, 64, 64, 1]>
1979 3_DepthwiseConv2DBias: read_offset [0, 128, 0, 0]> read_shape [1, 62, 64, 1]>
1980 """
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001981 if op.type == Op.Mean and op.run_on_npu:
Alexander Hansson90c34b52023-05-31 15:03:03 +00001982 max_kernel_size = 4096
1983 max_height = 64
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001984 inp, axis = op.inputs
Alexander Hansson1d5e8592023-06-27 12:36:25 +00001985 dims = len(inp.shape)
1986 dims_ofm = len(op.ofm.shape)
Alexander Hansson90c34b52023-05-31 15:03:03 +00001987 ofmq = op.ofm.quantization
1988 ifmq = op.ifm.quantization
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001989
Alexander Hansson1d5e8592023-06-27 12:36:25 +00001990 # reduce_axis[i] is true if axis i should be reduced
1991 if axis.shape == []:
1992 reduce_axis = [True if i == axis.values else False for i in range(dims)]
1993 else:
1994 reduce_axis = [True if i in axis.values else False for i in range(dims)]
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001995
Alexander Hansson1d5e8592023-06-27 12:36:25 +00001996 ifm_shape = inp.shape.copy()
1997 intermediate_shape = op.ofm.shape.copy()
Diqing Zhong1ddb2ed2022-03-09 12:23:47 +01001998
Alexander Hansson1d5e8592023-06-27 12:36:25 +00001999 # Fix intermediate_shape when keep_dims is false
2000 # e.g. IFM=1xHxWxC axis=2 OFM=1xHxC, the intermediate_shape should be 1xHx1xC
2001 if dims_ofm < dims:
2002 for i in range(dims):
2003 if reduce_axis[i]:
2004 intermediate_shape.insert(i, 1)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002005
Alexander Hansson1d5e8592023-06-27 12:36:25 +00002006 # Reshape to 4D
Alexander Hanssonda8741a2023-06-30 15:41:13 +00002007 reduce_axis = full_shape(4, reduce_axis, False)
2008 ifm_shape = full_shape(4, ifm_shape, 1)
2009 intermediate_shape = full_shape(4, intermediate_shape, 1)
Alexander Hansson1d5e8592023-06-27 12:36:25 +00002010
2011 # If all dimensions to reduce have shape 1, the operation is essentially a memcpy.
2012 # We can then remove the whole op by propagating ofm to previous ops
2013 if not any([reduce_axis[i] and ifm_shape[i] > 1 for i in range(4)]):
2014 op.type = Op.Memcpy
2015 op = bypass_memory_only_ops(op, arch, nng)
2016 return op
2017
Alexander Hanssonda8741a2023-06-30 15:41:13 +00002018 # Support mean over depth-axis by left-shifting the C channel
2019 # From semantics checks we can assume that one of H,W,C has shape 1
2020 if reduce_axis[3] and ifm_shape[3] > 1:
2021 assert 1 in ifm_shape[1:], "Mean reduction over depth channel, but none of H,W,C has shape 1"
2022 # If W=1 reshape NxHx1xC -> NxHxCx1, else reshape Nx1xWxC -> NxWxCx1
2023 idx_to_del = 2 if ifm_shape[2] == 1 else 1
2024
2025 # Delete axis with size 1
2026 del reduce_axis[idx_to_del]
2027 del ifm_shape[idx_to_del]
2028 del intermediate_shape[idx_to_del]
2029
2030 # Add another element to set channel-axis to one
2031 reduce_axis.append(False)
2032 ifm_shape.append(1)
2033 intermediate_shape.append(1)
2034
2035 # Compute kernel sizes for our convolutions
2036 # Batch axis is implicit as it is only supported if batch size is 1.
Alexander Hansson1d5e8592023-06-27 12:36:25 +00002037 h = ifm_shape[1] if reduce_axis[1] else 1
2038 w = ifm_shape[2] if reduce_axis[2] else 1
2039
Alexander Hansson90c34b52023-05-31 15:03:03 +00002040 num_elements_in_axis = h * w
2041
2042 # If one convolution is enough, but height is greater than max kernel height
2043 # reshape from HxW to 1x(HxW)
2044 # This can only be done if the mean is computed over both H and W
Alexander Hansson1d5e8592023-06-27 12:36:25 +00002045 if h > max_height and num_elements_in_axis <= max_kernel_size and reduce_axis[1] and reduce_axis[2]:
2046 ifm_shape = [ifm_shape[0], 1, h * w, ifm_shape[3]]
Alexander Hansson90c34b52023-05-31 15:03:03 +00002047 w = h * w
2048 h = 1
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002049
Alexander Hansson90c34b52023-05-31 15:03:03 +00002050 intermediate_op = None
2051 height_per_conv = min(max_kernel_size // w, h)
2052 height_per_conv = min(height_per_conv, max_height)
2053 num_convs = math.ceil(h / height_per_conv)
2054 convs = list()
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002055
Alexander Hansson90c34b52023-05-31 15:03:03 +00002056 for i in range(num_convs):
2057 is_last_op = i == (num_convs - 1)
2058
2059 intermediate_op = op.clone(f"{op.name}_conv_{i}")
2060
2061 intermediate_op.type = Op.DepthwiseConv2DBias
2062
2063 # Set necessary depthwise attributes
2064 intermediate_op.attrs.update(
2065 {
2066 "padding": Padding.VALID,
2067 "stride_h": 1,
2068 "stride_w": 1,
2069 "strides": (1, 1, 1, 1),
2070 "depth_multiplier": 1,
2071 "channel_multiplier": 1,
2072 "dilation_h_factor": 1,
2073 "dilation_w_factor": 1,
2074 "dilation": (1, 1, 1, 1),
2075 }
2076 )
2077
Alexander Hansson1d5e8592023-06-27 12:36:25 +00002078 b, _, _, c = ifm_shape
Alexander Hansson90c34b52023-05-31 15:03:03 +00002079
2080 intermediate_tensor = op.ofm.clone(suffix=f"_conv_sum_{i}", set_unique=True)
2081 intermediate_tensor.dtype = DataType.int32
Alexander Hansson1d5e8592023-06-27 12:36:25 +00002082 intermediate_tensor.shape = intermediate_shape
Alexander Hansson90c34b52023-05-31 15:03:03 +00002083 intermediate_op.set_output_tensor(intermediate_tensor)
2084
2085 # as we have several convs, scaling/rounding must be done after the sum has been calculated
2086 intermediate_op.explicit_scaling = ExplicitScaling(False, shift=[0], multiplier=[1])
2087
2088 # compute height for the kernel
2089 if is_last_op and h % height_per_conv != 0:
2090 weight_h = h % height_per_conv
2091 else:
2092 weight_h = height_per_conv
2093
2094 # compute ifm read offset and shape for the convolution
Alexander Hansson1d5e8592023-06-27 12:36:25 +00002095 read_shape_h = weight_h if reduce_axis[1] else ifm_shape[1]
2096 read_shape_w = w if reduce_axis[2] else ifm_shape[2]
Alexander Hansson90c34b52023-05-31 15:03:03 +00002097
2098 intermediate_op.read_offsets[0] = Shape4D([0, i * height_per_conv, 0, 0])
Alexander Hansson1d5e8592023-06-27 12:36:25 +00002099 intermediate_op.read_shapes[0] = Shape4D(ifm_shape).with_hw(read_shape_h, read_shape_w)
Alexander Hansson90c34b52023-05-31 15:03:03 +00002100
2101 weight_quant = QuantizationParameters(0, 255, scale_f32=1.0, zero_point=0)
2102 weight_shape = [weight_h, w, c, b]
2103 weight_tensor = create_const_tensor(
2104 f"{intermediate_op.name}_weights",
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002105 weight_shape,
Alexander Hansson90c34b52023-05-31 15:03:03 +00002106 DataType.uint8,
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002107 np.ones(weight_shape),
Alexander Hansson90c34b52023-05-31 15:03:03 +00002108 TensorPurpose.Weights,
2109 quantization=weight_quant,
2110 )
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002111
Alexander Hansson90c34b52023-05-31 15:03:03 +00002112 weights_1D = np.ones(np.prod(weight_shape))
2113 weight_tensor.equivalence_id = create_equivalence_id(tuple(weights_1D))
2114 weight_tensor.value_id = weight_tensor.equivalence_id
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002115
Alexander Hansson90c34b52023-05-31 15:03:03 +00002116 intermediate_op.set_input_tensor(weight_tensor, 1)
Rickard Bolina68b82a2023-04-20 15:12:28 +00002117
Alexander Hansson90c34b52023-05-31 15:03:03 +00002118 dtype = DataType.int64 if intermediate_op.ifm.dtype == DataType.int16 else DataType.int32
2119 bias_values = [0] * c
2120 bias = create_const_tensor(f"{intermediate_op.name}_bias", [c], dtype, bias_values)
2121 bias.equivalence_id = create_equivalence_id(tuple(bias_values))
2122 bias.value_id = bias.equivalence_id
2123 intermediate_op.inputs.append(bias)
2124 intermediate_op.set_ifm_ofm_shapes()
Johan Alfven7b3008a2023-04-13 18:54:47 +02002125
Alexander Hansson1d5e8592023-06-27 12:36:25 +00002126 # We want to avoid reshaping the ifm tensor directly, to not affect other ops
Alexander Hansson90c34b52023-05-31 15:03:03 +00002127 # so we update the shape explicitly for this operation
Alexander Hansson1d5e8592023-06-27 12:36:25 +00002128 intermediate_op.ifm_shapes[0] = Shape4D(ifm_shape)
Rickard Bolina68b82a2023-04-20 15:12:28 +00002129
Alexander Hansson90c34b52023-05-31 15:03:03 +00002130 convs.append(intermediate_op)
2131 DebugDatabase.add_optimised(op, intermediate_op)
2132
2133 # If we have more than one convolution
2134 # We use add operations to accumulate the intermediate tensors
2135 if len(convs) > 1:
2136 prev_add_op = None
2137 idx = 0
2138
2139 while len(convs):
2140 intermediate_tensor = op.ofm.clone(suffix=f"_add_sum_{idx}", set_unique=True)
2141 intermediate_tensor.dtype = DataType.int32
Alexander Hansson1d5e8592023-06-27 12:36:25 +00002142 intermediate_tensor.shape = intermediate_shape
Alexander Hansson90c34b52023-05-31 15:03:03 +00002143
2144 one_scale_quant = QuantizationParameters(scale_f32=1.0, zero_point=0)
2145
2146 ifm = convs.pop().ofm
2147 if not prev_add_op:
2148 ifm2 = convs.pop().ofm
2149 else:
2150 ifm2 = prev_add_op.ofm
Alexander Hansson90c34b52023-05-31 15:03:03 +00002151 intermediate_op = create_add(f"{op.name}_add_{idx}", ifm, ifm2, one_scale_quant)
2152 intermediate_op.explicit_scaling = ExplicitScaling(False, shift=[0], multiplier=[1])
2153 intermediate_op.set_output_tensor(intermediate_tensor)
2154 intermediate_op.set_ifm_ofm_shapes()
2155
2156 prev_add_op = intermediate_op
2157 idx += 1
2158
2159 DebugDatabase.add_optimised(op, intermediate_op)
2160
2161 # Convert the original mean op to our final Mul operation
2162 # Which scales and divides by num_elements_in_axis
2163 op.type = Op.Mul
2164 op.name = f"{op.name}_mul"
2165 op.attrs = {}
2166 op.set_input_tensor(intermediate_op.ofm, 0)
Rickard Bolina68b82a2023-04-20 15:12:28 +00002167
Johan Alfven7b3008a2023-04-13 18:54:47 +02002168 # The multiplier is calculated in the same way as in the reference,
2169 # clamping the shift value at the price of some precision loss.
Johan Alfven7b3008a2023-04-13 18:54:47 +02002170 output_multiplier, output_shift_vela = quantise_scale(np.double(ifmq.scale_f32) / np.double(ofmq.scale_f32))
2171
2172 # Convert to reference representation shift value
2173 output_shift = 31 - output_shift_vela
2174
2175 # Reference calculation
2176 # round_down_log2 same as 63 - CountLeadingZeros(num_elements_in_axis)
2177 shift = round_down_log2(num_elements_in_axis)
2178 shift = min(shift, 32)
2179 shift = min(shift, 31 + output_shift)
2180 output_multiplier = (output_multiplier << shift) // num_elements_in_axis
2181 output_shift = output_shift - shift
2182
2183 # Convert to vela representation shift
2184 output_shift_vela = 31 - output_shift
2185
2186 # For int32 scaling is not supported so instead multiply with the scale
2187 # intermediate * scale -> round and shift.
Alexander Hansson90c34b52023-05-31 15:03:03 +00002188 identity_quant = QuantizationParameters(scale_f32=1.0, zero_point=0)
Johan Alfven7b3008a2023-04-13 18:54:47 +02002189 scalar = create_const_tensor(
2190 op.name + "_scalar", [1, 1, 1, 1], DataType.int32, [output_multiplier], quantization=identity_quant
2191 )
Alexander Hansson90c34b52023-05-31 15:03:03 +00002192 op.set_input_tensor(scalar, 1)
2193 op.set_ifm_ofm_shapes()
Alexander Hansson1d5e8592023-06-27 12:36:25 +00002194 op.ofm_shapes[0] = Shape4D(intermediate_shape)
Johan Alfven7b3008a2023-04-13 18:54:47 +02002195
2196 # Reference using TFL rounding for the multiply
Alexander Hansson90c34b52023-05-31 15:03:03 +00002197 op.rounding_mode = RoundingMode.TFLite
Johan Alfven7b3008a2023-04-13 18:54:47 +02002198
2199 # Need to use explicit scaling to get the wanted shift
Alexander Hansson90c34b52023-05-31 15:03:03 +00002200 op.explicit_scaling = ExplicitScaling(False, [output_shift_vela], [1])
2201 DebugDatabase.add_optimised(op, op)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002202 return op
2203
2204
Raul Farkas66207142023-05-25 11:15:20 +01002205def convert_ops_to_lut(op: Operation, arch, nng) -> Operation:
2206 """Convert Exp to 8bit or 16bit LUT to allow for support on NPU."""
Johan Alfvence502732023-04-24 13:35:40 +02002207 if op.type == Op.Exp:
2208 if op.ifm.dtype == DataType.int8:
2209 return create_lut_8bit_op(op, math.exp, "exp")
2210 elif op.ifm.dtype == DataType.int16:
2211 return create_lut_int16_op(op, math.exp, "exp")
2212 else:
2213 # Should already be catched in tflite supported ops
2214 assert False, f"Unsupported data type {op.ifm.dtype} for {op.type}"
2215
Johan Alfven8e525ca2023-05-07 13:12:37 +02002216 if op.type == Op.Rsqrt:
2217 return create_lut_rsqrt_int8_op(op)
2218
Johan Alfvence502732023-04-24 13:35:40 +02002219 return op
2220
2221
Ayaan Masood25f48dd2022-06-29 18:16:04 +01002222def optimise_quantize(op: Operation, arch, nng):
2223
2224 if op.type == Op.Quantize and op.run_on_npu:
2225
2226 ifm, ofm = op.get_ifm_ofm()
2227 input_values = ifm.values
2228
2229 # Guard clause - input not const or no values to quantize
2230 if ifm.ops[0].type != Op.Const or input_values is None:
2231 return op
2232
2233 # Singular val in numpy array, convert to indexable array
2234 if input_values.ndim == 0:
2235 input_values = np.array([input_values])
2236
Fredrik Svedberg11563172022-07-06 14:54:12 +02002237 # requantized int8 to int8 or int16 to int16
2238 if ifm.dtype == ofm.dtype == DataType.int8 or ifm.dtype == ofm.dtype == DataType.int16:
Ayaan Masood25f48dd2022-06-29 18:16:04 +01002239
2240 # scale needs to use double precision to match TFLite reference kernel
2241 effective_scale = np.float64(ifm.quantization.scale_f32) / np.float64(ofm.quantization.scale_f32)
2242 effective_multiplier, effective_shift = quantise_scale(effective_scale)
2243
Ayaan Masood25f48dd2022-06-29 18:16:04 +01002244 requantized_vals = []
Fredrik Svedberga04f2f72022-07-06 13:42:24 +02002245 for val in input_values.flatten():
Ayaan Masood25f48dd2022-06-29 18:16:04 +01002246 input_val = val - ifm.quantization.zero_point
2247
Fredrik Svedberga04f2f72022-07-06 13:42:24 +02002248 ofm_val = fp_math.multiply_by_quantized_multiplier(input_val, effective_multiplier, effective_shift)
2249 ofm_val += ofm.quantization.zero_point
Ayaan Masood25f48dd2022-06-29 18:16:04 +01002250
Fredrik Svedberga04f2f72022-07-06 13:42:24 +02002251 clamped_ofm_value = max(min(ofm_val, ofm.quantization.quant_max), ofm.quantization.quant_min)
2252 requantized_vals.append(clamped_ofm_value)
Ayaan Masood25f48dd2022-06-29 18:16:04 +01002253
Fredrik Svedberga04f2f72022-07-06 13:42:24 +02002254 ofm.values = np.array(requantized_vals, ofm.dtype.as_numpy_type())
2255 ofm.values.shape = input_values.shape
Ayaan Masood25f48dd2022-06-29 18:16:04 +01002256
2257 # Case: Float input - quantize to int
Fredrik Svedberga04f2f72022-07-06 13:42:24 +02002258 elif ifm.dtype.type == BaseType.Float:
Ayaan Masood25f48dd2022-06-29 18:16:04 +01002259
2260 quantized_vals = []
2261 for val in input_values:
2262
2263 # Derive quantized value
2264 quant_val = (val / ofm.quantization.scale_f32) + ofm.quantization.zero_point
Fredrik Svedberga04f2f72022-07-06 13:42:24 +02002265 clamped_quantized_val = np.clip(quant_val, ofm.quantization.quant_min, ofm.quantization.quant_max)
2266 quantized_vals.append(clamped_quantized_val)
Ayaan Masood25f48dd2022-06-29 18:16:04 +01002267
2268 # Pass the statically calculated quant val to output tensor
Fredrik Svedberga04f2f72022-07-06 13:42:24 +02002269 ofm.values = np.array(quantized_vals, ofm.dtype.as_numpy_type())
2270
2271 # Unsupported data type
2272 else:
2273 return op
Ayaan Masood25f48dd2022-06-29 18:16:04 +01002274
2275 # Make quantize op const and disconnect from parent node
2276
2277 # Remove reference of the current quant op from the parent tensor's consumer list
2278 ifm.consumer_list = [consumer for consumer in ifm.consumer_list if consumer.op_index != op.op_index]
2279
2280 # Clear any references to parent node
2281 op.inputs = []
2282
2283 # Convert this quantize op to const
2284 op.type = Op.Const
2285
2286 return op
2287
2288
Ayaan Masood4965fae2022-06-29 11:30:57 +01002289def convert_shape_op_to_constant_tensor(op: Operation, arch, nng):
2290 """Static optimisation for SHAPE operator output value known at compile time"""
2291
2292 # Disconnect SHAPE operator from its parent and transform SHAPE OP into constant
2293
2294 if op.type == Op.Shape and op.run_on_npu:
2295
2296 ifm, ofm = op.get_ifm_ofm()
2297
2298 if len(ifm.shape) != ofm.shape[0]:
2299 return op
2300
2301 # Remove reference of the current shape op from the parent tensor's consumer list
2302 ifm.consumer_list = [consumer for consumer in ifm.consumer_list if consumer.op_index != op.op_index]
2303
2304 # Clear any references to parent node
2305 op.inputs = []
2306
2307 # Convert this SHAPE op to const
2308 op.type = Op.Const
2309
2310 # Add size calculation to shape output tensors
2311 ofm.values = np.array(ifm.shape)
2312
2313 return op
2314
2315
Raul Farkas66207142023-05-25 11:15:20 +01002316def fixup_dilation_gt2(op: Operation, arch, nng) -> Operation:
2317 """Fixup Conv2DBias and DepthwiseConv2DBias to allow dilation greater than 2."""
Tim Hallea4ba662022-11-11 18:19:53 +00002318 assert op.run_on_npu
2319 if op.type == Op.Conv2DBias or op.type == Op.DepthwiseConv2DBias:
2320 dilation_w, dilation_h = op.get_kernel_dilation()
2321
2322 # if dilation in either axis is greater than that supported by the hardware then we must manually dilate the
2323 # kernel
2324 if dilation_w > 2 or dilation_h > 2:
2325 kernel_w, kernel_h = op.get_kernel_size()
2326 kernel_ic = op.weights.shape[-2]
2327 kernel_oc = op.weights.shape[-1]
2328
2329 # if the dilation is a multiple of 2 then the hardware dialtion can be enabled to provide that multiple
2330 # of 2. this allows the kernel size to be reduced (via the scaled dilation) by half in that dimension.
2331 # odd = 1, even = 2
2332 hw_dilation_h = 1 if (dilation_h & 1) else 2
2333 hw_dilation_w = 1 if (dilation_w & 1) else 2
2334
2335 scale_dilation_h = dilation_h // hw_dilation_h
2336 scale_dilation_w = dilation_w // hw_dilation_w
2337
2338 # create new empty kernel (HWIO format)
2339 new_kernel_h = (kernel_h - 1) * scale_dilation_h + 1
2340 new_kernel_w = (kernel_w - 1) * scale_dilation_w + 1
2341
2342 new_kernel_shape = [new_kernel_h, new_kernel_w, kernel_ic, kernel_oc]
2343 new_kernel_values = np.zeros(new_kernel_shape, dtype=op.weights.values.dtype)
2344
2345 # copy the original kernel values into the new sparse kernel
2346 for h in range(0, kernel_h):
2347 for w in range(0, kernel_w):
2348 new_h = h * scale_dilation_h
2349 new_w = w * scale_dilation_w
2350 new_kernel_values[new_h, new_w, :, :] = op.weights.values[h, w, :, :]
2351
2352 # update the weight tensor with the new dilated kernel
2353 op.weights.shape = new_kernel_shape
2354 op.weights.values = new_kernel_values
2355
2356 # enable(=2) / disable(=1) hardware dilation
2357 op.attrs["dilation"] = (1, hw_dilation_h, hw_dilation_w, 1) # nhwc format
2358 op.attrs["dilation_h_factor"] = hw_dilation_h
2359 op.attrs["dilation_w_factor"] = hw_dilation_w
2360
2361 return op
2362
2363
Tim Hall2180a172023-03-10 18:11:34 +00002364def fixup_reshape(op, arch, nng):
2365 def _get_explicit_shape(implicit_shape, total_size):
2366 # the explicit shape is a copy of the implicit shape but with the special -1 (remaining size) value converted to
2367 # the appropriate value
2368 if implicit_shape is None:
2369 return None
2370
2371 explicit_shape = list(implicit_shape)
2372 if -1 in explicit_shape:
2373 explicit_shape[explicit_shape.index(-1)] = int(total_size / abs(np.prod(implicit_shape)))
2374
2375 return explicit_shape
2376
2377 if op.type == Op.Reshape:
2378 ifm_tensor, _, ofm_tensor = op.get_ifm_ifm2_ofm()
2379 ifm_size = ifm_tensor.elements()
2380 ofm_shape = ofm_tensor.shape
2381
2382 new_shape_tensor_shape = op.inputs[1].values.flatten() if len(op.inputs) > 1 else None
2383 new_shape_tensor_shape = _get_explicit_shape(new_shape_tensor_shape, ifm_size)
2384
2385 new_shape_attribute = op.attrs.get("new_shape", None)
2386 new_shape_attribute = _get_explicit_shape(new_shape_attribute, ifm_size)
2387
2388 # if present the new shape tensor overrides the new_shape attribute
2389 if new_shape_tensor_shape is not None:
2390 # check tensor
2391 if not np.array_equal(new_shape_tensor_shape, ofm_shape):
2392 print(
2393 f"Warning: {optype_to_builtintype(op.type)} '{op.name}' has new shape tensor"
2394 f" ({new_shape_tensor_shape}) that does not match output tensor shape {ofm_shape}. Will use output"
2395 f" tensor shape."
2396 )
2397 elif new_shape_attribute is not None:
2398 # check attribute
2399 if not np.array_equal(new_shape_attribute, ofm_shape):
2400 print(
2401 f"Warning: {optype_to_builtintype(op.type)} '{op.name}' has new_shape attribute"
2402 f" ({new_shape_attribute}) that does not match output tensor shape {ofm_shape}. Will use output"
2403 f" tensor shape."
2404 )
2405 else:
2406 print(
2407 f"Warning: {optype_to_builtintype(op.type)} '{op.name}' does not have a new shape tensor or a new_shape"
2408 f" attribute. Will use output tensor shape {ofm_shape}."
2409 )
2410
2411 # force new shape tensor to output shape
2412 new_shape_tensor = create_const_tensor(
2413 op.name + "_new_shape", [len(ofm_shape)], DataType.int32, np.array(ofm_shape, np.int32)
2414 )
2415 if len(op.inputs) > 1:
2416 op.set_input_tensor(new_shape_tensor, 1)
2417 else:
2418 op.add_input_tensor(new_shape_tensor)
2419
2420 # force new_shape attribute to output shape
2421 op.attrs["new_shape"] = ofm_shape
2422
2423 return op
2424
2425
Tim Hall9cf63a32023-06-27 12:07:49 +01002426def convert_conv_groups(op: Operation, arch, nng):
2427 """
2428 Convert convolution groups to a split followed by separate convolutions and then a concat.
2429 This needs to run before the concat and split handling functions"""
2430 if not op.type.is_conv2d_op():
2431 return op
2432
2433 num_conv_groups = op.attrs.get("num_conv_groups", 0)
2434 if num_conv_groups > 1:
2435 # convolution groups params
2436 ifm_depth_cg = op.ifm.shape[-1] // num_conv_groups
2437 num_filters_cg = op.weights.shape[-1] // num_conv_groups
2438
2439 # create split
2440 split_op = Operation(Op.Split, f"{op.name}_split")
2441 split_op.attrs.update(
2442 {
2443 "num_splits": num_conv_groups,
2444 }
2445 )
2446 # first input is the split axis
2447 split_op.add_input_tensor(
2448 # split along the depth axis
2449 create_const_tensor(f"{split_op.name}_axis", [0], DataType.int32, [-1])
2450 )
2451 # second input is the ifm
2452 split_op.add_input_tensor(op.ifm)
2453 # calculate shape of each ofm part
2454 split_op_ofm_shape = op.ifm.shape[:-1] + [ifm_depth_cg]
2455
2456 # create concat. do this prior to each conv group so that the for-loop can reference the concat as it iterates
2457 concat_op = Operation(Op.ConcatTFLite, f"{op.name}_concat")
2458 concat_op.attrs.update(
2459 {
2460 "axis": -1,
2461 "fused_activation_function": None,
2462 }
2463 )
2464 # calculate shape of each ifm part
2465 concat_op_ifm_shape = op.ofm.shape[:-1] + [num_filters_cg]
2466 # output is the concatenated tensor
2467 concat_op.set_output_tensor(op.ofm) # will disconnect ofm from op
2468
2469 # for each conv group
2470 for i in range(num_conv_groups):
2471 # cg params
2472 cg_oc_start = i * num_filters_cg
2473 cg_oc_end = (i + 1) * num_filters_cg
2474
2475 # split has multiple outputs
2476 split_op_ofm_part = Tensor(split_op_ofm_shape, op.ifm.dtype, f"{split_op.name}_out{i}")
2477 split_op_ofm_part.quantization = op.ifm.quantization.clone()
2478 split_op.add_output_tensor(split_op_ofm_part)
2479
2480 # concat has multiple inputs
2481 concat_op_ifm_part = Tensor(concat_op_ifm_shape, op.ifm.dtype, f"{concat_op.name}_in{i}")
2482 concat_op_ifm_part.quantization = op.ofm.quantization.clone()
2483 concat_op.add_input_tensor(concat_op_ifm_part)
2484
2485 # create convolution group operator
2486 conv_group_op = Operation(op.type, f"{op.name}_cg{i}")
2487 conv_group_op.attrs = op.attrs.copy()
2488 conv_group_op.attrs["num_conv_groups"] = 1
2489 # first input is the ifm
2490 conv_group_op.add_input_tensor(split_op_ofm_part)
2491 # second input is weights. the number of filters (i.e. the output channels) need to be split equally
2492 # across all of the convolution groups
2493 conv_group_op_weights_shape = op.weights.shape[:-1] + [num_filters_cg]
2494 conv_group_op_weights_quant = op.weights.quantization.clone()
2495 conv_group_op_weights_quant.scale_f32 = op.weights.quantization.scale_f32[..., cg_oc_start:cg_oc_end]
2496 conv_group_op_weights_quant.zero_point = op.weights.quantization.zero_point[..., cg_oc_start:cg_oc_end]
2497 conv_group_op.add_input_tensor(
2498 create_const_tensor(
2499 f"{op.weights.name}_cg{i}",
2500 conv_group_op_weights_shape,
2501 op.weights.dtype,
2502 op.weights.values[..., cg_oc_start:cg_oc_end],
2503 op.weights.purpose,
2504 conv_group_op_weights_quant,
2505 )
2506 )
2507 # third input is bias. like the weights, the bias needs to be split equally across all of the convolution
2508 # groups
2509 if op.bias is None:
2510 conv_group_op.add_input_tensor(None)
2511 else:
2512 conv_group_op_bias_shape = op.bias.shape[:-1] + [num_filters_cg]
2513 conv_group_op_bias_quant = op.bias.quantization.clone()
2514 conv_group_op_bias_quant.scale_f32 = op.bias.quantization.scale_f32[..., cg_oc_start:cg_oc_end]
2515 conv_group_op_bias_quant.zero_point = op.bias.quantization.zero_point[..., cg_oc_start:cg_oc_end]
2516 conv_group_op.add_input_tensor(
2517 create_const_tensor(
2518 f"{op.bias.name}_cg{i}",
2519 conv_group_op_bias_shape,
2520 op.bias.dtype,
2521 op.bias.values[..., cg_oc_start:cg_oc_end],
2522 op.bias.purpose,
2523 op.bias.quantization,
2524 )
2525 )
2526 # output goes to the concat
2527 conv_group_op.set_output_tensor(concat_op_ifm_part)
2528 # update the cg op shapes and debug db
2529 conv_group_op.set_ifm_ofm_shapes()
2530 DebugDatabase.add_optimised(op, conv_group_op)
2531
2532 # update the split/concat op shapes/debug db
2533 split_op.set_ifm_ofm_shapes()
2534 DebugDatabase.add_optimised(op, split_op)
2535 concat_op.set_ifm_ofm_shapes()
2536 DebugDatabase.add_optimised(op, concat_op)
2537
2538 # disconnect the original convolution operator.
2539 # the ofm has already been disconnected by concat_op.set_output_tensor()
2540 op.ifm.consumer_list.remove(op)
2541 op.inputs = []
2542 op.outputs = []
2543
2544 # return last op so that other graph optimiser functions can process the new operators
2545 op = concat_op
2546
2547 return op
2548
2549
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002550def supported_operator_check(op, arch, nng):
Jonas Ohlsson45e653d2021-07-26 16:13:12 +02002551 op.run_on_npu = arch.tflite_supported_operators.is_operator_supported(op)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002552 return op
2553
2554
wilisa0146c94772023-02-08 09:56:14 +00002555def tflite_optimise_graph(nng, arch, force_symmetric_int_weights):
Fredrik Svedberg11563172022-07-06 14:54:12 +02002556 # Compile time static optimisations
wilisa0146c94772023-02-08 09:56:14 +00002557 optimisation_list = [
2558 optimise_quantize,
2559 convert_shape_op_to_constant_tensor,
2560 fixup_or_check_asymmetric_weights(force_symmetric_int_weights),
2561 ]
Ayaan Masood25f48dd2022-06-29 18:16:04 +01002562
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002563 for idx, sg in enumerate(nng.subgraphs):
2564 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
Jonas Ohlssond8575072022-03-30 10:30:25 +02002565 nng,
2566 sg,
2567 arch,
2568 [],
Ayaan Masood4965fae2022-06-29 11:30:57 +01002569 optimisation_list,
2570 rewrite_unsupported=False,
2571 )
2572
Fredrik Svedberga04f2f72022-07-06 13:42:24 +02002573 # Pre-processing step
Tim Hall9cf63a32023-06-27 12:07:49 +01002574 pre_process_list = [supported_operator_check, set_ifm_ofm_op_shapes, fixup_reshape, convert_conv_groups]
Fredrik Svedberga04f2f72022-07-06 13:42:24 +02002575
Ayaan Masood4965fae2022-06-29 11:30:57 +01002576 for idx, sg in enumerate(nng.subgraphs):
2577 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
2578 nng,
2579 sg,
2580 arch,
2581 [],
Jonas Ohlssond8575072022-03-30 10:30:25 +02002582 pre_process_list,
2583 rewrite_unsupported=False,
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002584 )
2585
2586 # Handle Concat Ops
2587 for idx, sg in enumerate(nng.subgraphs):
2588 rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [rewrite_concat_ops])
2589 sg.refresh_after_modification()
2590
2591 # Handle Split Ops
2592 for idx, sg in enumerate(nng.subgraphs):
2593 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
2594 nng,
2595 sg,
2596 arch,
2597 [],
2598 [rewrite_unpack_output, rewrite_stridedslice_output, convert_nop_split_to_identity],
2599 rewrite_unsupported=False,
2600 )
2601
2602 for idx, sg in enumerate(nng.subgraphs):
2603 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
Jonas Ohlssond8575072022-03-30 10:30:25 +02002604 nng,
2605 sg,
2606 arch,
2607 [rewrite_split_ops],
2608 [],
2609 rewrite_unsupported=False,
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002610 )
2611
Johan Alfvena5e1b622023-02-02 14:59:03 +01002612 # Bypass or rewrite memory only operators
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002613 for idx, sg in enumerate(nng.subgraphs):
2614 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
Jonas Ohlssond8575072022-03-30 10:30:25 +02002615 nng,
2616 sg,
2617 arch,
2618 [],
Johan Alfvena5e1b622023-02-02 14:59:03 +01002619 [bypass_memory_only_ops],
Jonas Ohlssond8575072022-03-30 10:30:25 +02002620 rewrite_unsupported=False,
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002621 )
2622
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002623 # Rewrite of operators
2624 op_rewrite_list = [
2625 set_tensor_equivalence,
Johan Alfvence502732023-04-24 13:35:40 +02002626 convert_ops_to_lut,
Rickard Bolina68b82a2023-04-20 15:12:28 +00002627 convert_mean_to_depthwise_conv,
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002628 convert_depthwise_to_conv,
2629 convert_conv_to_fc,
Fredrik Svedberg0ac08042023-04-11 22:35:04 +02002630 convert_lstm,
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002631 convert_softmax,
Fredrik Svedberg8ddd4892022-08-19 16:06:04 +02002632 convert_prelu,
Fredrik Svedberg36424312022-09-16 09:39:26 +02002633 convert_mul_max_to_abs_or_lrelu,
2634 convert_lrelu,
Raul Farkas3e7157b2023-05-09 09:09:17 +01002635 convert_avg_pool_to_conv2d,
Raul Farkas69782af2023-05-09 10:39:52 +01002636 fixup_strided_conv,
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002637 convert_hardswish_to_lut,
2638 rewrite_fully_connected_input,
2639 convert_batched_fc_shape,
2640 fixup_conv2d_backprop,
2641 fixup_relus_with_differing_ifm_ofm_scaling,
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002642 reorder_depthwise_weights,
Rickard Bolin6986a072022-12-19 12:33:40 +00002643 convert_argmax_to_depthwise_conv_and_max_pool,
Tim Hall885033b2022-07-21 11:46:03 +01002644 fixup_resize,
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002645 fixup_bias_tensors,
Fredrik Svedbergcc8569f2021-11-01 14:25:29 +01002646 fixup_asymmetric_weights,
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002647 convert_tanh_sigmoid_to_lut,
2648 replace_pad_by_hw_pad,
Tim Hallea4ba662022-11-11 18:19:53 +00002649 fixup_dilation_gt2,
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002650 ]
2651
2652 for idx, sg in enumerate(nng.subgraphs):
2653 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
Jonas Ohlssond8575072022-03-30 10:30:25 +02002654 nng,
2655 sg,
2656 arch,
2657 [],
2658 op_rewrite_list,
2659 rewrite_unsupported=False,
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002660 )
2661
2662 for idx, sg in enumerate(nng.subgraphs):
2663 # remove passthrough tensors and attempt further optimizations
2664 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
2665 nng,
2666 sg,
2667 arch,
2668 [remove_passthrough_tensor],
2669 [fuse_activation_function_with_prev, convert_pad, add_padding_fields],
2670 )
2671
2672 # Removal of SplitSliceRead, need to be done after optimisation has been performed,
2673 # since ifm/ofm_shapes are of importance to this function
2674 for sg in nng.subgraphs:
2675 rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [remove_SplitSliceRead])
2676 sg.refresh_after_modification()
2677
Fredrik Svedbergf3c7d552022-11-04 09:48:49 +01002678 # Make sure that const optimisations on subgraph outputs are handled correctly
2679 for sg in nng.subgraphs:
2680 for ofm in sg.output_tensors:
2681 if ofm.is_const and ofm.ops[0].type_changed:
2682 # Subgraph output cannot be const - insert a memory copy
2683 op = ofm.ops[0]
2684 ofm_clone = ofm.clone()
2685 ofm_clone.values = ofm.values
2686 ofm.values = None
Tim Hall3b1578e2023-01-13 17:57:25 +00002687 zero = create_const_tensor("zero", [1], ofm.dtype, [0], quantization=ofm.quantization)
Fredrik Svedbergf3c7d552022-11-04 09:48:49 +01002688 memcpy = create_add_nop(f"{ofm.name}_copy")
2689 memcpy.add_input_tensor(ofm_clone)
2690 memcpy.add_input_tensor(zero)
2691 memcpy.set_output_tensor(ofm)
2692 memcpy.set_ifm_ofm_shapes()
2693 op.set_output_tensor(ofm_clone)
2694 DebugDatabase.add_optimised(op, memcpy)
2695
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002696 return nng