blob: 478d01895e7b220b0b27df59fb4b8130078743bd [file] [log] [blame]
Tim Hall3b1578e2023-01-13 17:57:25 +00001# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Rickard Bolinbc6ee582022-11-04 08:24:29 +000016#
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020017# Description:
18# Early optimisation of a TensorFlow Lite based network graph, using the rewrite_graph module
19# to do the traversal of the graph.
20import math
21import uuid
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020022
23import numpy as np
24
25from . import fp_math
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020026from . import rewrite_graph
27from . import scaling
28from .api import NpuRoundingMode
Fredrik Svedberga04f2f72022-07-06 13:42:24 +020029from .data_type import BaseType
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020030from .data_type import DataType
31from .debug_database import DebugDatabase
32from .errors import UnsupportedFeatureError
33from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Jonas Ohlsson0957e3e2021-09-01 15:57:21 +020034from .graph_optimiser_util import bypass_memory_only_ops
Patrik Gustavssonc74682c2021-08-17 14:26:38 +020035from .graph_optimiser_util import calc_explicit_padding
Patrik Gustavssondf995102021-08-23 15:33:59 +020036from .graph_optimiser_util import convert_depthwise_to_conv
Patrik Gustavssonf436ada2021-09-14 14:56:48 +020037from .graph_optimiser_util import convert_to_lut
Fredrik Svedberg0ac08042023-04-11 22:35:04 +020038from .graph_optimiser_util import create_avg_pool_for_concat
Jonas Ohlsson0957e3e2021-09-01 15:57:21 +020039from .graph_optimiser_util import memory_only_ops
Patrik Gustavssonf1580f02021-09-01 12:43:02 +020040from .graph_optimiser_util import move_splitsliceread_to_consumer
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020041from .graph_optimiser_util import needed_total_padding
42from .graph_optimiser_util import set_ifm_ofm_op_shapes
43from .graph_optimiser_util import set_tensor_equivalence
Fredrik Svedberg0ac08042023-04-11 22:35:04 +020044from .lstm import Lstm
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020045from .numeric_util import clamp_sigmoid
Johan Alfven56811e62023-03-27 11:33:50 +020046from .numeric_util import full_shape
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020047from .numeric_util import round_away_zero
48from .operation import create_activation_function
Fredrik Svedberg1a7527c2021-09-13 15:52:16 +020049from .operation import ExplicitScaling
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020050from .operation import NpuBlockType
51from .operation import Op
52from .operation import Operation
53from .operation import Padding
Fredrik Svedbergf3c7d552022-11-04 09:48:49 +010054from .operation_util import create_add_nop
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020055from .operation_util import create_avgpool_nop
Johan Alfvenc1ad80b2023-03-31 10:19:23 +020056from .operation_util import create_cast_op
Rickard Bolin6986a072022-12-19 12:33:40 +000057from .operation_util import create_depthwise_maxpool
Johan Alfvenc1ad80b2023-03-31 10:19:23 +020058from .operation_util import create_memcpy
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020059from .operation_util import get_pad_values_from_input
Ayaan Masood25f48dd2022-06-29 18:16:04 +010060from .scaling import quantise_scale
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020061from .shape4d import Shape4D
62from .softmax import SoftMax
63from .tensor import check_quantized_tens_scaling_equal
64from .tensor import create_const_tensor
65from .tensor import create_equivalence_id
66from .tensor import QuantizationParameters
67from .tensor import Tensor
68from .tensor import TensorPurpose
69from .tflite_mapping import optype_to_builtintype
70
71passthrough_nodes = (Op.Identity,)
72
73
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +020074def remove_passthrough_tensor(tens, arch, nng):
75 if len(tens.ops) == 1 and tens.ops[0].type in passthrough_nodes:
76 assert len(tens.ops[0].inputs) == 1
77 tens = tens.ops[0].inputs[0]
78 return tens
79
80
81def rewrite_concat_ops(op, arch):
82 if not op.run_on_npu or not op.type.is_concat_op():
83 return
84
85 axis_4D = 0
86 ofm = op.ofm
87 ofm.ops = []
88 offset = 0
89
90 unfuse_activation_function(op)
91
92 if op.type == Op.Pack:
93 # Pack is also referred to as Stack
94 axis = int(op.attrs["axis"])
95 if axis < 0: # Convert to positive axis
96 axis = len(op.inputs[0].shape) + 1 + axis
97
98 desired_shape = op.inputs[0].shape[:axis] + [1] + op.inputs[0].shape[axis:]
99
100 axis_4D = axis + (4 - len(desired_shape))
101
102 for idx, inp in enumerate(op.inputs):
103 op.ifm_shapes[idx] = Shape4D(desired_shape)
104 op.type = Op.PackReshaped
105
106 inputs, axis = op.get_concat_inputs_axis()
107 for idx, inp in enumerate(inputs):
108 if op.type != Op.PackReshaped:
109 op.ifm_shapes[idx] = Shape4D(inp.shape)
110 if axis >= 0:
111 axis_4D = axis + (4 - len(inp.shape))
112 else:
113 axis_4D = axis
114 write_offset = [0, 0, 0, 0]
115 write_offset[axis_4D] = offset
116 concat_end = offset + op.ifm_shapes[idx][axis_4D]
117 create_avg_pool_for_concat(
118 op, op.name + str(idx) + "_avgpool", inp, op.ifm_shapes[idx], Shape4D.from_list(write_offset)
119 )
120 offset = concat_end
121 assert ofm.shape[axis] == offset
122
123 return op
124
125
126def rewrite_split_ops(tens, arch, nng):
127
128 if len(tens.ops) == 1 and tens.ops[0].type.is_split_op() and tens.ops[0].type != Op.Unpack:
129 split_op = tens.ops[0]
130
131 # Not supported so leave it and run on CPU
132 if not split_op.run_on_npu:
133 return tens
134
135 inp, outputs, axis, offset_start, offset_end = split_op.get_split_inputs_axis()
136
137 tens.ops = []
138 new_op = Operation(Op.SplitSliceRead, split_op.name)
139 new_op.inputs = [inp]
140 ofm_shape_idx = 0
Tim Hall51a8dce2021-12-20 16:49:27 +0000141 if None in (offset_end, offset_start):
142 read_shape = None
143 else:
144 # the read shape is relative to each start offset
145 read_shape = [oe - os for oe, os in zip(offset_end, offset_start)]
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200146
147 # For Split the offset cannot be extracted from the tensor so it has to
148 # be calculated from the index of the output tensor
149 if axis is not None:
150 # Get the start and end of the split
151 offset_start = [0] * 4
152 axis_4D_list = split_op.attrs.get("split_axis_4D", None) # Present for UnpackReshaped and some StridedSlice
153 for idx, out in enumerate(outputs):
154 if axis_4D_list is not None:
155 axis_4D = axis_4D_list[idx]
156 else:
157 split_op.ofm_shapes[idx] = Shape4D(out.shape)
158 if axis >= 0:
159 axis_4D = axis + (4 - len(out.shape))
160 else:
161 axis_4D = axis
162
163 if out == tens:
164 ofm_shape_idx = idx
165 read_shape = split_op.ofm_shapes[idx]
166 break
167
168 offset_start[axis_4D] += split_op.ofm_shapes[idx][axis_4D]
169
170 new_op.read_offsets[0] = Shape4D.from_list(offset_start, 0)
171 new_op.read_shapes[0] = read_shape
172 new_op.run_on_npu = True
173 new_op.set_output_tensor(tens)
174 new_op.ifm_shapes.append(Shape4D(inp.shape))
175 new_op.ofm_shapes.append(split_op.ofm_shapes[ofm_shape_idx])
176 DebugDatabase.add_optimised(split_op, new_op)
177
178 return tens
179
180
181def remove_SplitSliceRead(op, arch):
182
183 if op.type == Op.SplitSliceRead:
Fredrik Svedberg0ac08042023-04-11 22:35:04 +0200184 # Check if it is possible to put the SplitSliceRead on the tensor consumer(s),
185 # or if an avgpool need to be inserted
186 if op.ofm_shapes[0] == Shape4D.from_list(op.ofm.shape) and all(
187 consumer is not None and consumer.run_on_npu and consumer.type not in memory_only_ops
188 for consumer in op.ofm.consumer_list
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200189 ):
Fredrik Svedberg0ac08042023-04-11 22:35:04 +0200190 # SplitSliceRead can be performed by tensor consumer(s)
191 for cons_op in list(op.ofm.consumer_list):
192 move_splitsliceread_to_consumer(op, cons_op)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200193 else:
194 avgpool_op = create_avgpool_nop(op.name + "_avgpool")
195 avgpool_op.add_input_tensor(op.ifm)
196 avgpool_op.outputs = [op.ofm]
197 op.ofm.ops.remove(op)
198 op.ofm.ops.append(avgpool_op)
199 avgpool_op.ifm_shapes.append(op.ifm_shapes[0])
200 avgpool_op.ofm_shapes.append(op.ofm_shapes[0])
201 avgpool_op.read_offsets[0] = op.read_offsets[0]
202 avgpool_op.read_shapes[0] = op.read_shapes[0]
203
204 op.ifm.consumer_list.remove(op)
205 DebugDatabase.add_optimised(op, avgpool_op)
206
207
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200208def calc_padding_and_skirt(padding_type, kernel, input_shape, explicit_padding):
209 k_w, k_h = kernel.dilated_wh()
210 s_x, s_y = kernel.stride
211 ypad = needed_total_padding(int(input_shape.height), int(s_y), int(k_h))
212 xpad = needed_total_padding(int(input_shape.width), int(s_x), int(k_w))
213 if padding_type == Padding.SAME:
214 left_pad = (xpad + 0) // 2
215 right_pad = (xpad + 1) // 2
216 top_pad = (ypad + 0) // 2
217 bottom_pad = (ypad + 1) // 2
218 elif padding_type == Padding.VALID:
219 left_pad = 0
220 right_pad = 0
221 top_pad = 0
222 bottom_pad = 0
223 elif padding_type == Padding.EXPLICIT:
224 # Padding is specified in a PAD operator which has been bypassed.
225 top, left, bottom, right = explicit_padding
226 top_pad, bottom_pad = calc_explicit_padding(int(input_shape.height), int(s_y), int(k_h), int(top), int(bottom))
227 left_pad, right_pad = calc_explicit_padding(int(input_shape.width), int(s_x), int(k_w), int(left), int(right))
Rickard Bolin9ae34552022-06-09 13:07:17 +0000228 elif padding_type == Padding.TILE:
229 # The values in the explicit padding only represent the "direction" in which to pad
230 top_pad, left_pad, bottom_pad, right_pad = explicit_padding
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200231 else:
Tim Hall0ab2edc2022-02-23 17:58:02 +0000232 raise UnsupportedFeatureError(f"Unsupported padding = {padding_type} for padding calculation")
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200233 padding = (top_pad, left_pad, bottom_pad, right_pad)
234 skirt = (top_pad, left_pad, ypad - top_pad, xpad - left_pad)
235 return padding, skirt
236
237
238def calc_upscaled_padding_and_skirt(padding_type, kernel_size, stride, input_shape, upscaling_factor):
239 kernel_height, kernel_width = kernel_size[0], kernel_size[1]
240 if padding_type == Padding.SAME:
241 ypad = needed_total_padding(int(input_shape.height) * upscaling_factor, int(stride[1]), int(kernel_height))
242 xpad = needed_total_padding(int(input_shape.width) * upscaling_factor, int(stride[2]), int(kernel_width))
243 right_pad = max(((xpad + 1) // upscaling_factor) - 1, 0)
244 bottom_pad = max(((ypad + 1) // upscaling_factor) - 1, 0)
245 left_pad = max(kernel_width - 1 - right_pad, 0)
246 top_pad = max(kernel_height - 1 - bottom_pad, 0)
247 elif padding_type == Padding.VALID:
248 right_pad = max(kernel_width - 2, 0)
249 bottom_pad = max(kernel_height - 2, 0)
250 left_pad = kernel_width - 1
251 top_pad = kernel_height - 1
252 else:
Tim Hall0ab2edc2022-02-23 17:58:02 +0000253 raise UnsupportedFeatureError(f"Unsupported padding = {padding_type} for up-scaled padding calculation")
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200254 padding = (top_pad, left_pad, bottom_pad, right_pad)
255 skirt = padding
256 return padding, skirt
257
258
259def fixup_conv2d_backprop(op, arch, nng):
260 if op.type == Op.Conv2DBackpropInput:
261 # flip the inputs
262 op.inputs[0], op.inputs[2] = op.inputs[2], op.inputs[0]
263 op.type = Op.Conv2DBackpropInputSwitchedBias
Tim Hall3c5cfe92022-03-16 16:31:57 +0000264 op.ifm_resampling_mode = resampling_mode.TRANSPOSE
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200265
266 # Update strides
267 op.attrs.update({"stride_w": 1, "stride_h": 1, "strides": (1, 1, 1, 1)})
wilisa0179a89042022-11-02 17:18:43 +0000268 DebugDatabase.add_optimised(op, op)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200269
270 return op
271
272
273# Convert the op to an elementwise add
Tim Hall885033b2022-07-21 11:46:03 +0100274def convert_resize_1x1_to_add(op):
275 op.type = Op.Add # original_type will stay as Op.ResizeBilinear or Op.ResizeNearestNeighbor
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200276 op.name = op.name + "_add"
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200277 # Create an input tensor filled with zeros
wilisa018289d512023-01-12 08:17:23 +0000278 name = op.inputs[1].name + "_add"
279 dtype = op.inputs[0].dtype
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200280 shape = op.ofm_shapes[0].as_list()
wilisa018289d512023-01-12 08:17:23 +0000281 values = np.zeros(shape, dtype.as_numpy_type())
282 quantization = QuantizationParameters(0.0, 255.0)
283 quantization.scale_f32 = 1.0
284 quantization.zero_point = 0
wilisa0116b5e5e2023-02-14 12:03:59 +0000285 op.inputs[1] = op.inputs[0]
286 op.set_input_tensor(create_const_tensor(name, shape, dtype, values, quantization=quantization), 0)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200287 op.set_ifm_ofm_shapes()
wilisa0179a89042022-11-02 17:18:43 +0000288 DebugDatabase.add_optimised(op, op)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200289
290 return op
291
292
Tim Hall885033b2022-07-21 11:46:03 +0100293# Convert ResizeNearestNeightbor with align corners to a depthwise convolution. The IFM will already have been upscaled
294# apart from the final x2 scaling which will be done as part of this operation. The kernel contains a single coefficient
295# to select the appropriate nearest neighbor value
296def convert_resizenn_ac_to_depthwise_conv(op, upscale_factor):
297 ifm = op.ifm
298 ofm = op.ofm
299 output_depth = ofm.shape[-1]
300 dw_op_attrs = {
301 "padding": Padding.VALID,
302 "stride_h": 1,
303 "stride_w": 1,
304 "strides": (1, 1, 1, 1),
305 "depth_multiplier": 1,
306 "channel_multiplier": 1,
307 "dilation_h_factor": 1,
308 "dilation_w_factor": 1,
309 "dilation": (1, 1, 1, 1),
310 }
311
312 # change resizebilinear to depthwise
313 op.type = Op.DepthwiseConv2DBias
314 op.attrs.update(dw_op_attrs)
315 op.set_input_tensor(ifm, 0) # ifm tensor index
316 op.activation = None
317
318 # add input resample to resize by x2
319 op.ifm_resampling_mode = resampling_mode.NEAREST
320
321 # don't care about the rounding mode as it is nearest neighbor
322
323 # setup weight tensor
324 weight_quant = QuantizationParameters()
325 weight_quant.scale_f32 = 1.0 # no scaling as only a single non-zero coeff to select the desired value
326 weight_quant.zero_point = 0
327 weight_quant.quant_dim = 0
328 ofm_dtype = ofm.dtype
Tim Hall3b1578e2023-01-13 17:57:25 +0000329 if ofm_dtype.type == BaseType.UnsignedInt:
Tim Hall885033b2022-07-21 11:46:03 +0100330 weight_quant.quant_min = 0
331 weight_quant.quant_max = (1 << ofm_dtype.bits) - 1
332 else:
Tim Hall885033b2022-07-21 11:46:03 +0100333 weight_quant.quant_min = -(1 << (ofm_dtype.bits - 1))
334 weight_quant.quant_max = (1 << (ofm_dtype.bits - 1)) - 1
335
336 weight_shape = [upscale_factor, upscale_factor, output_depth, output_depth] # HWIO
337
338 # the single non-zero coefficient used to select the desired value needs to be placed in the 'centre value', which
339 # is calculated by finding the 'centre position' ('*' in the diagram below) and then choosing the 'value' that is
340 # below-and-right (i.e. next) to it (D).
341 # 0---1---2
342 # | A | B |
343 # 1---*---+
344 # | C | D |
345 # 2---+---+
346 weight_values = [0] * (upscale_factor * upscale_factor)
347 centre_coeff = (upscale_factor // 2) * upscale_factor + (upscale_factor // 2)
348 weight_values[centre_coeff] = 1
349
350 # add weight tensor, this will discard the size tensor of the resize op
351 op.set_input_tensor(
352 create_const_tensor(
353 "weights",
354 weight_shape,
Tim Hall3b1578e2023-01-13 17:57:25 +0000355 ofm_dtype,
Tim Hall885033b2022-07-21 11:46:03 +0100356 np.array(weight_values).reshape(weight_shape),
Tim Hall885033b2022-07-21 11:46:03 +0100357 quantization=weight_quant,
358 ),
359 1, # inputs tensor weight index
360 )
361
362 # setup bias tensor by assign None and then call the fix-up function to create a suitable tensor.
363 # need to append the bias tensor as resize ops only have 2 inputs
364 assert len(op.inputs) == 2
365 op.inputs.append(None)
Fredrik Svedbergcc219be2022-09-20 16:32:52 +0200366 fixup_bias_tensors(op, None, None, DataType.int32)
Tim Hall885033b2022-07-21 11:46:03 +0100367
368 # finally update the shape incase we've change the tensor shapes or connections
369 op.set_ifm_ofm_shapes()
wilisa0179a89042022-11-02 17:18:43 +0000370 DebugDatabase.add_optimised(op, op)
Tim Hall885033b2022-07-21 11:46:03 +0100371
372 return op
373
374
375# Convert ResizeBilinear/NearestNeighbor to a number of 1x1 average pools with nearest neighbor x2 upscaling and one
376# final average pool with a kernel size that depends upon the resize ops upscaling factor (x2, x4 or x8). The maximum
377# upscale factor is limited to x8 because of the limit 8x8 kernel size limit for average pool with padding.
378def convert_resize_to_upscale_and_average_pool(op):
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200379 pre_op = op
380 outputs = op.outputs
Rickard Boline546def2022-01-25 15:45:00 +0000381 dtype = op.ifm.dtype
Tim Hall885033b2022-07-21 11:46:03 +0100382
Rickard Boline546def2022-01-25 15:45:00 +0000383 op.attrs.update({"strides": (1, 1, 1, 1), "ksize": (1, 1, 1, 1)})
Tim Hall47c76362022-07-18 21:26:47 +0100384 op.attrs["padding"] = Padding.SAME # doesn't really matter as the kernel is 1x1
Tim Hall3c5cfe92022-03-16 16:31:57 +0000385 op.ifm_resampling_mode = resampling_mode.NEAREST
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200386
387 upscaled_shape = np.array(op.ifm_shapes[0].get_hw_as_list())
Tim Hall47c76362022-07-18 21:26:47 +0100388
389 # Get upscale factor that was calculated in the supported operators check
390 upscale_factor = op.attrs["upscale_factor"]
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200391
Rickard Boline546def2022-01-25 15:45:00 +0000392 # Calculate how many times 2x2 upscaling needs to be performed
Tim Hallf9267da2022-04-20 20:19:48 +0100393 # Force the result of round to be an integer. This is because the behaviour of rounding numpy.float64 values changed
394 # between different versions of numpy. This consistency ensures that the kernel dimensions are kept integral
Rickard Boline546def2022-01-25 15:45:00 +0000395 n = int(np.log2(upscale_factor))
396
Tim Hall885033b2022-07-21 11:46:03 +0100397 # Perform x2 upscaling n-1 times
Rickard Boline546def2022-01-25 15:45:00 +0000398 scaled_op = pre_op
399 for count in range(n - 1):
400 if count > 0:
401 scaled_op = op.clone(f"_{count}")
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200402 scaled_op.inputs[0] = pre_op.outputs[0]
403
Tim Hall885033b2022-07-21 11:46:03 +0100404 # Nearest neighbor x2 upscaling
Tim Hall47c76362022-07-18 21:26:47 +0100405 upscaled_shape = upscaled_shape * 2
Rickard Boline546def2022-01-25 15:45:00 +0000406 shape = op.ofm_shapes[0].as_list()
407 shape[1:3] = upscaled_shape
408 out_tens = Tensor(shape, dtype, f"{op.outputs[0].name}_{count}")
409 out_tens.quantization = op.outputs[0].quantization.clone()
410 scaled_op.set_output_tensor(out_tens)
411 pre_op = scaled_op
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200412
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200413 scaled_op.set_ifm_ofm_shapes()
wilisa0179a89042022-11-02 17:18:43 +0000414 DebugDatabase.add_optimised(op, scaled_op)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200415
Tim Hall885033b2022-07-21 11:46:03 +0100416 # Last x2 upscaling
Rickard Boline546def2022-01-25 15:45:00 +0000417 if n > 1:
418 scaled_op = op.clone(f"_{n-1}")
419 scaled_op.inputs[0] = pre_op.outputs[0]
Tim Hall885033b2022-07-21 11:46:03 +0100420
421 if scaled_op.original_type == Op.ResizeBilinear:
422 if scaled_op.attrs["align_corners"]:
423 # no padding
424 scaled_op.attrs["padding"] = Padding.VALID
425 else:
426 # padding to the right and bottom (limits average pool to 8x8 kernel)
427 scaled_op.attrs["padding"] = Padding.EXPLICIT
428 scaled_op.attrs["explicit_padding"] = [0, 0, upscale_factor - 1, upscale_factor - 1]
429
430 # kernal size dependent on the upscaling factor
431 scaled_op.attrs.update({"ksize": (1, upscale_factor, upscale_factor, 1)})
432 else: # Op.ResizeNearestNeighbor
433 if scaled_op.attrs["align_corners"]:
434 # use depthwise conv to select the correct value
435 scaled_op = convert_resizenn_ac_to_depthwise_conv(scaled_op, upscale_factor)
436 else:
Johan Alfvéna64616c2022-10-17 12:29:12 +0200437 # Keep 1x1 kernel and average pool, this applies both when
438 # half-pixel-centers is True and False. Calculations are the
439 # same in the reference.
Tim Hall885033b2022-07-21 11:46:03 +0100440 pass
441
Rickard Boline546def2022-01-25 15:45:00 +0000442 scaled_op.outputs = outputs
443 scaled_op.outputs[0].ops = [scaled_op]
444 scaled_op.set_ifm_ofm_shapes()
wilisa0179a89042022-11-02 17:18:43 +0000445 DebugDatabase.add_optimised(op, scaled_op)
Rickard Boline546def2022-01-25 15:45:00 +0000446
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200447 return op
448
449
Rickard Bolin6986a072022-12-19 12:33:40 +0000450def convert_argmax_to_depthwise_conv_and_max_pool(op, arch, nng):
451 """
452 Convert ArgMax to DWConv2D->MaxPool->DWConv2D, see details below.
453
454 Example:
455 arr = [4, [00000100,
456 6, = 00000110, # <-- This is the largest value, so we're expecting argmax(arr) = 1
457 5] 00000101]
458
459 Use 16-bit precision and shift all values 7 bits to the left:
460 Shifted_arr = [0000001000000000,
461 0000001100000000,
462 0000001010000000]
463
464 Add "c - index of channel" to each channel:
465 Shifted_arr_plus_reverse_idx = [0000001000000010, (+2)
466 0000001100000001, (+1)
467 0000001010000000] (+0)
468
469 The index is reversed since ArgMax selects the lowest index if maximum value is found at two index. The index will
470 act as a tie-breaker between channels with equal values and since we want the smallest channel index to be chosen
471 we reverse the index before the maxpool and then subtract the index from the number of channel after the maxpool to
472 get the correct index.
473
474 Find the maximum value in the array:
475 val = max(shifted_arr_plus_reverse_idx) = 0000001100000001
476
477 Subtract the value from the number of channels:
478 shifted_arr_plus_idx = (c-1) - val = 2 - 1 = 1
479
480 Extract the 7 lowest bits using a LUT to cut off the 9 most significant bits:
481 idx = LUT(val) = 0000000000000001 = 1
482 """
483
484 if op.type == Op.ArgMax:
485 ifm, ofm = op.inputs[0], op.outputs[0]
486 identity_quant = QuantizationParameters()
487 identity_quant.zero_point = 0
488 identity_quant.scale_f32 = 1.0
Rickard Bolin6986a072022-12-19 12:33:40 +0000489 # Add last dimension to ofm shape
490 ofm.shape += [1]
491 ofm.ops = []
492
493 # Create 1x1 Depthwise convolution with 2**7 weights for each channel to convert precision to 16 bit and shift
494 # all values 7 bits to the left
495 # Set necessary depthwise attributes
496 dw_op_attrs = {
497 "padding": Padding.VALID,
498 "stride_h": 1,
499 "stride_w": 1,
500 "strides": (1, 1, 1, 1),
501 "depth_multiplier": 1,
502 "channel_multiplier": 1,
503 "dilation_h_factor": 1,
504 "dilation_w_factor": 1,
505 "dilation": (1, 1, 1, 1),
506 "explicit_padding": None,
507 }
Johan Alfvenc1ad80b2023-03-31 10:19:23 +0200508 orig_name = op.name
509 op.name = f"{orig_name}_depthwise_conv_SHL_7"
Rickard Bolin6986a072022-12-19 12:33:40 +0000510 op.type = Op.DepthwiseConv2DBias
511 op.attrs.update(dw_op_attrs)
Johan Alfven56811e62023-03-27 11:33:50 +0200512 n, h, w, c = full_shape(4, ifm.shape, 1)
Rickard Bolin6986a072022-12-19 12:33:40 +0000513 shape = [1, 1, 1, c]
514 kernel = np.dstack([2**7] * c)
515 op.inputs = []
516 op.add_input_tensor(ifm)
517 op.add_input_tensor(
518 create_const_tensor(
519 "weights",
520 shape,
521 DataType.uint8,
522 np.array(kernel).reshape(shape),
523 quantization=identity_quant,
524 ),
525 )
526 # Let the bias for each channel be the "reverse" index of the channel it is in, ie c - channel_idx
527 reverse_idxs = list(reversed(range(c)))
528 bias_tensor = create_const_tensor(op.name + "_bias", [c], DataType.int64, reverse_idxs)
529 op.add_input_tensor(bias_tensor)
530
531 intermediate_tens = Tensor([n, h, w, c], DataType.int16, "int16_and_shifted_7_bits_left")
532 intermediate_tens.quantization = ifm.quantization
533 op.set_output_tensor(intermediate_tens)
534 op.set_ifm_ofm_shapes()
535 orig_ifm_shape = op.ifm_shapes[0]
536 DebugDatabase.add_optimised(op, op)
537
538 # To extract 7 least significant bits and swap reverse index back to real index using a LUT activation, we set
539 # the base value to c-1 and slope to -128. The 16-bit LUT uses a table of 32-bit values where the top 16 bits
540 # represent the slope and bottom 16 bits the base which are used to interpolate the activation value.
541 slope = (-128 & 0xFFFF) << 16 # Top 16 bits of 32 bit LUT table value
542 base = c - 1 # Bottom 16 bits of the LUT table value
543 lut_tensor = create_const_tensor(
544 "maxpool_LUT_extract_7_LSB",
545 [1, 1, 1, 512],
546 DataType.uint32,
547 [slope + base] * 512,
548 TensorPurpose.LUT,
549 )
550
551 # Split large feature maps into smaller chunks since the Depthwise Maxpool height dimension can overflow due to
552 # flattening the ifm to (H*W)xCx1
553 max_height = 2**16 // orig_ifm_shape.width
554 num_full_height_ops = orig_ifm_shape.height // max_height
555 last_op_height = orig_ifm_shape.height - max_height * num_full_height_ops
556 op_heights = [max_height] * num_full_height_ops
557 if last_op_height > 0:
558 op_heights.append(last_op_height)
559
560 # Create maxpool output tensor which is reshaped to 1x(H*W)x1x1. The product H*W might be larger than the
561 # maximum allowed height, but that's handled by reading and writing the data in chunks
562 maxpool_ofm = Tensor([1, orig_ifm_shape.height * orig_ifm_shape.width, 1, 1], DataType.int16, "argmax_maxpool")
563 maxpool_ofm.quantization = identity_quant
564
565 for op_idx, op_height in enumerate(op_heights):
566 maxpool_op = create_depthwise_maxpool(
567 f"dw_maxpool_{op_idx}", intermediate_tens, orig_ifm_shape, identity_quant
568 )
569 maxpool_op.outputs = [maxpool_ofm]
570 maxpool_ofm.ops.append(maxpool_op)
571 maxpool_op.ofm_shapes = [Shape4D(maxpool_ofm.shape)]
572 maxpool_op.set_activation_lut(lut_tensor)
573
574 # Set read and write shapes/offsets to read/write chunks of the IFM/OFM
575 maxpool_op.read_shapes[0] = Shape4D([1, op_height * orig_ifm_shape.width, orig_ifm_shape.depth, 1])
576 maxpool_op.read_offsets[0] = Shape4D([0, sum(op_heights[:op_idx]) * orig_ifm_shape.width, 0, 0])
577 maxpool_op.write_shape = Shape4D([1, op_height * orig_ifm_shape.width, 1, 1])
578 maxpool_op.write_offset = Shape4D([0, sum(op_heights[:op_idx]) * orig_ifm_shape.width, 0, 0])
579 DebugDatabase.add_optimised(op, maxpool_op)
580
Johan Alfvenc1ad80b2023-03-31 10:19:23 +0200581 # Set final shape
582 maxpool_ofm.set_all_shapes([1, h, w, 1])
583
584 # Convert 16bit to 32bit or 64bit
585 if ofm.dtype == DataType.int64:
586 # If OFM dtype is int64 the result is converted by two cast ops (16bit to 32bit)
587 #
588 # A -> B -> C -> D (OFM)
589 # |0001| |00010000| |0001|0000| |00010000|00000000|
590 # i16 i32 i16 i16 i32 i32
591 # <-------i64------->
592 #
593 # Memcpy is used to copy the content from B to C and from D to OFM
594 # Memcpy will be turned into a nop or an DMA transer if memory regions differs.
595 intermediate_32bit = Tensor([1, h, w, 1], DataType.int32, f"{orig_name}_32bit")
596 else:
597 intermediate_32bit = ofm
598
599 op_cast = create_cast_op(f"{orig_name}_cast_to_32bit_1", maxpool_ofm, intermediate_32bit)
600 DebugDatabase.add_optimised(op, op_cast)
601
602 if ofm.dtype == DataType.int64:
603 # Create int16 tensor with double shape to cover the intermediate_32bit result from the first cast
604 intermediate_16bit_2x_size = Tensor([1, h, w, 2], DataType.int16, f"{orig_name}_16bit_2x_size")
605 memcpy_op = create_memcpy(f"{orig_name}_memcpy_1", intermediate_32bit, intermediate_16bit_2x_size)
606 DebugDatabase.add_optimised(op, memcpy_op)
607
608 # Create int32 tensor with double ofm shape to be able to store a "int64" result
609 intermediate_32bit_2x_size = Tensor([1, h, w, 2], DataType.int32, f"{orig_name}_32bit_2x_size")
610
611 op_cast = create_cast_op(
612 f"{orig_name}_cast_to_32bit_2", intermediate_16bit_2x_size, intermediate_32bit_2x_size
613 )
614 DebugDatabase.add_optimised(op, op_cast)
615
616 memcpy_op = create_memcpy("f{orig_name}_memcpy_2", intermediate_32bit_2x_size, ofm)
617 DebugDatabase.add_optimised(op, memcpy_op)
Rickard Bolin6986a072022-12-19 12:33:40 +0000618
619 return op
620
621
Rickard Bolinfea15162022-07-04 16:19:16 +0000622def convert_resizebilinear_to_depthwise_convolutions(op, half_pixel_centers=True):
623 def _compute_interpolation_values(index, input_size, output_size):
624 scale = input_size / output_size
625 scaled_value = (index + 0.5 * half_pixel_centers) * scale - 0.5 * half_pixel_centers
626 lower_bound = max(np.floor(scaled_value), 0)
627
628 return scaled_value, lower_bound
629
630 def _compute_kernels(input_height, input_width, output_height, output_width):
631 kernels = []
632 for y in (1, 2):
633 for x in (1, 2):
634 sv_h, lb_h = _compute_interpolation_values(y, input_height, output_height)
635 sv_w, lb_w = _compute_interpolation_values(x, input_width, output_width)
636
637 # Interpolation values calculated for (x, y) = ([1, 2], [1, 2]) will always generalize to the whole
638 # input for upscale = 2 and input sizes >= 2x2 and be in the correct order for going left-to-right,
639 # top-to-bottom - same as the depthwise convolution strides across each tile
640 kernel = np.zeros((2, 2))
641 kernel[1, 1] = (1 - (sv_h - lb_h)) * (1 - (sv_w - lb_w))
642 kernel[0, 1] = (sv_h - lb_h) * (1 - (sv_w - lb_w))
643 kernel[1, 0] = (1 - (sv_h - lb_h)) * (sv_w - lb_w)
644 kernel[0, 0] = (sv_h - lb_h) * (sv_w - lb_w)
645 kernel *= 16
646 kernels.append(kernel)
647
648 return kernels
649
650 def _build_convolutions(op, kernels):
651 dw_op_attrs = {
652 "padding": Padding.TILE,
653 "stride_h": 1,
654 "stride_w": 1,
655 "strides": (1, 1, 1, 1),
656 "depth_multiplier": 1,
657 "channel_multiplier": 1,
658 "dilation_h_factor": 1,
659 "dilation_w_factor": 1,
660 "dilation": (1, 1, 1, 1),
661 }
662 ifm = op.ifm
663 ofm = op.ofm
664 ofm.ops = []
665 elem_size = 2 if ofm.dtype == DataType.int16 else 1
666
667 n, h, w, c = ifm.shape
668 _, _, ow, _ = ofm.shape
669
670 intermediate_tens = Tensor(ifm.shape, ifm.dtype, "intermediate_tens")
671 intermediate_tens.quantization = op.outputs[0].quantization.clone()
672 avgpool_op = op
673 avgpool_op.name = "rb_init_avgpool"
674 avgpool_op.type = Op.AvgPool
675 avgpool_op.attrs["padding"] = Padding.VALID
676 avgpool_op.attrs["stride_w"] = 1
677 avgpool_op.attrs["stride_h"] = 1
678 avgpool_op.attrs["filter_width"] = 1
679 avgpool_op.attrs["filter_height"] = 1
680 avgpool_op.attrs["strides"] = [1, 1, 1, 1]
681 avgpool_op.attrs["ksize"] = [1, 1, 1, 1]
682
683 avgpool_op.add_input_tensor(ifm)
684 avgpool_op.set_output_tensor(intermediate_tens)
685 avgpool_op.set_ifm_ofm_shapes()
wilisa0179a89042022-11-02 17:18:43 +0000686 DebugDatabase.add_optimised(op, op)
Rickard Bolinfea15162022-07-04 16:19:16 +0000687
688 dw_conv = Operation(Op.DepthwiseConv2DBias, "depthwise_conv")
689 dw_conv._original_type = Op.ResizeBilinear
690 dw_conv.write_shape = Shape4D(n, h, w, c)
691 dw_conv.write_offset = Shape4D(0, 0, 0, 0)
692
693 # Set the output rounding mode. Resize bilinear requires rounding away from zero. Therefore, we need to
694 # adjust the accumulated value by a "small" amount before applying natural rounding. The "small" amount
695 # should be big enough to cause a x.5 to be rounded correctly but small enough not to cause smaller
696 # values to be incorrectly rounded
697 ofm.quantization.next_after = True
698 dw_conv.rounding_mode = NpuRoundingMode.NATURAL
699
700 # Double height and width stride to write the output of each of the four depthwise convolutions below
701 # interleaved with each other when combined with OFM tile base offsets.
702 dw_conv.ofm_stride_multiplier = [1, 2, 2] # C/H/W
703
704 # Choose tile padding direction - pad by 1 with edge values in two direction.
705 # For example, TL (top left) will pad top and left in H/W-plane in all channels.
706 directions = [[1, 1, 0, 0], [1, 0, 0, 1], [0, 1, 1, 0], [0, 0, 1, 1]] # TL, TR, BL, BR
707 for i in (0, 1):
708 for j in (0, 1):
709 index = i * 2 + j
710 dw_conv.name = f"depthwise_conv_{index}"
711 dw_op_attrs["explicit_padding"] = directions[index]
712 dw_conv.attrs.update(dw_op_attrs)
713
714 # This will offset the start of the write by modifying the Tile 0 base address
715 dw_conv.tile_base_offsets_ofm[0] = (i * ow + j) * c * elem_size
716
717 ofm.ops.append(dw_conv)
718 dw_conv.outputs = [ofm]
719
720 kernel = kernels[index]
721 shape = [2, 2, 1, c]
722 kernel = np.dstack([kernel] * c)
723
724 quant = QuantizationParameters()
725 quant.zero_point = 0
726 quant.scale_f32 = 1.0 / 16
727
728 dw_conv.inputs = []
729 dw_conv.add_input_tensor(intermediate_tens)
730 dw_conv.add_input_tensor(
731 create_const_tensor(
732 "weights",
733 shape,
734 intermediate_tens.dtype,
735 np.array(kernel).reshape(shape),
Rickard Bolinfea15162022-07-04 16:19:16 +0000736 quantization=quant,
737 ),
738 )
739
740 # setup bias tensor by assign None and then call the fix-up function to create a suitable tensor.
741 # need to append the bias tensor as resize ops only have 2 inputs
742 assert len(dw_conv.inputs) == 2
743 dw_conv.inputs.append(None)
Rickard Bolin017b4cc2022-09-23 10:16:48 +0000744 fixup_bias_tensors(dw_conv, None, None, dtype=DataType.int32)
Rickard Bolinfea15162022-07-04 16:19:16 +0000745
746 dw_conv.set_ifm_ofm_shapes()
wilisa0179a89042022-11-02 17:18:43 +0000747 DebugDatabase.add_optimised(op, dw_conv)
748
Rickard Bolinfea15162022-07-04 16:19:16 +0000749 dw_conv = dw_conv.clone(f"_{index}")
750 return op
751
752 _, input_height, input_width, _ = op.ifm.shape
753 _, output_height, output_width, _ = op.ofm.shape
754
755 kernels = _compute_kernels(input_height, input_width, output_height, output_width)
756 op = _build_convolutions(op, kernels)
757
758 return op
759
760
Tim Hall885033b2022-07-21 11:46:03 +0100761def fixup_resize(op, arch, nng):
762 if op.type.is_resize_op() and op.run_on_npu:
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200763 if op.ifm_shapes[0] == op.ofm_shapes[0]:
Tim Hall885033b2022-07-21 11:46:03 +0100764 # Bypass the resize op which is essentially a NOP
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200765 op.inputs = op.inputs[:1]
766 op.type = Op.Identity
767 elif op.ifm_shapes[0].height == 1 and op.ifm_shapes[0].width == 1:
Tim Hall885033b2022-07-21 11:46:03 +0100768 convert_resize_1x1_to_add(op)
Rickard Bolinfea15162022-07-04 16:19:16 +0000769 elif op.type == Op.ResizeBilinear and op.attrs.get("half_pixel_centers", False):
770 convert_resizebilinear_to_depthwise_convolutions(op)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200771 else:
Tim Hall885033b2022-07-21 11:46:03 +0100772 convert_resize_to_upscale_and_average_pool(op)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200773
774 return op
775
776
777def convert_nop_split_to_identity(op, arch, nng):
778 if op.type == Op.Split and op.attrs.get("num_splits") == 1:
779 # the list comprehension should return a list with a single tensor
780 # if it shouldn't, remove_passthrough_tensor will fail appropriately
781 op.inputs = [i for i in op.inputs if i.shape == op.outputs[0].shape]
782 op.type = Op.Identity
783 return op
784
785
Ayaan Masooda2ec5aa2022-04-21 14:28:03 +0100786def rewrite_fully_connected_input(op: Operation, arch, nng):
Fredrik Svedberg0ac08042023-04-11 22:35:04 +0200787 # If the operation already have a read shape do not modify
788 # the ifm shape, since that will already be correct
789 if op.type == Op.FullyConnected and not op.read_shapes[0]:
Ayaan Masooda2ec5aa2022-04-21 14:28:03 +0100790 new_shape = op.ifm.get_shape_as_2d(op.weights.shape[-2])
791 assert new_shape is not None, "Tensor can not be reshaped to 2D"
792 op.ifm_shapes[0] = new_shape
Johan Alfvén65835e02022-10-13 10:49:30 +0200793
794 if op.ifm_shapes[0].batch > 1 and op.ofm_shapes[0].batch == 1:
795 # If IFM is batching then also make sure OFM is batching
796 h, w = op.ofm_shapes[0].height, op.ofm_shapes[0].width
797 op.ofm_shapes[0] = Shape4D([h * w, 1, 1, op.ofm_shapes[0].depth])
798
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200799 return op
800
801
802def convert_batched_fc_shape(op, arch, nng):
803 if op.type == Op.FullyConnected:
804 # Check if the first dimension indicates batching
805 if op.ifm_shapes[0].batch > 1:
806 batching_split = {4: (2, 2), 8: (2, 4), 16: (4, 4)}
807 n = op.ifm_shapes[0].batch
808 h, w = batching_split.get(n, (1, n))
809 op.ifm_shapes[0] = Shape4D([1, h, w, op.ifm_shapes[0].depth])
810
811 # Reshape Weights to be 4D. IO becomes HWIO
812 weight_tensor = op.inputs[1]
James Peet7519d502021-07-19 16:47:58 +0100813 weight_tensor.values = np.expand_dims(np.expand_dims(weight_tensor.values, axis=0), axis=0)
814 weight_tensor.set_all_shapes(list(weight_tensor.values.shape))
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200815
816 n = op.ofm_shapes[0].batch
817 h, w = batching_split.get(n, (1, n))
818 op.ofm_shapes[0] = Shape4D([1, h, w, op.ofm_shapes[0].depth])
819 return op
820
821
822def unfuse_activation_function(op):
823 if op.type == Op.ConcatTFLite and op.run_on_npu and op.activation is not None:
824 act_op = Operation(op.activation.op_type, op.name + op.activation.op_type.name)
825 op.activation = None
826 out_tens = op.outputs[0]
827 intermediate_tens = out_tens.clone("_act_intermediate")
828 act_op.set_output_tensor(out_tens)
829 act_op.add_input_tensor(intermediate_tens)
830 op.set_output_tensor(intermediate_tens)
831 act_op.set_ifm_ofm_shapes()
wilisa0179a89042022-11-02 17:18:43 +0000832 DebugDatabase.add_optimised(op, act_op)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200833
834
835def rewrite_stridedslice_output(op, arch, nng):
836 if not op.run_on_npu or op.type != Op.StridedSlice:
837 return op
838
839 new_axis_mask = op.attrs["new_axis_mask"]
840 shrink_axis_mask = op.attrs["shrink_axis_mask"]
841
842 if shrink_axis_mask == 0 and new_axis_mask == 0:
843 return op
844
845 axis_4D = [0] * len(op.outputs)
846 for idx, out_tens in enumerate(op.outputs):
847 output_shape = list(out_tens.shape)
848
849 if shrink_axis_mask != 0:
850 n = 0
851 axis = 0
852 while shrink_axis_mask:
853 prev_mask = shrink_axis_mask
854 n += 1
855 shrink_axis_mask &= shrink_axis_mask - 1
856 axis = int(math.log2(prev_mask - shrink_axis_mask))
857 output_shape = output_shape[:axis] + [1] + output_shape[axis:]
858
859 assert len(out_tens.shape) == (len(op.inputs[0].shape) - n)
860 op.attrs["shrink_axis_mask"] = 0
861 if axis >= 0:
862 axis_4D[idx] = axis + (4 - len(output_shape))
863 else:
864 axis_4D[idx] = axis
865 op.ofm_shapes[idx] = Shape4D(output_shape)
866
867 elif new_axis_mask != 0:
868 n = 0
869 axis = 0
870 while new_axis_mask:
871 prev_mask = new_axis_mask
872 n += 1
873 new_axis_mask &= new_axis_mask - 1
874 axis = int(math.log2(prev_mask - new_axis_mask))
875 output_shape = output_shape[:axis] + output_shape[(axis + 1) :]
876 new_axis_mask >>= 1
877
878 assert len(out_tens.shape) == (len(op.inputs[0].shape) + n)
879 op.attrs["new_axis_mask"] = 0
880 if axis >= 0:
881 axis_4D[idx] = axis + (4 - len(output_shape))
882 else:
883 axis_4D[idx] = axis
884 op.ofm_shapes[idx] = Shape4D(output_shape)
885
886 op.attrs["split_axis_4D"] = axis_4D
887 return op
888
889
890def rewrite_unpack_output(op, arch, nng):
891 tens = op.outputs[0]
892 if op.run_on_npu and op.type == Op.Unpack:
893 # Unpack is also referred to as Unstack
894 axis = int(op.attrs["axis"])
895 if axis < 0: # Convert to positive axis
896 axis = len(op.inputs[0].shape) + 1 + axis
897 op.type = Op.UnpackReshaped
898 desired_output_shape = tens.shape[:axis] + [1] + tens.shape[axis:]
899
900 axis_4D = axis + (4 - len(desired_output_shape))
901 op.attrs["split_axis_4D"] = [axis_4D] * len(op.outputs)
902
903 for idx, out_tens in enumerate(op.outputs):
904 op.ofm_shapes[idx] = Shape4D(desired_output_shape)
905 return op
906
907
908def add_padding_fields(op, arch, nng):
909 if op.run_on_npu:
910 if "padding" in op.attrs:
911 input_shape = op.ifm_shapes[0]
912 output_shape = op.ofm_shapes[0]
913 if op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op():
914 kernel_size = op.inputs[1].shape[:2]
915 elif op.type.is_pool_op() or op.type.npu_block_type == NpuBlockType.ReduceSum:
916 kernel_size = op.attrs["ksize"][1:3]
917 else:
918 raise UnsupportedFeatureError(f"Unknown operation that uses padding: {optype_to_builtintype(op.type)}")
919
920 if op.type == Op.Conv2DBackpropInputSwitchedBias:
921 upscaling_factor = output_shape.height // input_shape.height
922 padding, skirt = calc_upscaled_padding_and_skirt(
923 op.attrs["padding"], kernel_size, op.attrs["strides"], input_shape, upscaling_factor
924 )
925 else:
926 padding, skirt = calc_padding_and_skirt(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200927 op.attrs["padding"],
928 op.kernel,
929 input_shape,
930 op.attrs.get("explicit_padding"),
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200931 )
932
933 op.attrs["explicit_padding"] = padding
934 op.attrs["skirt"] = skirt
935
936 return op
937
938
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200939def reorder_depthwise_weights(op, arch, nng):
940 if op.type.is_depthwise_conv2d_op():
941 weight_tensor = op.inputs[1]
James Peet7519d502021-07-19 16:47:58 +0100942 weight_tensor.values = np.transpose(weight_tensor.values, (0, 1, 3, 2))
943 weight_tensor.set_all_shapes(list(weight_tensor.values.shape))
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200944 weight_tensor.weight_transpose_depthwise = True
945
946 return op
947
948
Raul Farkas72c6a242023-03-16 16:38:05 +0000949def fixup_strided_conv(op: Operation, arch, nng):
950 """Optimize or fixup strided Conv2DBias
951 Optimization:
952 Reduce, when possible, the Conv2DBias stride from 2 to 1 by re-shaping
953 both IFM and filter.
954
955 Fixup:
956 Introduce software support for Conv2DBias with stride_width = 4 by
957 reducing it to 1 when possible by re-shaping both IFM and filter.
958 """
Raul Farkas090f18a2023-01-24 16:29:06 +0000959 if op.type != Op.Conv2DBias:
Louis Verhaard43d27582022-03-17 14:06:00 +0100960 return op
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200961 stride_x, stride_y = op.get_kernel_stride()
Louis Verhaard43d27582022-03-17 14:06:00 +0100962 weight_tensor = op.weights
963 ifm_shape = op.ifm_shapes[0]
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200964 if (
Raul Farkas090f18a2023-01-24 16:29:06 +0000965 (stride_x == 2 or stride_x == 4)
Louis Verhaard43d27582022-03-17 14:06:00 +0100966 and ifm_shape.depth <= 4
967 and ifm_shape.width % 2 == 0
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200968 and weight_tensor is not None
969 and weight_tensor.shape[1] >= 2
970 ):
Louis Verhaard43d27582022-03-17 14:06:00 +0100971 k_w, _ = op.get_kernel_size()
Raul Farkas090f18a2023-01-24 16:29:06 +0000972 curr_padding_x = needed_total_padding(ifm_shape.width, stride_x, k_w)
973 optimised_padding_x = needed_total_padding(ifm_shape.width // stride_x, 1, (k_w + 1) // stride_x)
974 padding_type = op.attrs.get("padding", None)
975
976 # If padding is enabled, check if current padding matches optimised padding
977 if not padding_type or (padding_type != Padding.VALID and curr_padding_x != optimised_padding_x):
Louis Verhaard43d27582022-03-17 14:06:00 +0100978 # Horizontal padding would become different after optimisation; this would not work
979 return op
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200980 # IFM
Raul Farkas090f18a2023-01-24 16:29:06 +0000981 op.ifm_shapes[0] = Shape4D(
982 [ifm_shape.batch, ifm_shape.height, ifm_shape.width // stride_x, ifm_shape.depth * stride_x]
983 )
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200984
985 # Weights
986 weight_shape = weight_tensor.shape
987 if weight_shape[1] % 2 != 0:
988 weight_shape[1] = weight_shape[1] + 1
989 padded_array = np.zeros(weight_shape)
990 for i in range(weight_shape[0]):
991 padded_array[i] = np.vstack(
992 [
James Peet7519d502021-07-19 16:47:58 +0100993 weight_tensor.values[i],
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200994 np.full((1, weight_shape[2], weight_shape[3]), weight_tensor.quantization.zero_point),
995 ]
996 )
James Peet7519d502021-07-19 16:47:58 +0100997 weight_tensor.values = padded_array
Raul Farkas090f18a2023-01-24 16:29:06 +0000998
999 # Change weight shape based on stride_x
1000 weight_shape[1] //= stride_x
1001 weight_shape[2] *= stride_x
1002
James Peet7519d502021-07-19 16:47:58 +01001003 weight_tensor.values = np.reshape(weight_tensor.values, weight_shape)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001004 weight_tensor.set_all_shapes(weight_shape)
1005 # If multiple copies of the weights are used, we could avoid
1006 # them having the same address by changing the value_id
1007 weight_tensor.value_id = uuid.uuid4()
1008
1009 # Strides
1010 stride_x = 1
1011 op.attrs.update({"stride_w": stride_x, "stride_h": stride_y, "strides": (1, stride_y, stride_x, 1)})
1012
Raul Farkas72c6a242023-03-16 16:38:05 +00001013 op.ifm.force_linear_format = True
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001014 return op
1015
1016
1017def convert_conv_to_fc(op, arch, nng):
1018 # Conv 1x1 can be equivalent to Fully Connected.
1019 # By representing certain convs as fully connected layers, Vela can better determine wether or not to use
1020 # caching/double buffering for the weights.
1021 # (Weights dont need to be reloaded for convs when IFM H and W are 1)
1022 if op.type == Op.Conv2DBias:
1023 h = op.ifm_shapes[0].height
1024 w = op.ifm_shapes[0].width
1025 kh, kw, _, _ = op.inputs[1].shape
1026 if h == 1 and w == 1 and kh == 1 and kw == 1:
1027 # Overwrite this op as a Fully Connected Op
1028 op.name += "_fc"
1029 op.type = Op.FullyConnected
1030 op.attrs = {
1031 "weights_format": 0,
1032 }
1033 # Reshape Weights to be 2D. HWIO becomes just IO (as H and W are 1, they can just be dropped)
1034 weight_tensor = op.inputs[1]
James Peet7519d502021-07-19 16:47:58 +01001035 weight_tensor.values = weight_tensor.values.squeeze(axis=(0, 1))
1036 weight_tensor.set_all_shapes(list(weight_tensor.values.shape))
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001037
1038 DebugDatabase.add_optimised(op, op)
1039 return op
1040
1041
1042def fixup_relus_with_differing_ifm_ofm_scaling(op, arch, nng):
1043 if op.run_on_npu and op.type.is_relu_op():
1044 ifm = op.inputs[0]
1045 ofm = op.outputs[0]
1046 # Relu with differing IFM and OFM scaling cannot be fused with another primary op
1047 # and requires its own to be inserted
1048 if not check_quantized_tens_scaling_equal(ifm, ofm):
1049 # Override this op with its own primary op (avgpool)
1050 relu_fused_op = create_avgpool_nop(op.name + "_avgpool")
1051 # And fuse the original activation function to it
1052 relu_fused_op.activation = create_activation_function(op.type)
Fredrik Svedberg1a7527c2021-09-13 15:52:16 +02001053 # Add explicit rescaling
1054 rescale = ifm.quantization.scale_f32 / ofm.quantization.scale_f32
1055 multiplier, shift = scaling.quantise_scale(rescale)
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +02001056 relu_fused_op.explicit_scaling = ExplicitScaling(False, [shift], [multiplier])
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001057 # Tidy up and assign the ifm and ofm to the new op
1058 ifm.consumer_list.remove(op)
1059
1060 relu_fused_op.add_input_tensor(ifm)
1061 relu_fused_op.set_output_tensor(ofm)
1062 relu_fused_op.set_ifm_ofm_shapes()
1063 op = relu_fused_op
1064 return op
1065
1066
Fredrik Svedberg0ac08042023-04-11 22:35:04 +02001067def convert_lstm(op, arch, nng):
1068 if op.type == Op.UnidirectionalSequenceLstm:
1069 lstm = Lstm(op)
1070 op = lstm.get_graph()
1071 return op
1072
1073
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001074def convert_softmax(op, arch, nng):
1075 if op.type == Op.Softmax and op.run_on_npu:
1076 softmax = SoftMax(op)
1077 op = softmax.get_graph()
1078 return op
1079
1080
Fredrik Svedberg8ddd4892022-08-19 16:06:04 +02001081def convert_prelu(op, arch, nng):
1082 if op.type == Op.Prelu:
1083 ifm, alpha, ofm = op.get_ifm_ifm2_ofm()
1084 if None in (ifm, alpha, ofm):
1085 return op
1086
Fredrik Svedberg66591652022-08-29 10:51:27 +02001087 if alpha.values is not None:
1088 # If const alpha check for possible optimisations
1089 alpha_zp = alpha.quantization.zero_point
1090 alpha_scale = alpha.quantization.scale_f32
1091 # If all alpha values are the same the PReLU can be converted to LeakyRelu
Rickard Bolin5fdcf172022-12-19 12:56:17 +00001092 alpha_min = (alpha.values.min().astype(int) - alpha_zp) * alpha_scale
1093 alpha_max = (alpha.values.max().astype(int) - alpha_zp) * alpha_scale
Fredrik Svedberg66591652022-08-29 10:51:27 +02001094 if alpha_min == alpha_max:
1095 # or even a Relu
1096 if alpha_min == 0:
1097 new_op = Op.Relu
1098 else:
1099 new_op = Op.LeakyRelu
1100 op.attrs["alpha"] = alpha_min
1101 # setup alpha_scaling for bit exact result
1102 ifm_scale = ifm.quantization.scale_f32
1103 ofm_scale = ofm.quantization.scale_f32
1104 alpha_scale, alpha_shift = scaling.elementwise_mul_scale(ifm_scale, alpha_scale, ofm_scale)
1105 op.attrs["alpha_scaling"] = (alpha.values.min() - alpha_zp, alpha_scale, alpha_shift)
1106 # Change op type
1107 op.type = new_op
1108 op.name = op.name.replace("Prelu", new_op.name)
1109 del op.inputs[1] # Remove alpha tensor
1110 return op
1111 elif alpha_max < 1:
1112 # If alpha_max is less than 1 convert PReLU to Max(alpha * IFM, identity * IFM)
1113 # Multiply with alpha tensor
1114 mul_alpha = Operation(Op.Mul, op.name + "_mul_alpha")
1115 mul_alpha.add_input_tensor(ifm)
1116 mul_alpha.add_input_tensor(alpha)
1117 fm_alpha = ofm.clone(op.name + "_alpha", set_unique=True)
1118 mul_alpha.set_output_tensor(fm_alpha)
1119 mul_alpha.set_ifm_ofm_shapes()
1120 DebugDatabase.add_optimised(op, mul_alpha)
1121 if check_quantized_tens_scaling_equal(ifm, ofm):
1122 # No scaling is needed
1123 fm_id = ifm
1124 else:
1125 # Add multiplication with identity
1126 mul_identity = Operation(Op.Mul, op.name + "_mul_identity")
1127 mul_identity.add_input_tensor(ifm)
1128 # Create const tensor containing identity as scalar
1129 quantization = ifm.quantization.clone()
1130 quantization.scale_f32 = np.float32(1)
1131 quantization.zero_point = 0
1132 one = create_const_tensor("one_const", [], ifm.dtype, [1], quantization=quantization)
1133 mul_identity.add_input_tensor(one)
1134 # Make sure that fm_id is allocated to a different address than fm_alpha
1135 fm_id = ofm.clone(op.name + "_id", set_unique=True)
1136 mul_identity.set_output_tensor(fm_id)
1137 mul_identity.set_ifm_ofm_shapes()
wilisa0179a89042022-11-02 17:18:43 +00001138 DebugDatabase.add_optimised(op, mul_identity)
Fredrik Svedberg66591652022-08-29 10:51:27 +02001139
1140 # Combine scaled and alpha multiplied values
1141 max_op = Operation(Op.Maximum, op.name + "_max")
1142 max_op.add_input_tensor(fm_alpha)
1143 max_op.add_input_tensor(fm_id)
1144 max_op.set_output_tensor(ofm)
1145 max_op.set_ifm_ofm_shapes()
1146
1147 DebugDatabase.add_optimised(op, max_op)
1148 ifm.consumer_list.remove(op)
1149 return max_op
1150
1151 # Catch all PReLU conversion for the cases that could not be optimised above
Fredrik Svedberg8ddd4892022-08-19 16:06:04 +02001152 no_scale_quant = ifm.quantization.clone()
1153 no_scale_quant.scale_f32 = None
1154 no_scale_quant.zero_point = 0
Fredrik Svedberg66591652022-08-29 10:51:27 +02001155 zero = create_const_tensor("zero_const", [], ifm.dtype, [0], quantization=no_scale_quant)
Fredrik Svedberg8ddd4892022-08-19 16:06:04 +02001156
1157 # Select values < 0
1158 min_op = Operation(Op.Minimum, op.name + "_min")
1159 min_op.add_input_tensor(ifm)
1160 min_op.add_input_tensor(zero)
1161 fm_negative = ifm.clone(op.name + "_negative", set_unique=True)
1162 min_op.set_output_tensor(fm_negative)
1163 min_op.set_ifm_ofm_shapes()
1164 DebugDatabase.add_optimised(op, min_op)
1165
1166 # and multiply with alpha tensor
1167 mul_alpha = Operation(Op.Mul, op.name + "_mul_alpha")
1168 mul_alpha.add_input_tensor(fm_negative)
1169 mul_alpha.add_input_tensor(alpha)
1170 fm_alpha = ofm.clone(op.name + "_negative_alpha", set_unique=True)
1171 mul_alpha.set_output_tensor(fm_alpha)
1172 mul_alpha.set_ifm_ofm_shapes()
1173 DebugDatabase.add_optimised(op, mul_alpha)
1174
1175 # Select (and scale) values > 0
1176 relu_op = Operation(Op.Relu, op.name + "_relu")
1177 relu_op.add_input_tensor(ifm)
1178 fm_scaled = ofm.clone(op.name + "_positive_scaled", set_unique=True)
1179 relu_op.set_output_tensor(fm_scaled)
1180 relu_op.set_ifm_ofm_shapes()
1181 DebugDatabase.add_optimised(op, relu_op)
1182
1183 # Add scaled and alpha multiplied values (without scaling)
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +02001184 add_op = Operation(Op.Add, op.name + "_add")
1185 add_op.explicit_scaling = ExplicitScaling(False, shift=[0], multiplier=[1]) # No scaling
Fredrik Svedberg8ddd4892022-08-19 16:06:04 +02001186 add_op.add_input_tensor(fm_alpha)
1187 add_op.add_input_tensor(fm_scaled)
1188 add_op.set_output_tensor(ofm)
1189 add_op.set_ifm_ofm_shapes()
1190
1191 DebugDatabase.add_optimised(op, add_op)
1192 ifm.consumer_list.remove(op)
1193 op = add_op
1194
1195 return op
1196
1197
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001198def convert_mul_max_to_abs_or_lrelu(op, arch, nng):
1199 r"""Whenever there is a subgraph with this topology:
1200
Jonas Ohlssond8575072022-03-30 10:30:25 +02001201 Input X For X = -1 or X > 0
1202 | \ / This subgraph can be replaced with either
1203 | Mul an Abs (if X = -1) or a LeakyReLU (if X > 0)
1204 | /
1205 Max
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001206 """
1207
1208 if op.type == Op.Maximum:
1209 # finds the Mul input(s) to the Max
1210 muls = [i for i in op.inputs if i.ops[0].type == Op.Mul]
1211 if len(muls) == 1:
1212 mul = muls[0].ops[0]
1213 elif len(muls) == 2:
1214 # In the case both inputs are Muls, find the one with the same input as the Max
Fredrik Svedberg66591652022-08-29 10:51:27 +02001215 mul_ifms = [m for m in muls if len(set(op.inputs + m.ops[0].inputs)) == 1]
1216 if len(mul_ifms):
1217 mul = mul_ifms[0].ops[0]
1218 else:
1219 # Not using same input
1220 return op
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001221 else:
1222 # No Mul inputs
1223 return op
1224
1225 # make sure the Mul doesn't have any other consumers
1226 mul_ofm = mul.outputs[0]
1227 if len(mul_ofm.consumers()) != 1:
1228 return op
1229 # make sure the Mul doesn't have a fused activation function
1230 if mul.activation:
1231 return op
1232 ifm, ofm = op.get_ifm_ofm()
1233 if ifm is None or ofm is None:
1234 return op
1235
1236 if ifm.dtype not in (DataType.uint8, DataType.int8) or ifm.dtype != ofm.dtype:
1237 return op
1238 if not check_quantized_tens_scaling_equal(ifm, ofm) or not check_quantized_tens_scaling_equal(ifm, mul_ofm):
1239 # rewrite to LeakyRelu currently only makes sense if the quantization is identical
1240 return op
1241
1242 # finds the branched input that goes to both the Max and the Mul
1243 shared = set(op.inputs) & set(mul.inputs)
1244 if len(shared) == 1:
1245 shared_in = shared.pop()
1246 # find the constant scalar input to the Mul
1247 const_tens = (set(mul.inputs) - {shared_in}).pop()
1248 # check that it is a scalar
1249 if const_tens.shape != []:
1250 return op
1251 const = const_tens.ops[0]
1252 # check that it is a constant
1253 if const.type != Op.Const:
1254 return op
1255 # Remove the Mul from the shared input's consumers
1256 shared_in.consumer_list.remove(mul)
1257 else:
1258 return op
1259
1260 val = const.outputs[0].values
1261 if val >= 0:
1262 new_op = Op.LeakyRelu
1263 op.attrs["alpha"] = val
1264 # to produce bit exact results, the alpha is not enough;
1265 # save additional scaling info in attr "alpha_scale", to be used as input
1266 # to the LUT construction
James Peet7519d502021-07-19 16:47:58 +01001267 alpha_scalar = const_tens.values - const_tens.quantization.zero_point
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001268 mul_ifm_scale = np.double(ifm.quantization.scale_f32)
1269 mul_ifm2_scale = np.double(const_tens.quantization.scale_f32)
1270 mul_ofm_scale = np.double(mul_ofm.quantization.scale_f32)
1271 alpha_scale, alpha_shift = scaling.elementwise_mul_scale(mul_ifm_scale, mul_ifm2_scale, mul_ofm_scale)
1272 op.attrs["alpha_scaling"] = (alpha_scalar, alpha_scale, alpha_shift)
1273 elif val == -1:
1274 new_op = Op.Abs
1275 else:
1276 return op
1277
1278 op.type = new_op
1279 op.name = op.name.replace("Maximum", new_op.name)
1280 op.outputs[0].name = op.outputs[0].name.replace("Maximum", new_op.name)
1281 op.inputs = [shared_in]
1282 op.set_ifm_ofm_shapes()
1283
1284 # Record optimisation in debug database
1285 DebugDatabase.add_optimised(op, op)
1286
1287 return op
1288
1289
1290def convert_hardswish_to_lut(op, arch, nng):
1291 if op.type == Op.HardSwish:
1292 ifm, ofm = op.get_ifm_ofm()
1293 # Generate the LUT
1294 ifm_scale = np.double(ifm.quantization.scale_f32)
1295 ofm_scale = np.double(ofm.quantization.scale_f32)
1296 zp_in = ifm.quantization.zero_point
1297 zp_out = ofm.quantization.zero_point
1298 ifm_scale_hires = (1 / 128) * ifm_scale
1299 relu_multiplier = np.double(3 / 32768)
1300 out_scale, out_shift = scaling.quantise_scale(ifm_scale_hires / ofm_scale)
1301 relu_scale, relu_shift = scaling.quantise_scale(ifm_scale_hires / relu_multiplier)
1302 # Use 16bit scale
1303 out_scale_16 = fp_math.downscale_multiplier_int32_to_int16(out_scale)
1304 relu_scale_16 = fp_math.downscale_multiplier_int32_to_int16(relu_scale)
1305
1306 values = []
1307 ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)
1308 quantized_min = min(ix)
1309 quantized_max = max(ix)
1310 for x in ix:
1311 input_value = x - zp_in
1312 input_value_hires = input_value * 128
1313 # Compute the input value on essentially the output scale, not shifted yet
1314 input_value_preshift = fp_math.saturating_rounding_mul16(input_value_hires, out_scale_16)
1315 # Compute the "relu-ish multiplier". This matches the code in TensorFlow Lite Micro kernel
1316 relu_value = np.int16(input_value_hires)
1317 if relu_shift < 31:
1318 relu_value = fp_math.shift_left16(relu_value, 30 - relu_shift)
1319
1320 relu_value = fp_math.saturating_rounding_mul16(relu_value, relu_scale_16)
1321
1322 if relu_shift < 31:
1323 relu_value = fp_math.shift_left16(relu_value, 1)
1324
1325 if relu_shift > 31:
1326 relu_value = fp_math.rounding_divide_by_pot(relu_value, relu_shift - 31)
1327
1328 # Rescaled the value into a 16bit fixedpoint relu_value in [-1, 1]
1329 # Now convert that to a 16bit fixedpoint value in [0, 1]
1330 relu_value = (relu_value + (1 << 15)) >> 1
1331 lut_result = fp_math.saturating_mul16(relu_value, input_value_preshift)
1332 shift = 31 - out_shift
1333 shift = -shift if shift < 0 else 0
1334 # Finally apply the output shift
1335 lut_result = fp_math.rounding_divide_by_pot(lut_result, shift) + zp_out
1336 lut_result = min(quantized_max, max(quantized_min, lut_result))
1337 values.append(lut_result)
1338 return convert_to_lut(op, values, "hardswish")
1339 return op
1340
1341
1342def convert_lrelu_to_mul_max(op, arch):
1343 # Converts LeakyRelu to Max(alpha * IFM, identity * IFM)
1344 # (the opposite of convert_mul_max_to_abs_or_lrelu)
1345 ifm, ofm = op.get_ifm_ofm()
1346 if ifm is None or ofm is None:
1347 return op
1348
Fredrik Svedberg701ba912022-09-07 16:01:15 +02001349 alpha = np.float32(op.attrs["alpha"])
1350 use_mul_max = 0 < alpha < 1
Fredrik Svedberg36424312022-09-16 09:39:26 +02001351 is_converted_prelu = "alpha_scaling" in op.attrs
Fredrik Svedberg701ba912022-09-07 16:01:15 +02001352 if use_mul_max:
1353 mul_ifm = ifm
1354 new_op = Op.Maximum
1355 else:
Fredrik Svedberg36424312022-09-16 09:39:26 +02001356 # Need to use a different approach for alpha < 0 or alpha > 1
Fredrik Svedberg701ba912022-09-07 16:01:15 +02001357 no_scale_quant = ifm.quantization.clone()
1358 no_scale_quant.scale_f32 = None
1359 no_scale_quant.zero_point = 0
1360 zero = create_const_tensor("zero_const", [], ifm.dtype, [0], quantization=no_scale_quant)
1361
1362 # Select values < 0
1363 min_op = Operation(Op.Minimum, op.name + "_min")
1364 min_op.add_input_tensor(ifm)
1365 min_op.add_input_tensor(zero)
1366 mul_ifm = ifm.clone(op.name + "_negative", set_unique=True)
Fredrik Svedberg36424312022-09-16 09:39:26 +02001367 if alpha < 0 and not is_converted_prelu:
1368 # For negative alpha that is not from a converted PReLU we need to use
1369 # int32 Mul below to perform the (negative) alpha scaling
1370 mul_ifm.dtype = DataType.int32
Fredrik Svedberg701ba912022-09-07 16:01:15 +02001371 min_op.set_output_tensor(mul_ifm)
1372 min_op.set_ifm_ofm_shapes()
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +02001373 new_op = Op.Add
1374 op.explicit_scaling = ExplicitScaling(False, shift=[0], multiplier=[1]) # No scaling
Fredrik Svedberg701ba912022-09-07 16:01:15 +02001375 DebugDatabase.add_optimised(op, min_op)
1376
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001377 # Add multiplication with alpha
1378 mul_alpha = Operation(Op.Mul, op.name + "_mul_alpha")
Fredrik Svedberg701ba912022-09-07 16:01:15 +02001379 mul_alpha.add_input_tensor(mul_ifm)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001380 # Create const tensor containing alpha as scalar
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001381 quantization = ifm.quantization.clone()
1382 quantization.min = 0
1383 quantization.max = alpha * (quantization.quant_max - quantization.quant_min)
1384 quantization.zero_point = 0
Fredrik Svedberg7f3ccd52022-09-13 15:22:01 +02001385 alpha_dtype = mul_ifm.dtype
Fredrik Svedberg36424312022-09-16 09:39:26 +02001386 if is_converted_prelu:
1387 # The LeakyRelu was the result from convert_prelu and the scaling is provided
Fredrik Svedberg66591652022-08-29 10:51:27 +02001388 scalar, alpha_scale, alpha_shift = op.attrs["alpha_scaling"]
Fredrik Svedberg4a434cb2022-09-27 14:13:01 +02001389 mul_alpha.explicit_scaling = ExplicitScaling(False, [alpha_shift], [alpha_scale])
Fredrik Svedberg7f3ccd52022-09-13 15:22:01 +02001390 elif alpha == 0 or np.isinf(1 / alpha):
1391 # Handling of alpha near or at zero
Fredrik Svedbergcce872b2021-09-02 15:20:52 +02001392 quantization.scale_f32 = np.float32(1)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001393 scalar = 0
1394 else:
1395 quantization.scale_f32 = alpha
Fredrik Svedberg7f3ccd52022-09-13 15:22:01 +02001396 if alpha_dtype == DataType.int32:
Fredrik Svedberg36424312022-09-16 09:39:26 +02001397 # When the datatype is int32 (alpha negative) we need to do the scaling with the multiplication
Fredrik Svedberg7f3ccd52022-09-13 15:22:01 +02001398 scalar, _ = scaling.elementwise_mul_scale(ifm.quantization.scale_f32, alpha, ofm.quantization.scale_f32)
1399 else:
1400 scalar = 1
Tim Hall3b1578e2023-01-13 17:57:25 +00001401 alpha_tens = create_const_tensor(op.name + "_alpha_scalar", [1], alpha_dtype, [scalar], quantization=quantization)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001402 mul_alpha.add_input_tensor(alpha_tens)
1403 fm_alpha = ofm.clone(op.name + "_alpha", set_unique=True)
1404 mul_alpha.set_output_tensor(fm_alpha)
1405 mul_alpha.set_ifm_ofm_shapes()
1406 DebugDatabase.add_optimised(op, mul_alpha)
1407
Fredrik Svedberg701ba912022-09-07 16:01:15 +02001408 if not use_mul_max:
1409 relu_op = Operation(Op.Relu, op.name + "_relu")
1410 relu_op.add_input_tensor(ifm)
1411 fm_id = ofm.clone(op.name + "_positive_scaled", set_unique=True)
1412 relu_op.set_output_tensor(fm_id)
1413 relu_op.set_ifm_ofm_shapes()
1414 DebugDatabase.add_optimised(op, relu_op)
1415 elif check_quantized_tens_scaling_equal(ifm, ofm):
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001416 # No identity multiplication is needed
1417 fm_id = ifm
1418 else:
1419 # Add multiplication with identity
1420 mul_identity = Operation(Op.Mul, op.name + "_mul_identity")
1421 mul_identity.add_input_tensor(ifm)
1422 # Create const tensor containing identity as scalar
1423 quantization = ifm.quantization.clone()
1424 quantization.min = 0
1425 quantization.max = quantization.quant_max - quantization.quant_min
Fredrik Svedbergcce872b2021-09-02 15:20:52 +02001426 quantization.scale_f32 = np.float32(1)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001427 quantization.zero_point = 0
Tim Hall3b1578e2023-01-13 17:57:25 +00001428 identity_tens = create_const_tensor(op.name + "_id_scalar", [], ifm.dtype, [1], quantization=quantization)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001429 mul_identity.add_input_tensor(identity_tens)
1430 # Make sure that fm_id is allocated to a different address than fm_alpha
1431 fm_id = ofm.clone(op.name + "_id", set_unique=True)
1432 mul_identity.set_output_tensor(fm_id)
1433 mul_identity.set_ifm_ofm_shapes()
1434 DebugDatabase.add_optimised(op, mul_identity)
1435
1436 # Convert LeakyRelu to Max, add the results of the multiplication(s) as inputs
Fredrik Svedberg701ba912022-09-07 16:01:15 +02001437 op.type = new_op
1438 op.name = op.name.replace("LeakyRelu", new_op.name)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001439 op.inputs = []
1440 ifm.consumer_list.remove(op)
1441 op.add_input_tensor(fm_alpha)
1442 op.add_input_tensor(fm_id)
1443 op.set_ifm_ofm_shapes()
1444
1445 DebugDatabase.add_optimised(op, op)
1446 return op
1447
1448
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001449def convert_to_lut8(op, fn, fn_name):
1450 # Converts op to a no-op + int8/uint8 LUT which is generated with the given function.
1451 # fn is a function(real) -> real
1452 ifm, ofm = op.get_ifm_ofm()
1453 if ifm.dtype not in (DataType.uint8, DataType.int8) or ifm.dtype != ofm.dtype:
1454 return op
1455 # Generate the LUT
1456 ifm_scale = np.double(ifm.quantization.scale_f32)
1457 ofm_scale = np.double(ofm.quantization.scale_f32)
1458 zp_in = ifm.quantization.zero_point
1459 zp_out = ofm.quantization.zero_point
1460 values = []
1461 ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)
1462 quantized_min = min(ix)
1463 quantized_max = max(ix)
1464 for x in ix:
1465 x_real = ifm_scale * (x - zp_in)
1466 y_real = fn(x_real)
1467 lut_result = round_away_zero(zp_out + y_real / ofm_scale)
1468 lut_result = min(quantized_max, max(quantized_min, lut_result))
1469 values.append(lut_result)
1470 return convert_to_lut(op, values, fn_name)
1471
1472
1473def convert_lrelu_to_lut(op, arch):
1474 ifm, ofm = op.get_ifm_ofm()
1475 # Generate the LUT
1476 alpha = op.attrs["alpha"]
1477 ifm_scale = np.double(ifm.quantization.scale_f32)
1478 ofm_scale = np.double(ofm.quantization.scale_f32)
1479 zp_in = ifm.quantization.zero_point
1480 zp_out = ofm.quantization.zero_point
1481 identity_scale, identity_shift = scaling.elementwise_mul_scale(ifm_scale, 1, ofm_scale)
1482 alpha_scalar = 1
1483 alpha_scale, alpha_shift = scaling.elementwise_mul_scale(ifm_scale, alpha, ofm_scale)
1484 if "alpha_scaling" in op.attrs:
1485 # The LeakyRelu was the result from convert_mul_max_to_abs_or_lrelu
1486 alpha_scalar, alpha_scale, alpha_shift = op.attrs["alpha_scaling"]
1487 values = []
1488 ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)
1489 quantized_min = min(ix)
1490 quantized_max = max(ix)
1491 for x in ix:
1492 if x < zp_in:
1493 lut_result = zp_out + fp_math.multiply_by_quantized_multiplier(
1494 alpha_scalar * (x - zp_in), alpha_scale, alpha_shift
1495 )
1496 else:
1497 lut_result = zp_out + fp_math.multiply_by_quantized_multiplier(x - zp_in, identity_scale, identity_shift)
1498 lut_result = min(quantized_max, max(quantized_min, lut_result))
1499 values.append(lut_result)
1500 return convert_to_lut(op, values, "lrelu")
1501
1502
1503def convert_lrelu(op, arch, nng):
1504 # Converts LeakyRelu to a LUT based solution if possible, otherwise a mul + max
1505 if op.type != Op.LeakyRelu:
1506 return op
1507 ifm, ofm = op.get_ifm_ofm()
1508 if ifm is None or ofm is None:
1509 return op
Fredrik Svedberg36424312022-09-16 09:39:26 +02001510 alpha = op.attrs["alpha"]
1511 if alpha == 0:
1512 # When alpha is 0 the opertion can be converted to a ReLU
1513 op.type = Op.Relu
1514 op.name = op.name.replace("LeakyRelu", op.type.name)
1515 return op
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001516 if ifm.dtype in (DataType.uint8, DataType.int8) and ifm.dtype == ofm.dtype:
1517 # use LUT for int8/uint8
1518 return convert_lrelu_to_lut(op, arch)
Fredrik Svedberg36424312022-09-16 09:39:26 +02001519 if check_quantized_tens_scaling_equal(ifm, ofm) and ifm.dtype == ofm.dtype == DataType.int16 and alpha > 0:
Fredrik Svedberg701ba912022-09-07 16:01:15 +02001520 # use LeakyRelu unmodified for int16 with equal input/output scaling and positive alpha
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001521 return op
1522 return convert_lrelu_to_mul_max(op, arch)
1523
1524
1525def convert_tanh_sigmoid_to_lut(op, arch, nng):
1526 # Converts int8/uint8 Sigmoid and Tanh to a LUT based solution
1527 if op.type == Op.Sigmoid:
1528 return convert_to_lut8(op, clamp_sigmoid, "sigmoid")
1529 elif op.type == Op.Tanh:
1530 return convert_to_lut8(op, math.tanh, "tanh")
1531 return op
1532
1533
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001534def fuse_activation_function_with_prev(op, arch, nng):
1535 # if op is a no-op: attempts to move the activation function to the preceding op
1536 if not op.attrs.get("is_nop", False) or op.activation is None:
1537 return op
1538 ifm, ofm = op.get_ifm_ofm()
1539 if ifm is None or ofm is None:
1540 return op
1541 # finds the input(s) to the operation
1542 prev_op = ifm.ops[0]
1543 # Note: the below checks on prev_op require that a first optimize pass on the full graph has been performed
1544 fuse = (
1545 prev_op.run_on_npu
1546 and prev_op.type.npu_block_type != NpuBlockType.Default
1547 and len(ifm.ops) == 1
1548 and len(prev_op.outputs[0].consumers()) == 1
1549 and prev_op.activation is None
1550 )
1551 if op.activation_lut is not None and arch.shram_reserved_unused_banks == 0:
1552 # TODO: if SHRAM LUT space is shared with SHRAM ACC (32, 64 MAC),
1553 # LUT currently only works correctly for elementwise ops
1554 fuse = False
1555 if not fuse:
1556 return op
1557 # Move the fused activation function + corresponding info to prev_op
1558 prev_op.activation = op.activation
1559 prev_op.forced_output_quantization = op.forced_output_quantization
1560 if op.activation_lut is not None:
1561 prev_op.set_activation_lut(op.activation_lut)
1562 # Bypass op
1563 prev_op.set_output_tensor(ofm)
wilisa0179a89042022-11-02 17:18:43 +00001564 DebugDatabase.add_optimised(prev_op, prev_op)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001565 return op
1566
1567
1568def _leading_pad_ok(leading_pad, stride, kernel_size):
1569 # If kernel size // 2 > stride, then (left, top) padding must be a multiple of stride,
1570 # otherwise replacing PAD by hardware padding would iterate the wrong IFM rows/columns
1571 max_size = kernel_size // 2
1572 return leading_pad == max_size or max_size <= stride or leading_pad % stride == 0
1573
1574
1575def replace_pad_by_hw_pad(op: Operation, arch, nng):
1576 """
1577 Tries to completely remove a PAD operator by using hardware padding.
1578 E.g. a PAD operation that pads 1, followed by a CONV with VALID padding and kernel size 3
1579 is rewritten such that the PAD is removed, and the CONV uses SAME padding.
1580 Converts tens1 -> PAD -> tens2 -> CONV to tens1 -> CONV
1581 if both operations can be run on the NPU.
1582 This is the most efficient way to implement PAD, but cannot be done for all pad sizes.
1583 """
1584 if (
1585 (op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op() or op.type.is_avgpool_op())
Tim Hall0ab2edc2022-02-23 17:58:02 +00001586 and op.type not in (Op.Conv2DBackpropInput, Op.Conv2DBackpropInputSwitchedBias)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001587 and op.run_on_npu
1588 and op.attrs["padding"] == Padding.VALID
1589 ):
1590 pad_op = op.ifm.ops[0]
1591 if pad_op.type != Op.Pad or not pad_op.run_on_npu:
1592 return op
1593 if pad_op.ifm.dtype != pad_op.ofm.dtype or not check_quantized_tens_scaling_equal(pad_op.ofm, pad_op.ifm):
1594 return op
1595 top, left, bottom, right = get_pad_values_from_input(pad_op.inputs[1].values)
1596 k = op.kernel
1597 k_w, k_h = k.dilated_wh()
1598
1599 # Check if the PAD operator can be replaced by hardware padding
1600 if left > k_w // 2 or right > k_w // 2 or top > k_h // 2 or bottom > k_h // 2:
1601 # Too much padding, it would require hardware padding to actually insert zeros
1602 return op
1603 if not _leading_pad_ok(top, k.stride.y, k_h) or not _leading_pad_ok(left, k.stride.x, k_w):
1604 return op
1605
1606 if op.type.is_avgpool_op():
1607 # For average pool, hardware padding can only be used if padding is 0 or kernel size / 2
1608 for pad, k_size in (
1609 (left, k_w),
1610 (right, k_w),
1611 (top, k_h),
1612 (bottom, k_h),
1613 ):
1614 if pad not in (0, k_size // 2):
1615 return op
1616 # Average pool is converted to depthwise, because NPU average pool + same padding
1617 # has a special implementation that is different from PAD followed by average pool with
1618 # valid padding.
1619 k_w, k_h = op.kernel.width, op.kernel.height
1620 ifm = op.ifm
1621 # Remember other inputs
1622 other_inputs = op.inputs[1:]
1623 # Create a weight tensor, all weights are set to 1/(kernel width * kernel height)
1624 quantization = QuantizationParameters(0.0, 255.0)
1625 quantization.scale_f32 = 1.0 / (k_w * k_h)
1626 quantization.zero_point = 0
1627 shape = [k_h, k_w, 1, op.ofm.shape[-1]]
1628 weights = np.full(shape, 1)
1629
1630 weight_tens = create_const_tensor(
1631 op.name + "_weights",
1632 shape,
1633 op.ifm.dtype,
1634 weights,
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001635 purpose=TensorPurpose.Weights,
1636 quantization=quantization,
1637 )
James Peet7519d502021-07-19 16:47:58 +01001638 weight_tens.values = weights
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001639 op.type = Op.DepthwiseConv2DBias
1640 op.inputs = []
1641 op.add_input_tensor(ifm)
1642 op.add_input_tensor(weight_tens)
1643 # Add bias tensor, all biases set to 0
1644 op.inputs.append(None)
Fredrik Svedbergcc219be2022-09-20 16:32:52 +02001645 fixup_bias_tensors(op, arch, nng, DataType.int32)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001646 # Add other inputs
1647 op.inputs.extend(other_inputs)
1648 op.rounding_mode = NpuRoundingMode.NATURAL
1649
1650 # Bypass the PAD operator
1651 op.set_input_tensor(pad_op.ifm, 0)
1652 # Adjust the padding attributes of the convolution operator
1653 op.attrs["padding"] = Padding.EXPLICIT
1654 op.attrs["explicit_padding"] = (top, left, bottom, right)
1655 op.set_ifm_ofm_shapes()
wilisa0179a89042022-11-02 17:18:43 +00001656 DebugDatabase.add_optimised(op, op)
1657
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001658 return op
1659
1660
1661def convert_pad(op: Operation, arch, nng):
1662 """
1663 Rewrites PAD operator to an average pool that copies the IFM to the OFM
1664 + up to 4 average pool operators that fill the OFM with zeros at the borders.
1665 This is done as fall-back for the PAD operators that remain after replace_pad_by_hw_pad
1666 """
1667 if op.type != Op.Pad or not op.run_on_npu:
1668 return op
1669 top, left, bottom, right = get_pad_values_from_input(op.inputs[1].values)
1670
1671 ifm = op.ifm
1672 assert ifm is not None
James Ward3e134342021-10-28 10:01:40 +01001673 ifm_shape = op.ifm_shapes[0]
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001674 ofm = op.ofm
1675 assert ofm is not None
1676 ofm.ops = []
1677 ofm_shape = op.ofm_shapes[0]
1678
1679 # Average pool op that copies IFM to the right place inside the OFM
1680 shp0 = Shape4D(0, 0, 0, 0)
1681 shp_top = shp0.with_height(top)
1682 avgpool_op = create_avg_pool_for_concat(op, op.name + "_main", ifm, ifm_shape, shp_top.with_width(left))
1683 avgpool_op.activation = op.activation
1684 quant = ofm.quantization
1685 pad_value = quant.zero_point
1686 # Add operations that fill the borders of the OFM
1687 if top > 0:
1688 shape = Shape4D(1, top, ofm_shape.width, ofm_shape.depth)
1689 zero_tens = create_const_tensor(
Tim Hall3b1578e2023-01-13 17:57:25 +00001690 op.name + "_top", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], quantization=quant
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001691 )
1692 # If top/bottom or left/right are equal, the const tensors can be allocated to the same address
1693 zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))
1694 create_avg_pool_for_concat(op, op.name + "_top", zero_tens, shape, shp0)
1695 if bottom > 0:
1696 shape = Shape4D(1, bottom, ofm_shape.width, ofm_shape.depth)
1697 zero_tens = create_const_tensor(
1698 op.name + "_bottom",
1699 shape.as_list(),
1700 ofm.dtype,
1701 shape.elements() * [pad_value],
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001702 quantization=quant,
1703 )
1704 zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))
1705 create_avg_pool_for_concat(
1706 op, op.name + "_bottom", zero_tens, shape, shp0.with_height(ofm_shape.height - bottom)
1707 )
1708 if left > 0:
1709 shape = Shape4D(1, ifm_shape.height, left, ofm_shape.depth)
1710 zero_tens = create_const_tensor(
Tim Hall3b1578e2023-01-13 17:57:25 +00001711 op.name + "_left", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], quantization=quant
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001712 )
1713 zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))
1714 create_avg_pool_for_concat(op, op.name + "_left", zero_tens, shape, shp_top)
1715 if right > 0:
1716 shape = Shape4D(1, ifm_shape.height, right, ofm_shape.depth)
1717 zero_tens = create_const_tensor(
Tim Hall3b1578e2023-01-13 17:57:25 +00001718 op.name + "_right", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], quantization=quant
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001719 )
1720 zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))
1721 create_avg_pool_for_concat(
1722 op, op.name + "_right", zero_tens, shape, shp_top.with_width(ofm_shape.width - right)
1723 )
1724
1725 op.type = Op.ConcatTFLite
1726 return avgpool_op
1727
1728
Fredrik Svedbergcc219be2022-09-20 16:32:52 +02001729def fixup_bias_tensors(op, arch, nng, dtype=None):
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001730 if op.type.needs_bias() and op.bias is None:
1731 # Op has no bias, add bias tensor filled with zeros
1732 nr_biases = op.inputs[1].shape[-1]
1733 bias_values = [0] * nr_biases
Fredrik Svedbergcc219be2022-09-20 16:32:52 +02001734 # The DataType of the bias tensor can be explicitly provided or deduced from the ifm
1735 # DataType. Default is int32 bias for 8-bit ifms and int64 for int16 ifms.
1736 # For int16 the selected bias DataType will have an impact on the scaling
1737 # used when encoding the scales and biases later. The default mode will match the
1738 # refence with reduced scaling for int64 bias.
1739 # This means that in cases (in the graph optimiser) where DepthwiseConv2DBias
1740 # is used to emulate average pool int32 bias should be selected for full precision
1741 # int16 scaling.
1742 if dtype is None:
1743 dtype = DataType.int64 if op.ifm.dtype == DataType.int16 else DataType.int32
1744 bias_tensor = create_const_tensor(op.name + "_bias", [nr_biases], dtype, bias_values)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001745 op.set_input_tensor(bias_tensor, op.type.info.indices.biases[0])
1746
1747 return op
1748
1749
wilisa0146c94772023-02-08 09:56:14 +00001750def detect_asymmetric_weights(op):
1751 # Check all ops (cpu and npu)
1752 if op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op():
1753 if op.ifm.dtype in (DataType.int8, DataType.int16):
Fredrik Svedbergcc8569f2021-11-01 14:25:29 +01001754 if not np.all(op.weights.quantization.zero_point == 0):
wilisa0146c94772023-02-08 09:56:14 +00001755 print(f"Warning: Op {op.type} '{op.name}' has asymmetric weights.", end=" ")
1756 return True
1757 return False
Fredrik Svedbergcc8569f2021-11-01 14:25:29 +01001758
wilisa0146c94772023-02-08 09:56:14 +00001759
1760def fixup_asymmetric_weights(op, arch, nng):
1761 if detect_asymmetric_weights(op):
1762 if op.run_on_npu:
1763 print("Zero points have been adjusted.")
1764 op.weights.quantization.zero_point *= 0
Fredrik Svedbergcc8569f2021-11-01 14:25:29 +01001765 return op
1766
1767
wilisa0146c94772023-02-08 09:56:14 +00001768def check_asymmetric_weights(op, arch, nng):
1769 # This function can modify the run_on_npu flag which causes an operator to be placed on the CPU. It is usually only
1770 # set by the supported operator checks. Therefore, it should be run immediately after those checks to avoid the
1771 # possibility of other graph optimiser functions modify the operator (that is later run on the CPU)
1772 if detect_asymmetric_weights(op):
1773 if op.run_on_npu:
1774 print("To run the operator on Ethos-U use the option --force-symmetric-int-weights")
1775 op.run_on_npu = False
1776 return op
1777
1778
1779def fixup_or_check_asymmetric_weights(force_symmetric_int_weights):
1780 if force_symmetric_int_weights:
1781 return fixup_asymmetric_weights
1782 else:
1783 return check_asymmetric_weights
1784
1785
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001786def convert_mean_to_depthwise_conv_or_avgpool(op, arch, nng):
1787 if op.type == Op.Mean and op.run_on_npu:
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001788 inp, axis = op.inputs
1789 shape = inp.shape
Diqing Zhong1ddb2ed2022-03-09 12:23:47 +01001790 ofm_shape = op.ofm.shape
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001791 dims = len(shape)
Diqing Zhong1ddb2ed2022-03-09 12:23:47 +01001792 dims_ofm = len(ofm_shape)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001793
1794 # Height and width axes have different index depending on dimensions
1795 if axis.shape == [] or axis.shape[0] == 1: # single axis
1796 axis = int(axis.values) if len(axis.shape) == 0 else int(axis.values[0])
1797 if dims in (2, 3):
1798 if axis == 0:
1799 h, w = shape[axis], 1
1800 else:
1801 h, w = 1, shape[axis]
1802 else:
1803 if axis == 1:
1804 h, w = shape[axis], 1
1805 else:
1806 h, w = 1, shape[axis]
1807 else: # multiple axes
1808 axis = sorted(axis.values)
1809 h, w = [shape[i] for i in axis]
1810
1811 # Set necessary depthwise attributes
1812 op.attrs.update(
1813 {
1814 "padding": Padding.VALID,
1815 "stride_h": 1,
1816 "stride_w": 1,
1817 "strides": (1, 1, 1, 1),
1818 "depth_multiplier": 1,
1819 "channel_multiplier": 1,
1820 "dilation_h_factor": 1,
1821 "dilation_w_factor": 1,
1822 "dilation": (1, 1, 1, 1),
1823 }
1824 )
1825 # Change op type
1826 op.type = Op.DepthwiseConv2DBias
1827 # Set IFM/OFM shapes after changing op type
1828 op.set_ifm_ofm_shapes()
1829
Fredrik Svedberg1e5456f2022-09-23 15:25:17 +02001830 weight_scale, bias = 1, 0
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001831 ofmq, ifmq = op.ofm.quantization, inp.quantization
Johan Alfvén9d51ec42022-10-27 16:30:01 +02001832 if ifmq.is_scaling_equal(ofmq):
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001833 # Here we can just use a simple AvgPool with truncating rounding,
1834 # as we're emulating simple integer division.
1835 op.rounding_mode = NpuRoundingMode.TRUNCATE
1836 op.type = Op.AvgPool
1837 op.attrs.update({"ksize": (1, h, w, 1), "filter_height": h, "filter_width": w})
1838 else:
1839 op.rounding_mode = NpuRoundingMode.NATURAL
1840 weight_scale = 1 / (h * w)
1841 # Input zero point is adjusted after mean calculation, so we emulate that with a bias
1842 bias = -ifmq.zero_point * h * w
1843 fiq = ifmq.clone()
1844 fiq.zero_point = 0
1845 op.forced_input_quantization = fiq
1846
1847 # Change dimensions to 4
Diqing Zhong1ddb2ed2022-03-09 12:23:47 +01001848 def extend_dims(dim, in_shape):
1849 if dim < 4:
1850 in_shape = [1] + in_shape
1851 if dim == 2:
1852 in_shape += [1]
1853 return in_shape
1854
1855 if dims < 4 or dims_ofm < 4:
1856 # Fix the ofm dimension when keep_dims is false
1857 # e.g. IFM=1xHxWxC axis=2 OFM=1xHxC, the ofm_shape should be 1xHx1xC, not 1x1xHxC
1858 if isinstance(axis, int) and dims_ofm + 1 == dims:
1859 ofm_shape.insert(axis, 1)
1860 elif isinstance(axis, list) and (dims_ofm + len(axis) == dims):
1861 for i in axis:
1862 ofm_shape.insert(i, 1)
1863 shape = extend_dims(dims, shape)
1864 dims_ofm = len(ofm_shape)
1865 ofm_shape = extend_dims(dims_ofm, ofm_shape)
1866 op.set_ifm_ofm_shapes()
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001867
Rickard Bolin7d7cb672021-12-07 09:09:14 +00001868 # If height is greater than max kernel height, reshape from HxW to 1x(HxW)
Johan Alfvéne84ed6b2022-09-26 13:46:51 +02001869 weight_shape = None
Rickard Bolin7d7cb672021-12-07 09:09:14 +00001870 if (h > 64 and op.type == Op.DepthwiseConv2DBias) or (h > 256 and op.type == Op.AvgPool):
Johan Alfvéne84ed6b2022-09-26 13:46:51 +02001871 # This can only happen and be done for multiple axes, and
1872 # h * w <= 256 for DepthwiseConv2DBias
1873 # h * w <= 4096 for AvgPool
1874 # which is checked in supported ops
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001875 shape = [shape[0], 1, h * w, shape[3]]
1876 op.ifm_shapes[0] = Shape4D(shape)
Johan Alfvéne84ed6b2022-09-26 13:46:51 +02001877 weight_shape = [1, h * w, shape[3], shape[0]]
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001878 if h > 256 and op.type == Op.AvgPool:
1879 op.attrs.update({"ksize": (1, 1, h * w, 1), "filter_height": 1, "filter_width": h * w})
1880
1881 # If the AvgPool version is used, we don't need to do anything else
1882 if op.type == Op.AvgPool:
wilisa0179a89042022-11-02 17:18:43 +00001883 DebugDatabase.add_optimised(op, op)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001884 return op
1885
1886 # Make unit weight tensor quantization
1887 weight_quant = ifmq.clone()
1888 weight_quant.min = 0
1889 weight_quant.max = 255
1890 weight_quant.scale_f32 = weight_scale
1891 weight_quant.zero_point = 0
1892
Johan Alfvéne84ed6b2022-09-26 13:46:51 +02001893 if weight_shape is None:
1894 # Set weight shape to [H,W,C,B]
1895 weight_shape = [h, w, shape[3], shape[0]]
Diqing Zhong1ddb2ed2022-03-09 12:23:47 +01001896
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001897 # Add unit weight tensor
1898 op.set_input_tensor(
1899 create_const_tensor(
1900 "weights",
1901 weight_shape,
1902 inp.dtype,
1903 np.ones(weight_shape),
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001904 quantization=weight_quant,
1905 ),
1906 1,
1907 )
James Peet7519d502021-07-19 16:47:58 +01001908 op.weights.values = np.reshape(op.inputs[1].values, weight_shape)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001909
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001910 # Add bias tensor
Fredrik Svedberg1e5456f2022-09-23 15:25:17 +02001911 bias_shape = [shape[-1]]
1912 op.inputs.append(create_const_tensor("bias", bias_shape, DataType.int32, np.ones(bias_shape) * bias))
wilisa0179a89042022-11-02 17:18:43 +00001913 DebugDatabase.add_optimised(op, op)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02001914
1915 return op
1916
1917
Ayaan Masood25f48dd2022-06-29 18:16:04 +01001918def optimise_quantize(op: Operation, arch, nng):
1919
1920 if op.type == Op.Quantize and op.run_on_npu:
1921
1922 ifm, ofm = op.get_ifm_ofm()
1923 input_values = ifm.values
1924
1925 # Guard clause - input not const or no values to quantize
1926 if ifm.ops[0].type != Op.Const or input_values is None:
1927 return op
1928
1929 # Singular val in numpy array, convert to indexable array
1930 if input_values.ndim == 0:
1931 input_values = np.array([input_values])
1932
Fredrik Svedberg11563172022-07-06 14:54:12 +02001933 # requantized int8 to int8 or int16 to int16
1934 if ifm.dtype == ofm.dtype == DataType.int8 or ifm.dtype == ofm.dtype == DataType.int16:
Ayaan Masood25f48dd2022-06-29 18:16:04 +01001935
1936 # scale needs to use double precision to match TFLite reference kernel
1937 effective_scale = np.float64(ifm.quantization.scale_f32) / np.float64(ofm.quantization.scale_f32)
1938 effective_multiplier, effective_shift = quantise_scale(effective_scale)
1939
Ayaan Masood25f48dd2022-06-29 18:16:04 +01001940 requantized_vals = []
Fredrik Svedberga04f2f72022-07-06 13:42:24 +02001941 for val in input_values.flatten():
Ayaan Masood25f48dd2022-06-29 18:16:04 +01001942 input_val = val - ifm.quantization.zero_point
1943
Fredrik Svedberga04f2f72022-07-06 13:42:24 +02001944 ofm_val = fp_math.multiply_by_quantized_multiplier(input_val, effective_multiplier, effective_shift)
1945 ofm_val += ofm.quantization.zero_point
Ayaan Masood25f48dd2022-06-29 18:16:04 +01001946
Fredrik Svedberga04f2f72022-07-06 13:42:24 +02001947 clamped_ofm_value = max(min(ofm_val, ofm.quantization.quant_max), ofm.quantization.quant_min)
1948 requantized_vals.append(clamped_ofm_value)
Ayaan Masood25f48dd2022-06-29 18:16:04 +01001949
Fredrik Svedberga04f2f72022-07-06 13:42:24 +02001950 ofm.values = np.array(requantized_vals, ofm.dtype.as_numpy_type())
1951 ofm.values.shape = input_values.shape
Ayaan Masood25f48dd2022-06-29 18:16:04 +01001952
1953 # Case: Float input - quantize to int
Fredrik Svedberga04f2f72022-07-06 13:42:24 +02001954 elif ifm.dtype.type == BaseType.Float:
Ayaan Masood25f48dd2022-06-29 18:16:04 +01001955
1956 quantized_vals = []
1957 for val in input_values:
1958
1959 # Derive quantized value
1960 quant_val = (val / ofm.quantization.scale_f32) + ofm.quantization.zero_point
Fredrik Svedberga04f2f72022-07-06 13:42:24 +02001961 clamped_quantized_val = np.clip(quant_val, ofm.quantization.quant_min, ofm.quantization.quant_max)
1962 quantized_vals.append(clamped_quantized_val)
Ayaan Masood25f48dd2022-06-29 18:16:04 +01001963
1964 # Pass the statically calculated quant val to output tensor
Fredrik Svedberga04f2f72022-07-06 13:42:24 +02001965 ofm.values = np.array(quantized_vals, ofm.dtype.as_numpy_type())
1966
1967 # Unsupported data type
1968 else:
1969 return op
Ayaan Masood25f48dd2022-06-29 18:16:04 +01001970
1971 # Make quantize op const and disconnect from parent node
1972
1973 # Remove reference of the current quant op from the parent tensor's consumer list
1974 ifm.consumer_list = [consumer for consumer in ifm.consumer_list if consumer.op_index != op.op_index]
1975
1976 # Clear any references to parent node
1977 op.inputs = []
1978
1979 # Convert this quantize op to const
1980 op.type = Op.Const
1981
1982 return op
1983
1984
Ayaan Masood4965fae2022-06-29 11:30:57 +01001985def convert_shape_op_to_constant_tensor(op: Operation, arch, nng):
1986 """Static optimisation for SHAPE operator output value known at compile time"""
1987
1988 # Disconnect SHAPE operator from its parent and transform SHAPE OP into constant
1989
1990 if op.type == Op.Shape and op.run_on_npu:
1991
1992 ifm, ofm = op.get_ifm_ofm()
1993
1994 if len(ifm.shape) != ofm.shape[0]:
1995 return op
1996
1997 # Remove reference of the current shape op from the parent tensor's consumer list
1998 ifm.consumer_list = [consumer for consumer in ifm.consumer_list if consumer.op_index != op.op_index]
1999
2000 # Clear any references to parent node
2001 op.inputs = []
2002
2003 # Convert this SHAPE op to const
2004 op.type = Op.Const
2005
2006 # Add size calculation to shape output tensors
2007 ofm.values = np.array(ifm.shape)
2008
2009 return op
2010
2011
Tim Hallea4ba662022-11-11 18:19:53 +00002012def fixup_dilation_gt2(op, arch, nng):
2013 assert op.run_on_npu
2014 if op.type == Op.Conv2DBias or op.type == Op.DepthwiseConv2DBias:
2015 dilation_w, dilation_h = op.get_kernel_dilation()
2016
2017 # if dilation in either axis is greater than that supported by the hardware then we must manually dilate the
2018 # kernel
2019 if dilation_w > 2 or dilation_h > 2:
2020 kernel_w, kernel_h = op.get_kernel_size()
2021 kernel_ic = op.weights.shape[-2]
2022 kernel_oc = op.weights.shape[-1]
2023
2024 # if the dilation is a multiple of 2 then the hardware dialtion can be enabled to provide that multiple
2025 # of 2. this allows the kernel size to be reduced (via the scaled dilation) by half in that dimension.
2026 # odd = 1, even = 2
2027 hw_dilation_h = 1 if (dilation_h & 1) else 2
2028 hw_dilation_w = 1 if (dilation_w & 1) else 2
2029
2030 scale_dilation_h = dilation_h // hw_dilation_h
2031 scale_dilation_w = dilation_w // hw_dilation_w
2032
2033 # create new empty kernel (HWIO format)
2034 new_kernel_h = (kernel_h - 1) * scale_dilation_h + 1
2035 new_kernel_w = (kernel_w - 1) * scale_dilation_w + 1
2036
2037 new_kernel_shape = [new_kernel_h, new_kernel_w, kernel_ic, kernel_oc]
2038 new_kernel_values = np.zeros(new_kernel_shape, dtype=op.weights.values.dtype)
2039
2040 # copy the original kernel values into the new sparse kernel
2041 for h in range(0, kernel_h):
2042 for w in range(0, kernel_w):
2043 new_h = h * scale_dilation_h
2044 new_w = w * scale_dilation_w
2045 new_kernel_values[new_h, new_w, :, :] = op.weights.values[h, w, :, :]
2046
2047 # update the weight tensor with the new dilated kernel
2048 op.weights.shape = new_kernel_shape
2049 op.weights.values = new_kernel_values
2050
2051 # enable(=2) / disable(=1) hardware dilation
2052 op.attrs["dilation"] = (1, hw_dilation_h, hw_dilation_w, 1) # nhwc format
2053 op.attrs["dilation_h_factor"] = hw_dilation_h
2054 op.attrs["dilation_w_factor"] = hw_dilation_w
2055
2056 return op
2057
2058
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002059def supported_operator_check(op, arch, nng):
Jonas Ohlsson45e653d2021-07-26 16:13:12 +02002060 op.run_on_npu = arch.tflite_supported_operators.is_operator_supported(op)
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002061 return op
2062
2063
wilisa0146c94772023-02-08 09:56:14 +00002064def tflite_optimise_graph(nng, arch, force_symmetric_int_weights):
Fredrik Svedberg11563172022-07-06 14:54:12 +02002065 # Compile time static optimisations
wilisa0146c94772023-02-08 09:56:14 +00002066 optimisation_list = [
2067 optimise_quantize,
2068 convert_shape_op_to_constant_tensor,
2069 fixup_or_check_asymmetric_weights(force_symmetric_int_weights),
2070 ]
Ayaan Masood25f48dd2022-06-29 18:16:04 +01002071
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002072 for idx, sg in enumerate(nng.subgraphs):
2073 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
Jonas Ohlssond8575072022-03-30 10:30:25 +02002074 nng,
2075 sg,
2076 arch,
2077 [],
Ayaan Masood4965fae2022-06-29 11:30:57 +01002078 optimisation_list,
2079 rewrite_unsupported=False,
2080 )
2081
Fredrik Svedberga04f2f72022-07-06 13:42:24 +02002082 # Pre-processing step
wilisa0146c94772023-02-08 09:56:14 +00002083 pre_process_list = [supported_operator_check, set_ifm_ofm_op_shapes]
Fredrik Svedberga04f2f72022-07-06 13:42:24 +02002084
Ayaan Masood4965fae2022-06-29 11:30:57 +01002085 for idx, sg in enumerate(nng.subgraphs):
2086 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
2087 nng,
2088 sg,
2089 arch,
2090 [],
Jonas Ohlssond8575072022-03-30 10:30:25 +02002091 pre_process_list,
2092 rewrite_unsupported=False,
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002093 )
2094
2095 # Handle Concat Ops
2096 for idx, sg in enumerate(nng.subgraphs):
2097 rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [rewrite_concat_ops])
2098 sg.refresh_after_modification()
2099
2100 # Handle Split Ops
2101 for idx, sg in enumerate(nng.subgraphs):
2102 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
2103 nng,
2104 sg,
2105 arch,
2106 [],
2107 [rewrite_unpack_output, rewrite_stridedslice_output, convert_nop_split_to_identity],
2108 rewrite_unsupported=False,
2109 )
2110
2111 for idx, sg in enumerate(nng.subgraphs):
2112 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
Jonas Ohlssond8575072022-03-30 10:30:25 +02002113 nng,
2114 sg,
2115 arch,
2116 [rewrite_split_ops],
2117 [],
2118 rewrite_unsupported=False,
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002119 )
2120
Johan Alfvena5e1b622023-02-02 14:59:03 +01002121 # Bypass or rewrite memory only operators
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002122 for idx, sg in enumerate(nng.subgraphs):
2123 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
Jonas Ohlssond8575072022-03-30 10:30:25 +02002124 nng,
2125 sg,
2126 arch,
2127 [],
Johan Alfvena5e1b622023-02-02 14:59:03 +01002128 [bypass_memory_only_ops],
Jonas Ohlssond8575072022-03-30 10:30:25 +02002129 rewrite_unsupported=False,
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002130 )
2131
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002132 # Rewrite of operators
2133 op_rewrite_list = [
2134 set_tensor_equivalence,
2135 convert_mean_to_depthwise_conv_or_avgpool,
2136 convert_depthwise_to_conv,
2137 convert_conv_to_fc,
Fredrik Svedberg0ac08042023-04-11 22:35:04 +02002138 convert_lstm,
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002139 convert_softmax,
Fredrik Svedberg8ddd4892022-08-19 16:06:04 +02002140 convert_prelu,
Fredrik Svedberg36424312022-09-16 09:39:26 +02002141 convert_mul_max_to_abs_or_lrelu,
2142 convert_lrelu,
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002143 convert_hardswish_to_lut,
2144 rewrite_fully_connected_input,
2145 convert_batched_fc_shape,
2146 fixup_conv2d_backprop,
2147 fixup_relus_with_differing_ifm_ofm_scaling,
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002148 reorder_depthwise_weights,
Rickard Bolin6986a072022-12-19 12:33:40 +00002149 convert_argmax_to_depthwise_conv_and_max_pool,
Tim Hall885033b2022-07-21 11:46:03 +01002150 fixup_resize,
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002151 fixup_bias_tensors,
Fredrik Svedbergcc8569f2021-11-01 14:25:29 +01002152 fixup_asymmetric_weights,
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002153 convert_tanh_sigmoid_to_lut,
2154 replace_pad_by_hw_pad,
Tim Hallea4ba662022-11-11 18:19:53 +00002155 fixup_dilation_gt2,
Raul Farkas72c6a242023-03-16 16:38:05 +00002156 fixup_strided_conv,
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002157 ]
2158
2159 for idx, sg in enumerate(nng.subgraphs):
2160 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
Jonas Ohlssond8575072022-03-30 10:30:25 +02002161 nng,
2162 sg,
2163 arch,
2164 [],
2165 op_rewrite_list,
2166 rewrite_unsupported=False,
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002167 )
2168
2169 for idx, sg in enumerate(nng.subgraphs):
2170 # remove passthrough tensors and attempt further optimizations
2171 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
2172 nng,
2173 sg,
2174 arch,
2175 [remove_passthrough_tensor],
2176 [fuse_activation_function_with_prev, convert_pad, add_padding_fields],
2177 )
2178
2179 # Removal of SplitSliceRead, need to be done after optimisation has been performed,
2180 # since ifm/ofm_shapes are of importance to this function
2181 for sg in nng.subgraphs:
2182 rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [remove_SplitSliceRead])
2183 sg.refresh_after_modification()
2184
Fredrik Svedbergf3c7d552022-11-04 09:48:49 +01002185 # Make sure that const optimisations on subgraph outputs are handled correctly
2186 for sg in nng.subgraphs:
2187 for ofm in sg.output_tensors:
2188 if ofm.is_const and ofm.ops[0].type_changed:
2189 # Subgraph output cannot be const - insert a memory copy
2190 op = ofm.ops[0]
2191 ofm_clone = ofm.clone()
2192 ofm_clone.values = ofm.values
2193 ofm.values = None
Tim Hall3b1578e2023-01-13 17:57:25 +00002194 zero = create_const_tensor("zero", [1], ofm.dtype, [0], quantization=ofm.quantization)
Fredrik Svedbergf3c7d552022-11-04 09:48:49 +01002195 memcpy = create_add_nop(f"{ofm.name}_copy")
2196 memcpy.add_input_tensor(ofm_clone)
2197 memcpy.add_input_tensor(zero)
2198 memcpy.set_output_tensor(ofm)
2199 memcpy.set_ifm_ofm_shapes()
2200 op.set_output_tensor(ofm_clone)
2201 DebugDatabase.add_optimised(op, memcpy)
2202
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +02002203 return nng