blob: 99d4bf053e1760faefe95fdd478b60d4a34e414e [file] [log] [blame]
Louis Verhaardebf4af62021-01-27 15:57:57 +01001# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Early optimisation of the network graph, using the rewrite_graph module to do the traversal of the graph. These are
18# split into two parts optimise_graph_a and optimise_graph_b.
Tim Hall79d07d22020-04-27 18:20:16 +010019import math
Diqing Zhong016b8272020-12-16 16:46:06 +010020import uuid
Louis Verhaardebf4af62021-01-27 15:57:57 +010021from typing import Tuple
Diego Russoea6111a2020-04-14 18:41:58 +010022
23import numpy as np
24
Louis Verhaardd7911c42020-08-25 13:36:41 +020025from . import fp_math
Louis Verhaardb9fc33c2020-08-13 11:47:36 +020026from . import lut
Diego Russoea6111a2020-04-14 18:41:58 +010027from . import rewrite_graph
Louis Verhaardd7911c42020-08-25 13:36:41 +020028from . import scaling
Louis Verhaard1a92f782021-02-09 16:08:26 +010029from .api import NpuRoundingMode
Diego Russoea6111a2020-04-14 18:41:58 +010030from .data_type import DataType
Tim Halle6ccd872020-11-09 16:46:37 +000031from .debug_database import DebugDatabase
Louis Verhaard7db78962020-05-25 15:05:26 +020032from .errors import UnsupportedFeatureError
Patrik Gustavsson3a269202021-01-21 08:28:55 +010033from .errors import VelaError
Dwight Lidman42fed942020-05-29 09:37:03 +020034from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Louis Verhaard8912c532020-09-30 12:11:49 +020035from .numeric_util import clamp_sigmoid
Louis Verhaarde0ef2732020-06-03 08:56:44 +020036from .numeric_util import full_shape
Louis Verhaardf03bad32020-09-25 08:30:44 +020037from .numeric_util import round_away_zero
Louis Verhaarde8a5a782020-11-02 18:04:27 +010038from .operation import create_activation_function
Diego Russoe8a10452020-04-21 17:39:10 +010039from .operation import NpuBlockType
Louis Verhaardaee5d752020-09-30 09:01:52 +020040from .operation import Op
Diego Russoe8a10452020-04-21 17:39:10 +010041from .operation import Operation
Michael McGeagh16895482020-12-14 15:51:20 +000042from .operation import Padding
Fredrik Svedbergd9c2c422020-12-01 16:33:45 +010043from .operation_util import create_avgpool_nop
Louis Verhaardc822d622021-03-11 14:59:06 +010044from .operation_util import get_pad_values_from_input
patrik.gustavssoneeb85152020-12-21 17:10:40 +000045from .shape4d import Shape4D
Fredrik Svedberga0c36242020-06-03 15:43:31 +020046from .softmax import SoftMax
Tim Hall93582962020-09-09 21:58:15 +010047from .tensor import check_quantized_tens_scaling_equal
Michael McGeaghc5b549b2020-08-07 11:54:28 +010048from .tensor import create_const_tensor
Louis Verhaardc822d622021-03-11 14:59:06 +010049from .tensor import create_equivalence_id
Charles Xu9a03fdf2020-07-02 15:12:40 +020050from .tensor import QuantizationParameters
Diego Russoe8a10452020-04-21 17:39:10 +010051from .tensor import Tensor
Louis Verhaard1a92f782021-02-09 16:08:26 +010052from .tensor import TensorPurpose
Michael McGeagh7a6f8432020-12-02 15:29:22 +000053from .tflite_mapping import optype_to_builtintype
Tim Hall79d07d22020-04-27 18:20:16 +010054
Michael McGeaghf3e3ad72020-12-02 12:39:03 +000055passthrough_nodes = (Op.Identity,)
Tim Hall79d07d22020-04-27 18:20:16 +010056
Michael McGeaghf3e3ad72020-12-02 12:39:03 +000057memory_only_ops = (Op.Reshape,)
Michael McGeagh11b0bdb2020-09-08 11:07:35 +010058
Tim Hall79d07d22020-04-27 18:20:16 +010059
Louis Verhaardc822d622021-03-11 14:59:06 +010060def create_avg_pool_for_concat(concat_op, name, ifm, ifm_shape: Shape4D, write_offset: Shape4D):
61 """Creates an average pool for the given concat op/input feature map"""
62 ofm = concat_op.ofm
63 avgpool_op = create_avgpool_nop(name)
64 avgpool_op.inputs = [ifm]
65 avgpool_op.outputs = [ofm]
66
67 avgpool_op.write_offset = write_offset
68 avgpool_op.write_shape = ifm_shape
69 ofm.ops.append(avgpool_op)
70 DebugDatabase.add_optimised(concat_op, avgpool_op)
71 avgpool_op.ifm_shapes.append(ifm_shape)
72 avgpool_op.ofm_shapes.append(concat_op.ofm_shapes[0])
73 avgpool_op.memory_function = Op.ConcatSliceWrite
74 return avgpool_op
75
76
Patrik Gustavsson3010d9b2020-10-01 08:22:10 +020077def remove_passthrough_tensor(tens, arch, nng):
Tim Hall79d07d22020-04-27 18:20:16 +010078 if len(tens.ops) == 1 and tens.ops[0].type in passthrough_nodes:
79 assert len(tens.ops[0].inputs) == 1
80 tens = tens.ops[0].inputs[0]
81 return tens
82
83
Patrik Gustavsson2c2522d2021-01-29 11:51:31 +010084def rewrite_concat_ops(op, arch):
Patrik Gustavsson3a269202021-01-21 08:28:55 +010085 if not op.run_on_npu or not op.type.is_concat_op():
Louis Verhaardc822d622021-03-11 14:59:06 +010086 return
Tim Hall79d07d22020-04-27 18:20:16 +010087
Patrik Gustavsson3a269202021-01-21 08:28:55 +010088 axis_4D = 0
89 ofm = op.ofm
90 ofm.ops = []
91 offset = 0
Tim Hall79d07d22020-04-27 18:20:16 +010092
Patrik Gustavsson7bada402021-01-28 15:46:21 +010093 unfuse_activation_function(op)
94
Patrik Gustavsson3a269202021-01-21 08:28:55 +010095 if op.type == Op.Pack:
96 # Pack is also referred to as Stack
97 axis = int(op.attrs["axis"])
Henrik G Olssona93f0262021-04-12 14:53:18 +020098 if axis < 0: # Convert to positive axis
99 axis = len(op.inputs[0].shape) + 1 + axis
100
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100101 desired_shape = op.inputs[0].shape[:axis] + [1] + op.inputs[0].shape[axis:]
Tim Hall79d07d22020-04-27 18:20:16 +0100102
Henrik G Olssona93f0262021-04-12 14:53:18 +0200103 axis_4D = axis + (4 - len(desired_shape))
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100104
105 for idx, inp in enumerate(op.inputs):
106 op.ifm_shapes[idx] = Shape4D(desired_shape)
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100107 op.type = Op.PackReshaped
108
109 inputs, axis = op.get_concat_inputs_axis()
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100110 for idx, inp in enumerate(inputs):
111 if op.type != Op.PackReshaped:
112 op.ifm_shapes[idx] = Shape4D(inp.shape)
Patrik Gustavsson3d737172020-12-22 10:40:51 +0100113 if axis >= 0:
114 axis_4D = axis + (4 - len(inp.shape))
115 else:
116 axis_4D = axis
Louis Verhaardc822d622021-03-11 14:59:06 +0100117 write_offset = [0, 0, 0, 0]
118 write_offset[axis_4D] = offset
119 concat_end = offset + op.ifm_shapes[idx][axis_4D]
120 create_avg_pool_for_concat(
121 op, op.name + str(idx) + "_avgpool", inp, op.ifm_shapes[idx], Shape4D.from_list(write_offset)
122 )
123 offset = concat_end
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100124 assert ofm.shape[axis] == offset
Patrik Gustavsson458a2082020-08-13 13:41:05 +0200125
Patrik Gustavssonee99bb12021-04-08 09:04:00 +0200126 return op
Tim Hall79d07d22020-04-27 18:20:16 +0100127
128
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100129def rewrite_split_ops(tens, arch, nng):
Tim Hall79d07d22020-04-27 18:20:16 +0100130
Patrik Gustavsson224e99b2021-01-14 10:55:43 +0100131 if len(tens.ops) == 1 and tens.ops[0].type.is_split_op() and tens.ops[0].type != Op.Unpack:
Tim Hall79d07d22020-04-27 18:20:16 +0100132 split_op = tens.ops[0]
133
134 # Not supported so leave it and run on CPU
135 if not split_op.run_on_npu:
136 return tens
137
138 inp, outputs, axis, offset_start, offset_end = split_op.get_split_inputs_axis()
139
140 tens.ops = []
Louis Verhaardaee5d752020-09-30 09:01:52 +0200141 new_op = Operation(Op.SplitSliceRead, split_op.name)
Tim Hall79d07d22020-04-27 18:20:16 +0100142 new_op.inputs = [inp]
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100143 ofm_shape_idx = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100144
145 # For Split the offset cannot be extracted from the tensor so it has to
146 # be calculated from the index of the output tensor
Diego Russoea6111a2020-04-14 18:41:58 +0100147 if axis is not None:
Tim Hall79d07d22020-04-27 18:20:16 +0100148 # Get the start and end of the split
Patrik Gustavsson2349d422020-12-01 16:02:29 +0100149 offset_start = [0] * 4
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100150 axis_4D_list = split_op.attrs.get("split_axis_4D", None) # Present for UnpackReshaped and some StridedSlice
Patrik Gustavsson2349d422020-12-01 16:02:29 +0100151 for idx, out in enumerate(outputs):
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100152 if axis_4D_list is not None:
153 axis_4D = axis_4D_list[idx]
Patrik Gustavsson3d737172020-12-22 10:40:51 +0100154 else:
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100155 split_op.ofm_shapes[idx] = Shape4D(out.shape)
156 if axis >= 0:
157 axis_4D = axis + (4 - len(out.shape))
158 else:
159 axis_4D = axis
160
161 if out == tens:
162 ofm_shape_idx = idx
163 break
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000164
Tim Hall73e843f2021-02-04 22:47:46 +0000165 offset_start[axis_4D] += split_op.ofm_shapes[idx][axis_4D]
Tim Hall79d07d22020-04-27 18:20:16 +0100166
Patrik Gustavssone3b1b912021-02-09 15:38:46 +0100167 new_op.read_offsets[0] = Shape4D.from_list(offset_start, 0)
Tim Hall79d07d22020-04-27 18:20:16 +0100168 new_op.run_on_npu = True
Michael McGeaghc5b549b2020-08-07 11:54:28 +0100169 new_op.set_output_tensor(tens)
Patrik Gustavsson224e99b2021-01-14 10:55:43 +0100170 new_op.ifm_shapes.append(Shape4D(inp.shape))
Tim Hall73e843f2021-02-04 22:47:46 +0000171 new_op.ofm_shapes.append(split_op.ofm_shapes[ofm_shape_idx])
Tim Halle6ccd872020-11-09 16:46:37 +0000172 DebugDatabase.add_optimised(split_op, new_op)
Tim Hall79d07d22020-04-27 18:20:16 +0100173
174 return tens
175
176
Patrik Gustavssone3b1b912021-02-09 15:38:46 +0100177def remove_SplitSliceRead(op, arch):
178
179 if op.type == Op.SplitSliceRead:
180 # Check if it is possible to put the SplitSliceRead on the tensor consumer, or if an avgpool need to be inserted
181 if (
182 len(op.ofm.consumer_list) == 1
183 and op.ofm.consumer_list[0] is not None
184 and op.ofm.consumer_list[0].run_on_npu
185 and op.ofm.consumer_list[0].type != Op.Reshape
186 and op.ofm_shapes[0] == Shape4D.from_list(op.ofm.shape)
187 ):
188 # SplitSliceRead can be performed by tensor consumer
189 cons_op = op.ofm.consumer_list[0]
190 if cons_op.ifm == op.ofm:
191 cons_op.read_offsets[0] = op.read_offsets[0]
192 cons_op.set_input_tensor(op.ifm, cons_op.type.info.indices.ifms[0])
193 cons_op.ifm_shapes[0] = op.ifm_shapes[0]
194 elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == op.ofm:
195 cons_op.read_offsets[1] = op.read_offsets[0]
196 cons_op.set_input_tensor(op.ifm, cons_op.type.info.indices.ifms[1])
197 cons_op.ifm_shapes[1] = op.ifm_shapes[0]
198
Henrik G Olsson5fabfca2021-04-15 17:57:26 +0200199 if "skirt" in cons_op.attrs:
200 assert cons_op.attrs["explicit_padding"] == cons_op.attrs["skirt"]
201 cons_op.attrs["skirt"] = None
202 cons_op.attrs["force_padding"] = True
Patrik Gustavssone3b1b912021-02-09 15:38:46 +0100203 op.ofm.consumer_list.remove(cons_op)
204 op.ofm.ops = []
205 op.ifm.consumer_list.remove(op)
206 else:
207 avgpool_op = create_avgpool_nop(op.name + "_avgpool")
208 avgpool_op.add_input_tensor(op.ifm)
209 avgpool_op.outputs = [op.ofm]
210 op.ofm.ops.remove(op)
211 op.ofm.ops.append(avgpool_op)
212 avgpool_op.ifm_shapes.append(op.ifm_shapes[0])
213 avgpool_op.ofm_shapes.append(op.ofm_shapes[0])
214 avgpool_op.read_offsets[0] = op.read_offsets[0]
215
216 op.ifm.consumer_list.remove(op)
217 DebugDatabase.add_optimised(op, avgpool_op)
218
219
Patrik Gustavssonee99bb12021-04-08 09:04:00 +0200220def avoid_nhcwb16_for_concat(tens):
221 # If axis corresponds to C-dimension, NHCWB16 can only be used in the output if all the concat_start's are a
222 # multiple of 16. This as, it is only then the address offset for the ofm, for all operations, will be 16 byte
223 # aligned. For other values of axis the address offsets will be 16 byte aligned, as they are all based on c = 0
224 # and those addresses are always 16 byte aligned due to the NHCWB16 format.
225 return any(op.write_offset.depth % 16 != 0 for op in tens.ops if op.write_offset is not None)
226
227
228def avoid_nhcwb16_for_split(tens):
229 # If read offset is not a multiple of 16 in the C-dimension, NHCWB16 need to be avoided in the input
230 for cons_op in tens.consumer_list:
231 if cons_op.ifm == tens:
232 read_offset = cons_op.read_offsets[0]
233 elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == tens:
234 read_offset = cons_op.read_offsets[1]
235 else:
236 assert False
237 if read_offset is not None and (read_offset[-1] % 16) != 0:
238 return True
239 return False
240
241
242def avoid_nhcwb16_for_shapes(tens):
243 # check all producers/consumers to see if any op shape is preventing NHCWB16
244 for cons_op in tens.consumer_list:
245 if cons_op.ifm == tens:
246 cons_op_shape = cons_op.ifm_shapes[0]
247 elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == tens:
248 cons_op_shape = cons_op.ifm_shapes[1]
249 else:
250 assert False
251 if Shape4D(tens.shape) != cons_op_shape:
252 return True
253
254 for prod_op in tens.ops:
255 if Shape4D(tens.shape) != prod_op.ofm_shapes[0]:
256 return True
257
258 return False
259
260
261# Check if non linear format can be used
262def check_format_restrictions(tens, arch):
263 if len(tens.ops) < 1:
264 return
265 if tens.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const) or any(
266 cons is None for cons in tens.consumer_list
267 ):
268 return
269
Patrik Gustavssone22f96b2021-04-15 17:01:17 +0200270 # Check if any of the producers/consumers is run on CPU
271 if not all(cons.run_on_npu for cons in tens.consumer_list):
Patrik Gustavssonee99bb12021-04-08 09:04:00 +0200272 return
Patrik Gustavssone22f96b2021-04-15 17:01:17 +0200273 if not all(prod.run_on_npu for prod in tens.ops):
Patrik Gustavssonee99bb12021-04-08 09:04:00 +0200274 return
275
276 # "Concat" ofm exception:
277 if avoid_nhcwb16_for_concat(tens):
278 return
279
280 # "Split" ifm exception:
281 if avoid_nhcwb16_for_split(tens):
282 return
283
284 # Shapes checking: check all producers/consumers are NHCWB16 compatible with tens.shape
285 if avoid_nhcwb16_for_shapes(tens):
286 return
287
288 for op in tens.consumer_list:
289 if op.type == Op.ReduceSum and tens.dtype == DataType.int32:
290 return
291 if op.type == Op.Reshape:
292 # Using NHCWB16 format for a no-op reshape is only an option if subsequent
293 # consumers do not also need to perform a reshape or if the OFM is going to
294 # be processed by CPU operations. No-op reshape consumers with empty lists
295 # (those that have no consumers, or null-consumers used as list terminators)
296 # must use normal NHWC output.
297
298 def incompatible_consumers(oper):
299 if oper and oper.type == Op.Reshape:
300 for consumer in oper.outputs[0].consumer_list:
301 yield from incompatible_consumers(consumer)
302 yield not oper or not oper.run_on_npu
303
304 if not any(incompatible_consumers(op)):
305
306 def get_rewrites(oper):
307 if oper and oper.type == Op.Reshape:
308 for consumer in oper.outputs[0].consumer_list:
309 yield from get_rewrites(consumer)
310 yield oper
311
312 # Detect no-op reshapes by comparing their full input and output tensor shapes.
313 inshape = op.ifm_shapes[0]
314 compatible_shape = [(inshape == oper.ofm_shapes[0]) for oper in get_rewrites(op)]
315 if not (compatible_shape and all(compatible_shape)):
316 return
317 else:
318 return
319
320 tens.needs_linear_format = False
321
322
Patrik Gustavsson138d47f2021-02-08 10:13:48 +0100323def insert_copy_op_after_tens(tens):
324 tens_cons_list_copy = tens.consumer_list.copy()
325
326 # Create a avg_pool nop op with ifm as input
327 copy_tens = tens.clone()
328 copy_op = create_avgpool_nop(tens.name + "_avgpool")
329 copy_op.add_input_tensor(tens)
330 copy_op.set_output_tensor(copy_tens)
331 copy_op.set_ifm_ofm_shapes()
332 copy_op.run_on_npu = True
333
334 # Set copy_ifm consumers
335 for tens_cons in tens_cons_list_copy:
336 if tens_cons is not None:
337 for ifm_idx, cons_inp in enumerate(tens_cons.inputs):
338 if cons_inp == tens:
339 tens_cons.set_input_tensor(copy_tens, ifm_idx)
340
341 DebugDatabase.add_optimised(tens.ops[0], copy_op)
342
343
344def fix_sg_input_output(op, arch, nng):
345 if not op.run_on_npu or op.type != Op.Reshape:
346 return op
347
Patrik Gustavssone3b1b912021-02-09 15:38:46 +0100348 # For the Reshape operators we want to remove, tensors are removed.
Patrik Gustavsson138d47f2021-02-08 10:13:48 +0100349 # But in order to to do this, they cannot be outputs of the sg,
350 # this need to be fixed prior to the removal.
351 # Solution is to add a avgpool NOP, to maintain the original tensor.
Patrik Gustavsson3645d002021-04-14 17:54:10 +0200352 # This is also valid when reshape ifm/ofm is produced respectively
353 # consumed by CPU
Patrik Gustavsson138d47f2021-02-08 10:13:48 +0100354
355 # Check if operator ifm/ofm are sg ifm/ofm
356 ifm_is_sg_ifm = op.ifm.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const)
357 ifm_is_sg_ofm = any(ifm_cons is None for ifm_cons in op.ifm.consumer_list)
358 ofm_is_sg_ofm = any(ofm_cons is None for ofm_cons in op.ofm.consumer_list)
Patrik Gustavsson3645d002021-04-14 17:54:10 +0200359 # Check if ifm/ofm is produced repectivly consumed by CPU
360 ifm_is_cpu_produced = any(ifm_prod is not None and not ifm_prod.run_on_npu for ifm_prod in op.ifm.ops)
361 ofm_is_cpu_consumed = any(ofm_cons is not None and not ofm_cons.run_on_npu for ofm_cons in op.ofm.consumer_list)
Patrik Gustavsson138d47f2021-02-08 10:13:48 +0100362
Patrik Gustavsson3645d002021-04-14 17:54:10 +0200363 if (ifm_is_sg_ofm or ifm_is_sg_ifm or ifm_is_cpu_produced) and (ofm_is_sg_ofm or ofm_is_cpu_consumed):
364 # Both ifm and ofm need to persist, but only ifm need a copy, in order to remove the Reshape
Patrik Gustavsson138d47f2021-02-08 10:13:48 +0100365 insert_copy_op_after_tens(op.ifm)
366
367 return op
368
369
Tim Hall79d07d22020-04-27 18:20:16 +0100370def needed_total_padding(input_size, stride, filter_size):
371 out_size = (input_size + stride - 1) // stride
372 needed_input = (out_size - 1) * stride + filter_size
373 total_padding = max(0, needed_input - input_size)
374 return total_padding
375
376
Louis Verhaardebf4af62021-01-27 15:57:57 +0100377def calc_explicit_padding(input_size, stride, filter_size, pad_before, pad_after) -> Tuple[int, int]:
378 """
379 Based on explicit padding provided in a PAD operation, returns the corresponding hardware padding
380 that provides equivalent results.
381 """
382 total_padding = needed_total_padding(input_size, stride, filter_size)
383 # The top/left padding can be taken as is from the PAD
384 output_pad_before = pad_before
385 # The bottom/right padding might need downward adjustment depending on stride/input size
386 output_pad_after = pad_after
387 while output_pad_after > 0 and output_pad_after % stride != (total_padding - pad_before) % stride:
388 output_pad_after -= 1
389 return output_pad_before, output_pad_after
390
391
392def calc_padding_and_skirt(padding_type, kernel, input_shape, explicit_padding):
393 k_w, k_h = kernel.dilated_wh()
394 s_x, s_y = kernel.stride
395 ypad = needed_total_padding(int(input_shape.height), int(s_y), int(k_h))
396 xpad = needed_total_padding(int(input_shape.width), int(s_x), int(k_w))
Michael McGeagh16895482020-12-14 15:51:20 +0000397 if padding_type == Padding.SAME:
Tim Hall79d07d22020-04-27 18:20:16 +0100398 left_pad = (xpad + 0) // 2
399 right_pad = (xpad + 1) // 2
400 top_pad = (ypad + 0) // 2
401 bottom_pad = (ypad + 1) // 2
Michael McGeagh16895482020-12-14 15:51:20 +0000402 elif padding_type == Padding.VALID:
Tim Hall79d07d22020-04-27 18:20:16 +0100403 left_pad = 0
404 right_pad = 0
405 top_pad = 0
406 bottom_pad = 0
Louis Verhaardae2d5532020-12-11 17:19:54 +0100407 elif padding_type == Padding.EXPLICIT:
408 # Padding is specified in a PAD operator which has been bypassed.
Louis Verhaardebf4af62021-01-27 15:57:57 +0100409 top, left, bottom, right = explicit_padding
410 top_pad, bottom_pad = calc_explicit_padding(int(input_shape.height), int(s_y), int(k_h), int(top), int(bottom))
411 left_pad, right_pad = calc_explicit_padding(int(input_shape.width), int(s_x), int(k_w), int(left), int(right))
Tim Hall79d07d22020-04-27 18:20:16 +0100412 else:
Michael McGeagh16895482020-12-14 15:51:20 +0000413 raise UnsupportedFeatureError(f"Unknown padding")
Tim Hall79d07d22020-04-27 18:20:16 +0100414 padding = (top_pad, left_pad, bottom_pad, right_pad)
415 skirt = (top_pad, left_pad, ypad - top_pad, xpad - left_pad)
416 return padding, skirt
417
Tim Hallc30f4952020-06-15 20:47:35 +0100418
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100419def calc_upscaled_padding_and_skirt(padding_type, kernel_size, stride, input_shape, upscaling_factor):
Jacob Bohlin9b64ba02020-07-07 17:15:22 +0200420 kernel_height, kernel_width = kernel_size[0], kernel_size[1]
Michael McGeagh16895482020-12-14 15:51:20 +0000421 if padding_type == Padding.SAME:
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100422 ypad = needed_total_padding(int(input_shape.height) * upscaling_factor, int(stride[1]), int(kernel_height))
423 xpad = needed_total_padding(int(input_shape.width) * upscaling_factor, int(stride[2]), int(kernel_width))
Jacob Bohlind47cc272020-08-24 11:42:14 +0200424 right_pad = max(((xpad + 1) // upscaling_factor) - 1, 0)
425 bottom_pad = max(((ypad + 1) // upscaling_factor) - 1, 0)
Jacob Bohlin9b64ba02020-07-07 17:15:22 +0200426 left_pad = max(kernel_width - 1 - right_pad, 0)
427 top_pad = max(kernel_height - 1 - bottom_pad, 0)
Michael McGeagh16895482020-12-14 15:51:20 +0000428 elif padding_type == Padding.VALID:
Jacob Bohlin9b64ba02020-07-07 17:15:22 +0200429 right_pad = max(kernel_width - 2, 0)
430 bottom_pad = max(kernel_height - 2, 0)
431 left_pad = kernel_width - 1
432 top_pad = kernel_height - 1
Jacob Bohlincf7da102020-05-20 09:03:40 +0200433 else:
Michael McGeagh16895482020-12-14 15:51:20 +0000434 raise UnsupportedFeatureError(f"Unknown padding")
Jacob Bohlincf7da102020-05-20 09:03:40 +0200435 padding = (top_pad, left_pad, bottom_pad, right_pad)
Jacob Bohlin9b64ba02020-07-07 17:15:22 +0200436 skirt = padding
Jacob Bohlincf7da102020-05-20 09:03:40 +0200437 return padding, skirt
438
Tim Hall79d07d22020-04-27 18:20:16 +0100439
Patrik Gustavsson3010d9b2020-10-01 08:22:10 +0200440def fixup_conv2d_backprop(op, arch, nng):
Louis Verhaardaee5d752020-09-30 09:01:52 +0200441 if op.type == Op.Conv2DBackpropInput:
Tim Hall79d07d22020-04-27 18:20:16 +0100442 # flip the inputs
443 op.inputs[0], op.inputs[2] = op.inputs[2], op.inputs[0]
Louis Verhaardaee5d752020-09-30 09:01:52 +0200444 op.type = Op.Conv2DBackpropInputSwitchedBias
Louis Verhaard69b84802020-12-16 12:02:28 +0100445 op.ifm.resampling_mode = resampling_mode.TRANSPOSE
Jacob Bohlincf7da102020-05-20 09:03:40 +0200446
447 # Update strides
Tim Hallc30f4952020-06-15 20:47:35 +0100448 op.attrs.update({"stride_w": 1, "stride_h": 1, "strides": (1, 1, 1, 1)})
Tim Hall79d07d22020-04-27 18:20:16 +0100449
450 return op
451
452
Charles Xu9a03fdf2020-07-02 15:12:40 +0200453# Convert the op to an elementwise add
454def convert_resizebilinear_1x1_to_add(op):
Louis Verhaardaee5d752020-09-30 09:01:52 +0200455 op.type = Op.Add
Charles Xu9a03fdf2020-07-02 15:12:40 +0200456 op.name = op.name + "_add"
Charles Xu9a03fdf2020-07-02 15:12:40 +0200457 op.attrs["resizebilinear"] = True
458 # Create an input tensor filled with zeros
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100459 shape = op.ofm_shapes[0].as_list()
Charles Xu9a03fdf2020-07-02 15:12:40 +0200460 tens = Tensor(shape, op.inputs[0].dtype, op.inputs[1].name + "_add")
461 tens.values = np.zeros(shape)
462 tens.quant_values = np.zeros(shape, np.uint8)
463 tens.quantization = QuantizationParameters(0.0, 255.0)
464 tens.quantization.scale_f32 = 1.0
465 tens.quantization.zero_point = 0
466 tens.consumer_list = [op]
467 tens_op = op.inputs[1].ops[0]
Michael McGeaghc5b549b2020-08-07 11:54:28 +0100468 tens_op.set_output_tensor(tens)
Charles Xu9a03fdf2020-07-02 15:12:40 +0200469 # Set the add inputs
470 op.inputs[1] = op.inputs[0]
471 op.inputs[0] = tens
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000472 op.set_ifm_ofm_shapes()
Charles Xu9a03fdf2020-07-02 15:12:40 +0200473
474 return op
475
476
Charles Xu87c13502020-08-06 12:17:26 +0200477# Convert ResizeBilinear to a number of 2x2 pool ops
478def convert_resizebilinear_to_2x2_pool(op):
479 count = 0
480 pre_op = op
481 outputs = op.outputs
482
483 op.attrs.update({"strides": (1, 1, 1, 1), "ksize": (1, 2, 2, 1)})
484 if op.attrs["align_corners"]:
485 shape_modifier = 1
Michael McGeagh16895482020-12-14 15:51:20 +0000486 op.attrs["padding"] = Padding.VALID
Charles Xu87c13502020-08-06 12:17:26 +0200487 else:
488 shape_modifier = 0
Michael McGeagh16895482020-12-14 15:51:20 +0000489 op.attrs["padding"] = Padding.SAME
Charles Xu87c13502020-08-06 12:17:26 +0200490 op.inputs[0].resampling_mode = resampling_mode.NEAREST
491
Patrik Gustavsson2c2522d2021-01-29 11:51:31 +0100492 upscaled_shape = np.array(op.ifm_shapes[0].get_hw_as_list())
493 out_shape = np.array(op.ofm_shapes[0].get_hw_as_list())
Charles Xu87c13502020-08-06 12:17:26 +0200494 if (upscaled_shape == upscaled_shape * 2 - shape_modifier).all():
495 return op
496
497 while (upscaled_shape < out_shape).all():
498 if count == 0:
499 scaled_op = pre_op
500 else:
501 scaled_op = op.clone("_{}".format(count))
502 scaled_op.inputs[0] = pre_op.outputs[0]
503
504 upscaled_shape = upscaled_shape * 2 - shape_modifier
505
506 if (upscaled_shape == out_shape).all():
507 scaled_op.outputs = outputs
508 scaled_op.outputs[0].ops = [scaled_op]
509 else:
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100510 shape = op.ofm_shapes[0].as_list()
511 shape[1:3] = upscaled_shape
Charles Xu87c13502020-08-06 12:17:26 +0200512 out_tens = Tensor(shape, DataType.int16, "{}_{}".format(op.outputs[0].name, count))
513 out_tens.quantization = op.outputs[0].quantization.clone()
514 out_tens.quantization.quant_min = np.iinfo(np.int16).min
515 out_tens.quantization.quant_max = np.iinfo(np.int16).max
516 scaled_op.set_output_tensor(out_tens)
517 pre_op = scaled_op
518 count += 1
519
520 # Setup the scale value
521 if scaled_op.inputs[0].dtype.bits == 8 and scaled_op.outputs[0].dtype.bits == 16:
Fredrik Svedberge82be7c2021-01-18 15:21:03 +0100522 scaled_op.rescale = 128
Charles Xu87c13502020-08-06 12:17:26 +0200523 elif scaled_op.inputs[0].dtype.bits == 16 and scaled_op.outputs[0].dtype.bits == 8:
Fredrik Svedberge82be7c2021-01-18 15:21:03 +0100524 scaled_op.rescale = 1 / 128
525 else:
526 scaled_op.rescale = None
Patrik Gustavssoncc6915c2020-12-22 09:16:50 +0100527 scaled_op.set_ifm_ofm_shapes()
Charles Xu87c13502020-08-06 12:17:26 +0200528
529 return op
530
531
Patrik Gustavsson3010d9b2020-10-01 08:22:10 +0200532def fixup_resizebilinear(op, arch, nng):
Louis Verhaardaee5d752020-09-30 09:01:52 +0200533 if op.type == Op.ResizeBilinear and op.run_on_npu:
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100534 if op.ifm_shapes[0] == op.ofm_shapes[0]:
Charles Xu36ffaf32020-08-05 15:40:44 +0200535 # Bypass nop resizebilinear
536 op.inputs = op.inputs[:1]
Louis Verhaardaee5d752020-09-30 09:01:52 +0200537 op.type = Op.Identity
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100538 elif op.ifm_shapes[0].height == 1 and op.ifm_shapes[0].width == 1:
Charles Xu87c13502020-08-06 12:17:26 +0200539 convert_resizebilinear_1x1_to_add(op)
540 else:
541 convert_resizebilinear_to_2x2_pool(op)
Charles Xu9a03fdf2020-07-02 15:12:40 +0200542
543 return op
544
545
Patrik Gustavsson3010d9b2020-10-01 08:22:10 +0200546def convert_nop_split_to_identity(op, arch, nng):
Louis Verhaardaee5d752020-09-30 09:01:52 +0200547 if op.type == Op.Split and op.attrs.get("num_splits") == 1:
Dwight Lidmanc3862c22020-09-14 15:22:33 +0200548 # the list comprehension should return a list with a single tensor
549 # if it shouldn't, remove_passthrough_tensor will fail appropriately
550 op.inputs = [i for i in op.inputs if i.shape == op.outputs[0].shape]
Louis Verhaardaee5d752020-09-30 09:01:52 +0200551 op.type = Op.Identity
Dwight Lidmanc3862c22020-09-14 15:22:33 +0200552 return op
553
554
Patrik Gustavsson2c2522d2021-01-29 11:51:31 +0100555def rewrite_fully_connected_input(op, arch, nng):
556 if op.type == Op.FullyConnected:
557 n_in_elems = op.weights.shape[-2]
558 elms = op.ifm.elements()
559 batch_size = elms // n_in_elems
560 assert batch_size * n_in_elems == elms
561
Patrik Gustavsson2c2522d2021-01-29 11:51:31 +0100562 op.ifm_shapes[0] = Shape4D([batch_size, 1, 1, n_in_elems])
563 return op
564
565
Diqing Zhong94457b12020-12-09 15:22:40 +0100566def convert_batched_fc_shape(op, arch, nng):
Louis Verhaardaee5d752020-09-30 09:01:52 +0200567 if op.type == Op.FullyConnected:
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100568 # Check if the first dimension indicates batching
569 if op.ifm_shapes[0].batch > 1:
Patrik Gustavssoncb337042020-09-16 14:55:40 +0200570 batching_split = {4: (2, 2), 8: (2, 4), 16: (4, 4)}
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100571 n = op.ifm_shapes[0].batch
Patrik Gustavssoncb337042020-09-16 14:55:40 +0200572 h, w = batching_split.get(n, (1, n))
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100573 op.ifm_shapes[0] = Shape4D([1, h, w, op.ifm_shapes[0].depth])
Patrik Gustavssoncb337042020-09-16 14:55:40 +0200574
Patrik Gustavssoncb337042020-09-16 14:55:40 +0200575 # Reshape Weights to be 4D. IO becomes HWIO
576 weight_tensor = op.inputs[1]
577 weight_tensor.quant_values = np.expand_dims(np.expand_dims(weight_tensor.quant_values, axis=0), axis=0)
578 weight_tensor.set_all_shapes(list(weight_tensor.quant_values.shape))
579
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100580 n = op.ofm_shapes[0].batch
581 h, w = batching_split.get(n, (1, n))
582 op.ofm_shapes[0] = Shape4D([1, h, w, op.ofm_shapes[0].depth])
Tim Hall79d07d22020-04-27 18:20:16 +0100583 return op
584
585
Patrik Gustavsson7bada402021-01-28 15:46:21 +0100586def unfuse_activation_function(op):
Louis Verhaardaee5d752020-09-30 09:01:52 +0200587 if op.type == Op.ConcatTFLite and op.run_on_npu and op.activation is not None:
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100588 act_op = Operation(op.activation.op_type, op.name + op.activation.op_type.name)
Louis Verhaardaee5d752020-09-30 09:01:52 +0200589 op.activation = None
Fredrik Svedberg0f98b362020-09-29 10:00:39 +0200590 out_tens = op.outputs[0]
591 intermediate_tens = out_tens.clone("_act_intermediate")
592 act_op.set_output_tensor(out_tens)
593 act_op.add_input_tensor(intermediate_tens)
594 op.set_output_tensor(intermediate_tens)
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000595 act_op.set_ifm_ofm_shapes()
Fredrik Svedberg0f98b362020-09-29 10:00:39 +0200596
Louis Verhaard8912c532020-09-30 12:11:49 +0200597
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100598def rewrite_stridedslice_output(op, arch, nng):
599 if not op.run_on_npu or op.type != Op.StridedSlice:
600 return op
601
602 new_axis_mask = op.attrs["new_axis_mask"]
603 shrink_axis_mask = op.attrs["shrink_axis_mask"]
604
605 if shrink_axis_mask == 0 and new_axis_mask == 0:
606 return op
607
608 axis_4D = [0] * len(op.outputs)
609 for idx, out_tens in enumerate(op.outputs):
610 output_shape = list(out_tens.shape)
Diqing Zhongc7c0b1b2020-10-26 11:45:25 +0100611
Dwight Lidman73320a42020-11-05 10:34:41 +0100612 if shrink_axis_mask != 0:
Diqing Zhongc7c0b1b2020-10-26 11:45:25 +0100613 n = 0
614 axis = 0
615 while shrink_axis_mask:
616 prev_mask = shrink_axis_mask
617 n += 1
618 shrink_axis_mask &= shrink_axis_mask - 1
619 axis = int(math.log2(prev_mask - shrink_axis_mask))
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100620 output_shape = output_shape[:axis] + [1] + output_shape[axis:]
Diqing Zhongc7c0b1b2020-10-26 11:45:25 +0100621
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100622 assert len(out_tens.shape) == (len(op.inputs[0].shape) - n)
Diqing Zhongc7c0b1b2020-10-26 11:45:25 +0100623 op.attrs["shrink_axis_mask"] = 0
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100624 if axis >= 0:
625 axis_4D[idx] = axis + (4 - len(output_shape))
626 else:
627 axis_4D[idx] = axis
628 op.ofm_shapes[idx] = Shape4D(output_shape)
629
Diqing Zhongc7c0b1b2020-10-26 11:45:25 +0100630 elif new_axis_mask != 0:
631 n = 0
632 axis = 0
633 while new_axis_mask:
634 prev_mask = new_axis_mask
635 n += 1
636 new_axis_mask &= new_axis_mask - 1
637 axis = int(math.log2(prev_mask - new_axis_mask))
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100638 output_shape = output_shape[:axis] + output_shape[(axis + 1) :]
Diqing Zhongc7c0b1b2020-10-26 11:45:25 +0100639 new_axis_mask >>= 1
640
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100641 assert len(out_tens.shape) == (len(op.inputs[0].shape) + n)
Diqing Zhongc7c0b1b2020-10-26 11:45:25 +0100642 op.attrs["new_axis_mask"] = 0
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100643 if axis >= 0:
644 axis_4D[idx] = axis + (4 - len(output_shape))
645 else:
646 axis_4D[idx] = axis
647 op.ofm_shapes[idx] = Shape4D(output_shape)
Diqing Zhongc7c0b1b2020-10-26 11:45:25 +0100648
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100649 op.attrs["split_axis_4D"] = axis_4D
650 return op
Diqing Zhongc7c0b1b2020-10-26 11:45:25 +0100651
652
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100653def rewrite_unpack_output(op, arch, nng):
654 tens = op.outputs[0]
Diqing Zhongc7c0b1b2020-10-26 11:45:25 +0100655 if op.run_on_npu and op.type == Op.Unpack:
Tim Hall79d07d22020-04-27 18:20:16 +0100656 # Unpack is also referred to as Unstack
Diqing Zhongc7c0b1b2020-10-26 11:45:25 +0100657 axis = int(op.attrs["axis"])
Henrik G Olssona93f0262021-04-12 14:53:18 +0200658 if axis < 0: # Convert to positive axis
659 axis = len(op.inputs[0].shape) + 1 + axis
Diqing Zhongc7c0b1b2020-10-26 11:45:25 +0100660 op.type = Op.UnpackReshaped
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100661 desired_output_shape = tens.shape[:axis] + [1] + tens.shape[axis:]
Tim Hall79d07d22020-04-27 18:20:16 +0100662
Henrik G Olssona93f0262021-04-12 14:53:18 +0200663 axis_4D = axis + (4 - len(desired_output_shape))
664 op.attrs["split_axis_4D"] = [axis_4D] * len(op.outputs)
Tim Hall79d07d22020-04-27 18:20:16 +0100665
666 for idx, out_tens in enumerate(op.outputs):
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100667 op.ofm_shapes[idx] = Shape4D(desired_output_shape)
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100668 return op
Tim Hall79d07d22020-04-27 18:20:16 +0100669
670
Patrik Gustavsson3010d9b2020-10-01 08:22:10 +0200671def add_padding_fields(op, arch, nng):
Jacob Bohlin90033f32020-08-28 15:45:44 +0200672 if op.run_on_npu:
673 if "padding" in op.attrs:
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100674 input_shape = op.ifm_shapes[0]
675 output_shape = op.ofm_shapes[0]
Louis Verhaardaee5d752020-09-30 09:01:52 +0200676 if op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op():
Jacob Bohlin90033f32020-08-28 15:45:44 +0200677 kernel_size = op.inputs[1].shape[:2]
Louis Verhaardaee5d752020-09-30 09:01:52 +0200678 elif op.type.is_pool_op() or op.type.npu_block_type == NpuBlockType.ReduceSum:
Jacob Bohlin90033f32020-08-28 15:45:44 +0200679 kernel_size = op.attrs["ksize"][1:3]
Jacob Bohlin90033f32020-08-28 15:45:44 +0200680 else:
Michael McGeagh7a6f8432020-12-02 15:29:22 +0000681 raise UnsupportedFeatureError(f"Unknown operation that uses padding: {optype_to_builtintype(op.type)}")
Tim Hall79d07d22020-04-27 18:20:16 +0100682
Louis Verhaardaee5d752020-09-30 09:01:52 +0200683 if op.type == Op.Conv2DBackpropInputSwitchedBias:
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100684 upscaling_factor = output_shape.height // input_shape.height
Jacob Bohlin90033f32020-08-28 15:45:44 +0200685 padding, skirt = calc_upscaled_padding_and_skirt(
686 op.attrs["padding"], kernel_size, op.attrs["strides"], input_shape, upscaling_factor
687 )
688 else:
Jacob Bohlin90033f32020-08-28 15:45:44 +0200689 padding, skirt = calc_padding_and_skirt(
Louis Verhaardebf4af62021-01-27 15:57:57 +0100690 op.attrs["padding"], op.kernel, input_shape, op.attrs.get("explicit_padding"),
Jacob Bohlin90033f32020-08-28 15:45:44 +0200691 )
Jacob Bohlincf7da102020-05-20 09:03:40 +0200692
Jacob Bohlin90033f32020-08-28 15:45:44 +0200693 op.attrs["explicit_padding"] = padding
694 op.attrs["skirt"] = skirt
Jacob Bohlincf7da102020-05-20 09:03:40 +0200695
Tim Hall79d07d22020-04-27 18:20:16 +0100696 return op
697
698
Patrik Gustavsson3010d9b2020-10-01 08:22:10 +0200699def convert_depthwise_to_conv(op, arch, nng):
Tim Hall79d07d22020-04-27 18:20:16 +0100700 # Depthwise is equivalent to a single conv2d if the ifm depth is 1 and
701 # the ofm depth equals the depth multipler.
702 # If those conditions are true, then we can perform a simple
703 # switch of the operator type (and weight order)
704
Louis Verhaardaee5d752020-09-30 09:01:52 +0200705 if op.type == Op.DepthwiseConv2DBias and (op.attrs["depth_multiplier"] != 1):
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100706 ifm_shape = op.ifm_shapes[0]
Tim Hall79d07d22020-04-27 18:20:16 +0100707 weight_tensor = op.inputs[1]
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100708 ofm_shape = op.ofm_shapes[0]
709 if (ifm_shape.depth == 1) and (ofm_shape.depth == op.attrs["depth_multiplier"]):
Tim Hall79d07d22020-04-27 18:20:16 +0100710 # Change op type to Conv2d
Louis Verhaardaee5d752020-09-30 09:01:52 +0200711 op.type = Op.Conv2DBias
Tim Hall79d07d22020-04-27 18:20:16 +0100712 del op.attrs["channel_multiplier"]
713 del op.attrs["depth_multiplier"]
714
715 weight_tensor.quant_values = np.transpose(weight_tensor.quant_values, (0, 1, 3, 2))
Michael McGeagh6a8d4242020-07-28 12:17:59 +0100716 weight_tensor.set_all_shapes(list(weight_tensor.quant_values.shape))
Tim Hall79d07d22020-04-27 18:20:16 +0100717 else:
Louis Verhaard7db78962020-05-25 15:05:26 +0200718 raise UnsupportedFeatureError(
Michael McGeagh7a6f8432020-12-02 15:29:22 +0000719 f"Unsupported 'DEPTHWISE_CONV_2D' with depth_multiplier = {op.attrs['depth_multiplier']},",
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100720 f" ifm channels = {ifm_shape.depth}, ofm channels = {ofm_shape.depth}",
Tim Hall79d07d22020-04-27 18:20:16 +0100721 )
Tim Halle6ccd872020-11-09 16:46:37 +0000722 DebugDatabase.add_optimised(op, op)
Tim Hall79d07d22020-04-27 18:20:16 +0100723 return op
724
725
Patrik Gustavsson3010d9b2020-10-01 08:22:10 +0200726def reorder_depthwise_weights(op, arch, nng):
Louis Verhaardaee5d752020-09-30 09:01:52 +0200727 if op.type.is_depthwise_conv2d_op():
Jacob Bohline843d332020-06-23 12:12:56 +0200728 weight_tensor = op.inputs[1]
729 weight_tensor.quant_values = np.transpose(weight_tensor.quant_values, (0, 1, 3, 2))
Michael McGeagh6a8d4242020-07-28 12:17:59 +0100730 weight_tensor.set_all_shapes(list(weight_tensor.quant_values.shape))
Jacob Bohline843d332020-06-23 12:12:56 +0200731 weight_tensor.weight_transpose_depthwise = True
732
733 return op
734
735
Diqing Zhong016b8272020-12-16 16:46:06 +0100736def optimise_strided_conv(op, arch, nng):
737 stride_x, stride_y = op.get_kernel_stride()
738 ifm_tensor, _, weight_tensor, _ = op.get_ifm_ifm2_weights_ofm()
739
740 if (
741 op.type == Op.Conv2DBias
742 and op.op_index == 0
743 and stride_x == 2
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100744 and op.ifm_shapes[0].depth <= 4
745 and op.ifm_shapes[0].width % 2 == 0
Diqing Zhong016b8272020-12-16 16:46:06 +0100746 and weight_tensor is not None
747 and weight_tensor.shape[1] >= 2
748 ):
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100749 ifm_shape = op.ifm_shapes[0]
Diqing Zhong016b8272020-12-16 16:46:06 +0100750 # IFM
Patrik Gustavsson3a269202021-01-21 08:28:55 +0100751 op.ifm_shapes[0] = Shape4D([ifm_shape.batch, ifm_shape.height, ifm_shape.width // 2, ifm_shape.depth * 2])
Diqing Zhong016b8272020-12-16 16:46:06 +0100752
753 # Weights
754 weight_shape = weight_tensor.shape
755 if weight_shape[1] % 2 != 0:
756 weight_shape[1] = weight_shape[1] + 1
757 padded_array = np.zeros(weight_shape)
758 for i in range(weight_shape[0]):
759 padded_array[i] = np.vstack(
760 [
761 weight_tensor.quant_values[i],
762 np.full((1, weight_shape[2], weight_shape[3]), weight_tensor.quantization.zero_point),
763 ]
764 )
765 weight_tensor.quant_values = padded_array
766 weight_shape[1] //= 2
767 weight_shape[2] *= 2
768 weight_tensor.quant_values = np.reshape(weight_tensor.quant_values, weight_shape)
769 weight_tensor.set_all_shapes(weight_shape)
770 # If multiple copies of the weights are used, we could avoid
771 # them having the same address by changing the value_id
772 weight_tensor.value_id = uuid.uuid4()
773
774 # Strides
775 stride_x = 1
776 op.attrs.update({"stride_w": stride_x, "stride_h": stride_y, "strides": (1, stride_y, stride_x, 1)})
777
Diqing Zhong016b8272020-12-16 16:46:06 +0100778 return op
779
780
Patrik Gustavsson3010d9b2020-10-01 08:22:10 +0200781def convert_conv_to_fc(op, arch, nng):
Michael McGeagh8d939c02020-07-29 13:11:43 +0100782 # Conv 1x1 can be equivalent to Fully Connected.
783 # By representing certain convs as fully connected layers, Vela can better determine wether or not to use
784 # caching/double buffering for the weights.
785 # (Weights dont need to be reloaded for convs when IFM H and W are 1)
Louis Verhaardaee5d752020-09-30 09:01:52 +0200786 if op.type == Op.Conv2DBias:
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000787 h = op.ifm_shapes[0].height
788 w = op.ifm_shapes[0].width
Michael McGeagh8d939c02020-07-29 13:11:43 +0100789 kh, kw, _, _ = op.inputs[1].shape
790 if h == 1 and w == 1 and kh == 1 and kw == 1:
791 # Overwrite this op as a Fully Connected Op
792 op.name += "_fc"
Louis Verhaardaee5d752020-09-30 09:01:52 +0200793 op.type = Op.FullyConnected
Michael McGeagh8d939c02020-07-29 13:11:43 +0100794 op.attrs = {
Michael McGeagh8d939c02020-07-29 13:11:43 +0100795 "weights_format": 0,
Michael McGeagh8d939c02020-07-29 13:11:43 +0100796 }
797 # Reshape Weights to be 2D. HWIO becomes just IO (as H and W are 1, they can just be dropped)
798 weight_tensor = op.inputs[1]
799 weight_tensor.quant_values = weight_tensor.quant_values.squeeze(axis=(0, 1))
800 weight_tensor.set_all_shapes(list(weight_tensor.quant_values.shape))
Patrik Gustavsson2349d422020-12-01 16:02:29 +0100801
Tim Halle6ccd872020-11-09 16:46:37 +0000802 DebugDatabase.add_optimised(op, op)
Michael McGeagh8d939c02020-07-29 13:11:43 +0100803 return op
804
805
Patrik Gustavsson3010d9b2020-10-01 08:22:10 +0200806def fixup_relus_with_differing_ifm_ofm_scaling(op, arch, nng):
Louis Verhaardaee5d752020-09-30 09:01:52 +0200807 if op.run_on_npu and op.type.is_relu_op():
Michael McGeagh8dbf8cf2020-09-08 11:09:48 +0100808 ifm = op.inputs[0]
809 ofm = op.outputs[0]
810 # Relu with differing IFM and OFM scaling cannot be fused with another primary op
811 # and requires its own to be inserted
Tim Hall93582962020-09-09 21:58:15 +0100812 if not check_quantized_tens_scaling_equal(ifm, ofm):
Michael McGeagh8dbf8cf2020-09-08 11:09:48 +0100813 # Override this op with its own primary op (avgpool)
814 relu_fused_op = create_avgpool_nop(op.name + "_avgpool")
815 # And fuse the original activation function to it
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100816 relu_fused_op.activation = create_activation_function(op.type)
Michael McGeagh8dbf8cf2020-09-08 11:09:48 +0100817 # Tidy up and assign the ifm and ofm to the new op
818 ifm.consumer_list.remove(op)
Andreas Nevalainenf3d737e2020-09-25 14:12:43 +0200819
Michael McGeagh8dbf8cf2020-09-08 11:09:48 +0100820 relu_fused_op.add_input_tensor(ifm)
821 relu_fused_op.set_output_tensor(ofm)
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000822 relu_fused_op.set_ifm_ofm_shapes()
Michael McGeagh8dbf8cf2020-09-08 11:09:48 +0100823 op = relu_fused_op
824 return op
825
826
Patrik Gustavsson3010d9b2020-10-01 08:22:10 +0200827def fixup_elementwise_with_scalars(op, arch, nng):
Louis Verhaardaee5d752020-09-30 09:01:52 +0200828 if op.type.is_binary_elementwise_op():
Louis Verhaarde0ef2732020-06-03 08:56:44 +0200829 ifm_tensor, ifm2_tensor, _, _ = op.get_ifm_ifm2_weights_ofm()
Charles Xu78792222020-05-13 10:15:26 +0200830 if ifm2_tensor.shape != [] and ifm_tensor.shape != []:
831 diff = len(ifm_tensor.shape) - len(ifm2_tensor.shape)
832 if diff > 0:
833 ifm2_tensor.shape = full_shape(len(ifm_tensor.shape), ifm2_tensor.shape, 1)
834 elif diff < 0:
835 ifm_tensor.shape = full_shape(len(ifm2_tensor.shape), ifm_tensor.shape, 1)
Louis Verhaarde0ef2732020-06-03 08:56:44 +0200836 elif ifm_tensor.shape == [] and ifm_tensor.quant_values is None:
837 # IFM is marked as a scalar, but is a result of an operation; change it to a shape of size 1
838 ifm_tensor.shape = len(ifm2_tensor.shape) * [1]
839 ifm_tensor.storage_shape = ifm_tensor.shape
840 elif ifm2_tensor.shape == [] and ifm2_tensor.quant_values is None:
841 # IFM2 is marked as a scalar, but is a result of an operation; change it to a shape of size 1
842 ifm2_tensor.shape = len(ifm_tensor.shape) * [1]
843 ifm2_tensor.storage_shape = ifm2_tensor.shape
Charles Xu78792222020-05-13 10:15:26 +0200844 return op
Tim Hall79d07d22020-04-27 18:20:16 +0100845
Louis Verhaarde0ef2732020-06-03 08:56:44 +0200846
Tim Hall4e127762020-05-15 16:05:49 +0100847# Set input/output tensor equivalence to the same id for memory operations
Patrik Gustavsson3010d9b2020-10-01 08:22:10 +0200848def set_tensor_equivalence(op, arch, nng):
Michael McGeagh11b0bdb2020-09-08 11:07:35 +0100849 if op.type in memory_only_ops:
Tim Hall4e127762020-05-15 16:05:49 +0100850 eid = op.outputs[0].equivalence_id
851 for inp in op.inputs:
852 inp.equivalence_id = eid
853 return op
854
855
Patrik Gustavsson2349d422020-12-01 16:02:29 +0100856def set_ifm_ofm_op_shapes(op, arch, nng):
857 if op.run_on_npu and op.type.needs_shapes():
858 if op.ifm_shapes or op.ofm_shapes:
859 # Shapes already set
860 return op
861 op.set_ifm_ofm_shapes()
862 return op
863
864
Patrik Gustavsson3010d9b2020-10-01 08:22:10 +0200865def convert_softmax(op, arch, nng):
Louis Verhaardaee5d752020-09-30 09:01:52 +0200866 if op.type == Op.Softmax and op.run_on_npu:
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200867 softmax = SoftMax(op)
868 op = softmax.get_graph()
869 return op
870
871
Patrik Gustavsson3010d9b2020-10-01 08:22:10 +0200872def convert_mul_max_to_abs_or_lrelu(op, arch, nng):
Diego Russoea6111a2020-04-14 18:41:58 +0100873 r"""Whenever there is a subgraph with this topology:
Tim Hall79d07d22020-04-27 18:20:16 +0100874
875 Input X For X = -1 or X > 0
876 | \ / This subgraph can be replaced with either
877 | Mul an Abs (if X = -1) or a LeakyReLU (if X > 0)
878 | /
879 Max
880 """
881
Louis Verhaardaee5d752020-09-30 09:01:52 +0200882 if op.type == Op.Maximum:
Tim Hall79d07d22020-04-27 18:20:16 +0100883 # finds the Mul input(s) to the Max
Louis Verhaardaee5d752020-09-30 09:01:52 +0200884 muls = [i for i in op.inputs if i.ops[0].type == Op.Mul]
Tim Hall79d07d22020-04-27 18:20:16 +0100885 if len(muls) == 1:
886 mul = muls[0].ops[0]
887 elif len(muls) == 2:
888 # In the case both inputs are Muls, find the one with the same input as the Max
889 mul = [m for m in muls if len(set(op.inputs + m.ops[0].inputs)) == 1][0].ops[0]
890 else:
891 # No Mul inputs
892 return op
893
894 # make sure the Mul doesn't have any other consumers
Louis Verhaardd7911c42020-08-25 13:36:41 +0200895 mul_ofm = mul.outputs[0]
896 if len(mul_ofm.consumers()) != 1:
Tim Hall79d07d22020-04-27 18:20:16 +0100897 return op
Louis Verhaardaee5d752020-09-30 09:01:52 +0200898 # make sure the Mul doesn't have a fused activation function
899 if mul.activation:
Tim Hall79d07d22020-04-27 18:20:16 +0100900 return op
Louis Verhaardaee5d752020-09-30 09:01:52 +0200901 ifm, ofm = op.get_ifm_ofm()
Tim Hall93582962020-09-09 21:58:15 +0100902 if ifm is None or ofm is None:
903 return op
904
Louis Verhaardb9fc33c2020-08-13 11:47:36 +0200905 if ifm.dtype not in (DataType.uint8, DataType.int8) or ifm.dtype != ofm.dtype:
906 return op
Tim Hall93582962020-09-09 21:58:15 +0100907 if not check_quantized_tens_scaling_equal(ifm, ofm) or not check_quantized_tens_scaling_equal(ifm, mul_ofm):
Louis Verhaardb9fc33c2020-08-13 11:47:36 +0200908 # rewrite to LeakyRelu currently only makes sense if the quantization is identical
909 return op
Tim Hall79d07d22020-04-27 18:20:16 +0100910
911 # finds the branched input that goes to both the Max and the Mul
912 shared = set(op.inputs) & set(mul.inputs)
913 if len(shared) == 1:
914 shared_in = shared.pop()
915 # find the constant scalar input to the Mul
916 const_tens = (set(mul.inputs) - {shared_in}).pop()
917 # check that it is a scalar
918 if const_tens.shape != []:
919 return op
920 const = const_tens.ops[0]
921 # check that it is a constant
Louis Verhaardaee5d752020-09-30 09:01:52 +0200922 if const.type != Op.Const:
Tim Hall79d07d22020-04-27 18:20:16 +0100923 return op
Louis Verhaardb9fc33c2020-08-13 11:47:36 +0200924 # Remove the Mul from the shared input's consumers
925 shared_in.consumer_list.remove(mul)
Tim Hall79d07d22020-04-27 18:20:16 +0100926 else:
927 return op
928
929 val = const.outputs[0].values
930 if val >= 0:
Louis Verhaardaee5d752020-09-30 09:01:52 +0200931 new_op = Op.LeakyRelu
Tim Hall79d07d22020-04-27 18:20:16 +0100932 op.attrs["alpha"] = val
Louis Verhaardd7911c42020-08-25 13:36:41 +0200933 # to produce bit exact results, the alpha is not enough;
934 # save additional scaling info in attr "alpha_scale", to be used as input
935 # to the LUT construction
936 alpha_scalar = const_tens.quant_values - const_tens.quantization.zero_point
937 mul_ifm_scale = np.double(ifm.quantization.scale_f32)
938 mul_ifm2_scale = np.double(const_tens.quantization.scale_f32)
939 mul_ofm_scale = np.double(mul_ofm.quantization.scale_f32)
940 alpha_scale, alpha_shift = scaling.elementwise_mul_scale(mul_ifm_scale, mul_ifm2_scale, mul_ofm_scale)
941 op.attrs["alpha_scaling"] = (alpha_scalar, alpha_scale, alpha_shift)
Tim Hall79d07d22020-04-27 18:20:16 +0100942 elif val == -1:
Louis Verhaardaee5d752020-09-30 09:01:52 +0200943 new_op = Op.Abs
Tim Hall79d07d22020-04-27 18:20:16 +0100944 else:
945 return op
946
Louis Verhaardaee5d752020-09-30 09:01:52 +0200947 op.type = new_op
948 op.name = op.name.replace("Maximum", new_op.name)
949 op.outputs[0].name = op.outputs[0].name.replace("Maximum", new_op.name)
Tim Hall79d07d22020-04-27 18:20:16 +0100950 op.inputs = [shared_in]
Patrik Gustavssonc509d332020-12-22 13:53:52 +0100951 op.set_ifm_ofm_shapes()
Tim Halle6ccd872020-11-09 16:46:37 +0000952
953 # Record optimisation in debug database
954 DebugDatabase.add_optimised(op, op)
955
Tim Hall79d07d22020-04-27 18:20:16 +0100956 return op
957
958
Diqing Zhong189f7482021-01-26 12:12:51 +0100959def convert_hardswish_to_lut(op, arch, nng):
960 if op.type == Op.HardSwish:
961 ifm, ofm = op.get_ifm_ofm()
962 # Generate the LUT
963 ifm_scale = np.double(ifm.quantization.scale_f32)
964 ofm_scale = np.double(ofm.quantization.scale_f32)
965 zp_in = ifm.quantization.zero_point
966 zp_out = ofm.quantization.zero_point
967 ifm_scale_hires = (1 / 128) * ifm_scale
968 relu_multiplier = np.double(3 / 32768)
969 out_scale, out_shift = scaling.quantise_scale(ifm_scale_hires / ofm_scale)
970 relu_scale, relu_shift = scaling.quantise_scale(ifm_scale_hires / relu_multiplier)
971 # Use 16bit scale
972 out_scale_16 = fp_math.downscale_multiplier_int32_to_int16(out_scale)
973 relu_scale_16 = fp_math.downscale_multiplier_int32_to_int16(relu_scale)
974
975 values = []
976 ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)
977 quantized_min = min(ix)
978 quantized_max = max(ix)
979 for x in ix:
980 input_value = x - zp_in
981 input_value_hires = input_value * 128
982 # Compute the input value on essentially the output scale, not shifted yet
983 input_value_preshift = fp_math.saturating_rounding_mul16(input_value_hires, out_scale_16)
984 # Compute the "relu-ish multiplier". This matches the code in TensorFlow Lite Micro kernel
985 relu_value = np.int16(input_value_hires)
986 if relu_shift < 31:
987 relu_value = fp_math.shift_left16(relu_value, 30 - relu_shift)
988
989 relu_value = fp_math.saturating_rounding_mul16(relu_value, relu_scale_16)
990
991 if relu_shift < 31:
992 relu_value = fp_math.shift_left16(relu_value, 1)
993
994 if relu_shift > 31:
995 relu_value = fp_math.rounding_divide_by_pot(relu_value, relu_shift - 31)
996
997 # Rescaled the value into a 16bit fixedpoint relu_value in [-1, 1]
998 # Now convert that to a 16bit fixedpoint value in [0, 1]
999 relu_value = (relu_value + (1 << 15)) >> 1
1000 lut_result = fp_math.saturating_mul16(relu_value, input_value_preshift)
1001 shift = 31 - out_shift
1002 shift = -shift if shift < 0 else 0
1003 # Finally apply the output shift
1004 lut_result = fp_math.rounding_divide_by_pot(lut_result, shift) + zp_out
1005 lut_result = min(quantized_max, max(quantized_min, lut_result))
1006 values.append(lut_result)
1007 return convert_to_lut(op, values, "hardswish")
1008 return op
1009
1010
Louis Verhaardb9fc33c2020-08-13 11:47:36 +02001011def convert_lrelu_to_mul_max(op, arch):
1012 # Converts LeakyRelu to Max(alpha * IFM, identity * IFM)
1013 # (the opposite of convert_mul_max_to_abs_or_lrelu)
Louis Verhaardaee5d752020-09-30 09:01:52 +02001014 ifm, ofm = op.get_ifm_ofm()
Tim Hall93582962020-09-09 21:58:15 +01001015 if ifm is None or ofm is None:
1016 return op
Louis Verhaardb9fc33c2020-08-13 11:47:36 +02001017
1018 # Add multiplication with alpha
Louis Verhaardaee5d752020-09-30 09:01:52 +02001019 mul_alpha = Operation(Op.Mul, op.name + "_mul_alpha")
Louis Verhaardb9fc33c2020-08-13 11:47:36 +02001020 mul_alpha.add_input_tensor(ifm)
1021 # Create const tensor containing alpha as scalar
1022 alpha = op.attrs["alpha"]
1023 quantization = ifm.quantization.clone()
1024 quantization.min = 0
1025 quantization.max = alpha * (quantization.quant_max - quantization.quant_min)
Louis Verhaardb9fc33c2020-08-13 11:47:36 +02001026 quantization.zero_point = 0
Louis Verhaardece4e652021-01-07 13:35:47 +01001027 if np.isinf(1 / np.float32(alpha)):
1028 # Handling of alpha near zero
1029 quantization.scale_f32 = 1
1030 scalar = 0
1031 else:
1032 quantization.scale_f32 = alpha
erik.andersson@arm.com8ba07922021-03-10 08:39:23 +01001033 scalar = alpha
Louis Verhaardece4e652021-01-07 13:35:47 +01001034 alpha_tens = create_const_tensor(
erik.andersson@arm.com8ba07922021-03-10 08:39:23 +01001035 op.name + "_alpha_scalar", [], ifm.dtype, [scalar], np.float32, quantization=quantization
Louis Verhaardece4e652021-01-07 13:35:47 +01001036 )
erik.andersson@arm.com8ba07922021-03-10 08:39:23 +01001037 alpha_tens.quant_values = np.array([1])
Louis Verhaardb9fc33c2020-08-13 11:47:36 +02001038 mul_alpha.add_input_tensor(alpha_tens)
erik.andersson@arm.com8ba07922021-03-10 08:39:23 +01001039 fm_alpha = ofm.clone(op.name + "_alpha", set_unique=True)
Louis Verhaardb9fc33c2020-08-13 11:47:36 +02001040 mul_alpha.set_output_tensor(fm_alpha)
patrik.gustavssoneeb85152020-12-21 17:10:40 +00001041 mul_alpha.set_ifm_ofm_shapes()
Tim Halle6ccd872020-11-09 16:46:37 +00001042 DebugDatabase.add_optimised(op, mul_alpha)
Louis Verhaardb9fc33c2020-08-13 11:47:36 +02001043
Tim Hall93582962020-09-09 21:58:15 +01001044 if check_quantized_tens_scaling_equal(ifm, ofm):
Louis Verhaardb9fc33c2020-08-13 11:47:36 +02001045 # No identity multiplication is needed
1046 fm_id = ifm
1047 else:
1048 # Add multiplication with identity
Louis Verhaardaee5d752020-09-30 09:01:52 +02001049 mul_identity = Operation(Op.Mul, op.name + "_mul_identity")
Louis Verhaardb9fc33c2020-08-13 11:47:36 +02001050 mul_identity.add_input_tensor(ifm)
1051 # Create const tensor containing identity as scalar
1052 quantization = ifm.quantization.clone()
1053 quantization.min = 0
1054 quantization.max = quantization.quant_max - quantization.quant_min
1055 quantization.scale_f32 = 1
1056 quantization.zero_point = 0
1057 identity_tens = create_const_tensor(
1058 op.name + "_id_scalar", [], ifm.dtype, [1], np.uint8, quantization=quantization
1059 )
1060 mul_identity.add_input_tensor(identity_tens)
Louis Verhaardece4e652021-01-07 13:35:47 +01001061 # Make sure that fm_id is allocated to a different address than fm_alpha
1062 fm_id = ofm.clone(op.name + "_id", set_unique=True)
Louis Verhaardb9fc33c2020-08-13 11:47:36 +02001063 mul_identity.set_output_tensor(fm_id)
patrik.gustavssoneeb85152020-12-21 17:10:40 +00001064 mul_identity.set_ifm_ofm_shapes()
Patrik Gustavsson2349d422020-12-01 16:02:29 +01001065 DebugDatabase.add_optimised(op, mul_identity)
Louis Verhaardb9fc33c2020-08-13 11:47:36 +02001066
1067 # Convert LeakyRelu to Max, add the results of the multiplication(s) as inputs
Louis Verhaardaee5d752020-09-30 09:01:52 +02001068 op.type = Op.Maximum
Louis Verhaardb9fc33c2020-08-13 11:47:36 +02001069 op.name = op.name.replace("LeakyRelu", "Maximum")
1070 op.inputs = []
1071 ifm.consumer_list.remove(op)
1072 op.add_input_tensor(fm_alpha)
1073 op.add_input_tensor(fm_id)
Patrik Gustavssonc509d332020-12-22 13:53:52 +01001074 op.set_ifm_ofm_shapes()
Tim Halle6ccd872020-11-09 16:46:37 +00001075
1076 DebugDatabase.add_optimised(op, op)
Louis Verhaardb9fc33c2020-08-13 11:47:36 +02001077 return op
1078
1079
Louis Verhaard2e186c72020-10-09 10:47:04 +02001080def convert_to_lut(op, lut_values, lut_name):
Louis Verhaardf03bad32020-09-25 08:30:44 +02001081 # Rewrite the operation by Add with scalar 0 + LUT activation
1082 ifm = op.inputs[0]
Tim Hall93582962020-09-09 21:58:15 +01001083 if ifm is None:
1084 return op
Louis Verhaard58520b92020-08-24 16:45:38 +02001085 assert ifm.dtype.size_in_bytes() == 1
Louis Verhaardaee5d752020-09-30 09:01:52 +02001086 op.type = Op.Add
Louis Verhaard2e186c72020-10-09 10:47:04 +02001087 op.name = op.name + "_lut_" + lut_name
Louis Verhaardb9fc33c2020-08-13 11:47:36 +02001088 # Mark as no-op to enable potential fusing optimizations
1089 op.attrs["is_nop"] = True
1090 # Create an input tensor containing scalar zero
1091 quantization = QuantizationParameters(0.0, 255.0)
Louis Verhaardd7911c42020-08-25 13:36:41 +02001092 quantization.scale_f32 = ifm.quantization.scale_f32
Louis Verhaardb9fc33c2020-08-13 11:47:36 +02001093 quantization.zero_point = 0
Louis Verhaard2e186c72020-10-09 10:47:04 +02001094 tens = create_const_tensor(op.inputs[0].name + "_scalar0", [], ifm.dtype, [0], np.uint8, quantization=quantization)
Louis Verhaardb9fc33c2020-08-13 11:47:36 +02001095 op.add_input_tensor(tens)
patrik.gustavssoneeb85152020-12-21 17:10:40 +00001096 op.ifm_shapes.append(Shape4D(tens.shape))
Patrik Gustavsson2349d422020-12-01 16:02:29 +01001097
Louis Verhaardf03bad32020-09-25 08:30:44 +02001098 # The LUT must be applied without any preceding rescaling (the LUT itself performs the rescale),
1099 # so even if the OFM has a different scale than the IFM, the generated OFM scale instructions
1100 # should be the same as the IFM
Louis Verhaardaee5d752020-09-30 09:01:52 +02001101 op.forced_output_quantization = ifm.quantization
Louis Verhaard2e186c72020-10-09 10:47:04 +02001102 lut_tensor = lut.create_lut_tensor(op.name + "_values", lut_values, DataType.int8)
Louis Verhaardf03bad32020-09-25 08:30:44 +02001103 op.set_activation_lut(lut_tensor)
Patrik Gustavssonc509d332020-12-22 13:53:52 +01001104 op.set_ifm_ofm_shapes()
Louis Verhaardf03bad32020-09-25 08:30:44 +02001105 return op
1106
1107
Louis Verhaard2e186c72020-10-09 10:47:04 +02001108def convert_to_lut8(op, fn, fn_name):
Louis Verhaardf03bad32020-09-25 08:30:44 +02001109 # Converts op to a no-op + int8/uint8 LUT which is generated with the given function.
1110 # fn is a function(real) -> real
Louis Verhaardaee5d752020-09-30 09:01:52 +02001111 ifm, ofm = op.get_ifm_ofm()
Louis Verhaardf03bad32020-09-25 08:30:44 +02001112 if ifm.dtype not in (DataType.uint8, DataType.int8) or ifm.dtype != ofm.dtype:
1113 return op
1114 # Generate the LUT
1115 ifm_scale = np.double(ifm.quantization.scale_f32)
1116 ofm_scale = np.double(ofm.quantization.scale_f32)
1117 zp_in = ifm.quantization.zero_point
1118 zp_out = ofm.quantization.zero_point
1119 values = []
1120 ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)
1121 quantized_min = min(ix)
1122 quantized_max = max(ix)
1123 for x in ix:
1124 x_real = ifm_scale * (x - zp_in)
1125 y_real = fn(x_real)
1126 lut_result = round_away_zero(zp_out + y_real / ofm_scale)
1127 lut_result = min(quantized_max, max(quantized_min, lut_result))
1128 values.append(lut_result)
Louis Verhaard2e186c72020-10-09 10:47:04 +02001129 return convert_to_lut(op, values, fn_name)
Louis Verhaardf03bad32020-09-25 08:30:44 +02001130
1131
1132def convert_lrelu_to_lut(op, arch):
Louis Verhaardaee5d752020-09-30 09:01:52 +02001133 ifm, ofm = op.get_ifm_ofm()
Louis Verhaardb9fc33c2020-08-13 11:47:36 +02001134 # Generate the LUT
Louis Verhaardd7911c42020-08-25 13:36:41 +02001135 alpha = op.attrs["alpha"]
1136 ifm_scale = np.double(ifm.quantization.scale_f32)
1137 ofm_scale = np.double(ofm.quantization.scale_f32)
1138 zp_in = ifm.quantization.zero_point
1139 zp_out = ofm.quantization.zero_point
1140 identity_scale, identity_shift = scaling.elementwise_mul_scale(ifm_scale, 1, ofm_scale)
1141 alpha_scalar = 1
1142 alpha_scale, alpha_shift = scaling.elementwise_mul_scale(ifm_scale, alpha, ofm_scale)
1143 if "alpha_scaling" in op.attrs:
1144 # The LeakyRelu was the result from convert_mul_max_to_abs_or_lrelu
1145 alpha_scalar, alpha_scale, alpha_shift = op.attrs["alpha_scaling"]
1146 values = []
Louis Verhaard58520b92020-08-24 16:45:38 +02001147 ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)
Louis Verhaardd7911c42020-08-25 13:36:41 +02001148 quantized_min = min(ix)
1149 quantized_max = max(ix)
1150 for x in ix:
1151 if x < zp_in:
1152 lut_result = zp_out + fp_math.multiply_by_quantized_multiplier(
1153 alpha_scalar * (x - zp_in), alpha_scale, alpha_shift
1154 )
1155 else:
1156 lut_result = zp_out + fp_math.multiply_by_quantized_multiplier(x - zp_in, identity_scale, identity_shift)
1157 lut_result = min(quantized_max, max(quantized_min, lut_result))
1158 values.append(lut_result)
Louis Verhaard2e186c72020-10-09 10:47:04 +02001159 return convert_to_lut(op, values, "lrelu")
Louis Verhaardb9fc33c2020-08-13 11:47:36 +02001160
1161
Patrik Gustavsson3010d9b2020-10-01 08:22:10 +02001162def convert_lrelu(op, arch, nng):
Louis Verhaardb9fc33c2020-08-13 11:47:36 +02001163 # Converts LeakyRelu to a LUT based solution if possible, otherwise a mul + max
Louis Verhaardaee5d752020-09-30 09:01:52 +02001164 if op.type != Op.LeakyRelu:
Louis Verhaardb9fc33c2020-08-13 11:47:36 +02001165 return op
Louis Verhaardaee5d752020-09-30 09:01:52 +02001166 ifm, ofm = op.get_ifm_ofm()
Tim Hall93582962020-09-09 21:58:15 +01001167 if ifm is None or ofm is None:
1168 return op
Louis Verhaardd7911c42020-08-25 13:36:41 +02001169 if ifm.dtype in (DataType.uint8, DataType.int8) and ifm.dtype == ofm.dtype:
1170 # use LUT for int8/uint8
1171 return convert_lrelu_to_lut(op, arch)
Tim Hall93582962020-09-09 21:58:15 +01001172 if check_quantized_tens_scaling_equal(ifm, ofm) and ifm.dtype == ofm.dtype == DataType.int16:
Louis Verhaardd7911c42020-08-25 13:36:41 +02001173 # use LeakyRelu unmodified for int16 with equal input/output scaling
1174 return op
Louis Verhaardb9fc33c2020-08-13 11:47:36 +02001175 return convert_lrelu_to_mul_max(op, arch)
1176
1177
Patrik Gustavsson3010d9b2020-10-01 08:22:10 +02001178def convert_tanh_sigmoid_to_lut(op, arch, nng):
Louis Verhaardf03bad32020-09-25 08:30:44 +02001179 # Converts int8/uint8 Sigmoid and Tanh to a LUT based solution
Louis Verhaardaee5d752020-09-30 09:01:52 +02001180 if op.type == Op.Sigmoid:
Louis Verhaard2e186c72020-10-09 10:47:04 +02001181 return convert_to_lut8(op, clamp_sigmoid, "sigmoid")
Louis Verhaardaee5d752020-09-30 09:01:52 +02001182 elif op.type == Op.Tanh:
Louis Verhaard2e186c72020-10-09 10:47:04 +02001183 return convert_to_lut8(op, math.tanh, "tanh")
Louis Verhaardf03bad32020-09-25 08:30:44 +02001184 return op
1185
1186
Patrik Gustavsson3a269202021-01-21 08:28:55 +01001187def remove_reshapes(op, arch):
1188 if op.run_on_npu and op.type == Op.Reshape:
1189 ofm = op.ofm
1190 ifm = op.ifm
Patrik Gustavssonfa4cb292020-09-10 08:19:36 +02001191
Patrik Gustavsson3a269202021-01-21 08:28:55 +01001192 # Check if quantization is the same in the input and output for the reshape ops
1193 if not check_quantized_tens_scaling_equal(ifm, ofm):
1194 # TODO Both tensors are needed, since quantisation properties currently are linked to Tensors.
1195 # In order to remove this reshape either quantization properties need to be moved to Operator,
1196 # or the reshape need to be replace with a NOP.
1197 return
Patrik Gustavssonfa4cb292020-09-10 08:19:36 +02001198
Patrik Gustavsson3a269202021-01-21 08:28:55 +01001199 # Check if Reshape ifm/ofm are network ifm/ofm
Patrik Gustavsson138d47f2021-02-08 10:13:48 +01001200 ifm_is_sg_ifm = ifm.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const)
Patrik Gustavsson3a269202021-01-21 08:28:55 +01001201 ifm_is_sg_ofm = any(ifm_cons is None for ifm_cons in ifm.consumer_list)
1202 ofm_is_sg_ofm = any(ofm_cons is None for ofm_cons in ofm.consumer_list)
Patrik Gustavsson3645d002021-04-14 17:54:10 +02001203 # Check if ifm/ofm is produced repectivly consumed by CPU
1204 ifm_is_cpu_produced = any(ifm_prod is not None and not ifm_prod.run_on_npu for ifm_prod in op.ifm.ops)
1205 ofm_is_cpu_consumed = any(ofm_cons is not None and not ofm_cons.run_on_npu for ofm_cons in op.ofm.consumer_list)
Patrik Gustavsson3a269202021-01-21 08:28:55 +01001206
Patrik Gustavsson3645d002021-04-14 17:54:10 +02001207 # This case should be handled prior to this function
1208 assert not ((ifm_is_sg_ifm or ifm_is_sg_ofm or ifm_is_cpu_produced) and (ofm_is_sg_ofm or ofm_is_cpu_consumed))
1209
1210 if ofm_is_sg_ofm or ofm_is_cpu_consumed:
Patrik Gustavsson3a269202021-01-21 08:28:55 +01001211 # Bypassed by replacing ifm with ofm
1212 ofm.ops = []
1213 for prev_op in ifm.ops:
1214 prev_op.outputs = [ofm]
1215 ofm.ops.append(prev_op)
1216
1217 # All ifm consumers need to use ofm as input
1218 for ifm_cons in ifm.consumer_list:
1219 for ifm_idx, cons_ifm in enumerate(ifm_cons.inputs):
1220 if cons_ifm == ifm:
1221 ifm_cons.set_input_tensor(ofm, ifm_idx)
Patrik Gustavsson3a269202021-01-21 08:28:55 +01001222 else:
1223 # Bypassed Reshape by replacing ofm with ifm
1224 for cons in ofm.consumer_list:
1225 for ifm_idx, cons_ifm in enumerate(cons.inputs):
1226 if cons_ifm == ofm:
1227 cons.set_input_tensor(ifm, ifm_idx)
Patrik Gustavsson3a269202021-01-21 08:28:55 +01001228
1229
1230def check_reshapes(op, arch):
1231 if op.run_on_npu and op.type == Op.Reshape:
1232 ofm = op.ofm
1233
1234 if check_quantized_tens_scaling_equal(op.ifm, ofm):
1235 # Reshape should have been removed
1236 raise VelaError(f"Reshape op {op} expected to have been removed, still remains")
Patrik Gustavssonfa4cb292020-09-10 08:19:36 +02001237
1238
Patrik Gustavsson3010d9b2020-10-01 08:22:10 +02001239def fuse_activation_function_with_prev(op, arch, nng):
Louis Verhaardb9fc33c2020-08-13 11:47:36 +02001240 # if op is a no-op: attempts to move the activation function to the preceding op
Louis Verhaardaee5d752020-09-30 09:01:52 +02001241 if not op.attrs.get("is_nop", False) or op.activation is None:
Louis Verhaardb9fc33c2020-08-13 11:47:36 +02001242 return op
Louis Verhaardaee5d752020-09-30 09:01:52 +02001243 ifm, ofm = op.get_ifm_ofm()
Tim Hall93582962020-09-09 21:58:15 +01001244 if ifm is None or ofm is None:
1245 return op
Louis Verhaardb9fc33c2020-08-13 11:47:36 +02001246 # finds the input(s) to the operation
1247 prev_op = ifm.ops[0]
1248 # Note: the below checks on prev_op require that a first optimize pass on the full graph has been performed
1249 fuse = (
1250 prev_op.run_on_npu
Louis Verhaardaee5d752020-09-30 09:01:52 +02001251 and prev_op.type.npu_block_type != NpuBlockType.Default
Louis Verhaardb9fc33c2020-08-13 11:47:36 +02001252 and len(ifm.ops) == 1
1253 and len(prev_op.outputs[0].consumers()) == 1
Louis Verhaardaee5d752020-09-30 09:01:52 +02001254 and prev_op.activation is None
Louis Verhaardb9fc33c2020-08-13 11:47:36 +02001255 )
1256 if op.activation_lut is not None and arch.shram_reserved_unused_banks == 0:
1257 # TODO: if SHRAM LUT space is shared with SHRAM ACC (32, 64 MAC),
1258 # LUT currently only works correctly for elementwise ops
1259 fuse = False
Louis Verhaardb9fc33c2020-08-13 11:47:36 +02001260 if not fuse:
1261 return op
1262 # Move the fused activation function + corresponding info to prev_op
Louis Verhaardaee5d752020-09-30 09:01:52 +02001263 prev_op.activation = op.activation
1264 prev_op.forced_output_quantization = op.forced_output_quantization
Louis Verhaardb9fc33c2020-08-13 11:47:36 +02001265 if op.activation_lut is not None:
1266 prev_op.set_activation_lut(op.activation_lut)
1267 # Bypass op
Louis Verhaard98a34992020-09-01 10:39:04 +02001268 prev_op.set_output_tensor(ofm)
Tim Halle6ccd872020-11-09 16:46:37 +00001269 DebugDatabase.add_optimised(op, prev_op)
Louis Verhaardb9fc33c2020-08-13 11:47:36 +02001270 return op
1271
1272
Louis Verhaardc822d622021-03-11 14:59:06 +01001273def _leading_pad_ok(leading_pad, stride, kernel_size):
1274 # If kernel size // 2 > stride, then (left, top) padding must be a multiple of stride,
1275 # otherwise replacing PAD by hardware padding would iterate the wrong IFM rows/columns
1276 max_size = kernel_size // 2
1277 return leading_pad == max_size or max_size <= stride or leading_pad % stride == 0
1278
1279
1280def replace_pad_by_hw_pad(op: Operation, arch, nng):
Louis Verhaardae2d5532020-12-11 17:19:54 +01001281 """
Louis Verhaardc822d622021-03-11 14:59:06 +01001282 Tries to completely remove a PAD operator by using hardware padding.
1283 E.g. a PAD operation that pads 1, followed by a CONV with VALID padding and kernel size 3
1284 is rewritten such that the PAD is removed, and the CONV uses SAME padding.
Louis Verhaardae2d5532020-12-11 17:19:54 +01001285 Converts tens1 -> PAD -> tens2 -> CONV to tens1 -> CONV
1286 if both operations can be run on the NPU.
Louis Verhaardc822d622021-03-11 14:59:06 +01001287 This is the most efficient way to implement PAD, but cannot be done for all pad sizes.
Louis Verhaardae2d5532020-12-11 17:19:54 +01001288 """
1289 if (
Louis Verhaardc822d622021-03-11 14:59:06 +01001290 (op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op() or op.type.is_avgpool_op())
Louis Verhaardae2d5532020-12-11 17:19:54 +01001291 and op.run_on_npu
1292 and op.attrs["padding"] == Padding.VALID
1293 ):
1294 pad_op = op.ifm.ops[0]
1295 if pad_op.type != Op.Pad or not pad_op.run_on_npu:
1296 return op
Louis Verhaardc822d622021-03-11 14:59:06 +01001297 if pad_op.ifm.dtype != pad_op.ofm.dtype or not check_quantized_tens_scaling_equal(pad_op.ofm, pad_op.ifm):
1298 return op
1299 top, left, bottom, right = get_pad_values_from_input(pad_op.inputs[1].values)
1300 k = op.kernel
1301 k_w, k_h = k.dilated_wh()
1302
1303 # Check if the PAD operator can be replaced by hardware padding
1304 if left > k_w // 2 or right > k_w // 2 or top > k_h // 2 or bottom > k_h // 2:
1305 # Too much padding, it would require hardware padding to actually insert zeros
1306 return op
1307 if not _leading_pad_ok(top, k.stride.y, k_h) or not _leading_pad_ok(left, k.stride.x, k_w):
1308 return op
1309
Louis Verhaard1a92f782021-02-09 16:08:26 +01001310 if op.type.is_avgpool_op():
Louis Verhaardc822d622021-03-11 14:59:06 +01001311 # For average pool, hardware padding can only be used if padding is 0 or kernel size / 2
1312 for pad, k_size in (
1313 (left, k_w),
1314 (right, k_w),
1315 (top, k_h),
1316 (bottom, k_h),
1317 ):
1318 if pad not in (0, k_size // 2):
1319 return op
Louis Verhaard1a92f782021-02-09 16:08:26 +01001320 # Average pool is converted to depthwise, because NPU average pool + same padding
1321 # has a special implementation that is different from PAD followed by average pool with
1322 # valid padding.
1323 k_w, k_h = op.kernel.width, op.kernel.height
1324 ifm = op.ifm
1325 # Remember other inputs
1326 other_inputs = op.inputs[1:]
1327 # Create a weight tensor, all weights are set to 1/(kernel width * kernel height)
1328 quantization = QuantizationParameters(0.0, 255.0)
1329 quantization.scale_f32 = 1.0 / (k_w * k_h)
1330 quantization.zero_point = 0
1331 shape = [k_h, k_w, 1, op.ofm.shape[-1]]
1332 weights = np.full(shape, 1)
1333
1334 weight_tens = create_const_tensor(
1335 op.name + "_weights",
1336 shape,
1337 op.ifm.dtype,
1338 weights,
1339 np.uint8,
1340 purpose=TensorPurpose.Weights,
1341 quantization=quantization,
1342 )
1343 weight_tens.quant_values = weights
1344 op.type = Op.DepthwiseConv2DBias
1345 op.inputs = []
1346 op.add_input_tensor(ifm)
1347 op.add_input_tensor(weight_tens)
1348 # Add bias tensor, all biases set to 0
1349 op.inputs.append(None)
1350 fixup_bias_tensors(op, arch, nng)
1351 # Add other inputs
1352 op.inputs.extend(other_inputs)
1353 op.rounding_mode = NpuRoundingMode.NATURAL
1354
Louis Verhaardae2d5532020-12-11 17:19:54 +01001355 # Bypass the PAD operator
1356 op.set_input_tensor(pad_op.ifm, 0)
1357 # Adjust the padding attributes of the convolution operator
1358 op.attrs["padding"] = Padding.EXPLICIT
Louis Verhaardae2d5532020-12-11 17:19:54 +01001359 op.attrs["explicit_padding"] = (top, left, bottom, right)
1360 op.set_ifm_ofm_shapes()
1361 return op
1362
1363
Louis Verhaardc822d622021-03-11 14:59:06 +01001364def convert_pad(op: Operation, arch, nng):
1365 """
1366 Rewrites PAD operator to an average pool that copies the IFM to the OFM
1367 + up to 4 average pool operators that fill the OFM with zeros at the borders.
1368 This is done as fall-back for the PAD operators that remain after replace_pad_by_hw_pad
1369 """
1370 if op.type != Op.Pad or not op.run_on_npu:
1371 return op
1372 top, left, bottom, right = get_pad_values_from_input(op.inputs[1].values)
1373
1374 ifm = op.ifm
1375 assert ifm is not None
1376 ifm_shape = Shape4D(ifm.shape)
1377 ofm = op.ofm
1378 assert ofm is not None
1379 ofm.ops = []
1380 ofm_shape = op.ofm_shapes[0]
1381
1382 # Average pool op that copies IFM to the right place inside the OFM
1383 shp0 = Shape4D(0, 0, 0, 0)
1384 shp_top = shp0.with_height(top)
1385 avgpool_op = create_avg_pool_for_concat(op, op.name + "_main", ifm, ifm_shape, shp_top.with_width(left))
1386 avgpool_op.activation = op.activation
1387 quant = ofm.quantization
1388 pad_value = quant.zero_point
1389 # Add operations that fill the borders of the OFM
1390 if top > 0:
1391 shape = Shape4D(1, top, ofm_shape.width, ofm_shape.depth)
1392 zero_tens = create_const_tensor(
1393 op.name + "_top", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], np.uint8, quantization=quant
1394 )
1395 # If top/bottom or left/right are equal, the const tensors can be allocated to the same address
1396 zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))
1397 create_avg_pool_for_concat(op, op.name + "_top", zero_tens, shape, shp0)
1398 if bottom > 0:
1399 shape = Shape4D(1, bottom, ofm_shape.width, ofm_shape.depth)
1400 zero_tens = create_const_tensor(
1401 op.name + "_bottom",
1402 shape.as_list(),
1403 ofm.dtype,
1404 shape.elements() * [pad_value],
1405 np.uint8,
1406 quantization=quant,
1407 )
1408 zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))
1409 create_avg_pool_for_concat(
1410 op, op.name + "_bottom", zero_tens, shape, shp0.with_height(ofm_shape.height - bottom)
1411 )
1412 if left > 0:
1413 shape = Shape4D(1, ifm_shape.height, left, ofm_shape.depth)
1414 zero_tens = create_const_tensor(
1415 op.name + "_left", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], np.uint8, quantization=quant
1416 )
1417 zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))
1418 create_avg_pool_for_concat(op, op.name + "_left", zero_tens, shape, shp_top)
1419 if right > 0:
1420 shape = Shape4D(1, ifm_shape.height, right, ofm_shape.depth)
1421 zero_tens = create_const_tensor(
1422 op.name + "_right", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], np.uint8, quantization=quant
1423 )
1424 zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))
1425 create_avg_pool_for_concat(
1426 op, op.name + "_right", zero_tens, shape, shp_top.with_width(ofm_shape.width - right)
1427 )
Patrik Gustavssonee99bb12021-04-08 09:04:00 +02001428
Louis Verhaardc822d622021-03-11 14:59:06 +01001429 op.type = Op.ConcatTFLite
1430 return avgpool_op
1431
1432
Patrik Gustavsson3010d9b2020-10-01 08:22:10 +02001433def add_attrs_to_resizebilinear(op, arch, nng):
Louis Verhaardaee5d752020-09-30 09:01:52 +02001434 if op.type == Op.ResizeBilinear and op.run_on_npu:
Dwight Lidman42fed942020-05-29 09:37:03 +02001435 input_tensor = op.inputs[0]
Patrik Gustavsson3a269202021-01-21 08:28:55 +01001436 input_shape = op.ifm_shapes[0]
1437 upscaled_height = input_shape.height * 2
1438 upscaled_width = input_shape.width * 2
1439 out_shape = op.ofm_shapes[0]
1440 if not op.attrs["align_corners"] and out_shape.height == upscaled_height and out_shape.width == upscaled_width:
Dwight Lidman42fed942020-05-29 09:37:03 +02001441 # this means the output is supposed to be a x2 upscale,
1442 # so we need to do SAME padding
Michael McGeagh16895482020-12-14 15:51:20 +00001443 op.attrs["padding"] = Padding.SAME
Patrik Gustavsson3a269202021-01-21 08:28:55 +01001444 elif (
1445 op.attrs["align_corners"]
1446 and out_shape.height == (upscaled_height - 1)
1447 and out_shape.width == (upscaled_width - 1)
1448 ):
Dwight Lidman42fed942020-05-29 09:37:03 +02001449 # here we can just run the avg pool without padding and
1450 # produce a (M * 2 - 1, N * 2 - 1) sized output
Michael McGeagh16895482020-12-14 15:51:20 +00001451 op.attrs["padding"] = Padding.VALID
Dwight Lidman42fed942020-05-29 09:37:03 +02001452 else:
Charles Xu9a03fdf2020-07-02 15:12:40 +02001453 return op
Dwight Lidman42fed942020-05-29 09:37:03 +02001454 input_tensor.resampling_mode = resampling_mode.NEAREST
Tim Hallc30f4952020-06-15 20:47:35 +01001455 op.attrs.update({"strides": (1, 1, 1, 1), "ksize": (1, 2, 2, 1)})
Dwight Lidman42fed942020-05-29 09:37:03 +02001456 return op
1457
1458
Patrik Gustavsson3010d9b2020-10-01 08:22:10 +02001459def fixup_bias_tensors(op, arch, nng):
Louis Verhaardaee5d752020-09-30 09:01:52 +02001460 if op.type.needs_bias() and op.bias is None:
Jacob Bohlina41cd4d2020-08-26 18:21:28 +02001461 # Op has no bias, add bias tensor filled with zeros
1462 nr_biases = op.inputs[1].shape[-1]
1463 bias_values = [0] * nr_biases
1464 bias_tensor = create_const_tensor(op.name + "_bias", [nr_biases], DataType.int32, bias_values)
1465 bias_tensor.quant_values = bias_tensor.values
Louis Verhaard1a92f782021-02-09 16:08:26 +01001466 op.set_input_tensor(bias_tensor, op.type.info.indices.biases[0])
Jacob Bohlin67e0d8f2020-08-20 10:53:02 +02001467
1468 return op
1469
1470
Dwight Lidman95b279f2021-03-26 10:53:28 +01001471def convert_mean_to_depthwise_conv_or_avgpool(op, arch, nng):
Dwight Lidman4f728c02020-12-17 15:14:45 +01001472 if op.type == Op.Mean and op.run_on_npu:
1473 keep_dims = op.attrs.get("keep_dims", False)
1474 inp, axis = op.inputs
1475 shape = inp.shape
1476 dims = len(shape)
1477
1478 # Height and width axes have different index depending on dimensions
Dwight Lidman8244ce02021-05-03 13:12:57 +02001479 if axis.shape == [] or axis.shape[0] == 1: # single axis
1480 axis = int(axis.values) if len(axis.shape) == 0 else int(axis.values[0])
Dwight Lidman4f728c02020-12-17 15:14:45 +01001481 if dims in (2, 3):
1482 if axis == 0:
1483 h, w = shape[axis], 1
1484 else:
1485 h, w = 1, shape[axis]
1486 else:
1487 if axis == 1:
1488 h, w = shape[axis], 1
1489 else:
1490 h, w = 1, shape[axis]
1491 else: # multiple axes
1492 axis = sorted(axis.values)
1493 h, w = [shape[i] for i in axis]
1494
1495 # Set necessary depthwise attributes
1496 op.attrs.update(
1497 {
1498 "padding": Padding.VALID,
1499 "stride_h": 1,
1500 "stride_w": 1,
1501 "strides": (1, 1, 1, 1),
1502 "depth_multiplier": 1,
1503 "channel_multiplier": 1,
1504 "dilation_h_factor": 1,
1505 "dilation_w_factor": 1,
1506 "dilation": (1, 1, 1, 1),
1507 }
1508 )
1509 # Change op type
1510 op.type = Op.DepthwiseConv2DBias
1511 # Set IFM/OFM shapes after changing op type
1512 op.set_ifm_ofm_shapes()
1513
Dwight Lidman9b379182021-03-15 19:06:10 +01001514 weight_scale, bias = 1, None
Dwight Lidman4f728c02020-12-17 15:14:45 +01001515 ofmq, ifmq = op.ofm.quantization, inp.quantization
1516 # Set rounding mode, scaling and zero point based on which reference implementation to match
1517 if len(shape) == 4 and axis == [1, 2] and keep_dims:
1518 if inp.dtype == DataType.uint8:
1519 # This attribute means a different scaling calculation is used in order to match reference
1520 op.low_precision_scaling = True
1521 weight_scale = h * w
Dwight Lidman9bb1e2e2021-03-18 14:51:42 +01001522 # Set zero points to 0 as they will be adjusted for with bias term
Dwight Lidman4f728c02020-12-17 15:14:45 +01001523 foq = ofmq.clone()
Dwight Lidman9bb1e2e2021-03-18 14:51:42 +01001524 foq.zero_point = 0
Dwight Lidman4f728c02020-12-17 15:14:45 +01001525 fiq = ifmq.clone()
1526 fiq.zero_point = 0
1527 op.forced_input_quantization = fiq
Dwight Lidman9bb1e2e2021-03-18 14:51:42 +01001528 bias_term = ofmq.zero_point - int(ifmq.zero_point * ifmq.scale_f32 / ofmq.scale_f32)
1529 # If the bias term is outside uint8 range, we need an Add op to apply it.
1530 if bias_term < 0 or bias_term > 255:
1531 intermediate = op.ofm.clone(suffix="_intermediate", set_unique=True)
1532 # Bias term has higher bitness (i32) than input/output (u8).
1533 # 16 bits is enough since the bias is added/subtracted from a u8 value,
1534 # the bias can only effectively assume values in the range [-255, 255].
1535 intermediate.dtype = DataType.int16
1536 intermediate.quantization.zero_point = 0
1537 add_op = Operation(Op.Add, op.name + "_bias")
1538 add_op.forced_output_quantization = foq
1539 add_op.add_input_tensor(intermediate)
1540 quant = QuantizationParameters()
1541 quant.zero_point = 0
1542 bias_term_tens = create_const_tensor(
1543 op.name + "_bias",
1544 [1, 1, 1, 1],
1545 DataType.int16,
1546 [bias_term],
1547 np.int16,
1548 quantization=quant,
1549 quant_value_dtype=np.int16,
1550 )
1551 add_op.add_input_tensor(bias_term_tens)
1552 add_op.set_output_tensor(op.ofm)
1553 add_op.set_ifm_ofm_shapes()
1554 add_op.activation = op.activation
1555 op.activation = None
1556 op.set_output_tensor(intermediate)
1557 op.set_ifm_ofm_shapes()
1558 # If not, we can just do it with the OFM zero point.
1559 else:
1560 foq.zero_point = bias_term
1561 op.forced_output_quantization = foq
Dwight Lidman4f728c02020-12-17 15:14:45 +01001562 else:
1563 assert inp.dtype == DataType.int8
1564 # Use a depthwise to calculate the sum,
1565 # followed by a multiplication with 1/N to get the MEAN
Dwight Lidman4f728c02020-12-17 15:14:45 +01001566 weight_scale = 1
1567 intermediate = op.ofm.clone(suffix="_intermediate", set_unique=True)
1568 intermediate.dtype = DataType.int16
1569 mul_op = Operation(Op.Mul, op.name + "_mul")
1570 mul_op.add_input_tensor(intermediate)
1571 # Create scalar containing 1/N
1572 quant = QuantizationParameters()
1573 quant.zero_point = 0
1574 # The reference rounds negative numbers downwards, e.g. -1.5 is rounded to -2,
1575 # while rounding mode NATURAL would round this to -1.
1576 # This can only occur if N is even, and can be emulated by
1577 # multiplying with a number that is slightly smaller than 1/N.
1578 # It must be so small that other roundings are not affected;
1579 # the calculated value is based on worst case,
1580 # which is sum 256 * N (the maximum sum that can occur with int8)
1581 n = int(h * w)
1582 eps = 1 / (256 * (n + 1)) if n % 2 == 0 else 0
1583 quant.scale_f32 = 1 / (n - eps)
1584 scalar = create_const_tensor(
1585 op.name + "_scalar", [1, 1, 1, 1], DataType.uint8, [1], np.uint8, quantization=quant
1586 )
1587 mul_op.add_input_tensor(scalar)
1588 mul_op.set_output_tensor(op.ofm)
1589 mul_op.set_ifm_ofm_shapes()
1590 mul_op.rounding_mode = NpuRoundingMode.NATURAL
1591 mul_op.activation = op.activation
1592 op.activation = None
1593 op.set_output_tensor(intermediate)
1594 op.set_ifm_ofm_shapes()
1595 elif ifmq.zero_point == ofmq.zero_point and ifmq.scale_f32 == ofmq.scale_f32:
Dwight Lidman95b279f2021-03-26 10:53:28 +01001596 # Here we can just use a simple AvgPool with truncating rounding,
1597 # as we're emulating simple integer division.
Dwight Lidman4f728c02020-12-17 15:14:45 +01001598 op.rounding_mode = NpuRoundingMode.TRUNCATE
Dwight Lidman95b279f2021-03-26 10:53:28 +01001599 op.type = Op.AvgPool
1600 op.attrs.update({"ksize": (1, h, w, 1), "filter_height": h, "filter_width": w})
Dwight Lidman4f728c02020-12-17 15:14:45 +01001601 else:
Dwight Lidman9b379182021-03-15 19:06:10 +01001602 op.rounding_mode = NpuRoundingMode.NATURAL
1603 weight_scale = 1 / (h * w)
1604 # Input zero point is adjusted after mean calculation, so we emulate that with a bias
1605 bias = -ifmq.zero_point * h * w
1606 fiq = ifmq.clone()
1607 fiq.zero_point = 0
1608 op.forced_input_quantization = fiq
Dwight Lidman4f728c02020-12-17 15:14:45 +01001609
1610 # Change dimensions to 4
1611 if dims < 4:
1612 shape = [1] + shape
1613 if dims == 2:
1614 shape += [1]
1615
1616 # If height is greater than max kernel height, reshape to from HxW to 1x(HxW)
1617 if h > 64:
1618 shape = [shape[0], 1, h * w, shape[3]]
1619 op.ifm_shapes[0] = Shape4D(shape)
Dwight Lidman95b279f2021-03-26 10:53:28 +01001620 if h > 256 and op.type == Op.AvgPool:
1621 op.attrs.update({"ksize": (1, 1, h * w, 1), "filter_height": 1, "filter_width": h * w})
1622
1623 # If the AvgPool version is used, we don't need to do anything else
1624 if op.type == Op.AvgPool:
1625 return op
Dwight Lidman4f728c02020-12-17 15:14:45 +01001626
Dwight Lidman4f728c02020-12-17 15:14:45 +01001627 # Make unit weight tensor quantization
Dwight Lidman9b379182021-03-15 19:06:10 +01001628 weight_quant = ifmq.clone()
Dwight Lidman4f728c02020-12-17 15:14:45 +01001629 weight_quant.min = 0
1630 weight_quant.max = 255
1631 weight_quant.scale_f32 = weight_scale
1632 weight_quant.zero_point = 0
1633
1634 # Set weight shape to [H,W,C,B]
1635 weight_shape = shape[1:4] + [shape[0]]
1636 # Add unit weight tensor
1637 op.set_input_tensor(
1638 create_const_tensor(
1639 "weights",
1640 weight_shape,
1641 inp.dtype,
1642 np.ones(weight_shape),
1643 value_dtype=np.uint8,
1644 quantization=weight_quant,
1645 ),
1646 1,
1647 )
Dwight Lidman9b379182021-03-15 19:06:10 +01001648 op.weights.quant_values = np.reshape(op.inputs[1].quant_values, weight_shape)
1649
Dwight Lidman95b279f2021-03-26 10:53:28 +01001650 # Add None bias tensor
1651 op.inputs.append(None)
Dwight Lidman9b379182021-03-15 19:06:10 +01001652 # Add bias tensor
1653 if bias:
1654 bias_shape = [shape[-1]]
1655 op.set_input_tensor(
1656 create_const_tensor(
1657 "bias",
1658 bias_shape,
1659 inp.dtype,
1660 np.ones(bias_shape) * bias,
1661 value_dtype=np.int32,
1662 quant_value_dtype=np.int32,
1663 quantization=None,
1664 ),
1665 2,
1666 )
Dwight Lidman4f728c02020-12-17 15:14:45 +01001667
1668 return op
1669
1670
Patrik Gustavsson3010d9b2020-10-01 08:22:10 +02001671def supported_operator_check(op, arch, nng):
Tim Hall79d07d22020-04-27 18:20:16 +01001672 op.run_on_npu = arch.supported_operators.is_operator_supported(op)
1673 return op
1674
1675
Tim Halle6ccd872020-11-09 16:46:37 +00001676def _record_optimised(op, arch):
1677 if op.type != Op.Const:
1678 DebugDatabase.add_optimised(op, op)
1679
1680
Tim Hall79d07d22020-04-27 18:20:16 +01001681def optimise_graph_a(nng, arch, verbose_graph=False):
1682 if verbose_graph:
1683 nng.print_graph()
1684
Patrik Gustavsson2349d422020-12-01 16:02:29 +01001685 pre_process_list = [
1686 supported_operator_check,
1687 set_ifm_ofm_op_shapes,
1688 # TODO: memory-only Op removal
1689 ]
1690
1691 for idx, sg in enumerate(nng.subgraphs):
1692 # rewrite graph pass
1693 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
1694 nng, sg, arch, [], pre_process_list, rewrite_unsupported=False,
1695 )
1696
Patrik Gustavsson3a269202021-01-21 08:28:55 +01001697 # Handle Concat Ops
1698 for idx, sg in enumerate(nng.subgraphs):
1699 # rewrite graph pass
Patrik Gustavsson2c2522d2021-01-29 11:51:31 +01001700 rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [rewrite_concat_ops])
1701 sg.refresh_after_modification()
Patrik Gustavsson3a269202021-01-21 08:28:55 +01001702
1703 # Handle Split Ops
1704 for idx, sg in enumerate(nng.subgraphs):
1705 # rewrite graph pass
1706 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
1707 nng,
1708 sg,
1709 arch,
1710 [],
1711 [rewrite_unpack_output, rewrite_stridedslice_output, convert_nop_split_to_identity],
1712 rewrite_unsupported=False,
1713 )
1714
1715 for idx, sg in enumerate(nng.subgraphs):
1716 # rewrite graph pass
1717 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
1718 nng, sg, arch, [rewrite_split_ops], [], rewrite_unsupported=False,
1719 )
1720
Patrik Gustavsson138d47f2021-02-08 10:13:48 +01001721 # Handle sg input output
1722 for idx, sg in enumerate(nng.subgraphs):
1723 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
1724 nng, sg, arch, [], [fix_sg_input_output], rewrite_unsupported=False,
1725 )
1726
Patrik Gustavsson3a269202021-01-21 08:28:55 +01001727 # Removal of reshapes
1728 for sg in nng.subgraphs:
1729 rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [remove_reshapes])
1730 sg.refresh_after_modification()
1731
Tim Hall79d07d22020-04-27 18:20:16 +01001732 op_rewrite_list = [
Tim Hall4e127762020-05-15 16:05:49 +01001733 set_tensor_equivalence,
Dwight Lidman95b279f2021-03-26 10:53:28 +01001734 convert_mean_to_depthwise_conv_or_avgpool,
Tim Hall79d07d22020-04-27 18:20:16 +01001735 convert_depthwise_to_conv,
Michael McGeagh8d939c02020-07-29 13:11:43 +01001736 convert_conv_to_fc,
Fredrik Svedberga0c36242020-06-03 15:43:31 +02001737 convert_softmax,
Diqing Zhong016b8272020-12-16 16:46:06 +01001738 optimise_strided_conv,
Diqing Zhong189f7482021-01-26 12:12:51 +01001739 convert_hardswish_to_lut,
Patrik Gustavsson2c2522d2021-01-29 11:51:31 +01001740 rewrite_fully_connected_input,
Diqing Zhong94457b12020-12-09 15:22:40 +01001741 convert_batched_fc_shape,
Tim Hall79d07d22020-04-27 18:20:16 +01001742 fixup_conv2d_backprop,
Michael McGeagh8dbf8cf2020-09-08 11:09:48 +01001743 fixup_relus_with_differing_ifm_ofm_scaling,
Patrik Gustavsson3a269202021-01-21 08:28:55 +01001744 fixup_elementwise_with_scalars, # TODO Move to early stage?
Jacob Bohline843d332020-06-23 12:12:56 +02001745 reorder_depthwise_weights,
Charles Xu9a03fdf2020-07-02 15:12:40 +02001746 fixup_resizebilinear,
Jacob Bohlina41cd4d2020-08-26 18:21:28 +02001747 fixup_bias_tensors,
Louis Verhaardb9fc33c2020-08-13 11:47:36 +02001748 convert_mul_max_to_abs_or_lrelu,
1749 convert_lrelu,
Louis Verhaardf03bad32020-09-25 08:30:44 +02001750 convert_tanh_sigmoid_to_lut,
Louis Verhaardc822d622021-03-11 14:59:06 +01001751 replace_pad_by_hw_pad,
Tim Hall79d07d22020-04-27 18:20:16 +01001752 ]
1753
1754 for idx, sg in enumerate(nng.subgraphs):
1755 # rewrite graph pass
1756 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
Dwight Lidman73320a42020-11-05 10:34:41 +01001757 nng, sg, arch, [], op_rewrite_list, rewrite_unsupported=False,
Tim Hall79d07d22020-04-27 18:20:16 +01001758 )
1759
1760 for idx, sg in enumerate(nng.subgraphs):
Louis Verhaardb9fc33c2020-08-13 11:47:36 +02001761 # remove passthrough tensors and attempt further optimizations
1762 nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(
Louis Verhaardae2d5532020-12-11 17:19:54 +01001763 nng,
1764 sg,
1765 arch,
1766 [remove_passthrough_tensor],
Louis Verhaardc822d622021-03-11 14:59:06 +01001767 [fuse_activation_function_with_prev, convert_pad, add_padding_fields],
Louis Verhaardb9fc33c2020-08-13 11:47:36 +02001768 )
Tim Hall79d07d22020-04-27 18:20:16 +01001769
Patrik Gustavssone3b1b912021-02-09 15:38:46 +01001770 # Removal of SplitSliceRead, need to be done after optimisation has been performed,
1771 # since ifm/ofm_shapes are of importance to this function
1772 for sg in nng.subgraphs:
1773 rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [remove_SplitSliceRead])
1774 sg.refresh_after_modification()
1775
Patrik Gustavssonee99bb12021-04-08 09:04:00 +02001776 # Check Tensor Format restrictions
1777 for sg in nng.subgraphs:
1778 rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [check_format_restrictions], [])
1779 sg.refresh_after_modification()
1780
Patrik Gustavsson3a269202021-01-21 08:28:55 +01001781 # Post-optimisation operator debug tracing, and checking that no undesired reshapes are left in the graph
Tim Halle6ccd872020-11-09 16:46:37 +00001782 for sg in nng.subgraphs:
Patrik Gustavsson3a269202021-01-21 08:28:55 +01001783 rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [check_reshapes, _record_optimised])
Tim Hall79d07d22020-04-27 18:20:16 +01001784
1785 if verbose_graph:
1786 nng.print_graph()
1787 return nng