Blame - ethosu/vela/tflite_graph_optimiser.py - ml/ethos-u/ethos-u-vela

2021-06-28 07:41:58 +0200

[diff] [blame]

1

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

16

# Description:

17

# Early optimisation of a TensorFlow Lite based network graph, using the rewrite_graph module

18

# to do the traversal of the graph.

19

import math

20

import uuid

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

import numpy as np

from . import fp_math

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

25

from . import rewrite_graph

26

from . import scaling

27

from .api import NpuRoundingMode

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

28

from .data_type import BaseType

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

29

from .data_type import DataType

30

from .debug_database import DebugDatabase

31

from .errors import UnsupportedFeatureError

32

from .ethos_u55_regs.ethos_u55_regs import resampling_mode

Jonas Ohlsson

2021-09-01 15:57:21 +0200

[diff] [blame]

33

from .graph_optimiser_util import bypass_memory_only_ops

Patrik Gustavsson

c74682c

2021-08-17 14:26:38 +0200

[diff] [blame]

34

from .graph_optimiser_util import calc_explicit_padding

Patrik Gustavsson

df99510

2021-08-23 15:33:59 +0200

[diff] [blame]

35

from .graph_optimiser_util import convert_depthwise_to_conv

Patrik Gustavsson

f436ada

2021-09-14 14:56:48 +0200

[diff] [blame]

36

from .graph_optimiser_util import convert_to_lut

Patrik Gustavsson

df99510

2021-08-23 15:33:59 +0200

[diff] [blame]

37

from .graph_optimiser_util import fix_sg_input_output

Jonas Ohlsson

2021-09-01 15:57:21 +0200

[diff] [blame]

38

from .graph_optimiser_util import memory_only_ops

Patrik Gustavsson

f1580f0

2021-09-01 12:43:02 +0200

[diff] [blame]

39

from .graph_optimiser_util import move_splitsliceread_to_consumer

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

40

from .graph_optimiser_util import needed_total_padding

41

from .graph_optimiser_util import set_ifm_ofm_op_shapes

42

from .graph_optimiser_util import set_tensor_equivalence

43

from .numeric_util import clamp_sigmoid

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

44

from .numeric_util import round_away_zero

Johan Alfvén

1700939

2022-08-30 09:14:56 +0200

[diff] [blame]

45

from .numeric_util import round_up_to_int

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

46

from .operation import create_activation_function

Fredrik Svedberg

1a7527c

2021-09-13 15:52:16 +0200

[diff] [blame]

47

from .operation import ExplicitScaling

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

48

from .operation import NpuBlockType

49

from .operation import Op

50

from .operation import Operation

51

from .operation import Padding

52

from .operation_util import create_avgpool_nop

53

from .operation_util import get_pad_values_from_input

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

54

from .scaling import quantise_scale

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

55

from .shape4d import Shape4D

56

from .softmax import SoftMax

57

from .tensor import check_quantized_tens_scaling_equal

58

from .tensor import create_const_tensor

59

from .tensor import create_equivalence_id

60

from .tensor import QuantizationParameters

61

from .tensor import Tensor

62

from .tensor import TensorPurpose

63

from .tflite_mapping import optype_to_builtintype

64

65

passthrough_nodes = (Op.Identity,)

66

67

68

def create_avg_pool_for_concat(concat_op, name, ifm, ifm_shape: Shape4D, write_offset: Shape4D):

69

"""Creates an average pool for the given concat op/input feature map"""

70

ofm = concat_op.ofm

71

avgpool_op = create_avgpool_nop(name)

72

avgpool_op.inputs = [ifm]

73

avgpool_op.outputs = [ofm]

74

75

avgpool_op.write_offset = write_offset

76

avgpool_op.write_shape = ifm_shape

77

ofm.ops.append(avgpool_op)

78

DebugDatabase.add_optimised(concat_op, avgpool_op)

79

avgpool_op.ifm_shapes.append(ifm_shape)

80

avgpool_op.ofm_shapes.append(concat_op.ofm_shapes[0])

81

avgpool_op.memory_function = Op.ConcatSliceWrite

return avgpool_op

def remove_passthrough_tensor(tens, arch, nng):

86

if len(tens.ops) == 1 and tens.ops[0].type in passthrough_nodes:

87

assert len(tens.ops[0].inputs) == 1

88

tens = tens.ops[0].inputs[0]

return tens

def rewrite_concat_ops(op, arch):

93

if not op.run_on_npu or not op.type.is_concat_op():

return

axis_4D = 0

ofm = op.ofm

ofm.ops = []

offset = 0

unfuse_activation_function(op)

102

103

if op.type == Op.Pack:

104

# Pack is also referred to as Stack

105

axis = int(op.attrs["axis"])

106

if axis < 0: # Convert to positive axis

107

axis = len(op.inputs[0].shape) + 1 + axis

108

109

desired_shape = op.inputs[0].shape[:axis] + [1] + op.inputs[0].shape[axis:]

110

111

axis_4D = axis + (4 - len(desired_shape))

112

113

for idx, inp in enumerate(op.inputs):

114

op.ifm_shapes[idx] = Shape4D(desired_shape)

115

op.type = Op.PackReshaped

116

117

inputs, axis = op.get_concat_inputs_axis()

118

for idx, inp in enumerate(inputs):

119

if op.type != Op.PackReshaped:

120

op.ifm_shapes[idx] = Shape4D(inp.shape)

121

if axis >= 0:

122

axis_4D = axis + (4 - len(inp.shape))

123

else:

124

axis_4D = axis

125

write_offset = [0, 0, 0, 0]

126

write_offset[axis_4D] = offset

127

concat_end = offset + op.ifm_shapes[idx][axis_4D]

128

create_avg_pool_for_concat(

129

op, op.name + str(idx) + "_avgpool", inp, op.ifm_shapes[idx], Shape4D.from_list(write_offset)

130

)

131

offset = concat_end

132

assert ofm.shape[axis] == offset

return op

def rewrite_split_ops(tens, arch, nng):

138

139

if len(tens.ops) == 1 and tens.ops[0].type.is_split_op() and tens.ops[0].type != Op.Unpack:

140

split_op = tens.ops[0]

141

142

# Not supported so leave it and run on CPU

143

if not split_op.run_on_npu:

144

return tens

145

146

inp, outputs, axis, offset_start, offset_end = split_op.get_split_inputs_axis()

147

148

tens.ops = []

149

new_op = Operation(Op.SplitSliceRead, split_op.name)

150

new_op.inputs = [inp]

151

ofm_shape_idx = 0

Tim Hall

51a8dce

2021-12-20 16:49:27 +0000

[diff] [blame]

152

if None in (offset_end, offset_start):

153

read_shape = None

154

else:

155

# the read shape is relative to each start offset

156

read_shape = [oe - os for oe, os in zip(offset_end, offset_start)]

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

157

158

# For Split the offset cannot be extracted from the tensor so it has to

159

# be calculated from the index of the output tensor

160

if axis is not None:

161

# Get the start and end of the split

162

offset_start = [0] * 4

163

axis_4D_list = split_op.attrs.get("split_axis_4D", None) # Present for UnpackReshaped and some StridedSlice

164

for idx, out in enumerate(outputs):

165

if axis_4D_list is not None:

166

axis_4D = axis_4D_list[idx]

167

else:

168

split_op.ofm_shapes[idx] = Shape4D(out.shape)

169

if axis >= 0:

170

axis_4D = axis + (4 - len(out.shape))

else:

axis_4D = axis

if out == tens:

ofm_shape_idx = idx

read_shape = split_op.ofm_shapes[idx]

177

break

178

179

offset_start[axis_4D] += split_op.ofm_shapes[idx][axis_4D]

180

181

new_op.read_offsets[0] = Shape4D.from_list(offset_start, 0)

182

new_op.read_shapes[0] = read_shape

183

new_op.run_on_npu = True

184

new_op.set_output_tensor(tens)

185

new_op.ifm_shapes.append(Shape4D(inp.shape))

186

new_op.ofm_shapes.append(split_op.ofm_shapes[ofm_shape_idx])

187

DebugDatabase.add_optimised(split_op, new_op)

return tens

def remove_SplitSliceRead(op, arch):

193

194

if op.type == Op.SplitSliceRead:

195

# Check if it is possible to put the SplitSliceRead on the tensor consumer, or if an avgpool need to be inserted

196

if (

197

len(op.ofm.consumer_list) == 1

198

and op.ofm.consumer_list[0] is not None

199

and op.ofm.consumer_list[0].run_on_npu

Jonas Ohlsson

2021-09-01 15:57:21 +0200

[diff] [blame]

200

and op.ofm.consumer_list[0].type not in memory_only_ops

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

201

and op.ofm_shapes[0] == Shape4D.from_list(op.ofm.shape)

202

):

203

# SplitSliceRead can be performed by tensor consumer

204

cons_op = op.ofm.consumer_list[0]

Patrik Gustavsson

f1580f0

2021-09-01 12:43:02 +0200

[diff] [blame]

205

move_splitsliceread_to_consumer(op, cons_op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

206

else:

207

avgpool_op = create_avgpool_nop(op.name + "_avgpool")

208

avgpool_op.add_input_tensor(op.ifm)

209

avgpool_op.outputs = [op.ofm]

210

op.ofm.ops.remove(op)

211

op.ofm.ops.append(avgpool_op)

212

avgpool_op.ifm_shapes.append(op.ifm_shapes[0])

213

avgpool_op.ofm_shapes.append(op.ofm_shapes[0])

214

avgpool_op.read_offsets[0] = op.read_offsets[0]

215

avgpool_op.read_shapes[0] = op.read_shapes[0]

216

217

op.ifm.consumer_list.remove(op)

218

DebugDatabase.add_optimised(op, avgpool_op)

219

220

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

221

def calc_padding_and_skirt(padding_type, kernel, input_shape, explicit_padding):

222

k_w, k_h = kernel.dilated_wh()

223

s_x, s_y = kernel.stride

224

ypad = needed_total_padding(int(input_shape.height), int(s_y), int(k_h))

225

xpad = needed_total_padding(int(input_shape.width), int(s_x), int(k_w))

226

if padding_type == Padding.SAME:

227

left_pad = (xpad + 0) // 2

228

right_pad = (xpad + 1) // 2

229

top_pad = (ypad + 0) // 2

230

bottom_pad = (ypad + 1) // 2

231

elif padding_type == Padding.VALID:

left_pad = 0

right_pad = 0

top_pad = 0

bottom_pad = 0

elif padding_type == Padding.EXPLICIT:

237

# Padding is specified in a PAD operator which has been bypassed.

238

top, left, bottom, right = explicit_padding

239

top_pad, bottom_pad = calc_explicit_padding(int(input_shape.height), int(s_y), int(k_h), int(top), int(bottom))

240

left_pad, right_pad = calc_explicit_padding(int(input_shape.width), int(s_x), int(k_w), int(left), int(right))

Rickard Bolin

9ae3455

2022-06-09 13:07:17 +0000

[diff] [blame]

241

elif padding_type == Padding.TILE:

242

# The values in the explicit padding only represent the "direction" in which to pad

243

top_pad, left_pad, bottom_pad, right_pad = explicit_padding

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

244

else:

Tim Hall

0ab2edc

2022-02-23 17:58:02 +0000

[diff] [blame]

245

raise UnsupportedFeatureError(f"Unsupported padding = {padding_type} for padding calculation")

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

246

padding = (top_pad, left_pad, bottom_pad, right_pad)

247

skirt = (top_pad, left_pad, ypad - top_pad, xpad - left_pad)

248

return padding, skirt

249

250

251

def calc_upscaled_padding_and_skirt(padding_type, kernel_size, stride, input_shape, upscaling_factor):

252

kernel_height, kernel_width = kernel_size[0], kernel_size[1]

253

if padding_type == Padding.SAME:

254

ypad = needed_total_padding(int(input_shape.height) * upscaling_factor, int(stride[1]), int(kernel_height))

255

xpad = needed_total_padding(int(input_shape.width) * upscaling_factor, int(stride[2]), int(kernel_width))

256

right_pad = max(((xpad + 1) // upscaling_factor) - 1, 0)

257

bottom_pad = max(((ypad + 1) // upscaling_factor) - 1, 0)

258

left_pad = max(kernel_width - 1 - right_pad, 0)

259

top_pad = max(kernel_height - 1 - bottom_pad, 0)

260

elif padding_type == Padding.VALID:

261

right_pad = max(kernel_width - 2, 0)

262

bottom_pad = max(kernel_height - 2, 0)

263

left_pad = kernel_width - 1

264

top_pad = kernel_height - 1

265

else:

Tim Hall

0ab2edc

2022-02-23 17:58:02 +0000

[diff] [blame]

266

raise UnsupportedFeatureError(f"Unsupported padding = {padding_type} for up-scaled padding calculation")

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

267

padding = (top_pad, left_pad, bottom_pad, right_pad)

268

skirt = padding

269

return padding, skirt

270

271

272

def fixup_conv2d_backprop(op, arch, nng):

273

if op.type == Op.Conv2DBackpropInput:

274

# flip the inputs

275

op.inputs[0], op.inputs[2] = op.inputs[2], op.inputs[0]

276

op.type = Op.Conv2DBackpropInputSwitchedBias

Tim Hall

3c5cfe9

2022-03-16 16:31:57 +0000

[diff] [blame]

277

op.ifm_resampling_mode = resampling_mode.TRANSPOSE

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

278

279

# Update strides

280

op.attrs.update({"stride_w": 1, "stride_h": 1, "strides": (1, 1, 1, 1)})

return op

# Convert the op to an elementwise add

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

286

def convert_resize_1x1_to_add(op):

287

op.type = Op.Add # original_type will stay as Op.ResizeBilinear or Op.ResizeNearestNeighbor

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

288

op.name = op.name + "_add"

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

289

# Create an input tensor filled with zeros

290

shape = op.ofm_shapes[0].as_list()

291

tens = Tensor(shape, op.inputs[0].dtype, op.inputs[1].name + "_add")

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

292

tens.values = np.zeros(shape, tens.dtype.as_numpy_type())

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

293

tens.quantization = QuantizationParameters(0.0, 255.0)

294

tens.quantization.scale_f32 = 1.0

295

tens.quantization.zero_point = 0

296

tens.consumer_list = [op]

297

tens_op = op.inputs[1].ops[0]

298

tens_op.set_output_tensor(tens)

299

# Set the add inputs

300

op.inputs[1] = op.inputs[0]

301

op.inputs[0] = tens

302

op.set_ifm_ofm_shapes()

return op

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

307

# Convert ResizeNearestNeightbor with align corners to a depthwise convolution. The IFM will already have been upscaled

308

# apart from the final x2 scaling which will be done as part of this operation. The kernel contains a single coefficient

309

# to select the appropriate nearest neighbor value

310

def convert_resizenn_ac_to_depthwise_conv(op, upscale_factor):

311

ifm = op.ifm

312

ofm = op.ofm

313

output_depth = ofm.shape[-1]

314

dw_op_attrs = {

315

"padding": Padding.VALID,

316

"stride_h": 1,

317

"stride_w": 1,

318

"strides": (1, 1, 1, 1),

319

"depth_multiplier": 1,

320

"channel_multiplier": 1,

321

"dilation_h_factor": 1,

322

"dilation_w_factor": 1,

323

"dilation": (1, 1, 1, 1),

324

}

325

326

# change resizebilinear to depthwise

327

op.type = Op.DepthwiseConv2DBias

328

op.attrs.update(dw_op_attrs)

329

op.set_input_tensor(ifm, 0) # ifm tensor index

330

op.activation = None

331

332

# add input resample to resize by x2

333

op.ifm_resampling_mode = resampling_mode.NEAREST

334

335

# don't care about the rounding mode as it is nearest neighbor

336

337

# setup weight tensor

338

weight_quant = QuantizationParameters()

339

weight_quant.scale_f32 = 1.0 # no scaling as only a single non-zero coeff to select the desired value

340

weight_quant.zero_point = 0

341

weight_quant.quant_dim = 0

342

ofm_dtype = ofm.dtype

343

if ofm_dtype == DataType.uint8:

344

weight_value_dtype = np.uint8

345

weight_quant.quant_min = 0

346

weight_quant.quant_max = (1 << ofm_dtype.bits) - 1

347

else:

348

if ofm_dtype == DataType.int8:

349

weight_value_dtype = np.int8

350

else:

351

assert ofm_dtype == DataType.int16

352

weight_value_dtype = np.int16

353

354

weight_quant.quant_min = -(1 << (ofm_dtype.bits - 1))

355

weight_quant.quant_max = (1 << (ofm_dtype.bits - 1)) - 1

356

357

weight_shape = [upscale_factor, upscale_factor, output_depth, output_depth] # HWIO

358

359

# the single non-zero coefficient used to select the desired value needs to be placed in the 'centre value', which

360

# is calculated by finding the 'centre position' ('*' in the diagram below) and then choosing the 'value' that is

361

# below-and-right (i.e. next) to it (D).

# 0---1---2

# | A | B |

# 1---*---+

# | C | D |

# 2---+---+

weight_values = [0] * (upscale_factor * upscale_factor)

368

centre_coeff = (upscale_factor // 2) * upscale_factor + (upscale_factor // 2)

369

weight_values[centre_coeff] = 1

370

371

# add weight tensor, this will discard the size tensor of the resize op

op.set_input_tensor(

create_const_tensor(

"weights",

weight_shape,

ofm.dtype,

np.array(weight_values).reshape(weight_shape),

378

value_dtype=weight_value_dtype,

379

quantization=weight_quant,

380

),

381

1, # inputs tensor weight index

382

)

383

384

# setup bias tensor by assign None and then call the fix-up function to create a suitable tensor.

385

# need to append the bias tensor as resize ops only have 2 inputs

386

assert len(op.inputs) == 2

387

op.inputs.append(None)

388

fixup_bias_tensors(op, None, None)

389

390

# finally update the shape incase we've change the tensor shapes or connections

391

op.set_ifm_ofm_shapes()

return op

# Convert ResizeBilinear/NearestNeighbor to a number of 1x1 average pools with nearest neighbor x2 upscaling and one

397

# final average pool with a kernel size that depends upon the resize ops upscaling factor (x2, x4 or x8). The maximum

398

# upscale factor is limited to x8 because of the limit 8x8 kernel size limit for average pool with padding.

399

def convert_resize_to_upscale_and_average_pool(op):

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

400

pre_op = op

401

outputs = op.outputs

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

402

dtype = op.ifm.dtype

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

403

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

404

op.attrs.update({"strides": (1, 1, 1, 1), "ksize": (1, 1, 1, 1)})

Tim Hall

47c7636

2022-07-18 21:26:47 +0100

[diff] [blame]

405

op.attrs["padding"] = Padding.SAME # doesn't really matter as the kernel is 1x1

Tim Hall

3c5cfe9

2022-03-16 16:31:57 +0000

[diff] [blame]

406

op.ifm_resampling_mode = resampling_mode.NEAREST

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

407

408

upscaled_shape = np.array(op.ifm_shapes[0].get_hw_as_list())

Tim Hall

47c7636

2022-07-18 21:26:47 +0100

[diff] [blame]

409

410

# Get upscale factor that was calculated in the supported operators check

411

upscale_factor = op.attrs["upscale_factor"]

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

412

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

413

# Calculate how many times 2x2 upscaling needs to be performed

Tim Hall

f9267da

2022-04-20 20:19:48 +0100

[diff] [blame]

414

# Force the result of round to be an integer. This is because the behaviour of rounding numpy.float64 values changed

415

# between different versions of numpy. This consistency ensures that the kernel dimensions are kept integral

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

416

n = int(np.log2(upscale_factor))

417

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

418

# Perform x2 upscaling n-1 times

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

419

scaled_op = pre_op

420

for count in range(n - 1):

421

if count > 0:

422

scaled_op = op.clone(f"_{count}")

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

423

scaled_op.inputs[0] = pre_op.outputs[0]

424

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

425

# Nearest neighbor x2 upscaling

Tim Hall

47c7636

2022-07-18 21:26:47 +0100

[diff] [blame]

426

upscaled_shape = upscaled_shape * 2

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

427

shape = op.ofm_shapes[0].as_list()

428

shape[1:3] = upscaled_shape

429

out_tens = Tensor(shape, dtype, f"{op.outputs[0].name}_{count}")

430

out_tens.quantization = op.outputs[0].quantization.clone()

431

scaled_op.set_output_tensor(out_tens)

432

pre_op = scaled_op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

433

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

434

scaled_op.set_ifm_ofm_shapes()

435

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

436

# Last x2 upscaling

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

437

if n > 1:

438

scaled_op = op.clone(f"_{n-1}")

439

scaled_op.inputs[0] = pre_op.outputs[0]

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

440

441

if scaled_op.original_type == Op.ResizeBilinear:

442

if scaled_op.attrs["align_corners"]:

443

# no padding

444

scaled_op.attrs["padding"] = Padding.VALID

445

else:

446

# padding to the right and bottom (limits average pool to 8x8 kernel)

447

scaled_op.attrs["padding"] = Padding.EXPLICIT

448

scaled_op.attrs["explicit_padding"] = [0, 0, upscale_factor - 1, upscale_factor - 1]

449

450

# kernal size dependent on the upscaling factor

451

scaled_op.attrs.update({"ksize": (1, upscale_factor, upscale_factor, 1)})

452

else: # Op.ResizeNearestNeighbor

453

if scaled_op.attrs["align_corners"]:

454

# use depthwise conv to select the correct value

455

scaled_op = convert_resizenn_ac_to_depthwise_conv(scaled_op, upscale_factor)

456

else:

457

# keep 1x1 kernel and average pool

458

pass

459

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

460

scaled_op.outputs = outputs

461

scaled_op.outputs[0].ops = [scaled_op]

462

scaled_op.set_ifm_ofm_shapes()

463

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

467

def fixup_resize(op, arch, nng):

468

if op.type.is_resize_op() and op.run_on_npu:

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

469

if op.ifm_shapes[0] == op.ofm_shapes[0]:

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

470

# Bypass the resize op which is essentially a NOP

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

471

op.inputs = op.inputs[:1]

472

op.type = Op.Identity

473

elif op.ifm_shapes[0].height == 1 and op.ifm_shapes[0].width == 1:

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

474

convert_resize_1x1_to_add(op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

475

else:

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

476

convert_resize_to_upscale_and_average_pool(op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

def convert_nop_split_to_identity(op, arch, nng):

482

if op.type == Op.Split and op.attrs.get("num_splits") == 1:

483

# the list comprehension should return a list with a single tensor

484

# if it shouldn't, remove_passthrough_tensor will fail appropriately

485

op.inputs = [i for i in op.inputs if i.shape == op.outputs[0].shape]

486

op.type = Op.Identity

return op

Ayaan Masood

2022-04-21 14:28:03 +0100

[diff] [blame]

490

def rewrite_fully_connected_input(op: Operation, arch, nng):

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

491

Ayaan Masood

a2ec5aa

2022-04-21 14:28:03 +0100

[diff] [blame]

492

if op.type == Op.FullyConnected:

493

new_shape = op.ifm.get_shape_as_2d(op.weights.shape[-2])

494

assert new_shape is not None, "Tensor can not be reshaped to 2D"

495

op.ifm_shapes[0] = new_shape

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

def convert_batched_fc_shape(op, arch, nng):

500

if op.type == Op.FullyConnected:

501

# Check if the first dimension indicates batching

502

if op.ifm_shapes[0].batch > 1:

503

batching_split = {4: (2, 2), 8: (2, 4), 16: (4, 4)}

504

n = op.ifm_shapes[0].batch

505

h, w = batching_split.get(n, (1, n))

506

op.ifm_shapes[0] = Shape4D([1, h, w, op.ifm_shapes[0].depth])

507

508

# Reshape Weights to be 4D. IO becomes HWIO

509

weight_tensor = op.inputs[1]

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

510

weight_tensor.values = np.expand_dims(np.expand_dims(weight_tensor.values, axis=0), axis=0)

511

weight_tensor.set_all_shapes(list(weight_tensor.values.shape))

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

512

513

n = op.ofm_shapes[0].batch

514

h, w = batching_split.get(n, (1, n))

515

op.ofm_shapes[0] = Shape4D([1, h, w, op.ofm_shapes[0].depth])

return op

def unfuse_activation_function(op):

520

if op.type == Op.ConcatTFLite and op.run_on_npu and op.activation is not None:

521

act_op = Operation(op.activation.op_type, op.name + op.activation.op_type.name)

522

op.activation = None

523

out_tens = op.outputs[0]

524

intermediate_tens = out_tens.clone("_act_intermediate")

525

act_op.set_output_tensor(out_tens)

526

act_op.add_input_tensor(intermediate_tens)

527

op.set_output_tensor(intermediate_tens)

528

act_op.set_ifm_ofm_shapes()

529

530

531

def rewrite_stridedslice_output(op, arch, nng):

532

if not op.run_on_npu or op.type != Op.StridedSlice:

533

return op

534

535

new_axis_mask = op.attrs["new_axis_mask"]

536

shrink_axis_mask = op.attrs["shrink_axis_mask"]

537

538

if shrink_axis_mask == 0 and new_axis_mask == 0:

539

return op

540

541

axis_4D = [0] * len(op.outputs)

542

for idx, out_tens in enumerate(op.outputs):

543

output_shape = list(out_tens.shape)

544

545

if shrink_axis_mask != 0:

546

n = 0

547

axis = 0

548

while shrink_axis_mask:

549

prev_mask = shrink_axis_mask

550

n += 1

551

shrink_axis_mask &= shrink_axis_mask - 1

552

axis = int(math.log2(prev_mask - shrink_axis_mask))

553

output_shape = output_shape[:axis] + [1] + output_shape[axis:]

554

555

assert len(out_tens.shape) == (len(op.inputs[0].shape) - n)

556

op.attrs["shrink_axis_mask"] = 0

557

if axis >= 0:

558

axis_4D[idx] = axis + (4 - len(output_shape))

559

else:

560

axis_4D[idx] = axis

561

op.ofm_shapes[idx] = Shape4D(output_shape)

562

563

elif new_axis_mask != 0:

n = 0

axis = 0

while new_axis_mask:

prev_mask = new_axis_mask

568

n += 1

569

new_axis_mask &= new_axis_mask - 1

570

axis = int(math.log2(prev_mask - new_axis_mask))

571

output_shape = output_shape[:axis] + output_shape[(axis + 1) :]

572

new_axis_mask >>= 1

573

574

assert len(out_tens.shape) == (len(op.inputs[0].shape) + n)

575

op.attrs["new_axis_mask"] = 0

576

if axis >= 0:

577

axis_4D[idx] = axis + (4 - len(output_shape))

578

else:

579

axis_4D[idx] = axis

580

op.ofm_shapes[idx] = Shape4D(output_shape)

581

582

op.attrs["split_axis_4D"] = axis_4D

return op

def rewrite_unpack_output(op, arch, nng):

587

tens = op.outputs[0]

588

if op.run_on_npu and op.type == Op.Unpack:

589

# Unpack is also referred to as Unstack

590

axis = int(op.attrs["axis"])

591

if axis < 0: # Convert to positive axis

592

axis = len(op.inputs[0].shape) + 1 + axis

593

op.type = Op.UnpackReshaped

594

desired_output_shape = tens.shape[:axis] + [1] + tens.shape[axis:]

595

596

axis_4D = axis + (4 - len(desired_output_shape))

597

op.attrs["split_axis_4D"] = [axis_4D] * len(op.outputs)

598

599

for idx, out_tens in enumerate(op.outputs):

600

op.ofm_shapes[idx] = Shape4D(desired_output_shape)

return op

def add_padding_fields(op, arch, nng):

605

if op.run_on_npu:

606

if "padding" in op.attrs:

607

input_shape = op.ifm_shapes[0]

608

output_shape = op.ofm_shapes[0]

609

if op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op():

610

kernel_size = op.inputs[1].shape[:2]

611

elif op.type.is_pool_op() or op.type.npu_block_type == NpuBlockType.ReduceSum:

612

kernel_size = op.attrs["ksize"][1:3]

613

else:

614

raise UnsupportedFeatureError(f"Unknown operation that uses padding: {optype_to_builtintype(op.type)}")

615

616

if op.type == Op.Conv2DBackpropInputSwitchedBias:

617

upscaling_factor = output_shape.height // input_shape.height

618

padding, skirt = calc_upscaled_padding_and_skirt(

619

op.attrs["padding"], kernel_size, op.attrs["strides"], input_shape, upscaling_factor

620

)

621

else:

622

padding, skirt = calc_padding_and_skirt(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

op.attrs["padding"],

op.kernel,

input_shape,

op.attrs.get("explicit_padding"),

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

627

)

628

629

op.attrs["explicit_padding"] = padding

630

op.attrs["skirt"] = skirt

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

635

def reorder_depthwise_weights(op, arch, nng):

636

if op.type.is_depthwise_conv2d_op():

637

weight_tensor = op.inputs[1]

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

638

weight_tensor.values = np.transpose(weight_tensor.values, (0, 1, 3, 2))

639

weight_tensor.set_all_shapes(list(weight_tensor.values.shape))

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

640

weight_tensor.weight_transpose_depthwise = True

return op

def optimise_strided_conv(op, arch, nng):

Louis Verhaard

2022-03-17 14:06:00 +0100

[diff] [blame]

646

if op.type != Op.Conv2DBias or op.op_index != 0:

647

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

648

stride_x, stride_y = op.get_kernel_stride()

Louis Verhaard

2022-03-17 14:06:00 +0100

[diff] [blame]

649

weight_tensor = op.weights

650

ifm_shape = op.ifm_shapes[0]

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

651

652

if (

Louis Verhaard

2022-03-17 14:06:00 +0100

[diff] [blame]

653

stride_x == 2

654

and ifm_shape.depth <= 4

655

and ifm_shape.width % 2 == 0

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

656

and weight_tensor is not None

657

and weight_tensor.shape[1] >= 2

658

):

Louis Verhaard

2022-03-17 14:06:00 +0100

[diff] [blame]

659

k_w, _ = op.get_kernel_size()

660

curr_padding_x = needed_total_padding(ifm_shape.width, 2, k_w)

661

optimised_padding_x = needed_total_padding(ifm_shape.width // 2, 1, (k_w + 1) // 2)

662

if curr_padding_x != optimised_padding_x:

663

# Horizontal padding would become different after optimisation; this would not work

664

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

665

# IFM

666

op.ifm_shapes[0] = Shape4D([ifm_shape.batch, ifm_shape.height, ifm_shape.width // 2, ifm_shape.depth * 2])

667

668

# Weights

669

weight_shape = weight_tensor.shape

670

if weight_shape[1] % 2 != 0:

671

weight_shape[1] = weight_shape[1] + 1

672

padded_array = np.zeros(weight_shape)

673

for i in range(weight_shape[0]):

674

padded_array[i] = np.vstack(

675

[

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

676

weight_tensor.values[i],

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

677

np.full((1, weight_shape[2], weight_shape[3]), weight_tensor.quantization.zero_point),

678

]

679

)

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

680

weight_tensor.values = padded_array

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

681

weight_shape[1] //= 2

682

weight_shape[2] *= 2

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

683

weight_tensor.values = np.reshape(weight_tensor.values, weight_shape)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

684

weight_tensor.set_all_shapes(weight_shape)

685

# If multiple copies of the weights are used, we could avoid

686

# them having the same address by changing the value_id

687

weight_tensor.value_id = uuid.uuid4()

# Strides

stride_x = 1

op.attrs.update({"stride_w": stride_x, "stride_h": stride_y, "strides": (1, stride_y, stride_x, 1)})

return op

def convert_conv_to_fc(op, arch, nng):

697

# Conv 1x1 can be equivalent to Fully Connected.

698

# By representing certain convs as fully connected layers, Vela can better determine wether or not to use

699

# caching/double buffering for the weights.

700

# (Weights dont need to be reloaded for convs when IFM H and W are 1)

701

if op.type == Op.Conv2DBias:

702

h = op.ifm_shapes[0].height

703

w = op.ifm_shapes[0].width

704

kh, kw, _, _ = op.inputs[1].shape

705

if h == 1 and w == 1 and kh == 1 and kw == 1:

706

# Overwrite this op as a Fully Connected Op

707

op.name += "_fc"

708

op.type = Op.FullyConnected

op.attrs = {

"weights_format": 0,

}

# Reshape Weights to be 2D. HWIO becomes just IO (as H and W are 1, they can just be dropped)

713

weight_tensor = op.inputs[1]

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

714

weight_tensor.values = weight_tensor.values.squeeze(axis=(0, 1))

715

weight_tensor.set_all_shapes(list(weight_tensor.values.shape))

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

716

717

DebugDatabase.add_optimised(op, op)

return op

def fixup_relus_with_differing_ifm_ofm_scaling(op, arch, nng):

722

if op.run_on_npu and op.type.is_relu_op():

723

ifm = op.inputs[0]

724

ofm = op.outputs[0]

725

# Relu with differing IFM and OFM scaling cannot be fused with another primary op

726

# and requires its own to be inserted

727

if not check_quantized_tens_scaling_equal(ifm, ofm):

728

# Override this op with its own primary op (avgpool)

729

relu_fused_op = create_avgpool_nop(op.name + "_avgpool")

730

# And fuse the original activation function to it

731

relu_fused_op.activation = create_activation_function(op.type)

Fredrik Svedberg

1a7527c

2021-09-13 15:52:16 +0200

[diff] [blame]

732

# Add explicit rescaling

733

rescale = ifm.quantization.scale_f32 / ofm.quantization.scale_f32

734

multiplier, shift = scaling.quantise_scale(rescale)

735

relu_fused_op.rescale = ExplicitScaling(False, [shift], [multiplier])

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

736

# Tidy up and assign the ifm and ofm to the new op

737

ifm.consumer_list.remove(op)

738

739

relu_fused_op.add_input_tensor(ifm)

740

relu_fused_op.set_output_tensor(ofm)

741

relu_fused_op.set_ifm_ofm_shapes()

op = relu_fused_op

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

746

def convert_softmax(op, arch, nng):

747

if op.type == Op.Softmax and op.run_on_npu:

748

softmax = SoftMax(op)

749

op = softmax.get_graph()

return op

Fredrik Svedberg

2022-08-19 16:06:04 +0200

[diff] [blame]

753

def convert_prelu(op, arch, nng):

754

if op.type == Op.Prelu:

755

ifm, alpha, ofm = op.get_ifm_ifm2_ofm()

756

if None in (ifm, alpha, ofm):

757

return op

758

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

759

if alpha.values is not None:

760

# If const alpha check for possible optimisations

761

alpha_zp = alpha.quantization.zero_point

762

alpha_scale = alpha.quantization.scale_f32

763

# If all alpha values are the same the PReLU can be converted to LeakyRelu

764

alpha_min = (alpha.values.min().astype(np.int) - alpha_zp) * alpha_scale

765

alpha_max = (alpha.values.max().astype(np.int) - alpha_zp) * alpha_scale

766

if alpha_min == alpha_max:

# or even a Relu

if alpha_min == 0:

new_op = Op.Relu

else:

new_op = Op.LeakyRelu

772

op.attrs["alpha"] = alpha_min

773

# setup alpha_scaling for bit exact result

774

ifm_scale = ifm.quantization.scale_f32

775

ofm_scale = ofm.quantization.scale_f32

776

alpha_scale, alpha_shift = scaling.elementwise_mul_scale(ifm_scale, alpha_scale, ofm_scale)

777

op.attrs["alpha_scaling"] = (alpha.values.min() - alpha_zp, alpha_scale, alpha_shift)

778

# Change op type

779

op.type = new_op

780

op.name = op.name.replace("Prelu", new_op.name)

781

del op.inputs[1] # Remove alpha tensor

782

return op

783

elif alpha_max < 1:

784

# If alpha_max is less than 1 convert PReLU to Max(alpha * IFM, identity * IFM)

785

# Multiply with alpha tensor

786

mul_alpha = Operation(Op.Mul, op.name + "_mul_alpha")

787

mul_alpha.add_input_tensor(ifm)

788

mul_alpha.add_input_tensor(alpha)

789

fm_alpha = ofm.clone(op.name + "_alpha", set_unique=True)

790

mul_alpha.set_output_tensor(fm_alpha)

791

mul_alpha.set_ifm_ofm_shapes()

792

DebugDatabase.add_optimised(op, mul_alpha)

793

if check_quantized_tens_scaling_equal(ifm, ofm):

794

# No scaling is needed

795

fm_id = ifm

796

else:

797

# Add multiplication with identity

798

mul_identity = Operation(Op.Mul, op.name + "_mul_identity")

799

mul_identity.add_input_tensor(ifm)

800

# Create const tensor containing identity as scalar

801

quantization = ifm.quantization.clone()

802

quantization.scale_f32 = np.float32(1)

803

quantization.zero_point = 0

804

one = create_const_tensor("one_const", [], ifm.dtype, [1], quantization=quantization)

805

mul_identity.add_input_tensor(one)

806

# Make sure that fm_id is allocated to a different address than fm_alpha

807

fm_id = ofm.clone(op.name + "_id", set_unique=True)

808

mul_identity.set_output_tensor(fm_id)

809

mul_identity.set_ifm_ofm_shapes()

810

811

# Combine scaled and alpha multiplied values

812

max_op = Operation(Op.Maximum, op.name + "_max")

813

max_op.add_input_tensor(fm_alpha)

814

max_op.add_input_tensor(fm_id)

815

max_op.set_output_tensor(ofm)

816

max_op.set_ifm_ofm_shapes()

817

818

DebugDatabase.add_optimised(op, max_op)

819

ifm.consumer_list.remove(op)

820

return max_op

821

822

# Catch all PReLU conversion for the cases that could not be optimised above

Fredrik Svedberg

8ddd489

2022-08-19 16:06:04 +0200

[diff] [blame]

823

no_scale_quant = ifm.quantization.clone()

824

no_scale_quant.scale_f32 = None

825

no_scale_quant.zero_point = 0

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

826

zero = create_const_tensor("zero_const", [], ifm.dtype, [0], quantization=no_scale_quant)

Fredrik Svedberg

8ddd489

2022-08-19 16:06:04 +0200

[diff] [blame]

827

828

# Select values < 0

829

min_op = Operation(Op.Minimum, op.name + "_min")

830

min_op.add_input_tensor(ifm)

831

min_op.add_input_tensor(zero)

832

fm_negative = ifm.clone(op.name + "_negative", set_unique=True)

833

min_op.set_output_tensor(fm_negative)

834

min_op.set_ifm_ofm_shapes()

835

DebugDatabase.add_optimised(op, min_op)

836

837

# and multiply with alpha tensor

838

mul_alpha = Operation(Op.Mul, op.name + "_mul_alpha")

839

mul_alpha.add_input_tensor(fm_negative)

840

mul_alpha.add_input_tensor(alpha)

841

fm_alpha = ofm.clone(op.name + "_negative_alpha", set_unique=True)

842

mul_alpha.set_output_tensor(fm_alpha)

843

mul_alpha.set_ifm_ofm_shapes()

844

DebugDatabase.add_optimised(op, mul_alpha)

845

846

# Select (and scale) values > 0

847

relu_op = Operation(Op.Relu, op.name + "_relu")

848

relu_op.add_input_tensor(ifm)

849

fm_scaled = ofm.clone(op.name + "_positive_scaled", set_unique=True)

850

relu_op.set_output_tensor(fm_scaled)

851

relu_op.set_ifm_ofm_shapes()

852

DebugDatabase.add_optimised(op, relu_op)

853

854

# Add scaled and alpha multiplied values (without scaling)

855

add_op = Operation(Op.RescaleAdd, op.name + "_add")

856

add_op.rescale = (1, 0) # No scale or shift

857

add_op.add_input_tensor(fm_alpha)

858

add_op.add_input_tensor(fm_scaled)

859

add_op.set_output_tensor(ofm)

860

add_op.set_ifm_ofm_shapes()

861

862

DebugDatabase.add_optimised(op, add_op)

863

ifm.consumer_list.remove(op)

op = add_op

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

869

def convert_mul_max_to_abs_or_lrelu(op, arch, nng):

870

r"""Whenever there is a subgraph with this topology:

871

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

872

Input X For X = -1 or X > 0

873

| \ / This subgraph can be replaced with either

874

| Mul an Abs (if X = -1) or a LeakyReLU (if X > 0)

875

| /

876

Max

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

877

"""

878

879

if op.type == Op.Maximum:

880

# finds the Mul input(s) to the Max

881

muls = [i for i in op.inputs if i.ops[0].type == Op.Mul]

if len(muls) == 1:

mul = muls[0].ops[0]

elif len(muls) == 2:

# In the case both inputs are Muls, find the one with the same input as the Max

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

886

mul_ifms = [m for m in muls if len(set(op.inputs + m.ops[0].inputs)) == 1]

887

if len(mul_ifms):

888

mul = mul_ifms[0].ops[0]

889

else:

890

# Not using same input

891

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

else:

# No Mul inputs

return op

# make sure the Mul doesn't have any other consumers

897

mul_ofm = mul.outputs[0]

898

if len(mul_ofm.consumers()) != 1:

899

return op

900

# make sure the Mul doesn't have a fused activation function

901

if mul.activation:

902

return op

903

ifm, ofm = op.get_ifm_ofm()

904

if ifm is None or ofm is None:

905

return op

906

907

if ifm.dtype not in (DataType.uint8, DataType.int8) or ifm.dtype != ofm.dtype:

908

return op

909

if not check_quantized_tens_scaling_equal(ifm, ofm) or not check_quantized_tens_scaling_equal(ifm, mul_ofm):

910

# rewrite to LeakyRelu currently only makes sense if the quantization is identical

911

return op

912

913

# finds the branched input that goes to both the Max and the Mul

914

shared = set(op.inputs) & set(mul.inputs)

915

if len(shared) == 1:

916

shared_in = shared.pop()

917

# find the constant scalar input to the Mul

918

const_tens = (set(mul.inputs) - {shared_in}).pop()

919

# check that it is a scalar

920

if const_tens.shape != []:

921

return op

922

const = const_tens.ops[0]

923

# check that it is a constant

924

if const.type != Op.Const:

925

return op

926

# Remove the Mul from the shared input's consumers

927

shared_in.consumer_list.remove(mul)

else:

return op

val = const.outputs[0].values

932

if val >= 0:

933

new_op = Op.LeakyRelu

934

op.attrs["alpha"] = val

935

# to produce bit exact results, the alpha is not enough;

936

# save additional scaling info in attr "alpha_scale", to be used as input

937

# to the LUT construction

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

938

alpha_scalar = const_tens.values - const_tens.quantization.zero_point

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

939

mul_ifm_scale = np.double(ifm.quantization.scale_f32)

940

mul_ifm2_scale = np.double(const_tens.quantization.scale_f32)

941

mul_ofm_scale = np.double(mul_ofm.quantization.scale_f32)

942

alpha_scale, alpha_shift = scaling.elementwise_mul_scale(mul_ifm_scale, mul_ifm2_scale, mul_ofm_scale)

943

op.attrs["alpha_scaling"] = (alpha_scalar, alpha_scale, alpha_shift)

elif val == -1:

new_op = Op.Abs

else:

return op

op.type = new_op

op.name = op.name.replace("Maximum", new_op.name)

951

op.outputs[0].name = op.outputs[0].name.replace("Maximum", new_op.name)

952

op.inputs = [shared_in]

953

op.set_ifm_ofm_shapes()

954

955

# Record optimisation in debug database

956

DebugDatabase.add_optimised(op, op)

return op

def convert_hardswish_to_lut(op, arch, nng):

962

if op.type == Op.HardSwish:

963

ifm, ofm = op.get_ifm_ofm()

964

# Generate the LUT

965

ifm_scale = np.double(ifm.quantization.scale_f32)

966

ofm_scale = np.double(ofm.quantization.scale_f32)

967

zp_in = ifm.quantization.zero_point

968

zp_out = ofm.quantization.zero_point

969

ifm_scale_hires = (1 / 128) * ifm_scale

970

relu_multiplier = np.double(3 / 32768)

971

out_scale, out_shift = scaling.quantise_scale(ifm_scale_hires / ofm_scale)

972

relu_scale, relu_shift = scaling.quantise_scale(ifm_scale_hires / relu_multiplier)

973

# Use 16bit scale

974

out_scale_16 = fp_math.downscale_multiplier_int32_to_int16(out_scale)

975

relu_scale_16 = fp_math.downscale_multiplier_int32_to_int16(relu_scale)

976

977

values = []

978

ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)

979

quantized_min = min(ix)

980

quantized_max = max(ix)

981

for x in ix:

982

input_value = x - zp_in

983

input_value_hires = input_value * 128

984

# Compute the input value on essentially the output scale, not shifted yet

985

input_value_preshift = fp_math.saturating_rounding_mul16(input_value_hires, out_scale_16)

986

# Compute the "relu-ish multiplier". This matches the code in TensorFlow Lite Micro kernel

987

relu_value = np.int16(input_value_hires)

988

if relu_shift < 31:

989

relu_value = fp_math.shift_left16(relu_value, 30 - relu_shift)

990

991

relu_value = fp_math.saturating_rounding_mul16(relu_value, relu_scale_16)

992

993

if relu_shift < 31:

994

relu_value = fp_math.shift_left16(relu_value, 1)

995

996

if relu_shift > 31:

997

relu_value = fp_math.rounding_divide_by_pot(relu_value, relu_shift - 31)

998

999

# Rescaled the value into a 16bit fixedpoint relu_value in [-1, 1]

1000

# Now convert that to a 16bit fixedpoint value in [0, 1]

1001

relu_value = (relu_value + (1 << 15)) >> 1

1002

lut_result = fp_math.saturating_mul16(relu_value, input_value_preshift)

1003

shift = 31 - out_shift

1004

shift = -shift if shift < 0 else 0

1005

# Finally apply the output shift

1006

lut_result = fp_math.rounding_divide_by_pot(lut_result, shift) + zp_out

1007

lut_result = min(quantized_max, max(quantized_min, lut_result))

1008

values.append(lut_result)

1009

return convert_to_lut(op, values, "hardswish")

return op

def convert_lrelu_to_mul_max(op, arch):

1014

# Converts LeakyRelu to Max(alpha * IFM, identity * IFM)

1015

# (the opposite of convert_mul_max_to_abs_or_lrelu)

1016

ifm, ofm = op.get_ifm_ofm()

1017

if ifm is None or ofm is None:

1018

return op

1019

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1020

alpha = np.float32(op.attrs["alpha"])

1021

use_mul_max = 0 < alpha < 1

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame^]

1022

is_converted_prelu = "alpha_scaling" in op.attrs

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

if use_mul_max:

mul_ifm = ifm

new_op = Op.Maximum

else:

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame^]

1027

# Need to use a different approach for alpha < 0 or alpha > 1

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1028

no_scale_quant = ifm.quantization.clone()

1029

no_scale_quant.scale_f32 = None

1030

no_scale_quant.zero_point = 0

1031

zero = create_const_tensor("zero_const", [], ifm.dtype, [0], quantization=no_scale_quant)

1032

1033

# Select values < 0

1034

min_op = Operation(Op.Minimum, op.name + "_min")

1035

min_op.add_input_tensor(ifm)

1036

min_op.add_input_tensor(zero)

1037

mul_ifm = ifm.clone(op.name + "_negative", set_unique=True)

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame^]

1038

if alpha < 0 and not is_converted_prelu:

1039

# For negative alpha that is not from a converted PReLU we need to use

1040

# int32 Mul below to perform the (negative) alpha scaling

1041

mul_ifm.dtype = DataType.int32

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1042

min_op.set_output_tensor(mul_ifm)

1043

min_op.set_ifm_ofm_shapes()

1044

new_op = Op.RescaleAdd

1045

op.rescale = (1, 0) # No scale or shift

1046

DebugDatabase.add_optimised(op, min_op)

1047

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1048

# Add multiplication with alpha

1049

mul_alpha = Operation(Op.Mul, op.name + "_mul_alpha")

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1050

mul_alpha.add_input_tensor(mul_ifm)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1051

# Create const tensor containing alpha as scalar

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1052

quantization = ifm.quantization.clone()

1053

quantization.min = 0

1054

quantization.max = alpha * (quantization.quant_max - quantization.quant_min)

1055

quantization.zero_point = 0

Fredrik Svedberg

2022-09-13 15:22:01 +0200

[diff] [blame]

1056

alpha_dtype = mul_ifm.dtype

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame^]

1057

if is_converted_prelu:

1058

# The LeakyRelu was the result from convert_prelu and the scaling is provided

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

1059

scalar, alpha_scale, alpha_shift = op.attrs["alpha_scaling"]

1060

mul_alpha.type = Op.RescaleMul

1061

mul_alpha.rescale = [alpha_scale, alpha_shift]

Fredrik Svedberg

2022-09-13 15:22:01 +0200

[diff] [blame]

1062

elif alpha == 0 or np.isinf(1 / alpha):

1063

# Handling of alpha near or at zero

Fredrik Svedberg

cce872b

2021-09-02 15:20:52 +0200

[diff] [blame]

1064

quantization.scale_f32 = np.float32(1)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1065

scalar = 0

1066

else:

1067

quantization.scale_f32 = alpha

Fredrik Svedberg

2022-09-13 15:22:01 +0200

[diff] [blame]

1068

if alpha_dtype == DataType.int32:

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame^]

1069

# When the datatype is int32 (alpha negative) we need to do the scaling with the multiplication

Fredrik Svedberg

2022-09-13 15:22:01 +0200

[diff] [blame]

1070

scalar, _ = scaling.elementwise_mul_scale(ifm.quantization.scale_f32, alpha, ofm.quantization.scale_f32)

1071

else:

1072

scalar = 1

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1073

alpha_tens = create_const_tensor(

Fredrik Svedberg

2022-09-13 15:22:01 +0200

[diff] [blame]

1074

op.name + "_alpha_scalar", [1], alpha_dtype, [scalar], alpha_dtype.as_numpy_type(), quantization=quantization

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1075

)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1076

mul_alpha.add_input_tensor(alpha_tens)

1077

fm_alpha = ofm.clone(op.name + "_alpha", set_unique=True)

1078

mul_alpha.set_output_tensor(fm_alpha)

1079

mul_alpha.set_ifm_ofm_shapes()

1080

DebugDatabase.add_optimised(op, mul_alpha)

1081

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1082

if not use_mul_max:

1083

relu_op = Operation(Op.Relu, op.name + "_relu")

1084

relu_op.add_input_tensor(ifm)

1085

fm_id = ofm.clone(op.name + "_positive_scaled", set_unique=True)

1086

relu_op.set_output_tensor(fm_id)

1087

relu_op.set_ifm_ofm_shapes()

1088

DebugDatabase.add_optimised(op, relu_op)

1089

elif check_quantized_tens_scaling_equal(ifm, ofm):

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1090

# No identity multiplication is needed

1091

fm_id = ifm

1092

else:

1093

# Add multiplication with identity

1094

mul_identity = Operation(Op.Mul, op.name + "_mul_identity")

1095

mul_identity.add_input_tensor(ifm)

1096

# Create const tensor containing identity as scalar

1097

quantization = ifm.quantization.clone()

1098

quantization.min = 0

1099

quantization.max = quantization.quant_max - quantization.quant_min

Fredrik Svedberg

cce872b

2021-09-02 15:20:52 +0200

[diff] [blame]

1100

quantization.scale_f32 = np.float32(1)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1101

quantization.zero_point = 0

1102

identity_tens = create_const_tensor(

1103

op.name + "_id_scalar", [], ifm.dtype, [1], np.uint8, quantization=quantization

1104

)

1105

mul_identity.add_input_tensor(identity_tens)

1106

# Make sure that fm_id is allocated to a different address than fm_alpha

1107

fm_id = ofm.clone(op.name + "_id", set_unique=True)

1108

mul_identity.set_output_tensor(fm_id)

1109

mul_identity.set_ifm_ofm_shapes()

1110

DebugDatabase.add_optimised(op, mul_identity)

1111

1112

# Convert LeakyRelu to Max, add the results of the multiplication(s) as inputs

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1113

op.type = new_op

1114

op.name = op.name.replace("LeakyRelu", new_op.name)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1115

op.inputs = []

1116

ifm.consumer_list.remove(op)

1117

op.add_input_tensor(fm_alpha)

1118

op.add_input_tensor(fm_id)

1119

op.set_ifm_ofm_shapes()

1120

1121

DebugDatabase.add_optimised(op, op)

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1125

def convert_to_lut8(op, fn, fn_name):

1126

# Converts op to a no-op + int8/uint8 LUT which is generated with the given function.

1127

# fn is a function(real) -> real

1128

ifm, ofm = op.get_ifm_ofm()

1129

if ifm.dtype not in (DataType.uint8, DataType.int8) or ifm.dtype != ofm.dtype:

1130

return op

1131

# Generate the LUT

1132

ifm_scale = np.double(ifm.quantization.scale_f32)

1133

ofm_scale = np.double(ofm.quantization.scale_f32)

1134

zp_in = ifm.quantization.zero_point

1135

zp_out = ofm.quantization.zero_point

1136

values = []

1137

ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)

1138

quantized_min = min(ix)

1139

quantized_max = max(ix)

1140

for x in ix:

1141

x_real = ifm_scale * (x - zp_in)

1142

y_real = fn(x_real)

1143

lut_result = round_away_zero(zp_out + y_real / ofm_scale)

1144

lut_result = min(quantized_max, max(quantized_min, lut_result))

1145

values.append(lut_result)

1146

return convert_to_lut(op, values, fn_name)

1147

1148

1149

def convert_lrelu_to_lut(op, arch):

1150

ifm, ofm = op.get_ifm_ofm()

1151

# Generate the LUT

1152

alpha = op.attrs["alpha"]

1153

ifm_scale = np.double(ifm.quantization.scale_f32)

1154

ofm_scale = np.double(ofm.quantization.scale_f32)

1155

zp_in = ifm.quantization.zero_point

1156

zp_out = ofm.quantization.zero_point

1157

identity_scale, identity_shift = scaling.elementwise_mul_scale(ifm_scale, 1, ofm_scale)

1158

alpha_scalar = 1

1159

alpha_scale, alpha_shift = scaling.elementwise_mul_scale(ifm_scale, alpha, ofm_scale)

1160

if "alpha_scaling" in op.attrs:

1161

# The LeakyRelu was the result from convert_mul_max_to_abs_or_lrelu

1162

alpha_scalar, alpha_scale, alpha_shift = op.attrs["alpha_scaling"]

1163

values = []

1164

ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)

1165

quantized_min = min(ix)

1166

quantized_max = max(ix)

1167

for x in ix:

1168

if x < zp_in:

1169

lut_result = zp_out + fp_math.multiply_by_quantized_multiplier(

1170

alpha_scalar * (x - zp_in), alpha_scale, alpha_shift

1171

)

1172

else:

1173

lut_result = zp_out + fp_math.multiply_by_quantized_multiplier(x - zp_in, identity_scale, identity_shift)

1174

lut_result = min(quantized_max, max(quantized_min, lut_result))

1175

values.append(lut_result)

1176

return convert_to_lut(op, values, "lrelu")

1177

1178

1179

def convert_lrelu(op, arch, nng):

1180

# Converts LeakyRelu to a LUT based solution if possible, otherwise a mul + max

1181

if op.type != Op.LeakyRelu:

1182

return op

1183

ifm, ofm = op.get_ifm_ofm()

1184

if ifm is None or ofm is None:

1185

return op

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame^]

1186

alpha = op.attrs["alpha"]

1187

if alpha == 0:

1188

# When alpha is 0 the opertion can be converted to a ReLU

1189

op.type = Op.Relu

1190

op.name = op.name.replace("LeakyRelu", op.type.name)

1191

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1192

if ifm.dtype in (DataType.uint8, DataType.int8) and ifm.dtype == ofm.dtype:

1193

# use LUT for int8/uint8

1194

return convert_lrelu_to_lut(op, arch)

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame^]

1195

if check_quantized_tens_scaling_equal(ifm, ofm) and ifm.dtype == ofm.dtype == DataType.int16 and alpha > 0:

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1196

# use LeakyRelu unmodified for int16 with equal input/output scaling and positive alpha

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1197

return op

1198

return convert_lrelu_to_mul_max(op, arch)

1199

1200

1201

def convert_tanh_sigmoid_to_lut(op, arch, nng):

1202

# Converts int8/uint8 Sigmoid and Tanh to a LUT based solution

1203

if op.type == Op.Sigmoid:

1204

return convert_to_lut8(op, clamp_sigmoid, "sigmoid")

1205

elif op.type == Op.Tanh:

1206

return convert_to_lut8(op, math.tanh, "tanh")

return op

Jonas Ohlsson

2021-09-01 15:57:21 +0200

[diff] [blame]

1210

def remove_memory_only_ops(op, arch):

1211

if op.run_on_npu and op.type in memory_only_ops:

1212

bypass_memory_only_ops(op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1213

1214

1215

def fuse_activation_function_with_prev(op, arch, nng):

1216

# if op is a no-op: attempts to move the activation function to the preceding op

1217

if not op.attrs.get("is_nop", False) or op.activation is None:

1218

return op

1219

ifm, ofm = op.get_ifm_ofm()

1220

if ifm is None or ofm is None:

1221

return op

1222

# finds the input(s) to the operation

1223

prev_op = ifm.ops[0]

1224

# Note: the below checks on prev_op require that a first optimize pass on the full graph has been performed

1225

fuse = (

1226

prev_op.run_on_npu

1227

and prev_op.type.npu_block_type != NpuBlockType.Default

1228

and len(ifm.ops) == 1

1229

and len(prev_op.outputs[0].consumers()) == 1

1230

and prev_op.activation is None

1231

)

1232

if op.activation_lut is not None and arch.shram_reserved_unused_banks == 0:

1233

# TODO: if SHRAM LUT space is shared with SHRAM ACC (32, 64 MAC),

1234

# LUT currently only works correctly for elementwise ops

fuse = False

if not fuse:

return op

# Move the fused activation function + corresponding info to prev_op

1239

prev_op.activation = op.activation

1240

prev_op.forced_output_quantization = op.forced_output_quantization

1241

if op.activation_lut is not None:

1242

prev_op.set_activation_lut(op.activation_lut)

1243

# Bypass op

1244

prev_op.set_output_tensor(ofm)

1245

DebugDatabase.add_optimised(op, prev_op)

return op

def _leading_pad_ok(leading_pad, stride, kernel_size):

1250

# If kernel size // 2 > stride, then (left, top) padding must be a multiple of stride,

1251

# otherwise replacing PAD by hardware padding would iterate the wrong IFM rows/columns

1252

max_size = kernel_size // 2

1253

return leading_pad == max_size or max_size <= stride or leading_pad % stride == 0

1254

1255

1256

def replace_pad_by_hw_pad(op: Operation, arch, nng):

1257

"""

1258

Tries to completely remove a PAD operator by using hardware padding.

1259

E.g. a PAD operation that pads 1, followed by a CONV with VALID padding and kernel size 3

1260

is rewritten such that the PAD is removed, and the CONV uses SAME padding.

1261

Converts tens1 -> PAD -> tens2 -> CONV to tens1 -> CONV

1262

if both operations can be run on the NPU.

1263

This is the most efficient way to implement PAD, but cannot be done for all pad sizes.

1264

"""

1265

if (

1266

(op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op() or op.type.is_avgpool_op())

Tim Hall

0ab2edc

2022-02-23 17:58:02 +0000

[diff] [blame]

1267

and op.type not in (Op.Conv2DBackpropInput, Op.Conv2DBackpropInputSwitchedBias)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1268

and op.run_on_npu

1269

and op.attrs["padding"] == Padding.VALID

1270

):

1271

pad_op = op.ifm.ops[0]

1272

if pad_op.type != Op.Pad or not pad_op.run_on_npu:

1273

return op

1274

if pad_op.ifm.dtype != pad_op.ofm.dtype or not check_quantized_tens_scaling_equal(pad_op.ofm, pad_op.ifm):

1275

return op

1276

top, left, bottom, right = get_pad_values_from_input(pad_op.inputs[1].values)

1277

k = op.kernel

1278

k_w, k_h = k.dilated_wh()

1279

1280

# Check if the PAD operator can be replaced by hardware padding

1281

if left > k_w // 2 or right > k_w // 2 or top > k_h // 2 or bottom > k_h // 2:

1282

# Too much padding, it would require hardware padding to actually insert zeros

1283

return op

1284

if not _leading_pad_ok(top, k.stride.y, k_h) or not _leading_pad_ok(left, k.stride.x, k_w):

1285

return op

1286

1287

if op.type.is_avgpool_op():

1288

# For average pool, hardware padding can only be used if padding is 0 or kernel size / 2

for pad, k_size in (

(left, k_w),

(right, k_w),

(top, k_h),

(bottom, k_h),

):

if pad not in (0, k_size // 2):

1296

return op

1297

# Average pool is converted to depthwise, because NPU average pool + same padding

1298

# has a special implementation that is different from PAD followed by average pool with

1299

# valid padding.

1300

k_w, k_h = op.kernel.width, op.kernel.height

1301

ifm = op.ifm

1302

# Remember other inputs

1303

other_inputs = op.inputs[1:]

1304

# Create a weight tensor, all weights are set to 1/(kernel width * kernel height)

1305

quantization = QuantizationParameters(0.0, 255.0)

1306

quantization.scale_f32 = 1.0 / (k_w * k_h)

1307

quantization.zero_point = 0

1308

shape = [k_h, k_w, 1, op.ofm.shape[-1]]

1309

weights = np.full(shape, 1)

1310

1311

weight_tens = create_const_tensor(

1312

op.name + "_weights",

shape,

op.ifm.dtype,

weights,

np.uint8,

purpose=TensorPurpose.Weights,

1318

quantization=quantization,

1319

)

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

1320

weight_tens.values = weights

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1321

op.type = Op.DepthwiseConv2DBias

1322

op.inputs = []

1323

op.add_input_tensor(ifm)

1324

op.add_input_tensor(weight_tens)

1325

# Add bias tensor, all biases set to 0

1326

op.inputs.append(None)

1327

fixup_bias_tensors(op, arch, nng)

1328

# Add other inputs

1329

op.inputs.extend(other_inputs)

1330

op.rounding_mode = NpuRoundingMode.NATURAL

1331

1332

# Bypass the PAD operator

1333

op.set_input_tensor(pad_op.ifm, 0)

1334

# Adjust the padding attributes of the convolution operator

1335

op.attrs["padding"] = Padding.EXPLICIT

1336

op.attrs["explicit_padding"] = (top, left, bottom, right)

1337

op.set_ifm_ofm_shapes()

return op

def convert_pad(op: Operation, arch, nng):

1342

"""

1343

Rewrites PAD operator to an average pool that copies the IFM to the OFM

1344

+ up to 4 average pool operators that fill the OFM with zeros at the borders.

1345

This is done as fall-back for the PAD operators that remain after replace_pad_by_hw_pad

1346

"""

1347

if op.type != Op.Pad or not op.run_on_npu:

1348

return op

1349

top, left, bottom, right = get_pad_values_from_input(op.inputs[1].values)

1350

1351

ifm = op.ifm

1352

assert ifm is not None

James Ward

3e13434

2021-10-28 10:01:40 +0100

[diff] [blame]

1353

ifm_shape = op.ifm_shapes[0]

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1354

ofm = op.ofm

1355

assert ofm is not None

1356

ofm.ops = []

1357

ofm_shape = op.ofm_shapes[0]

1358

1359

# Average pool op that copies IFM to the right place inside the OFM

1360

shp0 = Shape4D(0, 0, 0, 0)

1361

shp_top = shp0.with_height(top)

1362

avgpool_op = create_avg_pool_for_concat(op, op.name + "_main", ifm, ifm_shape, shp_top.with_width(left))

1363

avgpool_op.activation = op.activation

1364

quant = ofm.quantization

1365

pad_value = quant.zero_point

1366

# Add operations that fill the borders of the OFM

1367

if top > 0:

1368

shape = Shape4D(1, top, ofm_shape.width, ofm_shape.depth)

1369

zero_tens = create_const_tensor(

1370

op.name + "_top", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], np.uint8, quantization=quant

1371

)

1372

# If top/bottom or left/right are equal, the const tensors can be allocated to the same address

1373

zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))

1374

create_avg_pool_for_concat(op, op.name + "_top", zero_tens, shape, shp0)

1375

if bottom > 0:

1376

shape = Shape4D(1, bottom, ofm_shape.width, ofm_shape.depth)

1377

zero_tens = create_const_tensor(

op.name + "_bottom",

shape.as_list(),

ofm.dtype,

shape.elements() * [pad_value],

np.uint8,

quantization=quant,

)

zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))

1386

create_avg_pool_for_concat(

1387

op, op.name + "_bottom", zero_tens, shape, shp0.with_height(ofm_shape.height - bottom)

1388

)

1389

if left > 0:

1390

shape = Shape4D(1, ifm_shape.height, left, ofm_shape.depth)

1391

zero_tens = create_const_tensor(

1392

op.name + "_left", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], np.uint8, quantization=quant

1393

)

1394

zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))

1395

create_avg_pool_for_concat(op, op.name + "_left", zero_tens, shape, shp_top)

1396

if right > 0:

1397

shape = Shape4D(1, ifm_shape.height, right, ofm_shape.depth)

1398

zero_tens = create_const_tensor(

1399

op.name + "_right", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], np.uint8, quantization=quant

1400

)

1401

zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))

1402

create_avg_pool_for_concat(

1403

op, op.name + "_right", zero_tens, shape, shp_top.with_width(ofm_shape.width - right)

1404

)

1405

1406

op.type = Op.ConcatTFLite

return avgpool_op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1410

def fixup_bias_tensors(op, arch, nng):

1411

if op.type.needs_bias() and op.bias is None:

1412

# Op has no bias, add bias tensor filled with zeros

1413

nr_biases = op.inputs[1].shape[-1]

1414

bias_values = [0] * nr_biases

1415

bias_tensor = create_const_tensor(op.name + "_bias", [nr_biases], DataType.int32, bias_values)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1416

op.set_input_tensor(bias_tensor, op.type.info.indices.biases[0])

return op

Fredrik Svedberg

2021-11-01 14:25:29 +0100

[diff] [blame]

1421

def fixup_asymmetric_weights(op, arch, nng):

1422

if op.run_on_npu and (op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op()):

1423

if op.ifm.dtype == DataType.int8:

1424

if not np.all(op.weights.quantization.zero_point == 0):

1425

print(f"Warning: {op.type} '{op.name}' has asymmetric weights, zero points have been adjusted.")

1426

op.weights.quantization.zero_point *= 0

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1431

def convert_mean_to_depthwise_conv_or_avgpool(op, arch, nng):

1432

if op.type == Op.Mean and op.run_on_npu:

1433

keep_dims = op.attrs.get("keep_dims", False)

1434

inp, axis = op.inputs

1435

shape = inp.shape

Diqing Zhong

2022-03-09 12:23:47 +0100

[diff] [blame]

1436

ofm_shape = op.ofm.shape

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1437

dims = len(shape)

Diqing Zhong

2022-03-09 12:23:47 +0100

[diff] [blame]

1438

dims_ofm = len(ofm_shape)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1439

1440

# Height and width axes have different index depending on dimensions

1441

if axis.shape == [] or axis.shape[0] == 1: # single axis

1442

axis = int(axis.values) if len(axis.shape) == 0 else int(axis.values[0])

1443

if dims in (2, 3):

1444

if axis == 0:

1445

h, w = shape[axis], 1

1446

else:

1447

h, w = 1, shape[axis]

1448

else:

1449

if axis == 1:

1450

h, w = shape[axis], 1

1451

else:

1452

h, w = 1, shape[axis]

1453

else: # multiple axes

1454

axis = sorted(axis.values)

1455

h, w = [shape[i] for i in axis]

1456

1457

# Set necessary depthwise attributes

1458

op.attrs.update(

1459

{

1460

"padding": Padding.VALID,

1461

"stride_h": 1,

1462

"stride_w": 1,

1463

"strides": (1, 1, 1, 1),

1464

"depth_multiplier": 1,

1465

"channel_multiplier": 1,

1466

"dilation_h_factor": 1,

1467

"dilation_w_factor": 1,

1468

"dilation": (1, 1, 1, 1),

}

)

# Change op type

op.type = Op.DepthwiseConv2DBias

1473

# Set IFM/OFM shapes after changing op type

1474

op.set_ifm_ofm_shapes()

1475

1476

weight_scale, bias = 1, None

1477

ofmq, ifmq = op.ofm.quantization, inp.quantization

1478

# Set rounding mode, scaling and zero point based on which reference implementation to match

1479

if len(shape) == 4 and axis == [1, 2] and keep_dims:

1480

if inp.dtype == DataType.uint8:

1481

# This attribute means a different scaling calculation is used in order to match reference

1482

op.low_precision_scaling = True

1483

weight_scale = h * w

1484

# Set zero points to 0 as they will be adjusted for with bias term

foq = ofmq.clone()

foq.zero_point = 0

fiq = ifmq.clone()

fiq.zero_point = 0

op.forced_input_quantization = fiq

Johan Alfvén

1700939

2022-08-30 09:14:56 +0200

[diff] [blame]

1490

bias_term = ofmq.zero_point - round_up_to_int(ifmq.zero_point * ifmq.scale_f32 / ofmq.scale_f32)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1491

# If the bias term is outside uint8 range, we need an Add op to apply it.

1492

if bias_term < 0 or bias_term > 255:

1493

intermediate = op.ofm.clone(suffix="_intermediate", set_unique=True)

1494

# Bias term has higher bitness (i32) than input/output (u8).

1495

# 16 bits is enough since the bias is added/subtracted from a u8 value,

1496

# the bias can only effectively assume values in the range [-255, 255].

1497

intermediate.dtype = DataType.int16

1498

intermediate.quantization.zero_point = 0

1499

add_op = Operation(Op.Add, op.name + "_bias")

1500

add_op.forced_output_quantization = foq

1501

add_op.add_input_tensor(intermediate)

1502

quant = QuantizationParameters()

1503

quant.zero_point = 0

1504

bias_term_tens = create_const_tensor(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

op.name + "_bias",

[1, 1, 1, 1],

DataType.int16,

[bias_term],

np.int16,

quantization=quant,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1511

)

1512

add_op.add_input_tensor(bias_term_tens)

1513

add_op.set_output_tensor(op.ofm)

1514

add_op.set_ifm_ofm_shapes()

1515

add_op.activation = op.activation

1516

op.activation = None

1517

op.set_output_tensor(intermediate)

1518

op.set_ifm_ofm_shapes()

1519

# If not, we can just do it with the OFM zero point.

1520

else:

1521

foq.zero_point = bias_term

1522

op.forced_output_quantization = foq

1523

else:

1524

assert inp.dtype == DataType.int8

1525

# Use a depthwise to calculate the sum,

1526

# followed by a multiplication with 1/N to get the MEAN

1527

weight_scale = 1

1528

intermediate = op.ofm.clone(suffix="_intermediate", set_unique=True)

Johan Alfvén

2022-09-06 20:33:22 +0200

[diff] [blame]

1529

intermediate.dtype = DataType.int32

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1530

mul_op = Operation(Op.Mul, op.name + "_mul")

1531

mul_op.add_input_tensor(intermediate)

Johan Alfvén

2022-09-06 20:33:22 +0200

[diff] [blame]

1532

mul_op.set_output_tensor(op.ofm)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1533

# Create scalar containing 1/N

1534

quant = QuantizationParameters()

1535

quant.zero_point = 0

1536

# The reference rounds negative numbers downwards, e.g. -1.5 is rounded to -2,

1537

# while rounding mode NATURAL would round this to -1.

1538

# This can only occur if N is even, and can be emulated by

1539

# multiplying with a number that is slightly smaller than 1/N.

1540

# It must be so small that other roundings are not affected;

1541

# the calculated value is based on worst case,

1542

# which is sum 256 * N (the maximum sum that can occur with int8)

1543

n = int(h * w)

1544

eps = 1 / (256 * (n + 1)) if n % 2 == 0 else 0

1545

quant.scale_f32 = 1 / (n - eps)

Johan Alfvén

2022-09-06 20:33:22 +0200

[diff] [blame]

1546

1547

# For int8/int16 we could use IFM/OFM scaling to do the division

1548

# intermediate * 1 -> scale > round and shift.

1549

#

1550

# For int32 scaling is not supported so instead multiply with the scale

1551

# intermediate * scale -> round and shift.

1552

#

1553

# Calculate the scale and shift value. const Tensor must be created

1554

# with correct quantization since the scale and shift is calculated later

1555

# in the command stream generator.

1556

mul_scale, _ = scaling.elementwise_mul_scale(

1557

mul_op.ifm.quantization.scale_f32, quant.scale_f32, mul_op.ofm.quantization.scale_f32

1558

)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1559

scalar = create_const_tensor(

Johan Alfvén

2022-09-06 20:33:22 +0200

[diff] [blame]

1560

op.name + "_scalar", [1, 1, 1, 1], DataType.int32, [mul_scale], np.int32, quantization=quant

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1561

)

1562

mul_op.add_input_tensor(scalar)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1563

mul_op.set_ifm_ofm_shapes()

1564

mul_op.rounding_mode = NpuRoundingMode.NATURAL

1565

mul_op.activation = op.activation

1566

op.activation = None

1567

op.set_output_tensor(intermediate)

1568

op.set_ifm_ofm_shapes()

1569

elif ifmq.zero_point == ofmq.zero_point and ifmq.scale_f32 == ofmq.scale_f32:

1570

# Here we can just use a simple AvgPool with truncating rounding,

1571

# as we're emulating simple integer division.

1572

op.rounding_mode = NpuRoundingMode.TRUNCATE

1573

op.type = Op.AvgPool

1574

op.attrs.update({"ksize": (1, h, w, 1), "filter_height": h, "filter_width": w})

1575

else:

1576

op.rounding_mode = NpuRoundingMode.NATURAL

1577

weight_scale = 1 / (h * w)

1578

# Input zero point is adjusted after mean calculation, so we emulate that with a bias

1579

bias = -ifmq.zero_point * h * w

1580

fiq = ifmq.clone()

1581

fiq.zero_point = 0

1582

op.forced_input_quantization = fiq

1583

1584

# Change dimensions to 4

Diqing Zhong

2022-03-09 12:23:47 +0100

[diff] [blame]

1585

def extend_dims(dim, in_shape):

1586

if dim < 4:

1587

in_shape = [1] + in_shape

if dim == 2:

in_shape += [1]

return in_shape

if dims < 4 or dims_ofm < 4:

1593

# Fix the ofm dimension when keep_dims is false

1594

# e.g. IFM=1xHxWxC axis=2 OFM=1xHxC, the ofm_shape should be 1xHx1xC, not 1x1xHxC

1595

if isinstance(axis, int) and dims_ofm + 1 == dims:

1596

ofm_shape.insert(axis, 1)

1597

elif isinstance(axis, list) and (dims_ofm + len(axis) == dims):

1598

for i in axis:

1599

ofm_shape.insert(i, 1)

1600

shape = extend_dims(dims, shape)

1601

dims_ofm = len(ofm_shape)

1602

ofm_shape = extend_dims(dims_ofm, ofm_shape)

1603

op.set_ifm_ofm_shapes()

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1604

Rickard Bolin

7d7cb67

2021-12-07 09:09:14 +0000

[diff] [blame]

1605

# If height is greater than max kernel height, reshape from HxW to 1x(HxW)

1606

if (h > 64 and op.type == Op.DepthwiseConv2DBias) or (h > 256 and op.type == Op.AvgPool):

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1607

shape = [shape[0], 1, h * w, shape[3]]

1608

op.ifm_shapes[0] = Shape4D(shape)

1609

if h > 256 and op.type == Op.AvgPool:

1610

op.attrs.update({"ksize": (1, 1, h * w, 1), "filter_height": 1, "filter_width": h * w})

1611

1612

# If the AvgPool version is used, we don't need to do anything else

1613

if op.type == Op.AvgPool:

1614

return op

1615

1616

# Make unit weight tensor quantization

1617

weight_quant = ifmq.clone()

1618

weight_quant.min = 0

1619

weight_quant.max = 255

1620

weight_quant.scale_f32 = weight_scale

1621

weight_quant.zero_point = 0

1622

1623

# Set weight shape to [H,W,C,B]

Diqing Zhong

2022-03-09 12:23:47 +0100

[diff] [blame]

1624

weight_shape = [h, w, shape[3], shape[0]]

1625

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1626

# Add unit weight tensor

op.set_input_tensor(

create_const_tensor(

"weights",

weight_shape,

inp.dtype,

np.ones(weight_shape),

1633

value_dtype=np.uint8,

1634

quantization=weight_quant,

1635

),

1636

1,

1637

)

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

1638

op.weights.values = np.reshape(op.inputs[1].values, weight_shape)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1639

1640

# Add None bias tensor

1641

op.inputs.append(None)

1642

# Add bias tensor

1643

if bias:

1644

bias_shape = [shape[-1]]

1645

op.set_input_tensor(

1646

create_const_tensor(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

"bias",

bias_shape,

inp.dtype,

np.ones(bias_shape) * bias,

1651

value_dtype=np.int32,

1652

quantization=None,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

),

2,

)

return op

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1660

def optimise_quantize(op: Operation, arch, nng):

1661

1662

if op.type == Op.Quantize and op.run_on_npu:

1663

1664

ifm, ofm = op.get_ifm_ofm()

1665

input_values = ifm.values

1666

1667

# Guard clause - input not const or no values to quantize

1668

if ifm.ops[0].type != Op.Const or input_values is None:

1669

return op

1670

1671

# Singular val in numpy array, convert to indexable array

1672

if input_values.ndim == 0:

1673

input_values = np.array([input_values])

1674

Fredrik Svedberg

1156317

2022-07-06 14:54:12 +0200

[diff] [blame]

1675

# requantized int8 to int8 or int16 to int16

1676

if ifm.dtype == ofm.dtype == DataType.int8 or ifm.dtype == ofm.dtype == DataType.int16:

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1677

1678

# scale needs to use double precision to match TFLite reference kernel

1679

effective_scale = np.float64(ifm.quantization.scale_f32) / np.float64(ofm.quantization.scale_f32)

1680

effective_multiplier, effective_shift = quantise_scale(effective_scale)

1681

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1682

requantized_vals = []

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

1683

for val in input_values.flatten():

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1684

input_val = val - ifm.quantization.zero_point

1685

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

1686

ofm_val = fp_math.multiply_by_quantized_multiplier(input_val, effective_multiplier, effective_shift)

1687

ofm_val += ofm.quantization.zero_point

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1688

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

1689

clamped_ofm_value = max(min(ofm_val, ofm.quantization.quant_max), ofm.quantization.quant_min)

1690

requantized_vals.append(clamped_ofm_value)

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1691

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

1692

ofm.values = np.array(requantized_vals, ofm.dtype.as_numpy_type())

1693

ofm.values.shape = input_values.shape

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1694

1695

# Case: Float input - quantize to int

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

1696

elif ifm.dtype.type == BaseType.Float:

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1697

1698

quantized_vals = []

1699

for val in input_values:

1700

1701

# Derive quantized value

1702

quant_val = (val / ofm.quantization.scale_f32) + ofm.quantization.zero_point

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

1703

clamped_quantized_val = np.clip(quant_val, ofm.quantization.quant_min, ofm.quantization.quant_max)

1704

quantized_vals.append(clamped_quantized_val)

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1705

1706

# Pass the statically calculated quant val to output tensor

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

1707

ofm.values = np.array(quantized_vals, ofm.dtype.as_numpy_type())

1708

1709

# Unsupported data type

1710

else:

1711

return op

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1712

1713

# Make quantize op const and disconnect from parent node

1714

1715

# Remove reference of the current quant op from the parent tensor's consumer list

1716

ifm.consumer_list = [consumer for consumer in ifm.consumer_list if consumer.op_index != op.op_index]

1717

1718

# Clear any references to parent node

1719

op.inputs = []

1720

1721

# Convert this quantize op to const

op.type = Op.Const

return op

Ayaan Masood

2022-06-29 11:30:57 +0100

[diff] [blame]

1727

def convert_shape_op_to_constant_tensor(op: Operation, arch, nng):

1728

"""Static optimisation for SHAPE operator output value known at compile time"""

1729

1730

# Disconnect SHAPE operator from its parent and transform SHAPE OP into constant

1731

1732

if op.type == Op.Shape and op.run_on_npu:

1733

1734

ifm, ofm = op.get_ifm_ofm()

1735

1736

if len(ifm.shape) != ofm.shape[0]:

1737

return op

1738

1739

# Remove reference of the current shape op from the parent tensor's consumer list

1740

ifm.consumer_list = [consumer for consumer in ifm.consumer_list if consumer.op_index != op.op_index]

1741

1742

# Clear any references to parent node

1743

op.inputs = []

1744

1745

# Convert this SHAPE op to const

1746

op.type = Op.Const

1747

1748

# Add size calculation to shape output tensors

1749

ofm.values = np.array(ifm.shape)

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1754

def supported_operator_check(op, arch, nng):

Jonas Ohlsson

45e653d

2021-07-26 16:13:12 +0200

[diff] [blame]

1755

op.run_on_npu = arch.tflite_supported_operators.is_operator_supported(op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

def tflite_optimise_graph(nng, arch):

Fredrik Svedberg

1156317

2022-07-06 14:54:12 +0200

[diff] [blame]

1760

# Compile time static optimisations

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1761

optimisation_list = [optimise_quantize, convert_shape_op_to_constant_tensor]

1762

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1763

for idx, sg in enumerate(nng.subgraphs):

1764

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

nng,

sg,

arch,

[],

Ayaan Masood

4965fae

2022-06-29 11:30:57 +0100

[diff] [blame]

1769

optimisation_list,

1770

rewrite_unsupported=False,

1771

)

1772

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

1773

# Pre-processing step

1774

pre_process_list = [

1775

supported_operator_check,

1776

set_ifm_ofm_op_shapes,

1777

]

1778

Ayaan Masood

4965fae

2022-06-29 11:30:57 +0100

[diff] [blame]

1779

for idx, sg in enumerate(nng.subgraphs):

1780

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

nng,

sg,

arch,

[],

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

1785

pre_process_list,

1786

rewrite_unsupported=False,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

)

# Handle Concat Ops

for idx, sg in enumerate(nng.subgraphs):

1791

rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [rewrite_concat_ops])

1792

sg.refresh_after_modification()

1793

1794

# Handle Split Ops

1795

for idx, sg in enumerate(nng.subgraphs):

1796

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

nng,

sg,

arch,

[],

[rewrite_unpack_output, rewrite_stridedslice_output, convert_nop_split_to_identity],

1802

rewrite_unsupported=False,

1803

)

1804

1805

for idx, sg in enumerate(nng.subgraphs):

1806

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

nng,

sg,

arch,

[rewrite_split_ops],

[],

rewrite_unsupported=False,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1813

)

1814

1815

# Handle sg input output

1816

for idx, sg in enumerate(nng.subgraphs):

1817

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

nng,

sg,

arch,

[],

[fix_sg_input_output],

1823

rewrite_unsupported=False,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1824

)

1825

Jonas Ohlsson

2021-09-01 15:57:21 +0200

[diff] [blame]

1826

# Removal of memory only operators

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1827

for sg in nng.subgraphs:

Jonas Ohlsson

2021-09-01 15:57:21 +0200

[diff] [blame]

1828

rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [remove_memory_only_ops])

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1829

sg.refresh_after_modification()

1830

1831

# Rewrite of operators

1832

op_rewrite_list = [

1833

set_tensor_equivalence,

1834

convert_mean_to_depthwise_conv_or_avgpool,

1835

convert_depthwise_to_conv,

1836

convert_conv_to_fc,

1837

convert_softmax,

Fredrik Svedberg

8ddd489

2022-08-19 16:06:04 +0200

[diff] [blame]

1838

convert_prelu,

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame^]

1839

convert_mul_max_to_abs_or_lrelu,

1840

convert_lrelu,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1841

optimise_strided_conv,

1842

convert_hardswish_to_lut,

1843

rewrite_fully_connected_input,

1844

convert_batched_fc_shape,

1845

fixup_conv2d_backprop,

1846

fixup_relus_with_differing_ifm_ofm_scaling,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1847

reorder_depthwise_weights,

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

1848

fixup_resize,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1849

fixup_bias_tensors,

Fredrik Svedberg

cc8569f

2021-11-01 14:25:29 +0100

[diff] [blame]

1850

fixup_asymmetric_weights,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1851

convert_tanh_sigmoid_to_lut,

1852

replace_pad_by_hw_pad,

1853

]

1854

1855

for idx, sg in enumerate(nng.subgraphs):

1856

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

nng,

sg,

arch,

[],

op_rewrite_list,

rewrite_unsupported=False,

Patrik Gustavsson