Blame - ethosu/vela/tflite_graph_optimiser.py - ml/ethos-u/ethos-u-vela

2021-06-28 07:41:58 +0200

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Rickard Bolin

bc6ee58

2022-11-04 08:24:29 +0000

[diff] [blame]

16

#

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

17

# Description:

18

# Early optimisation of a TensorFlow Lite based network graph, using the rewrite_graph module

19

# to do the traversal of the graph.

20

import math

21

import uuid

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

import numpy as np

from . import fp_math

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

26

from . import rewrite_graph

27

from . import scaling

28

from .api import NpuRoundingMode

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

29

from .data_type import BaseType

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

30

from .data_type import DataType

31

from .debug_database import DebugDatabase

32

from .errors import UnsupportedFeatureError

33

from .ethos_u55_regs.ethos_u55_regs import resampling_mode

Jonas Ohlsson

2021-09-01 15:57:21 +0200

[diff] [blame]

34

from .graph_optimiser_util import bypass_memory_only_ops

Patrik Gustavsson

c74682c

2021-08-17 14:26:38 +0200

[diff] [blame]

35

from .graph_optimiser_util import calc_explicit_padding

Patrik Gustavsson

df99510

2021-08-23 15:33:59 +0200

[diff] [blame]

36

from .graph_optimiser_util import convert_depthwise_to_conv

Patrik Gustavsson

f436ada

2021-09-14 14:56:48 +0200

[diff] [blame]

37

from .graph_optimiser_util import convert_to_lut

Patrik Gustavsson

df99510

2021-08-23 15:33:59 +0200

[diff] [blame]

38

from .graph_optimiser_util import fix_sg_input_output

Jonas Ohlsson

2021-09-01 15:57:21 +0200

[diff] [blame]

39

from .graph_optimiser_util import memory_only_ops

Patrik Gustavsson

f1580f0

2021-09-01 12:43:02 +0200

[diff] [blame]

40

from .graph_optimiser_util import move_splitsliceread_to_consumer

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

41

from .graph_optimiser_util import needed_total_padding

42

from .graph_optimiser_util import set_ifm_ofm_op_shapes

43

from .graph_optimiser_util import set_tensor_equivalence

44

from .numeric_util import clamp_sigmoid

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

45

from .numeric_util import round_away_zero

46

from .operation import create_activation_function

Fredrik Svedberg

1a7527c

2021-09-13 15:52:16 +0200

[diff] [blame]

47

from .operation import ExplicitScaling

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

48

from .operation import NpuBlockType

49

from .operation import Op

50

from .operation import Operation

51

from .operation import Padding

Fredrik Svedberg

f3c7d55

2022-11-04 09:48:49 +0100

[diff] [blame]

52

from .operation_util import create_add_nop

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

53

from .operation_util import create_avgpool_nop

54

from .operation_util import get_pad_values_from_input

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

55

from .scaling import quantise_scale

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

56

from .shape4d import Shape4D

57

from .softmax import SoftMax

58

from .tensor import check_quantized_tens_scaling_equal

59

from .tensor import create_const_tensor

60

from .tensor import create_equivalence_id

61

from .tensor import QuantizationParameters

62

from .tensor import Tensor

63

from .tensor import TensorPurpose

64

from .tflite_mapping import optype_to_builtintype

65

66

passthrough_nodes = (Op.Identity,)

67

68

69

def create_avg_pool_for_concat(concat_op, name, ifm, ifm_shape: Shape4D, write_offset: Shape4D):

70

"""Creates an average pool for the given concat op/input feature map"""

71

ofm = concat_op.ofm

72

avgpool_op = create_avgpool_nop(name)

73

avgpool_op.inputs = [ifm]

74

avgpool_op.outputs = [ofm]

75

76

avgpool_op.write_offset = write_offset

77

avgpool_op.write_shape = ifm_shape

78

ofm.ops.append(avgpool_op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

79

avgpool_op.ifm_shapes.append(ifm_shape)

80

avgpool_op.ofm_shapes.append(concat_op.ofm_shapes[0])

81

avgpool_op.memory_function = Op.ConcatSliceWrite

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame^]

82

DebugDatabase.add_optimised(concat_op, avgpool_op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return avgpool_op

def remove_passthrough_tensor(tens, arch, nng):

87

if len(tens.ops) == 1 and tens.ops[0].type in passthrough_nodes:

88

assert len(tens.ops[0].inputs) == 1

89

tens = tens.ops[0].inputs[0]

return tens

def rewrite_concat_ops(op, arch):

94

if not op.run_on_npu or not op.type.is_concat_op():

return

axis_4D = 0

ofm = op.ofm

ofm.ops = []

offset = 0

unfuse_activation_function(op)

103

104

if op.type == Op.Pack:

105

# Pack is also referred to as Stack

106

axis = int(op.attrs["axis"])

107

if axis < 0: # Convert to positive axis

108

axis = len(op.inputs[0].shape) + 1 + axis

109

110

desired_shape = op.inputs[0].shape[:axis] + [1] + op.inputs[0].shape[axis:]

111

112

axis_4D = axis + (4 - len(desired_shape))

113

114

for idx, inp in enumerate(op.inputs):

115

op.ifm_shapes[idx] = Shape4D(desired_shape)

116

op.type = Op.PackReshaped

117

118

inputs, axis = op.get_concat_inputs_axis()

119

for idx, inp in enumerate(inputs):

120

if op.type != Op.PackReshaped:

121

op.ifm_shapes[idx] = Shape4D(inp.shape)

122

if axis >= 0:

123

axis_4D = axis + (4 - len(inp.shape))

124

else:

125

axis_4D = axis

126

write_offset = [0, 0, 0, 0]

127

write_offset[axis_4D] = offset

128

concat_end = offset + op.ifm_shapes[idx][axis_4D]

129

create_avg_pool_for_concat(

130

op, op.name + str(idx) + "_avgpool", inp, op.ifm_shapes[idx], Shape4D.from_list(write_offset)

131

)

132

offset = concat_end

133

assert ofm.shape[axis] == offset

return op

def rewrite_split_ops(tens, arch, nng):

139

140

if len(tens.ops) == 1 and tens.ops[0].type.is_split_op() and tens.ops[0].type != Op.Unpack:

141

split_op = tens.ops[0]

142

143

# Not supported so leave it and run on CPU

144

if not split_op.run_on_npu:

145

return tens

146

147

inp, outputs, axis, offset_start, offset_end = split_op.get_split_inputs_axis()

148

149

tens.ops = []

150

new_op = Operation(Op.SplitSliceRead, split_op.name)

151

new_op.inputs = [inp]

152

ofm_shape_idx = 0

Tim Hall

51a8dce

2021-12-20 16:49:27 +0000

[diff] [blame]

153

if None in (offset_end, offset_start):

154

read_shape = None

155

else:

156

# the read shape is relative to each start offset

157

read_shape = [oe - os for oe, os in zip(offset_end, offset_start)]

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

158

159

# For Split the offset cannot be extracted from the tensor so it has to

160

# be calculated from the index of the output tensor

161

if axis is not None:

162

# Get the start and end of the split

163

offset_start = [0] * 4

164

axis_4D_list = split_op.attrs.get("split_axis_4D", None) # Present for UnpackReshaped and some StridedSlice

165

for idx, out in enumerate(outputs):

166

if axis_4D_list is not None:

167

axis_4D = axis_4D_list[idx]

168

else:

169

split_op.ofm_shapes[idx] = Shape4D(out.shape)

170

if axis >= 0:

171

axis_4D = axis + (4 - len(out.shape))

else:

axis_4D = axis

if out == tens:

ofm_shape_idx = idx

read_shape = split_op.ofm_shapes[idx]

178

break

179

180

offset_start[axis_4D] += split_op.ofm_shapes[idx][axis_4D]

181

182

new_op.read_offsets[0] = Shape4D.from_list(offset_start, 0)

183

new_op.read_shapes[0] = read_shape

184

new_op.run_on_npu = True

185

new_op.set_output_tensor(tens)

186

new_op.ifm_shapes.append(Shape4D(inp.shape))

187

new_op.ofm_shapes.append(split_op.ofm_shapes[ofm_shape_idx])

188

DebugDatabase.add_optimised(split_op, new_op)

return tens

def remove_SplitSliceRead(op, arch):

194

195

if op.type == Op.SplitSliceRead:

196

# Check if it is possible to put the SplitSliceRead on the tensor consumer, or if an avgpool need to be inserted

197

if (

198

len(op.ofm.consumer_list) == 1

199

and op.ofm.consumer_list[0] is not None

200

and op.ofm.consumer_list[0].run_on_npu

Jonas Ohlsson

2021-09-01 15:57:21 +0200

[diff] [blame]

201

and op.ofm.consumer_list[0].type not in memory_only_ops

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

202

and op.ofm_shapes[0] == Shape4D.from_list(op.ofm.shape)

203

):

204

# SplitSliceRead can be performed by tensor consumer

205

cons_op = op.ofm.consumer_list[0]

Patrik Gustavsson

f1580f0

2021-09-01 12:43:02 +0200

[diff] [blame]

206

move_splitsliceread_to_consumer(op, cons_op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

207

else:

208

avgpool_op = create_avgpool_nop(op.name + "_avgpool")

209

avgpool_op.add_input_tensor(op.ifm)

210

avgpool_op.outputs = [op.ofm]

211

op.ofm.ops.remove(op)

212

op.ofm.ops.append(avgpool_op)

213

avgpool_op.ifm_shapes.append(op.ifm_shapes[0])

214

avgpool_op.ofm_shapes.append(op.ofm_shapes[0])

215

avgpool_op.read_offsets[0] = op.read_offsets[0]

216

avgpool_op.read_shapes[0] = op.read_shapes[0]

217

218

op.ifm.consumer_list.remove(op)

219

DebugDatabase.add_optimised(op, avgpool_op)

220

221

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

222

def calc_padding_and_skirt(padding_type, kernel, input_shape, explicit_padding):

223

k_w, k_h = kernel.dilated_wh()

224

s_x, s_y = kernel.stride

225

ypad = needed_total_padding(int(input_shape.height), int(s_y), int(k_h))

226

xpad = needed_total_padding(int(input_shape.width), int(s_x), int(k_w))

227

if padding_type == Padding.SAME:

228

left_pad = (xpad + 0) // 2

229

right_pad = (xpad + 1) // 2

230

top_pad = (ypad + 0) // 2

231

bottom_pad = (ypad + 1) // 2

232

elif padding_type == Padding.VALID:

left_pad = 0

right_pad = 0

top_pad = 0

bottom_pad = 0

elif padding_type == Padding.EXPLICIT:

238

# Padding is specified in a PAD operator which has been bypassed.

239

top, left, bottom, right = explicit_padding

240

top_pad, bottom_pad = calc_explicit_padding(int(input_shape.height), int(s_y), int(k_h), int(top), int(bottom))

241

left_pad, right_pad = calc_explicit_padding(int(input_shape.width), int(s_x), int(k_w), int(left), int(right))

Rickard Bolin

9ae3455

2022-06-09 13:07:17 +0000

[diff] [blame]

242

elif padding_type == Padding.TILE:

243

# The values in the explicit padding only represent the "direction" in which to pad

244

top_pad, left_pad, bottom_pad, right_pad = explicit_padding

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

245

else:

Tim Hall

0ab2edc

2022-02-23 17:58:02 +0000

[diff] [blame]

246

raise UnsupportedFeatureError(f"Unsupported padding = {padding_type} for padding calculation")

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

247

padding = (top_pad, left_pad, bottom_pad, right_pad)

248

skirt = (top_pad, left_pad, ypad - top_pad, xpad - left_pad)

249

return padding, skirt

250

251

252

def calc_upscaled_padding_and_skirt(padding_type, kernel_size, stride, input_shape, upscaling_factor):

253

kernel_height, kernel_width = kernel_size[0], kernel_size[1]

254

if padding_type == Padding.SAME:

255

ypad = needed_total_padding(int(input_shape.height) * upscaling_factor, int(stride[1]), int(kernel_height))

256

xpad = needed_total_padding(int(input_shape.width) * upscaling_factor, int(stride[2]), int(kernel_width))

257

right_pad = max(((xpad + 1) // upscaling_factor) - 1, 0)

258

bottom_pad = max(((ypad + 1) // upscaling_factor) - 1, 0)

259

left_pad = max(kernel_width - 1 - right_pad, 0)

260

top_pad = max(kernel_height - 1 - bottom_pad, 0)

261

elif padding_type == Padding.VALID:

262

right_pad = max(kernel_width - 2, 0)

263

bottom_pad = max(kernel_height - 2, 0)

264

left_pad = kernel_width - 1

265

top_pad = kernel_height - 1

266

else:

Tim Hall

0ab2edc

2022-02-23 17:58:02 +0000

[diff] [blame]

267

raise UnsupportedFeatureError(f"Unsupported padding = {padding_type} for up-scaled padding calculation")

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

268

padding = (top_pad, left_pad, bottom_pad, right_pad)

269

skirt = padding

270

return padding, skirt

271

272

273

def fixup_conv2d_backprop(op, arch, nng):

274

if op.type == Op.Conv2DBackpropInput:

275

# flip the inputs

276

op.inputs[0], op.inputs[2] = op.inputs[2], op.inputs[0]

277

op.type = Op.Conv2DBackpropInputSwitchedBias

Tim Hall

3c5cfe9

2022-03-16 16:31:57 +0000

[diff] [blame]

278

op.ifm_resampling_mode = resampling_mode.TRANSPOSE

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

279

280

# Update strides

281

op.attrs.update({"stride_w": 1, "stride_h": 1, "strides": (1, 1, 1, 1)})

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame^]

282

DebugDatabase.add_optimised(op, op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

# Convert the op to an elementwise add

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

288

def convert_resize_1x1_to_add(op):

289

op.type = Op.Add # original_type will stay as Op.ResizeBilinear or Op.ResizeNearestNeighbor

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

290

op.name = op.name + "_add"

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

291

# Create an input tensor filled with zeros

292

shape = op.ofm_shapes[0].as_list()

293

tens = Tensor(shape, op.inputs[0].dtype, op.inputs[1].name + "_add")

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

294

tens.values = np.zeros(shape, tens.dtype.as_numpy_type())

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

295

tens.quantization = QuantizationParameters(0.0, 255.0)

296

tens.quantization.scale_f32 = 1.0

297

tens.quantization.zero_point = 0

298

tens.consumer_list = [op]

299

tens_op = op.inputs[1].ops[0]

300

tens_op.set_output_tensor(tens)

301

# Set the add inputs

302

op.inputs[1] = op.inputs[0]

303

op.inputs[0] = tens

304

op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame^]

305

DebugDatabase.add_optimised(op, op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

310

# Convert ResizeNearestNeightbor with align corners to a depthwise convolution. The IFM will already have been upscaled

311

# apart from the final x2 scaling which will be done as part of this operation. The kernel contains a single coefficient

312

# to select the appropriate nearest neighbor value

313

def convert_resizenn_ac_to_depthwise_conv(op, upscale_factor):

314

ifm = op.ifm

315

ofm = op.ofm

316

output_depth = ofm.shape[-1]

317

dw_op_attrs = {

318

"padding": Padding.VALID,

319

"stride_h": 1,

320

"stride_w": 1,

321

"strides": (1, 1, 1, 1),

322

"depth_multiplier": 1,

323

"channel_multiplier": 1,

324

"dilation_h_factor": 1,

325

"dilation_w_factor": 1,

326

"dilation": (1, 1, 1, 1),

327

}

328

329

# change resizebilinear to depthwise

330

op.type = Op.DepthwiseConv2DBias

331

op.attrs.update(dw_op_attrs)

332

op.set_input_tensor(ifm, 0) # ifm tensor index

333

op.activation = None

334

335

# add input resample to resize by x2

336

op.ifm_resampling_mode = resampling_mode.NEAREST

337

338

# don't care about the rounding mode as it is nearest neighbor

339

340

# setup weight tensor

341

weight_quant = QuantizationParameters()

342

weight_quant.scale_f32 = 1.0 # no scaling as only a single non-zero coeff to select the desired value

343

weight_quant.zero_point = 0

344

weight_quant.quant_dim = 0

345

ofm_dtype = ofm.dtype

346

if ofm_dtype == DataType.uint8:

347

weight_value_dtype = np.uint8

348

weight_quant.quant_min = 0

349

weight_quant.quant_max = (1 << ofm_dtype.bits) - 1

350

else:

351

if ofm_dtype == DataType.int8:

352

weight_value_dtype = np.int8

353

else:

354

assert ofm_dtype == DataType.int16

355

weight_value_dtype = np.int16

356

357

weight_quant.quant_min = -(1 << (ofm_dtype.bits - 1))

358

weight_quant.quant_max = (1 << (ofm_dtype.bits - 1)) - 1

359

360

weight_shape = [upscale_factor, upscale_factor, output_depth, output_depth] # HWIO

361

362

# the single non-zero coefficient used to select the desired value needs to be placed in the 'centre value', which

363

# is calculated by finding the 'centre position' ('*' in the diagram below) and then choosing the 'value' that is

364

# below-and-right (i.e. next) to it (D).

# 0---1---2

# | A | B |

# 1---*---+

# | C | D |

# 2---+---+

weight_values = [0] * (upscale_factor * upscale_factor)

371

centre_coeff = (upscale_factor // 2) * upscale_factor + (upscale_factor // 2)

372

weight_values[centre_coeff] = 1

373

374

# add weight tensor, this will discard the size tensor of the resize op

op.set_input_tensor(

create_const_tensor(

"weights",

weight_shape,

ofm.dtype,

np.array(weight_values).reshape(weight_shape),

381

value_dtype=weight_value_dtype,

382

quantization=weight_quant,

383

),

384

1, # inputs tensor weight index

385

)

386

387

# setup bias tensor by assign None and then call the fix-up function to create a suitable tensor.

388

# need to append the bias tensor as resize ops only have 2 inputs

389

assert len(op.inputs) == 2

390

op.inputs.append(None)

Fredrik Svedberg

cc219be

2022-09-20 16:32:52 +0200

[diff] [blame]

391

fixup_bias_tensors(op, None, None, DataType.int32)

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

392

393

# finally update the shape incase we've change the tensor shapes or connections

394

op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame^]

395

DebugDatabase.add_optimised(op, op)

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

return op

# Convert ResizeBilinear/NearestNeighbor to a number of 1x1 average pools with nearest neighbor x2 upscaling and one

401

# final average pool with a kernel size that depends upon the resize ops upscaling factor (x2, x4 or x8). The maximum

402

# upscale factor is limited to x8 because of the limit 8x8 kernel size limit for average pool with padding.

403

def convert_resize_to_upscale_and_average_pool(op):

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

404

pre_op = op

405

outputs = op.outputs

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

406

dtype = op.ifm.dtype

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

407

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

408

op.attrs.update({"strides": (1, 1, 1, 1), "ksize": (1, 1, 1, 1)})

Tim Hall

47c7636

2022-07-18 21:26:47 +0100

[diff] [blame]

409

op.attrs["padding"] = Padding.SAME # doesn't really matter as the kernel is 1x1

Tim Hall

3c5cfe9

2022-03-16 16:31:57 +0000

[diff] [blame]

410

op.ifm_resampling_mode = resampling_mode.NEAREST

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

411

412

upscaled_shape = np.array(op.ifm_shapes[0].get_hw_as_list())

Tim Hall

47c7636

2022-07-18 21:26:47 +0100

[diff] [blame]

413

414

# Get upscale factor that was calculated in the supported operators check

415

upscale_factor = op.attrs["upscale_factor"]

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

416

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

417

# Calculate how many times 2x2 upscaling needs to be performed

Tim Hall

f9267da

2022-04-20 20:19:48 +0100

[diff] [blame]

418

# Force the result of round to be an integer. This is because the behaviour of rounding numpy.float64 values changed

419

# between different versions of numpy. This consistency ensures that the kernel dimensions are kept integral

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

420

n = int(np.log2(upscale_factor))

421

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

422

# Perform x2 upscaling n-1 times

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

423

scaled_op = pre_op

424

for count in range(n - 1):

425

if count > 0:

426

scaled_op = op.clone(f"_{count}")

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

427

scaled_op.inputs[0] = pre_op.outputs[0]

428

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

429

# Nearest neighbor x2 upscaling

Tim Hall

47c7636

2022-07-18 21:26:47 +0100

[diff] [blame]

430

upscaled_shape = upscaled_shape * 2

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

431

shape = op.ofm_shapes[0].as_list()

432

shape[1:3] = upscaled_shape

433

out_tens = Tensor(shape, dtype, f"{op.outputs[0].name}_{count}")

434

out_tens.quantization = op.outputs[0].quantization.clone()

435

scaled_op.set_output_tensor(out_tens)

436

pre_op = scaled_op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

437

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

438

scaled_op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame^]

439

DebugDatabase.add_optimised(op, scaled_op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

440

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

441

# Last x2 upscaling

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

442

if n > 1:

443

scaled_op = op.clone(f"_{n-1}")

444

scaled_op.inputs[0] = pre_op.outputs[0]

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

445

446

if scaled_op.original_type == Op.ResizeBilinear:

447

if scaled_op.attrs["align_corners"]:

448

# no padding

449

scaled_op.attrs["padding"] = Padding.VALID

450

else:

451

# padding to the right and bottom (limits average pool to 8x8 kernel)

452

scaled_op.attrs["padding"] = Padding.EXPLICIT

453

scaled_op.attrs["explicit_padding"] = [0, 0, upscale_factor - 1, upscale_factor - 1]

454

455

# kernal size dependent on the upscaling factor

456

scaled_op.attrs.update({"ksize": (1, upscale_factor, upscale_factor, 1)})

457

else: # Op.ResizeNearestNeighbor

458

if scaled_op.attrs["align_corners"]:

459

# use depthwise conv to select the correct value

460

scaled_op = convert_resizenn_ac_to_depthwise_conv(scaled_op, upscale_factor)

461

else:

Johan Alfvén

a64616c

2022-10-17 12:29:12 +0200

[diff] [blame]

462

# Keep 1x1 kernel and average pool, this applies both when

463

# half-pixel-centers is True and False. Calculations are the

464

# same in the reference.

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

465

pass

466

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

467

scaled_op.outputs = outputs

468

scaled_op.outputs[0].ops = [scaled_op]

469

scaled_op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame^]

470

DebugDatabase.add_optimised(op, scaled_op)

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

471

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

Rickard Bolin

2022-07-04 16:19:16 +0000

[diff] [blame]

475

def convert_resizebilinear_to_depthwise_convolutions(op, half_pixel_centers=True):

476

def _compute_interpolation_values(index, input_size, output_size):

477

scale = input_size / output_size

478

scaled_value = (index + 0.5 * half_pixel_centers) * scale - 0.5 * half_pixel_centers

479

lower_bound = max(np.floor(scaled_value), 0)

480

481

return scaled_value, lower_bound

482

483

def _compute_kernels(input_height, input_width, output_height, output_width):

kernels = []

for y in (1, 2):

for x in (1, 2):

sv_h, lb_h = _compute_interpolation_values(y, input_height, output_height)

488

sv_w, lb_w = _compute_interpolation_values(x, input_width, output_width)

489

490

# Interpolation values calculated for (x, y) = ([1, 2], [1, 2]) will always generalize to the whole

491

# input for upscale = 2 and input sizes >= 2x2 and be in the correct order for going left-to-right,

492

# top-to-bottom - same as the depthwise convolution strides across each tile

493

kernel = np.zeros((2, 2))

494

kernel[1, 1] = (1 - (sv_h - lb_h)) * (1 - (sv_w - lb_w))

495

kernel[0, 1] = (sv_h - lb_h) * (1 - (sv_w - lb_w))

496

kernel[1, 0] = (1 - (sv_h - lb_h)) * (sv_w - lb_w)

497

kernel[0, 0] = (sv_h - lb_h) * (sv_w - lb_w)

498

kernel *= 16

499

kernels.append(kernel)

return kernels

def _build_convolutions(op, kernels):

504

dw_op_attrs = {

505

"padding": Padding.TILE,

506

"stride_h": 1,

507

"stride_w": 1,

508

"strides": (1, 1, 1, 1),

509

"depth_multiplier": 1,

510

"channel_multiplier": 1,

511

"dilation_h_factor": 1,

512

"dilation_w_factor": 1,

513

"dilation": (1, 1, 1, 1),

}

ifm = op.ifm

ofm = op.ofm

ofm.ops = []

elem_size = 2 if ofm.dtype == DataType.int16 else 1

519

520

n, h, w, c = ifm.shape

521

_, _, ow, _ = ofm.shape

522

523

intermediate_tens = Tensor(ifm.shape, ifm.dtype, "intermediate_tens")

524

intermediate_tens.quantization = op.outputs[0].quantization.clone()

525

avgpool_op = op

526

avgpool_op.name = "rb_init_avgpool"

527

avgpool_op.type = Op.AvgPool

528

avgpool_op.attrs["padding"] = Padding.VALID

529

avgpool_op.attrs["stride_w"] = 1

530

avgpool_op.attrs["stride_h"] = 1

531

avgpool_op.attrs["filter_width"] = 1

532

avgpool_op.attrs["filter_height"] = 1

533

avgpool_op.attrs["strides"] = [1, 1, 1, 1]

534

avgpool_op.attrs["ksize"] = [1, 1, 1, 1]

535

536

avgpool_op.add_input_tensor(ifm)

537

avgpool_op.set_output_tensor(intermediate_tens)

538

avgpool_op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame^]

539

DebugDatabase.add_optimised(op, op)

Rickard Bolin

2022-07-04 16:19:16 +0000

[diff] [blame]

540

541

dw_conv = Operation(Op.DepthwiseConv2DBias, "depthwise_conv")

542

dw_conv._original_type = Op.ResizeBilinear

543

dw_conv.write_shape = Shape4D(n, h, w, c)

544

dw_conv.write_offset = Shape4D(0, 0, 0, 0)

545

546

# Set the output rounding mode. Resize bilinear requires rounding away from zero. Therefore, we need to

547

# adjust the accumulated value by a "small" amount before applying natural rounding. The "small" amount

548

# should be big enough to cause a x.5 to be rounded correctly but small enough not to cause smaller

549

# values to be incorrectly rounded

550

ofm.quantization.next_after = True

551

dw_conv.rounding_mode = NpuRoundingMode.NATURAL

552

553

# Double height and width stride to write the output of each of the four depthwise convolutions below

554

# interleaved with each other when combined with OFM tile base offsets.

555

dw_conv.ofm_stride_multiplier = [1, 2, 2] # C/H/W

556

557

# Choose tile padding direction - pad by 1 with edge values in two direction.

558

# For example, TL (top left) will pad top and left in H/W-plane in all channels.

559

directions = [[1, 1, 0, 0], [1, 0, 0, 1], [0, 1, 1, 0], [0, 0, 1, 1]] # TL, TR, BL, BR

for i in (0, 1):

for j in (0, 1):

index = i * 2 + j

dw_conv.name = f"depthwise_conv_{index}"

564

dw_op_attrs["explicit_padding"] = directions[index]

565

dw_conv.attrs.update(dw_op_attrs)

566

567

# This will offset the start of the write by modifying the Tile 0 base address

568

dw_conv.tile_base_offsets_ofm[0] = (i * ow + j) * c * elem_size

569

570

ofm.ops.append(dw_conv)

571

dw_conv.outputs = [ofm]

572

573

kernel = kernels[index]

574

shape = [2, 2, 1, c]

575

kernel = np.dstack([kernel] * c)

576

577

quant = QuantizationParameters()

578

quant.zero_point = 0

579

quant.scale_f32 = 1.0 / 16

580

581

dw_conv.inputs = []

582

dw_conv.add_input_tensor(intermediate_tens)

583

dw_conv.add_input_tensor(

create_const_tensor(

"weights",

shape,

intermediate_tens.dtype,

588

np.array(kernel).reshape(shape),

value_dtype=np.int8,

quantization=quant,

),

)

# setup bias tensor by assign None and then call the fix-up function to create a suitable tensor.

595

# need to append the bias tensor as resize ops only have 2 inputs

596

assert len(dw_conv.inputs) == 2

597

dw_conv.inputs.append(None)

Rickard Bolin

017b4cc

2022-09-23 10:16:48 +0000

[diff] [blame]

598

fixup_bias_tensors(dw_conv, None, None, dtype=DataType.int32)

Rickard Bolin

2022-07-04 16:19:16 +0000

[diff] [blame]

599

600

dw_conv.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame^]

601

DebugDatabase.add_optimised(op, dw_conv)

602

Rickard Bolin

2022-07-04 16:19:16 +0000

[diff] [blame]

603

dw_conv = dw_conv.clone(f"_{index}")

604

return op

605

606

_, input_height, input_width, _ = op.ifm.shape

607

_, output_height, output_width, _ = op.ofm.shape

608

609

kernels = _compute_kernels(input_height, input_width, output_height, output_width)

610

op = _build_convolutions(op, kernels)

return op

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

615

def fixup_resize(op, arch, nng):

616

if op.type.is_resize_op() and op.run_on_npu:

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

617

if op.ifm_shapes[0] == op.ofm_shapes[0]:

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

618

# Bypass the resize op which is essentially a NOP

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

619

op.inputs = op.inputs[:1]

620

op.type = Op.Identity

621

elif op.ifm_shapes[0].height == 1 and op.ifm_shapes[0].width == 1:

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

622

convert_resize_1x1_to_add(op)

Rickard Bolin

2022-07-04 16:19:16 +0000

[diff] [blame]

623

elif op.type == Op.ResizeBilinear and op.attrs.get("half_pixel_centers", False):

624

convert_resizebilinear_to_depthwise_convolutions(op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

625

else:

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

626

convert_resize_to_upscale_and_average_pool(op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

def convert_nop_split_to_identity(op, arch, nng):

632

if op.type == Op.Split and op.attrs.get("num_splits") == 1:

633

# the list comprehension should return a list with a single tensor

634

# if it shouldn't, remove_passthrough_tensor will fail appropriately

635

op.inputs = [i for i in op.inputs if i.shape == op.outputs[0].shape]

636

op.type = Op.Identity

return op

Ayaan Masood

2022-04-21 14:28:03 +0100

[diff] [blame]

640

def rewrite_fully_connected_input(op: Operation, arch, nng):

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

641

Ayaan Masood

a2ec5aa

2022-04-21 14:28:03 +0100

[diff] [blame]

642

if op.type == Op.FullyConnected:

643

new_shape = op.ifm.get_shape_as_2d(op.weights.shape[-2])

644

assert new_shape is not None, "Tensor can not be reshaped to 2D"

645

op.ifm_shapes[0] = new_shape

Johan Alfvén

65835e0

2022-10-13 10:49:30 +0200

[diff] [blame]

646

647

if op.ifm_shapes[0].batch > 1 and op.ofm_shapes[0].batch == 1:

648

# If IFM is batching then also make sure OFM is batching

649

h, w = op.ofm_shapes[0].height, op.ofm_shapes[0].width

650

op.ofm_shapes[0] = Shape4D([h * w, 1, 1, op.ofm_shapes[0].depth])

651

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

def convert_batched_fc_shape(op, arch, nng):

656

if op.type == Op.FullyConnected:

657

# Check if the first dimension indicates batching

658

if op.ifm_shapes[0].batch > 1:

659

batching_split = {4: (2, 2), 8: (2, 4), 16: (4, 4)}

660

n = op.ifm_shapes[0].batch

661

h, w = batching_split.get(n, (1, n))

662

op.ifm_shapes[0] = Shape4D([1, h, w, op.ifm_shapes[0].depth])

663

664

# Reshape Weights to be 4D. IO becomes HWIO

665

weight_tensor = op.inputs[1]

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

666

weight_tensor.values = np.expand_dims(np.expand_dims(weight_tensor.values, axis=0), axis=0)

667

weight_tensor.set_all_shapes(list(weight_tensor.values.shape))

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

668

669

n = op.ofm_shapes[0].batch

670

h, w = batching_split.get(n, (1, n))

671

op.ofm_shapes[0] = Shape4D([1, h, w, op.ofm_shapes[0].depth])

return op

def unfuse_activation_function(op):

676

if op.type == Op.ConcatTFLite and op.run_on_npu and op.activation is not None:

677

act_op = Operation(op.activation.op_type, op.name + op.activation.op_type.name)

678

op.activation = None

679

out_tens = op.outputs[0]

680

intermediate_tens = out_tens.clone("_act_intermediate")

681

act_op.set_output_tensor(out_tens)

682

act_op.add_input_tensor(intermediate_tens)

683

op.set_output_tensor(intermediate_tens)

684

act_op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame^]

685

DebugDatabase.add_optimised(op, act_op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

686

687

688

def rewrite_stridedslice_output(op, arch, nng):

689

if not op.run_on_npu or op.type != Op.StridedSlice:

690

return op

691

692

new_axis_mask = op.attrs["new_axis_mask"]

693

shrink_axis_mask = op.attrs["shrink_axis_mask"]

694

695

if shrink_axis_mask == 0 and new_axis_mask == 0:

696

return op

697

698

axis_4D = [0] * len(op.outputs)

699

for idx, out_tens in enumerate(op.outputs):

700

output_shape = list(out_tens.shape)

701

702

if shrink_axis_mask != 0:

703

n = 0

704

axis = 0

705

while shrink_axis_mask:

706

prev_mask = shrink_axis_mask

707

n += 1

708

shrink_axis_mask &= shrink_axis_mask - 1

709

axis = int(math.log2(prev_mask - shrink_axis_mask))

710

output_shape = output_shape[:axis] + [1] + output_shape[axis:]

711

712

assert len(out_tens.shape) == (len(op.inputs[0].shape) - n)

713

op.attrs["shrink_axis_mask"] = 0

714

if axis >= 0:

715

axis_4D[idx] = axis + (4 - len(output_shape))

716

else:

717

axis_4D[idx] = axis

718

op.ofm_shapes[idx] = Shape4D(output_shape)

719

720

elif new_axis_mask != 0:

n = 0

axis = 0

while new_axis_mask:

prev_mask = new_axis_mask

725

n += 1

726

new_axis_mask &= new_axis_mask - 1

727

axis = int(math.log2(prev_mask - new_axis_mask))

728

output_shape = output_shape[:axis] + output_shape[(axis + 1) :]

729

new_axis_mask >>= 1

730

731

assert len(out_tens.shape) == (len(op.inputs[0].shape) + n)

732

op.attrs["new_axis_mask"] = 0

733

if axis >= 0:

734

axis_4D[idx] = axis + (4 - len(output_shape))

735

else:

736

axis_4D[idx] = axis

737

op.ofm_shapes[idx] = Shape4D(output_shape)

738

739

op.attrs["split_axis_4D"] = axis_4D

return op

def rewrite_unpack_output(op, arch, nng):

744

tens = op.outputs[0]

745

if op.run_on_npu and op.type == Op.Unpack:

746

# Unpack is also referred to as Unstack

747

axis = int(op.attrs["axis"])

748

if axis < 0: # Convert to positive axis

749

axis = len(op.inputs[0].shape) + 1 + axis

750

op.type = Op.UnpackReshaped

751

desired_output_shape = tens.shape[:axis] + [1] + tens.shape[axis:]

752

753

axis_4D = axis + (4 - len(desired_output_shape))

754

op.attrs["split_axis_4D"] = [axis_4D] * len(op.outputs)

755

756

for idx, out_tens in enumerate(op.outputs):

757

op.ofm_shapes[idx] = Shape4D(desired_output_shape)

return op

def add_padding_fields(op, arch, nng):

762

if op.run_on_npu:

763

if "padding" in op.attrs:

764

input_shape = op.ifm_shapes[0]

765

output_shape = op.ofm_shapes[0]

766

if op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op():

767

kernel_size = op.inputs[1].shape[:2]

768

elif op.type.is_pool_op() or op.type.npu_block_type == NpuBlockType.ReduceSum:

769

kernel_size = op.attrs["ksize"][1:3]

770

else:

771

raise UnsupportedFeatureError(f"Unknown operation that uses padding: {optype_to_builtintype(op.type)}")

772

773

if op.type == Op.Conv2DBackpropInputSwitchedBias:

774

upscaling_factor = output_shape.height // input_shape.height

775

padding, skirt = calc_upscaled_padding_and_skirt(

776

op.attrs["padding"], kernel_size, op.attrs["strides"], input_shape, upscaling_factor

777

)

778

else:

779

padding, skirt = calc_padding_and_skirt(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

op.attrs["padding"],

op.kernel,

input_shape,

op.attrs.get("explicit_padding"),

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

784

)

785

786

op.attrs["explicit_padding"] = padding

787

op.attrs["skirt"] = skirt

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

792

def reorder_depthwise_weights(op, arch, nng):

793

if op.type.is_depthwise_conv2d_op():

794

weight_tensor = op.inputs[1]

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

795

weight_tensor.values = np.transpose(weight_tensor.values, (0, 1, 3, 2))

796

weight_tensor.set_all_shapes(list(weight_tensor.values.shape))

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

797

weight_tensor.weight_transpose_depthwise = True

return op

def optimise_strided_conv(op, arch, nng):

Louis Verhaard

2022-03-17 14:06:00 +0100

[diff] [blame]

803

if op.type != Op.Conv2DBias or op.op_index != 0:

804

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

805

stride_x, stride_y = op.get_kernel_stride()

Louis Verhaard

2022-03-17 14:06:00 +0100

[diff] [blame]

806

weight_tensor = op.weights

807

ifm_shape = op.ifm_shapes[0]

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

808

809

if (

Louis Verhaard

2022-03-17 14:06:00 +0100

[diff] [blame]

810

stride_x == 2

811

and ifm_shape.depth <= 4

812

and ifm_shape.width % 2 == 0

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

813

and weight_tensor is not None

814

and weight_tensor.shape[1] >= 2

815

):

Louis Verhaard

2022-03-17 14:06:00 +0100

[diff] [blame]

816

k_w, _ = op.get_kernel_size()

817

curr_padding_x = needed_total_padding(ifm_shape.width, 2, k_w)

818

optimised_padding_x = needed_total_padding(ifm_shape.width // 2, 1, (k_w + 1) // 2)

819

if curr_padding_x != optimised_padding_x:

820

# Horizontal padding would become different after optimisation; this would not work

821

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

822

# IFM

823

op.ifm_shapes[0] = Shape4D([ifm_shape.batch, ifm_shape.height, ifm_shape.width // 2, ifm_shape.depth * 2])

824

825

# Weights

826

weight_shape = weight_tensor.shape

827

if weight_shape[1] % 2 != 0:

828

weight_shape[1] = weight_shape[1] + 1

829

padded_array = np.zeros(weight_shape)

830

for i in range(weight_shape[0]):

831

padded_array[i] = np.vstack(

832

[

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

833

weight_tensor.values[i],

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

834

np.full((1, weight_shape[2], weight_shape[3]), weight_tensor.quantization.zero_point),

835

]

836

)

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

837

weight_tensor.values = padded_array

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

838

weight_shape[1] //= 2

839

weight_shape[2] *= 2

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

840

weight_tensor.values = np.reshape(weight_tensor.values, weight_shape)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

841

weight_tensor.set_all_shapes(weight_shape)

842

# If multiple copies of the weights are used, we could avoid

843

# them having the same address by changing the value_id

844

weight_tensor.value_id = uuid.uuid4()

# Strides

stride_x = 1

op.attrs.update({"stride_w": stride_x, "stride_h": stride_y, "strides": (1, stride_y, stride_x, 1)})

return op

def convert_conv_to_fc(op, arch, nng):

854

# Conv 1x1 can be equivalent to Fully Connected.

855

# By representing certain convs as fully connected layers, Vela can better determine wether or not to use

856

# caching/double buffering for the weights.

857

# (Weights dont need to be reloaded for convs when IFM H and W are 1)

858

if op.type == Op.Conv2DBias:

859

h = op.ifm_shapes[0].height

860

w = op.ifm_shapes[0].width

861

kh, kw, _, _ = op.inputs[1].shape

862

if h == 1 and w == 1 and kh == 1 and kw == 1:

863

# Overwrite this op as a Fully Connected Op

864

op.name += "_fc"

865

op.type = Op.FullyConnected

op.attrs = {

"weights_format": 0,

}

# Reshape Weights to be 2D. HWIO becomes just IO (as H and W are 1, they can just be dropped)

870

weight_tensor = op.inputs[1]

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

871

weight_tensor.values = weight_tensor.values.squeeze(axis=(0, 1))

872

weight_tensor.set_all_shapes(list(weight_tensor.values.shape))

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

873

874

DebugDatabase.add_optimised(op, op)

return op

def fixup_relus_with_differing_ifm_ofm_scaling(op, arch, nng):

879

if op.run_on_npu and op.type.is_relu_op():

880

ifm = op.inputs[0]

881

ofm = op.outputs[0]

882

# Relu with differing IFM and OFM scaling cannot be fused with another primary op

883

# and requires its own to be inserted

884

if not check_quantized_tens_scaling_equal(ifm, ofm):

885

# Override this op with its own primary op (avgpool)

886

relu_fused_op = create_avgpool_nop(op.name + "_avgpool")

887

# And fuse the original activation function to it

888

relu_fused_op.activation = create_activation_function(op.type)

Fredrik Svedberg

1a7527c

2021-09-13 15:52:16 +0200

[diff] [blame]

889

# Add explicit rescaling

890

rescale = ifm.quantization.scale_f32 / ofm.quantization.scale_f32

891

multiplier, shift = scaling.quantise_scale(rescale)

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame]

892

relu_fused_op.explicit_scaling = ExplicitScaling(False, [shift], [multiplier])

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

893

# Tidy up and assign the ifm and ofm to the new op

894

ifm.consumer_list.remove(op)

895

896

relu_fused_op.add_input_tensor(ifm)

897

relu_fused_op.set_output_tensor(ofm)

898

relu_fused_op.set_ifm_ofm_shapes()

op = relu_fused_op

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

903

def convert_softmax(op, arch, nng):

904

if op.type == Op.Softmax and op.run_on_npu:

905

softmax = SoftMax(op)

906

op = softmax.get_graph()

return op

Fredrik Svedberg

2022-08-19 16:06:04 +0200

[diff] [blame]

910

def convert_prelu(op, arch, nng):

911

if op.type == Op.Prelu:

912

ifm, alpha, ofm = op.get_ifm_ifm2_ofm()

913

if None in (ifm, alpha, ofm):

914

return op

915

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

916

if alpha.values is not None:

917

# If const alpha check for possible optimisations

918

alpha_zp = alpha.quantization.zero_point

919

alpha_scale = alpha.quantization.scale_f32

920

# If all alpha values are the same the PReLU can be converted to LeakyRelu

921

alpha_min = (alpha.values.min().astype(np.int) - alpha_zp) * alpha_scale

922

alpha_max = (alpha.values.max().astype(np.int) - alpha_zp) * alpha_scale

923

if alpha_min == alpha_max:

# or even a Relu

if alpha_min == 0:

new_op = Op.Relu

else:

new_op = Op.LeakyRelu

929

op.attrs["alpha"] = alpha_min

930

# setup alpha_scaling for bit exact result

931

ifm_scale = ifm.quantization.scale_f32

932

ofm_scale = ofm.quantization.scale_f32

933

alpha_scale, alpha_shift = scaling.elementwise_mul_scale(ifm_scale, alpha_scale, ofm_scale)

934

op.attrs["alpha_scaling"] = (alpha.values.min() - alpha_zp, alpha_scale, alpha_shift)

935

# Change op type

936

op.type = new_op

937

op.name = op.name.replace("Prelu", new_op.name)

938

del op.inputs[1] # Remove alpha tensor

939

return op

940

elif alpha_max < 1:

941

# If alpha_max is less than 1 convert PReLU to Max(alpha * IFM, identity * IFM)

942

# Multiply with alpha tensor

943

mul_alpha = Operation(Op.Mul, op.name + "_mul_alpha")

944

mul_alpha.add_input_tensor(ifm)

945

mul_alpha.add_input_tensor(alpha)

946

fm_alpha = ofm.clone(op.name + "_alpha", set_unique=True)

947

mul_alpha.set_output_tensor(fm_alpha)

948

mul_alpha.set_ifm_ofm_shapes()

949

DebugDatabase.add_optimised(op, mul_alpha)

950

if check_quantized_tens_scaling_equal(ifm, ofm):

951

# No scaling is needed

952

fm_id = ifm

953

else:

954

# Add multiplication with identity

955

mul_identity = Operation(Op.Mul, op.name + "_mul_identity")

956

mul_identity.add_input_tensor(ifm)

957

# Create const tensor containing identity as scalar

958

quantization = ifm.quantization.clone()

959

quantization.scale_f32 = np.float32(1)

960

quantization.zero_point = 0

961

one = create_const_tensor("one_const", [], ifm.dtype, [1], quantization=quantization)

962

mul_identity.add_input_tensor(one)

963

# Make sure that fm_id is allocated to a different address than fm_alpha

964

fm_id = ofm.clone(op.name + "_id", set_unique=True)

965

mul_identity.set_output_tensor(fm_id)

966

mul_identity.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame^]

967

DebugDatabase.add_optimised(op, mul_identity)

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

968

969

# Combine scaled and alpha multiplied values

970

max_op = Operation(Op.Maximum, op.name + "_max")

971

max_op.add_input_tensor(fm_alpha)

972

max_op.add_input_tensor(fm_id)

973

max_op.set_output_tensor(ofm)

974

max_op.set_ifm_ofm_shapes()

975

976

DebugDatabase.add_optimised(op, max_op)

977

ifm.consumer_list.remove(op)

978

return max_op

979

980

# Catch all PReLU conversion for the cases that could not be optimised above

Fredrik Svedberg

2022-08-19 16:06:04 +0200

[diff] [blame]

981

no_scale_quant = ifm.quantization.clone()

982

no_scale_quant.scale_f32 = None

983

no_scale_quant.zero_point = 0

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

984

zero = create_const_tensor("zero_const", [], ifm.dtype, [0], quantization=no_scale_quant)

Fredrik Svedberg

2022-08-19 16:06:04 +0200

[diff] [blame]

985

986

# Select values < 0

987

min_op = Operation(Op.Minimum, op.name + "_min")

988

min_op.add_input_tensor(ifm)

989

min_op.add_input_tensor(zero)

990

fm_negative = ifm.clone(op.name + "_negative", set_unique=True)

991

min_op.set_output_tensor(fm_negative)

992

min_op.set_ifm_ofm_shapes()

993

DebugDatabase.add_optimised(op, min_op)

994

995

# and multiply with alpha tensor

996

mul_alpha = Operation(Op.Mul, op.name + "_mul_alpha")

997

mul_alpha.add_input_tensor(fm_negative)

998

mul_alpha.add_input_tensor(alpha)

999

fm_alpha = ofm.clone(op.name + "_negative_alpha", set_unique=True)

1000

mul_alpha.set_output_tensor(fm_alpha)

1001

mul_alpha.set_ifm_ofm_shapes()

1002

DebugDatabase.add_optimised(op, mul_alpha)

1003

1004

# Select (and scale) values > 0

1005

relu_op = Operation(Op.Relu, op.name + "_relu")

1006

relu_op.add_input_tensor(ifm)

1007

fm_scaled = ofm.clone(op.name + "_positive_scaled", set_unique=True)

1008

relu_op.set_output_tensor(fm_scaled)

1009

relu_op.set_ifm_ofm_shapes()

1010

DebugDatabase.add_optimised(op, relu_op)

1011

1012

# Add scaled and alpha multiplied values (without scaling)

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame]

1013

add_op = Operation(Op.Add, op.name + "_add")

1014

add_op.explicit_scaling = ExplicitScaling(False, shift=[0], multiplier=[1]) # No scaling

Fredrik Svedberg

2022-08-19 16:06:04 +0200

[diff] [blame]

1015

add_op.add_input_tensor(fm_alpha)

1016

add_op.add_input_tensor(fm_scaled)

1017

add_op.set_output_tensor(ofm)

1018

add_op.set_ifm_ofm_shapes()

1019

1020

DebugDatabase.add_optimised(op, add_op)

1021

ifm.consumer_list.remove(op)

op = add_op

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1027

def convert_mul_max_to_abs_or_lrelu(op, arch, nng):

1028

r"""Whenever there is a subgraph with this topology:

1029

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

1030

Input X For X = -1 or X > 0

1031

| \ / This subgraph can be replaced with either

1032

| Mul an Abs (if X = -1) or a LeakyReLU (if X > 0)

1033

| /

1034

Max

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1035

"""

1036

1037

if op.type == Op.Maximum:

1038

# finds the Mul input(s) to the Max

1039

muls = [i for i in op.inputs if i.ops[0].type == Op.Mul]

if len(muls) == 1:

mul = muls[0].ops[0]

elif len(muls) == 2:

# In the case both inputs are Muls, find the one with the same input as the Max

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

1044

mul_ifms = [m for m in muls if len(set(op.inputs + m.ops[0].inputs)) == 1]

1045

if len(mul_ifms):

1046

mul = mul_ifms[0].ops[0]

1047

else:

1048

# Not using same input

1049

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

else:

# No Mul inputs

return op

# make sure the Mul doesn't have any other consumers

1055

mul_ofm = mul.outputs[0]

1056

if len(mul_ofm.consumers()) != 1:

1057

return op

1058

# make sure the Mul doesn't have a fused activation function

1059

if mul.activation:

1060

return op

1061

ifm, ofm = op.get_ifm_ofm()

1062

if ifm is None or ofm is None:

1063

return op

1064

1065

if ifm.dtype not in (DataType.uint8, DataType.int8) or ifm.dtype != ofm.dtype:

1066

return op

1067

if not check_quantized_tens_scaling_equal(ifm, ofm) or not check_quantized_tens_scaling_equal(ifm, mul_ofm):

1068

# rewrite to LeakyRelu currently only makes sense if the quantization is identical

1069

return op

1070

1071

# finds the branched input that goes to both the Max and the Mul

1072

shared = set(op.inputs) & set(mul.inputs)

1073

if len(shared) == 1:

1074

shared_in = shared.pop()

1075

# find the constant scalar input to the Mul

1076

const_tens = (set(mul.inputs) - {shared_in}).pop()

1077

# check that it is a scalar

1078

if const_tens.shape != []:

1079

return op

1080

const = const_tens.ops[0]

1081

# check that it is a constant

1082

if const.type != Op.Const:

1083

return op

1084

# Remove the Mul from the shared input's consumers

1085

shared_in.consumer_list.remove(mul)

else:

return op

val = const.outputs[0].values

1090

if val >= 0:

1091

new_op = Op.LeakyRelu

1092

op.attrs["alpha"] = val

1093

# to produce bit exact results, the alpha is not enough;

1094

# save additional scaling info in attr "alpha_scale", to be used as input

1095

# to the LUT construction

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

1096

alpha_scalar = const_tens.values - const_tens.quantization.zero_point

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1097

mul_ifm_scale = np.double(ifm.quantization.scale_f32)

1098

mul_ifm2_scale = np.double(const_tens.quantization.scale_f32)

1099

mul_ofm_scale = np.double(mul_ofm.quantization.scale_f32)

1100

alpha_scale, alpha_shift = scaling.elementwise_mul_scale(mul_ifm_scale, mul_ifm2_scale, mul_ofm_scale)

1101

op.attrs["alpha_scaling"] = (alpha_scalar, alpha_scale, alpha_shift)

elif val == -1:

new_op = Op.Abs

else:

return op

op.type = new_op

op.name = op.name.replace("Maximum", new_op.name)

1109

op.outputs[0].name = op.outputs[0].name.replace("Maximum", new_op.name)

1110

op.inputs = [shared_in]

1111

op.set_ifm_ofm_shapes()

1112

1113

# Record optimisation in debug database

1114

DebugDatabase.add_optimised(op, op)

return op

def convert_hardswish_to_lut(op, arch, nng):

1120

if op.type == Op.HardSwish:

1121

ifm, ofm = op.get_ifm_ofm()

1122

# Generate the LUT

1123

ifm_scale = np.double(ifm.quantization.scale_f32)

1124

ofm_scale = np.double(ofm.quantization.scale_f32)

1125

zp_in = ifm.quantization.zero_point

1126

zp_out = ofm.quantization.zero_point

1127

ifm_scale_hires = (1 / 128) * ifm_scale

1128

relu_multiplier = np.double(3 / 32768)

1129

out_scale, out_shift = scaling.quantise_scale(ifm_scale_hires / ofm_scale)

1130

relu_scale, relu_shift = scaling.quantise_scale(ifm_scale_hires / relu_multiplier)

1131

# Use 16bit scale

1132

out_scale_16 = fp_math.downscale_multiplier_int32_to_int16(out_scale)

1133

relu_scale_16 = fp_math.downscale_multiplier_int32_to_int16(relu_scale)

1134

1135

values = []

1136

ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)

1137

quantized_min = min(ix)

1138

quantized_max = max(ix)

1139

for x in ix:

1140

input_value = x - zp_in

1141

input_value_hires = input_value * 128

1142

# Compute the input value on essentially the output scale, not shifted yet

1143

input_value_preshift = fp_math.saturating_rounding_mul16(input_value_hires, out_scale_16)

1144

# Compute the "relu-ish multiplier". This matches the code in TensorFlow Lite Micro kernel

1145

relu_value = np.int16(input_value_hires)

1146

if relu_shift < 31:

1147

relu_value = fp_math.shift_left16(relu_value, 30 - relu_shift)

1148

1149

relu_value = fp_math.saturating_rounding_mul16(relu_value, relu_scale_16)

1150

1151

if relu_shift < 31:

1152

relu_value = fp_math.shift_left16(relu_value, 1)

1153

1154

if relu_shift > 31:

1155

relu_value = fp_math.rounding_divide_by_pot(relu_value, relu_shift - 31)

1156

1157

# Rescaled the value into a 16bit fixedpoint relu_value in [-1, 1]

1158

# Now convert that to a 16bit fixedpoint value in [0, 1]

1159

relu_value = (relu_value + (1 << 15)) >> 1

1160

lut_result = fp_math.saturating_mul16(relu_value, input_value_preshift)

1161

shift = 31 - out_shift

1162

shift = -shift if shift < 0 else 0

1163

# Finally apply the output shift

1164

lut_result = fp_math.rounding_divide_by_pot(lut_result, shift) + zp_out

1165

lut_result = min(quantized_max, max(quantized_min, lut_result))

1166

values.append(lut_result)

1167

return convert_to_lut(op, values, "hardswish")

return op

def convert_lrelu_to_mul_max(op, arch):

1172

# Converts LeakyRelu to Max(alpha * IFM, identity * IFM)

1173

# (the opposite of convert_mul_max_to_abs_or_lrelu)

1174

ifm, ofm = op.get_ifm_ofm()

1175

if ifm is None or ofm is None:

1176

return op

1177

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1178

alpha = np.float32(op.attrs["alpha"])

1179

use_mul_max = 0 < alpha < 1

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1180

is_converted_prelu = "alpha_scaling" in op.attrs

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

if use_mul_max:

mul_ifm = ifm

new_op = Op.Maximum

else:

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1185

# Need to use a different approach for alpha < 0 or alpha > 1

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1186

no_scale_quant = ifm.quantization.clone()

1187

no_scale_quant.scale_f32 = None

1188

no_scale_quant.zero_point = 0

1189

zero = create_const_tensor("zero_const", [], ifm.dtype, [0], quantization=no_scale_quant)

1190

1191

# Select values < 0

1192

min_op = Operation(Op.Minimum, op.name + "_min")

1193

min_op.add_input_tensor(ifm)

1194

min_op.add_input_tensor(zero)

1195

mul_ifm = ifm.clone(op.name + "_negative", set_unique=True)

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1196

if alpha < 0 and not is_converted_prelu:

1197

# For negative alpha that is not from a converted PReLU we need to use

1198

# int32 Mul below to perform the (negative) alpha scaling

1199

mul_ifm.dtype = DataType.int32

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1200

min_op.set_output_tensor(mul_ifm)

1201

min_op.set_ifm_ofm_shapes()

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame]

1202

new_op = Op.Add

1203

op.explicit_scaling = ExplicitScaling(False, shift=[0], multiplier=[1]) # No scaling

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1204

DebugDatabase.add_optimised(op, min_op)

1205

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1206

# Add multiplication with alpha

1207

mul_alpha = Operation(Op.Mul, op.name + "_mul_alpha")

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1208

mul_alpha.add_input_tensor(mul_ifm)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1209

# Create const tensor containing alpha as scalar

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1210

quantization = ifm.quantization.clone()

1211

quantization.min = 0

1212

quantization.max = alpha * (quantization.quant_max - quantization.quant_min)

1213

quantization.zero_point = 0

Fredrik Svedberg

2022-09-13 15:22:01 +0200

[diff] [blame]

1214

alpha_dtype = mul_ifm.dtype

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1215

if is_converted_prelu:

1216

# The LeakyRelu was the result from convert_prelu and the scaling is provided

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

1217

scalar, alpha_scale, alpha_shift = op.attrs["alpha_scaling"]

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame]

1218

mul_alpha.explicit_scaling = ExplicitScaling(False, [alpha_shift], [alpha_scale])

Fredrik Svedberg

2022-09-13 15:22:01 +0200

[diff] [blame]

1219

elif alpha == 0 or np.isinf(1 / alpha):

1220

# Handling of alpha near or at zero

Fredrik Svedberg

cce872b

2021-09-02 15:20:52 +0200

[diff] [blame]

1221

quantization.scale_f32 = np.float32(1)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1222

scalar = 0

1223

else:

1224

quantization.scale_f32 = alpha

Fredrik Svedberg

2022-09-13 15:22:01 +0200

[diff] [blame]

1225

if alpha_dtype == DataType.int32:

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1226

# When the datatype is int32 (alpha negative) we need to do the scaling with the multiplication

Fredrik Svedberg

2022-09-13 15:22:01 +0200

[diff] [blame]

1227

scalar, _ = scaling.elementwise_mul_scale(ifm.quantization.scale_f32, alpha, ofm.quantization.scale_f32)

1228

else:

1229

scalar = 1

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1230

alpha_tens = create_const_tensor(

Fredrik Svedberg

2022-09-13 15:22:01 +0200

[diff] [blame]

1231

op.name + "_alpha_scalar", [1], alpha_dtype, [scalar], alpha_dtype.as_numpy_type(), quantization=quantization

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1232

)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1233

mul_alpha.add_input_tensor(alpha_tens)

1234

fm_alpha = ofm.clone(op.name + "_alpha", set_unique=True)

1235

mul_alpha.set_output_tensor(fm_alpha)

1236

mul_alpha.set_ifm_ofm_shapes()

1237

DebugDatabase.add_optimised(op, mul_alpha)

1238

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1239

if not use_mul_max:

1240

relu_op = Operation(Op.Relu, op.name + "_relu")

1241

relu_op.add_input_tensor(ifm)

1242

fm_id = ofm.clone(op.name + "_positive_scaled", set_unique=True)

1243

relu_op.set_output_tensor(fm_id)

1244

relu_op.set_ifm_ofm_shapes()

1245

DebugDatabase.add_optimised(op, relu_op)

1246

elif check_quantized_tens_scaling_equal(ifm, ofm):

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1247

# No identity multiplication is needed

1248

fm_id = ifm

1249

else:

1250

# Add multiplication with identity

1251

mul_identity = Operation(Op.Mul, op.name + "_mul_identity")

1252

mul_identity.add_input_tensor(ifm)

1253

# Create const tensor containing identity as scalar

1254

quantization = ifm.quantization.clone()

1255

quantization.min = 0

1256

quantization.max = quantization.quant_max - quantization.quant_min

Fredrik Svedberg

cce872b

2021-09-02 15:20:52 +0200

[diff] [blame]

1257

quantization.scale_f32 = np.float32(1)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1258

quantization.zero_point = 0

1259

identity_tens = create_const_tensor(

1260

op.name + "_id_scalar", [], ifm.dtype, [1], np.uint8, quantization=quantization

1261

)

1262

mul_identity.add_input_tensor(identity_tens)

1263

# Make sure that fm_id is allocated to a different address than fm_alpha

1264

fm_id = ofm.clone(op.name + "_id", set_unique=True)

1265

mul_identity.set_output_tensor(fm_id)

1266

mul_identity.set_ifm_ofm_shapes()

1267

DebugDatabase.add_optimised(op, mul_identity)

1268

1269

# Convert LeakyRelu to Max, add the results of the multiplication(s) as inputs

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1270

op.type = new_op

1271

op.name = op.name.replace("LeakyRelu", new_op.name)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1272

op.inputs = []

1273

ifm.consumer_list.remove(op)

1274

op.add_input_tensor(fm_alpha)

1275

op.add_input_tensor(fm_id)

1276

op.set_ifm_ofm_shapes()

1277

1278

DebugDatabase.add_optimised(op, op)

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1282

def convert_to_lut8(op, fn, fn_name):

1283

# Converts op to a no-op + int8/uint8 LUT which is generated with the given function.

1284

# fn is a function(real) -> real

1285

ifm, ofm = op.get_ifm_ofm()

1286

if ifm.dtype not in (DataType.uint8, DataType.int8) or ifm.dtype != ofm.dtype:

1287

return op

1288

# Generate the LUT

1289

ifm_scale = np.double(ifm.quantization.scale_f32)

1290

ofm_scale = np.double(ofm.quantization.scale_f32)

1291

zp_in = ifm.quantization.zero_point

1292

zp_out = ofm.quantization.zero_point

1293

values = []

1294

ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)

1295

quantized_min = min(ix)

1296

quantized_max = max(ix)

1297

for x in ix:

1298

x_real = ifm_scale * (x - zp_in)

1299

y_real = fn(x_real)

1300

lut_result = round_away_zero(zp_out + y_real / ofm_scale)

1301

lut_result = min(quantized_max, max(quantized_min, lut_result))

1302

values.append(lut_result)

1303

return convert_to_lut(op, values, fn_name)

1304

1305

1306

def convert_lrelu_to_lut(op, arch):

1307

ifm, ofm = op.get_ifm_ofm()

1308

# Generate the LUT

1309

alpha = op.attrs["alpha"]

1310

ifm_scale = np.double(ifm.quantization.scale_f32)

1311

ofm_scale = np.double(ofm.quantization.scale_f32)

1312

zp_in = ifm.quantization.zero_point

1313

zp_out = ofm.quantization.zero_point

1314

identity_scale, identity_shift = scaling.elementwise_mul_scale(ifm_scale, 1, ofm_scale)

1315

alpha_scalar = 1

1316

alpha_scale, alpha_shift = scaling.elementwise_mul_scale(ifm_scale, alpha, ofm_scale)

1317

if "alpha_scaling" in op.attrs:

1318

# The LeakyRelu was the result from convert_mul_max_to_abs_or_lrelu

1319

alpha_scalar, alpha_scale, alpha_shift = op.attrs["alpha_scaling"]

1320

values = []

1321

ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)

1322

quantized_min = min(ix)

1323

quantized_max = max(ix)

1324

for x in ix:

1325

if x < zp_in:

1326

lut_result = zp_out + fp_math.multiply_by_quantized_multiplier(

1327

alpha_scalar * (x - zp_in), alpha_scale, alpha_shift

1328

)

1329

else:

1330

lut_result = zp_out + fp_math.multiply_by_quantized_multiplier(x - zp_in, identity_scale, identity_shift)

1331

lut_result = min(quantized_max, max(quantized_min, lut_result))

1332

values.append(lut_result)

1333

return convert_to_lut(op, values, "lrelu")

1334

1335

1336

def convert_lrelu(op, arch, nng):

1337

# Converts LeakyRelu to a LUT based solution if possible, otherwise a mul + max

1338

if op.type != Op.LeakyRelu:

1339

return op

1340

ifm, ofm = op.get_ifm_ofm()

1341

if ifm is None or ofm is None:

1342

return op

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1343

alpha = op.attrs["alpha"]

1344

if alpha == 0:

1345

# When alpha is 0 the opertion can be converted to a ReLU

1346

op.type = Op.Relu

1347

op.name = op.name.replace("LeakyRelu", op.type.name)

1348

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1349

if ifm.dtype in (DataType.uint8, DataType.int8) and ifm.dtype == ofm.dtype:

1350

# use LUT for int8/uint8

1351

return convert_lrelu_to_lut(op, arch)

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1352

if check_quantized_tens_scaling_equal(ifm, ofm) and ifm.dtype == ofm.dtype == DataType.int16 and alpha > 0:

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1353

# use LeakyRelu unmodified for int16 with equal input/output scaling and positive alpha

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1354

return op

1355

return convert_lrelu_to_mul_max(op, arch)

1356

1357

1358

def convert_tanh_sigmoid_to_lut(op, arch, nng):

1359

# Converts int8/uint8 Sigmoid and Tanh to a LUT based solution

1360

if op.type == Op.Sigmoid:

1361

return convert_to_lut8(op, clamp_sigmoid, "sigmoid")

1362

elif op.type == Op.Tanh:

1363

return convert_to_lut8(op, math.tanh, "tanh")

return op

Jonas Ohlsson

2021-09-01 15:57:21 +0200

[diff] [blame]

1367

def remove_memory_only_ops(op, arch):

1368

if op.run_on_npu and op.type in memory_only_ops:

1369

bypass_memory_only_ops(op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1370

1371

1372

def fuse_activation_function_with_prev(op, arch, nng):

1373

# if op is a no-op: attempts to move the activation function to the preceding op

1374

if not op.attrs.get("is_nop", False) or op.activation is None:

1375

return op

1376

ifm, ofm = op.get_ifm_ofm()

1377

if ifm is None or ofm is None:

1378

return op

1379

# finds the input(s) to the operation

1380

prev_op = ifm.ops[0]

1381

# Note: the below checks on prev_op require that a first optimize pass on the full graph has been performed

1382

fuse = (

1383

prev_op.run_on_npu

1384

and prev_op.type.npu_block_type != NpuBlockType.Default

1385

and len(ifm.ops) == 1

1386

and len(prev_op.outputs[0].consumers()) == 1

1387

and prev_op.activation is None

1388

)

1389

if op.activation_lut is not None and arch.shram_reserved_unused_banks == 0:

1390

# TODO: if SHRAM LUT space is shared with SHRAM ACC (32, 64 MAC),

1391

# LUT currently only works correctly for elementwise ops

fuse = False

if not fuse:

return op

# Move the fused activation function + corresponding info to prev_op

1396

prev_op.activation = op.activation

1397

prev_op.forced_output_quantization = op.forced_output_quantization

1398

if op.activation_lut is not None:

1399

prev_op.set_activation_lut(op.activation_lut)

1400

# Bypass op

1401

prev_op.set_output_tensor(ofm)

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame^]

1402

DebugDatabase.add_optimised(prev_op, prev_op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

def _leading_pad_ok(leading_pad, stride, kernel_size):

1407

# If kernel size // 2 > stride, then (left, top) padding must be a multiple of stride,

1408

# otherwise replacing PAD by hardware padding would iterate the wrong IFM rows/columns

1409

max_size = kernel_size // 2

1410

return leading_pad == max_size or max_size <= stride or leading_pad % stride == 0

1411

1412

1413

def replace_pad_by_hw_pad(op: Operation, arch, nng):

1414

"""

1415

Tries to completely remove a PAD operator by using hardware padding.

1416

E.g. a PAD operation that pads 1, followed by a CONV with VALID padding and kernel size 3

1417

is rewritten such that the PAD is removed, and the CONV uses SAME padding.

1418

Converts tens1 -> PAD -> tens2 -> CONV to tens1 -> CONV

1419

if both operations can be run on the NPU.

1420

This is the most efficient way to implement PAD, but cannot be done for all pad sizes.

1421

"""

1422

if (

1423

(op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op() or op.type.is_avgpool_op())

Tim Hall

0ab2edc

2022-02-23 17:58:02 +0000

[diff] [blame]

1424

and op.type not in (Op.Conv2DBackpropInput, Op.Conv2DBackpropInputSwitchedBias)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1425

and op.run_on_npu

1426

and op.attrs["padding"] == Padding.VALID

1427

):

1428

pad_op = op.ifm.ops[0]

1429

if pad_op.type != Op.Pad or not pad_op.run_on_npu:

1430

return op

1431

if pad_op.ifm.dtype != pad_op.ofm.dtype or not check_quantized_tens_scaling_equal(pad_op.ofm, pad_op.ifm):

1432

return op

1433

top, left, bottom, right = get_pad_values_from_input(pad_op.inputs[1].values)

1434

k = op.kernel

1435

k_w, k_h = k.dilated_wh()

1436

1437

# Check if the PAD operator can be replaced by hardware padding

1438

if left > k_w // 2 or right > k_w // 2 or top > k_h // 2 or bottom > k_h // 2:

1439

# Too much padding, it would require hardware padding to actually insert zeros

1440

return op

1441

if not _leading_pad_ok(top, k.stride.y, k_h) or not _leading_pad_ok(left, k.stride.x, k_w):

1442

return op

1443

1444

if op.type.is_avgpool_op():

1445

# For average pool, hardware padding can only be used if padding is 0 or kernel size / 2

for pad, k_size in (

(left, k_w),

(right, k_w),

(top, k_h),

(bottom, k_h),

):

if pad not in (0, k_size // 2):

1453

return op

1454

# Average pool is converted to depthwise, because NPU average pool + same padding

1455

# has a special implementation that is different from PAD followed by average pool with

1456

# valid padding.

1457

k_w, k_h = op.kernel.width, op.kernel.height

1458

ifm = op.ifm

1459

# Remember other inputs

1460

other_inputs = op.inputs[1:]

1461

# Create a weight tensor, all weights are set to 1/(kernel width * kernel height)

1462

quantization = QuantizationParameters(0.0, 255.0)

1463

quantization.scale_f32 = 1.0 / (k_w * k_h)

1464

quantization.zero_point = 0

1465

shape = [k_h, k_w, 1, op.ofm.shape[-1]]

1466

weights = np.full(shape, 1)

1467

1468

weight_tens = create_const_tensor(

1469

op.name + "_weights",

shape,

op.ifm.dtype,

weights,

np.uint8,

purpose=TensorPurpose.Weights,

1475

quantization=quantization,

1476

)

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

1477

weight_tens.values = weights

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1478

op.type = Op.DepthwiseConv2DBias

1479

op.inputs = []

1480

op.add_input_tensor(ifm)

1481

op.add_input_tensor(weight_tens)

1482

# Add bias tensor, all biases set to 0

1483

op.inputs.append(None)

Fredrik Svedberg

cc219be

2022-09-20 16:32:52 +0200

[diff] [blame]

1484

fixup_bias_tensors(op, arch, nng, DataType.int32)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1485

# Add other inputs

1486

op.inputs.extend(other_inputs)

1487

op.rounding_mode = NpuRoundingMode.NATURAL

1488

1489

# Bypass the PAD operator

1490

op.set_input_tensor(pad_op.ifm, 0)

1491

# Adjust the padding attributes of the convolution operator

1492

op.attrs["padding"] = Padding.EXPLICIT

1493

op.attrs["explicit_padding"] = (top, left, bottom, right)

1494

op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame^]

1495

DebugDatabase.add_optimised(op, op)

1496

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

def convert_pad(op: Operation, arch, nng):

1501

"""

1502

Rewrites PAD operator to an average pool that copies the IFM to the OFM

1503

+ up to 4 average pool operators that fill the OFM with zeros at the borders.

1504

This is done as fall-back for the PAD operators that remain after replace_pad_by_hw_pad

1505

"""

1506

if op.type != Op.Pad or not op.run_on_npu:

1507

return op

1508

top, left, bottom, right = get_pad_values_from_input(op.inputs[1].values)

1509

1510

ifm = op.ifm

1511

assert ifm is not None

James Ward

3e13434

2021-10-28 10:01:40 +0100

[diff] [blame]

1512

ifm_shape = op.ifm_shapes[0]

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1513

ofm = op.ofm

1514

assert ofm is not None

1515

ofm.ops = []

1516

ofm_shape = op.ofm_shapes[0]

1517

1518

# Average pool op that copies IFM to the right place inside the OFM

1519

shp0 = Shape4D(0, 0, 0, 0)

1520

shp_top = shp0.with_height(top)

1521

avgpool_op = create_avg_pool_for_concat(op, op.name + "_main", ifm, ifm_shape, shp_top.with_width(left))

1522

avgpool_op.activation = op.activation

1523

quant = ofm.quantization

1524

pad_value = quant.zero_point

1525

# Add operations that fill the borders of the OFM

1526

if top > 0:

1527

shape = Shape4D(1, top, ofm_shape.width, ofm_shape.depth)

1528

zero_tens = create_const_tensor(

1529

op.name + "_top", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], np.uint8, quantization=quant

1530

)

1531

# If top/bottom or left/right are equal, the const tensors can be allocated to the same address

1532

zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))

1533

create_avg_pool_for_concat(op, op.name + "_top", zero_tens, shape, shp0)

1534

if bottom > 0:

1535

shape = Shape4D(1, bottom, ofm_shape.width, ofm_shape.depth)

1536

zero_tens = create_const_tensor(

op.name + "_bottom",

shape.as_list(),

ofm.dtype,

shape.elements() * [pad_value],

np.uint8,

quantization=quant,

)

zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))

1545

create_avg_pool_for_concat(

1546

op, op.name + "_bottom", zero_tens, shape, shp0.with_height(ofm_shape.height - bottom)

1547

)

1548

if left > 0:

1549

shape = Shape4D(1, ifm_shape.height, left, ofm_shape.depth)

1550

zero_tens = create_const_tensor(

1551

op.name + "_left", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], np.uint8, quantization=quant

1552

)

1553

zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))

1554

create_avg_pool_for_concat(op, op.name + "_left", zero_tens, shape, shp_top)

1555

if right > 0:

1556

shape = Shape4D(1, ifm_shape.height, right, ofm_shape.depth)

1557

zero_tens = create_const_tensor(

1558

op.name + "_right", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], np.uint8, quantization=quant

1559

)

1560

zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))

1561

create_avg_pool_for_concat(

1562

op, op.name + "_right", zero_tens, shape, shp_top.with_width(ofm_shape.width - right)

1563

)

1564

1565

op.type = Op.ConcatTFLite

return avgpool_op

Fredrik Svedberg

2022-09-20 16:32:52 +0200

[diff] [blame]

1569

def fixup_bias_tensors(op, arch, nng, dtype=None):

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1570

if op.type.needs_bias() and op.bias is None:

1571

# Op has no bias, add bias tensor filled with zeros

1572

nr_biases = op.inputs[1].shape[-1]

1573

bias_values = [0] * nr_biases

Fredrik Svedberg

cc219be

2022-09-20 16:32:52 +0200

[diff] [blame]

1574

# The DataType of the bias tensor can be explicitly provided or deduced from the ifm

1575

# DataType. Default is int32 bias for 8-bit ifms and int64 for int16 ifms.

1576

# For int16 the selected bias DataType will have an impact on the scaling

1577

# used when encoding the scales and biases later. The default mode will match the

1578

# refence with reduced scaling for int64 bias.

1579

# This means that in cases (in the graph optimiser) where DepthwiseConv2DBias

1580

# is used to emulate average pool int32 bias should be selected for full precision

1581

# int16 scaling.

1582

if dtype is None:

1583

dtype = DataType.int64 if op.ifm.dtype == DataType.int16 else DataType.int32

1584

bias_tensor = create_const_tensor(op.name + "_bias", [nr_biases], dtype, bias_values)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1585

op.set_input_tensor(bias_tensor, op.type.info.indices.biases[0])

return op

Fredrik Svedberg

2021-11-01 14:25:29 +0100

[diff] [blame]

1590

def fixup_asymmetric_weights(op, arch, nng):

1591

if op.run_on_npu and (op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op()):

1592

if op.ifm.dtype == DataType.int8:

1593

if not np.all(op.weights.quantization.zero_point == 0):

1594

print(f"Warning: {op.type} '{op.name}' has asymmetric weights, zero points have been adjusted.")

1595

op.weights.quantization.zero_point *= 0

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1600

def convert_mean_to_depthwise_conv_or_avgpool(op, arch, nng):

1601

if op.type == Op.Mean and op.run_on_npu:

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1602

inp, axis = op.inputs

1603

shape = inp.shape

Diqing Zhong

2022-03-09 12:23:47 +0100

[diff] [blame]

1604

ofm_shape = op.ofm.shape

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1605

dims = len(shape)

Diqing Zhong

2022-03-09 12:23:47 +0100

[diff] [blame]

1606

dims_ofm = len(ofm_shape)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1607

1608

# Height and width axes have different index depending on dimensions

1609

if axis.shape == [] or axis.shape[0] == 1: # single axis

1610

axis = int(axis.values) if len(axis.shape) == 0 else int(axis.values[0])

1611

if dims in (2, 3):

1612

if axis == 0:

1613

h, w = shape[axis], 1

1614

else:

1615

h, w = 1, shape[axis]

1616

else:

1617

if axis == 1:

1618

h, w = shape[axis], 1

1619

else:

1620

h, w = 1, shape[axis]

1621

else: # multiple axes

1622

axis = sorted(axis.values)

1623

h, w = [shape[i] for i in axis]

1624

1625

# Set necessary depthwise attributes

1626

op.attrs.update(

1627

{

1628

"padding": Padding.VALID,

1629

"stride_h": 1,

1630

"stride_w": 1,

1631

"strides": (1, 1, 1, 1),

1632

"depth_multiplier": 1,

1633

"channel_multiplier": 1,

1634

"dilation_h_factor": 1,

1635

"dilation_w_factor": 1,

1636

"dilation": (1, 1, 1, 1),

}

)

# Change op type

op.type = Op.DepthwiseConv2DBias

1641

# Set IFM/OFM shapes after changing op type

1642

op.set_ifm_ofm_shapes()

1643

Fredrik Svedberg

1e5456f

2022-09-23 15:25:17 +0200

[diff] [blame]

1644

weight_scale, bias = 1, 0

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1645

ofmq, ifmq = op.ofm.quantization, inp.quantization

Johan Alfvén

9d51ec4

2022-10-27 16:30:01 +0200

[diff] [blame]

1646

if ifmq.is_scaling_equal(ofmq):

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1647

# Here we can just use a simple AvgPool with truncating rounding,

1648

# as we're emulating simple integer division.

1649

op.rounding_mode = NpuRoundingMode.TRUNCATE

1650

op.type = Op.AvgPool

1651

op.attrs.update({"ksize": (1, h, w, 1), "filter_height": h, "filter_width": w})

1652

else:

1653

op.rounding_mode = NpuRoundingMode.NATURAL

1654

weight_scale = 1 / (h * w)

1655

# Input zero point is adjusted after mean calculation, so we emulate that with a bias

1656

bias = -ifmq.zero_point * h * w

1657

fiq = ifmq.clone()

1658

fiq.zero_point = 0

1659

op.forced_input_quantization = fiq

1660

1661

# Change dimensions to 4

Diqing Zhong

2022-03-09 12:23:47 +0100

[diff] [blame]

1662

def extend_dims(dim, in_shape):

1663

if dim < 4:

1664

in_shape = [1] + in_shape

if dim == 2:

in_shape += [1]

return in_shape

if dims < 4 or dims_ofm < 4:

1670

# Fix the ofm dimension when keep_dims is false

1671

# e.g. IFM=1xHxWxC axis=2 OFM=1xHxC, the ofm_shape should be 1xHx1xC, not 1x1xHxC

1672

if isinstance(axis, int) and dims_ofm + 1 == dims:

1673

ofm_shape.insert(axis, 1)

1674

elif isinstance(axis, list) and (dims_ofm + len(axis) == dims):

1675

for i in axis:

1676

ofm_shape.insert(i, 1)

1677

shape = extend_dims(dims, shape)

1678

dims_ofm = len(ofm_shape)

1679

ofm_shape = extend_dims(dims_ofm, ofm_shape)

1680

op.set_ifm_ofm_shapes()

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1681

Rickard Bolin

7d7cb67

2021-12-07 09:09:14 +0000

[diff] [blame]

1682

# If height is greater than max kernel height, reshape from HxW to 1x(HxW)

Johan Alfvén

2022-09-26 13:46:51 +0200

[diff] [blame]

1683

weight_shape = None

Rickard Bolin

7d7cb67

2021-12-07 09:09:14 +0000

[diff] [blame]

1684

if (h > 64 and op.type == Op.DepthwiseConv2DBias) or (h > 256 and op.type == Op.AvgPool):

Johan Alfvén

2022-09-26 13:46:51 +0200

[diff] [blame]

1685

# This can only happen and be done for multiple axes, and

1686

# h * w <= 256 for DepthwiseConv2DBias

1687

# h * w <= 4096 for AvgPool

1688

# which is checked in supported ops

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1689

shape = [shape[0], 1, h * w, shape[3]]

1690

op.ifm_shapes[0] = Shape4D(shape)

Johan Alfvén

2022-09-26 13:46:51 +0200

[diff] [blame]

1691

weight_shape = [1, h * w, shape[3], shape[0]]

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1692

if h > 256 and op.type == Op.AvgPool:

1693

op.attrs.update({"ksize": (1, 1, h * w, 1), "filter_height": 1, "filter_width": h * w})

1694

1695

# If the AvgPool version is used, we don't need to do anything else

1696

if op.type == Op.AvgPool:

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame^]

1697

DebugDatabase.add_optimised(op, op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1698

return op

1699

1700

# Make unit weight tensor quantization

1701

weight_quant = ifmq.clone()

1702

weight_quant.min = 0

1703

weight_quant.max = 255

1704

weight_quant.scale_f32 = weight_scale

1705

weight_quant.zero_point = 0

1706

Johan Alfvén

2022-09-26 13:46:51 +0200

[diff] [blame]

1707

if weight_shape is None:

1708

# Set weight shape to [H,W,C,B]

1709

weight_shape = [h, w, shape[3], shape[0]]

Diqing Zhong

2022-03-09 12:23:47 +0100

[diff] [blame]

1710

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1711

# Add unit weight tensor

op.set_input_tensor(

create_const_tensor(

"weights",

weight_shape,

inp.dtype,

np.ones(weight_shape),

1718

value_dtype=np.uint8,

1719

quantization=weight_quant,

1720

),

1721

1,

1722

)

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

1723

op.weights.values = np.reshape(op.inputs[1].values, weight_shape)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1724

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1725

# Add bias tensor

Fredrik Svedberg

1e5456f

2022-09-23 15:25:17 +0200

[diff] [blame]

1726

bias_shape = [shape[-1]]

1727

op.inputs.append(create_const_tensor("bias", bias_shape, DataType.int32, np.ones(bias_shape) * bias))

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame^]

1728

DebugDatabase.add_optimised(op, op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1733

def optimise_quantize(op: Operation, arch, nng):

1734

1735

if op.type == Op.Quantize and op.run_on_npu:

1736

1737

ifm, ofm = op.get_ifm_ofm()

1738

input_values = ifm.values

1739

1740

# Guard clause - input not const or no values to quantize

1741

if ifm.ops[0].type != Op.Const or input_values is None:

1742

return op

1743

1744

# Singular val in numpy array, convert to indexable array

1745

if input_values.ndim == 0:

1746

input_values = np.array([input_values])

1747

Fredrik Svedberg

1156317

2022-07-06 14:54:12 +0200

[diff] [blame]

1748

# requantized int8 to int8 or int16 to int16

1749

if ifm.dtype == ofm.dtype == DataType.int8 or ifm.dtype == ofm.dtype == DataType.int16:

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1750

1751

# scale needs to use double precision to match TFLite reference kernel

1752

effective_scale = np.float64(ifm.quantization.scale_f32) / np.float64(ofm.quantization.scale_f32)

1753

effective_multiplier, effective_shift = quantise_scale(effective_scale)

1754

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1755

requantized_vals = []

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

1756

for val in input_values.flatten():

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1757

input_val = val - ifm.quantization.zero_point

1758

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

1759

ofm_val = fp_math.multiply_by_quantized_multiplier(input_val, effective_multiplier, effective_shift)

1760

ofm_val += ofm.quantization.zero_point

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1761

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

1762

clamped_ofm_value = max(min(ofm_val, ofm.quantization.quant_max), ofm.quantization.quant_min)

1763

requantized_vals.append(clamped_ofm_value)

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1764

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

1765

ofm.values = np.array(requantized_vals, ofm.dtype.as_numpy_type())

1766

ofm.values.shape = input_values.shape

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1767

1768

# Case: Float input - quantize to int

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

1769

elif ifm.dtype.type == BaseType.Float:

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1770

1771

quantized_vals = []

1772

for val in input_values:

1773

1774

# Derive quantized value

1775

quant_val = (val / ofm.quantization.scale_f32) + ofm.quantization.zero_point

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

1776

clamped_quantized_val = np.clip(quant_val, ofm.quantization.quant_min, ofm.quantization.quant_max)

1777

quantized_vals.append(clamped_quantized_val)

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1778

1779

# Pass the statically calculated quant val to output tensor

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

1780

ofm.values = np.array(quantized_vals, ofm.dtype.as_numpy_type())

1781

1782

# Unsupported data type

1783

else:

1784

return op

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1785

1786

# Make quantize op const and disconnect from parent node

1787

1788

# Remove reference of the current quant op from the parent tensor's consumer list

1789

ifm.consumer_list = [consumer for consumer in ifm.consumer_list if consumer.op_index != op.op_index]

1790

1791

# Clear any references to parent node

1792

op.inputs = []

1793

1794

# Convert this quantize op to const

op.type = Op.Const

return op

Ayaan Masood

2022-06-29 11:30:57 +0100

[diff] [blame]

1800

def convert_shape_op_to_constant_tensor(op: Operation, arch, nng):

1801

"""Static optimisation for SHAPE operator output value known at compile time"""

1802

1803

# Disconnect SHAPE operator from its parent and transform SHAPE OP into constant

1804

1805

if op.type == Op.Shape and op.run_on_npu:

1806

1807

ifm, ofm = op.get_ifm_ofm()

1808

1809

if len(ifm.shape) != ofm.shape[0]:

1810

return op

1811

1812

# Remove reference of the current shape op from the parent tensor's consumer list

1813

ifm.consumer_list = [consumer for consumer in ifm.consumer_list if consumer.op_index != op.op_index]

1814

1815

# Clear any references to parent node

1816

op.inputs = []

1817

1818

# Convert this SHAPE op to const

1819

op.type = Op.Const

1820

1821

# Add size calculation to shape output tensors

1822

ofm.values = np.array(ifm.shape)

return op

Tim Hall

2022-11-11 18:19:53 +0000

[diff] [blame]

1827

def fixup_dilation_gt2(op, arch, nng):

1828

assert op.run_on_npu

1829

if op.type == Op.Conv2DBias or op.type == Op.DepthwiseConv2DBias:

1830

dilation_w, dilation_h = op.get_kernel_dilation()

1831

1832

# if dilation in either axis is greater than that supported by the hardware then we must manually dilate the

1833

# kernel

1834

if dilation_w > 2 or dilation_h > 2:

1835

kernel_w, kernel_h = op.get_kernel_size()

1836

kernel_ic = op.weights.shape[-2]

1837

kernel_oc = op.weights.shape[-1]

1838

1839

# if the dilation is a multiple of 2 then the hardware dialtion can be enabled to provide that multiple

1840

# of 2. this allows the kernel size to be reduced (via the scaled dilation) by half in that dimension.

1841

# odd = 1, even = 2

1842

hw_dilation_h = 1 if (dilation_h & 1) else 2

1843

hw_dilation_w = 1 if (dilation_w & 1) else 2

1844

1845

scale_dilation_h = dilation_h // hw_dilation_h

1846

scale_dilation_w = dilation_w // hw_dilation_w

1847

1848

# create new empty kernel (HWIO format)

1849

new_kernel_h = (kernel_h - 1) * scale_dilation_h + 1

1850

new_kernel_w = (kernel_w - 1) * scale_dilation_w + 1

1851

1852

new_kernel_shape = [new_kernel_h, new_kernel_w, kernel_ic, kernel_oc]

1853

new_kernel_values = np.zeros(new_kernel_shape, dtype=op.weights.values.dtype)

1854

1855

# copy the original kernel values into the new sparse kernel

1856

for h in range(0, kernel_h):

1857

for w in range(0, kernel_w):

1858

new_h = h * scale_dilation_h

1859

new_w = w * scale_dilation_w

1860

new_kernel_values[new_h, new_w, :, :] = op.weights.values[h, w, :, :]

1861

1862

# update the weight tensor with the new dilated kernel

1863

op.weights.shape = new_kernel_shape

1864

op.weights.values = new_kernel_values

1865

1866

# enable(=2) / disable(=1) hardware dilation

1867

op.attrs["dilation"] = (1, hw_dilation_h, hw_dilation_w, 1) # nhwc format

1868

op.attrs["dilation_h_factor"] = hw_dilation_h

1869

op.attrs["dilation_w_factor"] = hw_dilation_w

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1874

def supported_operator_check(op, arch, nng):

Jonas Ohlsson

45e653d

2021-07-26 16:13:12 +0200

[diff] [blame]

1875

op.run_on_npu = arch.tflite_supported_operators.is_operator_supported(op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

def tflite_optimise_graph(nng, arch):

Fredrik Svedberg

1156317

2022-07-06 14:54:12 +0200

[diff] [blame]

1880

# Compile time static optimisations

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1881

optimisation_list = [optimise_quantize, convert_shape_op_to_constant_tensor]

1882

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1883

for idx, sg in enumerate(nng.subgraphs):

1884

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

nng,

sg,

arch,

[],

Ayaan Masood

4965fae

2022-06-29 11:30:57 +0100

[diff] [blame]

1889

optimisation_list,

1890

rewrite_unsupported=False,

1891

)

1892

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

1893

# Pre-processing step

1894

pre_process_list = [

1895

supported_operator_check,

1896

set_ifm_ofm_op_shapes,

1897

]

1898

Ayaan Masood

4965fae

2022-06-29 11:30:57 +0100

[diff] [blame]

1899

for idx, sg in enumerate(nng.subgraphs):

1900

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

nng,

sg,

arch,

[],

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

1905

pre_process_list,

1906

rewrite_unsupported=False,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

)

# Handle Concat Ops

for idx, sg in enumerate(nng.subgraphs):

1911

rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [rewrite_concat_ops])

1912

sg.refresh_after_modification()

1913

1914

# Handle Split Ops

1915

for idx, sg in enumerate(nng.subgraphs):

1916

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

nng,

sg,

arch,

[],

[rewrite_unpack_output, rewrite_stridedslice_output, convert_nop_split_to_identity],

1922

rewrite_unsupported=False,

1923

)

1924

1925

for idx, sg in enumerate(nng.subgraphs):

1926

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

nng,

sg,

arch,

[rewrite_split_ops],

[],

rewrite_unsupported=False,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1933

)

1934

1935

# Handle sg input output

1936

for idx, sg in enumerate(nng.subgraphs):

1937

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

nng,

sg,

arch,

[],

[fix_sg_input_output],

1943

rewrite_unsupported=False,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1944

)

1945

Jonas Ohlsson

2021-09-01 15:57:21 +0200

[diff] [blame]

1946

# Removal of memory only operators

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1947

for sg in nng.subgraphs:

Jonas Ohlsson

2021-09-01 15:57:21 +0200

[diff] [blame]

1948

rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [remove_memory_only_ops])

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1949

sg.refresh_after_modification()

1950

1951

# Rewrite of operators

1952

op_rewrite_list = [

1953

set_tensor_equivalence,

1954

convert_mean_to_depthwise_conv_or_avgpool,

1955

convert_depthwise_to_conv,

1956

convert_conv_to_fc,

1957

convert_softmax,

Fredrik Svedberg

2022-08-19 16:06:04 +0200

[diff] [blame]

1958

convert_prelu,

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1959

convert_mul_max_to_abs_or_lrelu,

1960

convert_lrelu,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1961

optimise_strided_conv,

1962

convert_hardswish_to_lut,

1963

rewrite_fully_connected_input,

1964

convert_batched_fc_shape,

1965

fixup_conv2d_backprop,

1966

fixup_relus_with_differing_ifm_ofm_scaling,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1967

reorder_depthwise_weights,

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

1968

fixup_resize,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1969

fixup_bias_tensors,

Fredrik Svedberg

cc8569f

2021-11-01 14:25:29 +0100

[diff] [blame]

1970

fixup_asymmetric_weights,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1971

convert_tanh_sigmoid_to_lut,

1972

replace_pad_by_hw_pad,

Tim Hall

ea4ba66

2022-11-11 18:19:53 +0000

[diff] [blame]

1973

fixup_dilation_gt2,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1974

]

1975

1976

for idx, sg in enumerate(nng.subgraphs):

1977

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

nng,

sg,

arch,

[],

op_rewrite_list,

rewrite_unsupported=False,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1984

)

1985

1986

for idx, sg in enumerate(nng.subgraphs):

1987

# remove passthrough tensors and attempt further optimizations

1988

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

nng,

sg,

arch,

[remove_passthrough_tensor],

1993

[fuse_activation_function_with_prev, convert_pad, add_padding_fields],

1994

)

1995

1996

# Removal of SplitSliceRead, need to be done after optimisation has been performed,

1997

# since ifm/ofm_shapes are of importance to this function

1998

for sg in nng.subgraphs:

1999

rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [remove_SplitSliceRead])

2000

sg.refresh_after_modification()

2001

Fredrik Svedberg

f3c7d55

2022-11-04 09:48:49 +0100

[diff] [blame]

2002

# Make sure that const optimisations on subgraph outputs are handled correctly

2003

for sg in nng.subgraphs:

2004

for ofm in sg.output_tensors:

2005

if ofm.is_const and ofm.ops[0].type_changed:

2006

# Subgraph output cannot be const - insert a memory copy

2007

op = ofm.ops[0]

2008

ofm_clone = ofm.clone()

2009

ofm_clone.values = ofm.values

2010

ofm.values = None

2011

np_dtype = ofm.dtype.as_numpy_type()

2012

zero = create_const_tensor("zero", [1], ofm.dtype, [0], np_dtype, quantization=ofm.quantization)

2013

memcpy = create_add_nop(f"{ofm.name}_copy")

2014

memcpy.add_input_tensor(ofm_clone)

2015

memcpy.add_input_tensor(zero)

2016

memcpy.set_output_tensor(ofm)

2017

memcpy.set_ifm_ofm_shapes()

2018

op.set_output_tensor(ofm_clone)

2019

DebugDatabase.add_optimised(op, memcpy)

2020

Patrik Gustavsson