Blame - ethosu/vela/tflite_graph_optimiser.py - ml/ethos-u/ethos-u-vela

2023-01-13 17:57:25 +0000

[diff] [blame]

1

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Rickard Bolin

bc6ee58

2022-11-04 08:24:29 +0000

[diff] [blame]

16

#

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

17

# Description:

18

# Early optimisation of a TensorFlow Lite based network graph, using the rewrite_graph module

19

# to do the traversal of the graph.

20

import math

21

import uuid

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

import numpy as np

from . import fp_math

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

26

from . import rewrite_graph

27

from . import scaling

28

from .api import NpuRoundingMode

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

29

from .data_type import BaseType

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

30

from .data_type import DataType

31

from .debug_database import DebugDatabase

32

from .errors import UnsupportedFeatureError

33

from .ethos_u55_regs.ethos_u55_regs import resampling_mode

Jonas Ohlsson

0957e3e

2021-09-01 15:57:21 +0200

[diff] [blame]

34

from .graph_optimiser_util import bypass_memory_only_ops

Patrik Gustavsson

c74682c

2021-08-17 14:26:38 +0200

[diff] [blame]

35

from .graph_optimiser_util import calc_explicit_padding

Patrik Gustavsson

df99510

2021-08-23 15:33:59 +0200

[diff] [blame]

36

from .graph_optimiser_util import convert_depthwise_to_conv

Patrik Gustavsson

f436ada

2021-09-14 14:56:48 +0200

[diff] [blame]

37

from .graph_optimiser_util import convert_to_lut

Jonas Ohlsson

0957e3e

2021-09-01 15:57:21 +0200

[diff] [blame]

38

from .graph_optimiser_util import memory_only_ops

Patrik Gustavsson

f1580f0

2021-09-01 12:43:02 +0200

[diff] [blame]

39

from .graph_optimiser_util import move_splitsliceread_to_consumer

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

40

from .graph_optimiser_util import needed_total_padding

41

from .graph_optimiser_util import set_ifm_ofm_op_shapes

42

from .graph_optimiser_util import set_tensor_equivalence

43

from .numeric_util import clamp_sigmoid

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

44

from .numeric_util import round_away_zero

45

from .operation import create_activation_function

Fredrik Svedberg

1a7527c

2021-09-13 15:52:16 +0200

[diff] [blame]

46

from .operation import ExplicitScaling

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

47

from .operation import NpuBlockType

48

from .operation import Op

49

from .operation import Operation

50

from .operation import Padding

Fredrik Svedberg

f3c7d55

2022-11-04 09:48:49 +0100

[diff] [blame]

51

from .operation_util import create_add_nop

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

52

from .operation_util import create_avgpool_nop

53

from .operation_util import get_pad_values_from_input

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

54

from .scaling import quantise_scale

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

55

from .shape4d import Shape4D

56

from .softmax import SoftMax

57

from .tensor import check_quantized_tens_scaling_equal

58

from .tensor import create_const_tensor

59

from .tensor import create_equivalence_id

60

from .tensor import QuantizationParameters

61

from .tensor import Tensor

62

from .tensor import TensorPurpose

63

from .tflite_mapping import optype_to_builtintype

64

65

passthrough_nodes = (Op.Identity,)

66

67

68

def create_avg_pool_for_concat(concat_op, name, ifm, ifm_shape: Shape4D, write_offset: Shape4D):

69

"""Creates an average pool for the given concat op/input feature map"""

70

ofm = concat_op.ofm

71

avgpool_op = create_avgpool_nop(name)

72

avgpool_op.inputs = [ifm]

73

avgpool_op.outputs = [ofm]

74

75

avgpool_op.write_offset = write_offset

76

avgpool_op.write_shape = ifm_shape

77

ofm.ops.append(avgpool_op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

78

avgpool_op.ifm_shapes.append(ifm_shape)

79

avgpool_op.ofm_shapes.append(concat_op.ofm_shapes[0])

80

avgpool_op.memory_function = Op.ConcatSliceWrite

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

81

DebugDatabase.add_optimised(concat_op, avgpool_op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return avgpool_op

def remove_passthrough_tensor(tens, arch, nng):

86

if len(tens.ops) == 1 and tens.ops[0].type in passthrough_nodes:

87

assert len(tens.ops[0].inputs) == 1

88

tens = tens.ops[0].inputs[0]

return tens

def rewrite_concat_ops(op, arch):

93

if not op.run_on_npu or not op.type.is_concat_op():

return

axis_4D = 0

ofm = op.ofm

ofm.ops = []

offset = 0

unfuse_activation_function(op)

102

103

if op.type == Op.Pack:

104

# Pack is also referred to as Stack

105

axis = int(op.attrs["axis"])

106

if axis < 0: # Convert to positive axis

107

axis = len(op.inputs[0].shape) + 1 + axis

108

109

desired_shape = op.inputs[0].shape[:axis] + [1] + op.inputs[0].shape[axis:]

110

111

axis_4D = axis + (4 - len(desired_shape))

112

113

for idx, inp in enumerate(op.inputs):

114

op.ifm_shapes[idx] = Shape4D(desired_shape)

115

op.type = Op.PackReshaped

116

117

inputs, axis = op.get_concat_inputs_axis()

118

for idx, inp in enumerate(inputs):

119

if op.type != Op.PackReshaped:

120

op.ifm_shapes[idx] = Shape4D(inp.shape)

121

if axis >= 0:

122

axis_4D = axis + (4 - len(inp.shape))

123

else:

124

axis_4D = axis

125

write_offset = [0, 0, 0, 0]

126

write_offset[axis_4D] = offset

127

concat_end = offset + op.ifm_shapes[idx][axis_4D]

128

create_avg_pool_for_concat(

129

op, op.name + str(idx) + "_avgpool", inp, op.ifm_shapes[idx], Shape4D.from_list(write_offset)

130

)

131

offset = concat_end

132

assert ofm.shape[axis] == offset

return op

def rewrite_split_ops(tens, arch, nng):

138

139

if len(tens.ops) == 1 and tens.ops[0].type.is_split_op() and tens.ops[0].type != Op.Unpack:

140

split_op = tens.ops[0]

141

142

# Not supported so leave it and run on CPU

143

if not split_op.run_on_npu:

144

return tens

145

146

inp, outputs, axis, offset_start, offset_end = split_op.get_split_inputs_axis()

147

148

tens.ops = []

149

new_op = Operation(Op.SplitSliceRead, split_op.name)

150

new_op.inputs = [inp]

151

ofm_shape_idx = 0

Tim Hall

51a8dce

2021-12-20 16:49:27 +0000

[diff] [blame]

152

if None in (offset_end, offset_start):

153

read_shape = None

154

else:

155

# the read shape is relative to each start offset

156

read_shape = [oe - os for oe, os in zip(offset_end, offset_start)]

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

157

158

# For Split the offset cannot be extracted from the tensor so it has to

159

# be calculated from the index of the output tensor

160

if axis is not None:

161

# Get the start and end of the split

162

offset_start = [0] * 4

163

axis_4D_list = split_op.attrs.get("split_axis_4D", None) # Present for UnpackReshaped and some StridedSlice

164

for idx, out in enumerate(outputs):

165

if axis_4D_list is not None:

166

axis_4D = axis_4D_list[idx]

167

else:

168

split_op.ofm_shapes[idx] = Shape4D(out.shape)

169

if axis >= 0:

170

axis_4D = axis + (4 - len(out.shape))

else:

axis_4D = axis

if out == tens:

ofm_shape_idx = idx

read_shape = split_op.ofm_shapes[idx]

177

break

178

179

offset_start[axis_4D] += split_op.ofm_shapes[idx][axis_4D]

180

181

new_op.read_offsets[0] = Shape4D.from_list(offset_start, 0)

182

new_op.read_shapes[0] = read_shape

183

new_op.run_on_npu = True

184

new_op.set_output_tensor(tens)

185

new_op.ifm_shapes.append(Shape4D(inp.shape))

186

new_op.ofm_shapes.append(split_op.ofm_shapes[ofm_shape_idx])

187

DebugDatabase.add_optimised(split_op, new_op)

return tens

def remove_SplitSliceRead(op, arch):

193

194

if op.type == Op.SplitSliceRead:

195

# Check if it is possible to put the SplitSliceRead on the tensor consumer, or if an avgpool need to be inserted

196

if (

197

len(op.ofm.consumer_list) == 1

198

and op.ofm.consumer_list[0] is not None

199

and op.ofm.consumer_list[0].run_on_npu

Jonas Ohlsson

0957e3e

2021-09-01 15:57:21 +0200

[diff] [blame]

200

and op.ofm.consumer_list[0].type not in memory_only_ops

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

201

and op.ofm_shapes[0] == Shape4D.from_list(op.ofm.shape)

202

):

203

# SplitSliceRead can be performed by tensor consumer

204

cons_op = op.ofm.consumer_list[0]

Patrik Gustavsson

f1580f0

2021-09-01 12:43:02 +0200

[diff] [blame]

205

move_splitsliceread_to_consumer(op, cons_op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

206

else:

207

avgpool_op = create_avgpool_nop(op.name + "_avgpool")

208

avgpool_op.add_input_tensor(op.ifm)

209

avgpool_op.outputs = [op.ofm]

210

op.ofm.ops.remove(op)

211

op.ofm.ops.append(avgpool_op)

212

avgpool_op.ifm_shapes.append(op.ifm_shapes[0])

213

avgpool_op.ofm_shapes.append(op.ofm_shapes[0])

214

avgpool_op.read_offsets[0] = op.read_offsets[0]

215

avgpool_op.read_shapes[0] = op.read_shapes[0]

216

217

op.ifm.consumer_list.remove(op)

218

DebugDatabase.add_optimised(op, avgpool_op)

219

220

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

221

def calc_padding_and_skirt(padding_type, kernel, input_shape, explicit_padding):

222

k_w, k_h = kernel.dilated_wh()

223

s_x, s_y = kernel.stride

224

ypad = needed_total_padding(int(input_shape.height), int(s_y), int(k_h))

225

xpad = needed_total_padding(int(input_shape.width), int(s_x), int(k_w))

226

if padding_type == Padding.SAME:

227

left_pad = (xpad + 0) // 2

228

right_pad = (xpad + 1) // 2

229

top_pad = (ypad + 0) // 2

230

bottom_pad = (ypad + 1) // 2

231

elif padding_type == Padding.VALID:

left_pad = 0

right_pad = 0

top_pad = 0

bottom_pad = 0

elif padding_type == Padding.EXPLICIT:

237

# Padding is specified in a PAD operator which has been bypassed.

238

top, left, bottom, right = explicit_padding

239

top_pad, bottom_pad = calc_explicit_padding(int(input_shape.height), int(s_y), int(k_h), int(top), int(bottom))

240

left_pad, right_pad = calc_explicit_padding(int(input_shape.width), int(s_x), int(k_w), int(left), int(right))

Rickard Bolin

9ae3455

2022-06-09 13:07:17 +0000

[diff] [blame]

241

elif padding_type == Padding.TILE:

242

# The values in the explicit padding only represent the "direction" in which to pad

243

top_pad, left_pad, bottom_pad, right_pad = explicit_padding

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

244

else:

Tim Hall

0ab2edc

2022-02-23 17:58:02 +0000

[diff] [blame]

245

raise UnsupportedFeatureError(f"Unsupported padding = {padding_type} for padding calculation")

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

246

padding = (top_pad, left_pad, bottom_pad, right_pad)

247

skirt = (top_pad, left_pad, ypad - top_pad, xpad - left_pad)

248

return padding, skirt

249

250

251

def calc_upscaled_padding_and_skirt(padding_type, kernel_size, stride, input_shape, upscaling_factor):

252

kernel_height, kernel_width = kernel_size[0], kernel_size[1]

253

if padding_type == Padding.SAME:

254

ypad = needed_total_padding(int(input_shape.height) * upscaling_factor, int(stride[1]), int(kernel_height))

255

xpad = needed_total_padding(int(input_shape.width) * upscaling_factor, int(stride[2]), int(kernel_width))

256

right_pad = max(((xpad + 1) // upscaling_factor) - 1, 0)

257

bottom_pad = max(((ypad + 1) // upscaling_factor) - 1, 0)

258

left_pad = max(kernel_width - 1 - right_pad, 0)

259

top_pad = max(kernel_height - 1 - bottom_pad, 0)

260

elif padding_type == Padding.VALID:

261

right_pad = max(kernel_width - 2, 0)

262

bottom_pad = max(kernel_height - 2, 0)

263

left_pad = kernel_width - 1

264

top_pad = kernel_height - 1

265

else:

Tim Hall

0ab2edc

2022-02-23 17:58:02 +0000

[diff] [blame]

266

raise UnsupportedFeatureError(f"Unsupported padding = {padding_type} for up-scaled padding calculation")

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

267

padding = (top_pad, left_pad, bottom_pad, right_pad)

268

skirt = padding

269

return padding, skirt

270

271

272

def fixup_conv2d_backprop(op, arch, nng):

273

if op.type == Op.Conv2DBackpropInput:

274

# flip the inputs

275

op.inputs[0], op.inputs[2] = op.inputs[2], op.inputs[0]

276

op.type = Op.Conv2DBackpropInputSwitchedBias

Tim Hall

3c5cfe9

2022-03-16 16:31:57 +0000

[diff] [blame]

277

op.ifm_resampling_mode = resampling_mode.TRANSPOSE

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

278

279

# Update strides

280

op.attrs.update({"stride_w": 1, "stride_h": 1, "strides": (1, 1, 1, 1)})

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

281

DebugDatabase.add_optimised(op, op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

# Convert the op to an elementwise add

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

287

def convert_resize_1x1_to_add(op):

288

op.type = Op.Add # original_type will stay as Op.ResizeBilinear or Op.ResizeNearestNeighbor

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

289

op.name = op.name + "_add"

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

290

# Create an input tensor filled with zeros

wilisa01

8289d51

2023-01-12 08:17:23 +0000

[diff] [blame]

291

name = op.inputs[1].name + "_add"

292

dtype = op.inputs[0].dtype

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

293

shape = op.ofm_shapes[0].as_list()

wilisa01

8289d51

2023-01-12 08:17:23 +0000

[diff] [blame]

294

values = np.zeros(shape, dtype.as_numpy_type())

295

quantization = QuantizationParameters(0.0, 255.0)

296

quantization.scale_f32 = 1.0

297

quantization.zero_point = 0

wilisa01

16b5e5e

2023-02-14 12:03:59 +0000

[diff] [blame]

298

op.inputs[1] = op.inputs[0]

299

op.set_input_tensor(create_const_tensor(name, shape, dtype, values, quantization=quantization), 0)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

300

op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

301

DebugDatabase.add_optimised(op, op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

306

# Convert ResizeNearestNeightbor with align corners to a depthwise convolution. The IFM will already have been upscaled

307

# apart from the final x2 scaling which will be done as part of this operation. The kernel contains a single coefficient

308

# to select the appropriate nearest neighbor value

309

def convert_resizenn_ac_to_depthwise_conv(op, upscale_factor):

310

ifm = op.ifm

311

ofm = op.ofm

312

output_depth = ofm.shape[-1]

313

dw_op_attrs = {

314

"padding": Padding.VALID,

315

"stride_h": 1,

316

"stride_w": 1,

317

"strides": (1, 1, 1, 1),

318

"depth_multiplier": 1,

319

"channel_multiplier": 1,

320

"dilation_h_factor": 1,

321

"dilation_w_factor": 1,

322

"dilation": (1, 1, 1, 1),

323

}

324

325

# change resizebilinear to depthwise

326

op.type = Op.DepthwiseConv2DBias

327

op.attrs.update(dw_op_attrs)

328

op.set_input_tensor(ifm, 0) # ifm tensor index

329

op.activation = None

330

331

# add input resample to resize by x2

332

op.ifm_resampling_mode = resampling_mode.NEAREST

333

334

# don't care about the rounding mode as it is nearest neighbor

335

336

# setup weight tensor

337

weight_quant = QuantizationParameters()

338

weight_quant.scale_f32 = 1.0 # no scaling as only a single non-zero coeff to select the desired value

339

weight_quant.zero_point = 0

340

weight_quant.quant_dim = 0

341

ofm_dtype = ofm.dtype

Tim Hall

2023-01-13 17:57:25 +0000

[diff] [blame]

342

if ofm_dtype.type == BaseType.UnsignedInt:

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

343

weight_quant.quant_min = 0

344

weight_quant.quant_max = (1 << ofm_dtype.bits) - 1

345

else:

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

346

weight_quant.quant_min = -(1 << (ofm_dtype.bits - 1))

347

weight_quant.quant_max = (1 << (ofm_dtype.bits - 1)) - 1

348

349

weight_shape = [upscale_factor, upscale_factor, output_depth, output_depth] # HWIO

350

351

# the single non-zero coefficient used to select the desired value needs to be placed in the 'centre value', which

352

# is calculated by finding the 'centre position' ('*' in the diagram below) and then choosing the 'value' that is

353

# below-and-right (i.e. next) to it (D).

# 0---1---2

# | A | B |

# 1---*---+

# | C | D |

# 2---+---+

weight_values = [0] * (upscale_factor * upscale_factor)

360

centre_coeff = (upscale_factor // 2) * upscale_factor + (upscale_factor // 2)

361

weight_values[centre_coeff] = 1

362

363

# add weight tensor, this will discard the size tensor of the resize op

op.set_input_tensor(

create_const_tensor(

"weights",

weight_shape,

Tim Hall

2023-01-13 17:57:25 +0000

[diff] [blame]

368

ofm_dtype,

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

369

np.array(weight_values).reshape(weight_shape),

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

370

quantization=weight_quant,

371

),

372

1, # inputs tensor weight index

373

)

374

375

# setup bias tensor by assign None and then call the fix-up function to create a suitable tensor.

376

# need to append the bias tensor as resize ops only have 2 inputs

377

assert len(op.inputs) == 2

378

op.inputs.append(None)

Fredrik Svedberg

cc219be

2022-09-20 16:32:52 +0200

[diff] [blame]

379

fixup_bias_tensors(op, None, None, DataType.int32)

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

380

381

# finally update the shape incase we've change the tensor shapes or connections

382

op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

383

DebugDatabase.add_optimised(op, op)

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

return op

# Convert ResizeBilinear/NearestNeighbor to a number of 1x1 average pools with nearest neighbor x2 upscaling and one

389

# final average pool with a kernel size that depends upon the resize ops upscaling factor (x2, x4 or x8). The maximum

390

# upscale factor is limited to x8 because of the limit 8x8 kernel size limit for average pool with padding.

391

def convert_resize_to_upscale_and_average_pool(op):

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

392

pre_op = op

393

outputs = op.outputs

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

394

dtype = op.ifm.dtype

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

395

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

396

op.attrs.update({"strides": (1, 1, 1, 1), "ksize": (1, 1, 1, 1)})

Tim Hall

47c7636

2022-07-18 21:26:47 +0100

[diff] [blame]

397

op.attrs["padding"] = Padding.SAME # doesn't really matter as the kernel is 1x1

Tim Hall

3c5cfe9

2022-03-16 16:31:57 +0000

[diff] [blame]

398

op.ifm_resampling_mode = resampling_mode.NEAREST

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

399

400

upscaled_shape = np.array(op.ifm_shapes[0].get_hw_as_list())

Tim Hall

47c7636

2022-07-18 21:26:47 +0100

[diff] [blame]

401

402

# Get upscale factor that was calculated in the supported operators check

403

upscale_factor = op.attrs["upscale_factor"]

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

404

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

405

# Calculate how many times 2x2 upscaling needs to be performed

Tim Hall

f9267da

2022-04-20 20:19:48 +0100

[diff] [blame]

406

# Force the result of round to be an integer. This is because the behaviour of rounding numpy.float64 values changed

407

# between different versions of numpy. This consistency ensures that the kernel dimensions are kept integral

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

408

n = int(np.log2(upscale_factor))

409

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

410

# Perform x2 upscaling n-1 times

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

411

scaled_op = pre_op

412

for count in range(n - 1):

413

if count > 0:

414

scaled_op = op.clone(f"_{count}")

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

415

scaled_op.inputs[0] = pre_op.outputs[0]

416

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

417

# Nearest neighbor x2 upscaling

Tim Hall

47c7636

2022-07-18 21:26:47 +0100

[diff] [blame]

418

upscaled_shape = upscaled_shape * 2

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

419

shape = op.ofm_shapes[0].as_list()

420

shape[1:3] = upscaled_shape

421

out_tens = Tensor(shape, dtype, f"{op.outputs[0].name}_{count}")

422

out_tens.quantization = op.outputs[0].quantization.clone()

423

scaled_op.set_output_tensor(out_tens)

424

pre_op = scaled_op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

425

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

426

scaled_op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

427

DebugDatabase.add_optimised(op, scaled_op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

428

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

429

# Last x2 upscaling

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

430

if n > 1:

431

scaled_op = op.clone(f"_{n-1}")

432

scaled_op.inputs[0] = pre_op.outputs[0]

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

433

434

if scaled_op.original_type == Op.ResizeBilinear:

435

if scaled_op.attrs["align_corners"]:

436

# no padding

437

scaled_op.attrs["padding"] = Padding.VALID

438

else:

439

# padding to the right and bottom (limits average pool to 8x8 kernel)

440

scaled_op.attrs["padding"] = Padding.EXPLICIT

441

scaled_op.attrs["explicit_padding"] = [0, 0, upscale_factor - 1, upscale_factor - 1]

442

443

# kernal size dependent on the upscaling factor

444

scaled_op.attrs.update({"ksize": (1, upscale_factor, upscale_factor, 1)})

445

else: # Op.ResizeNearestNeighbor

446

if scaled_op.attrs["align_corners"]:

447

# use depthwise conv to select the correct value

448

scaled_op = convert_resizenn_ac_to_depthwise_conv(scaled_op, upscale_factor)

449

else:

Johan Alfvén

a64616c

2022-10-17 12:29:12 +0200

[diff] [blame]

450

# Keep 1x1 kernel and average pool, this applies both when

451

# half-pixel-centers is True and False. Calculations are the

452

# same in the reference.

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

453

pass

454

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

455

scaled_op.outputs = outputs

456

scaled_op.outputs[0].ops = [scaled_op]

457

scaled_op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

458

DebugDatabase.add_optimised(op, scaled_op)

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

459

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

Rickard Bolin

2022-07-04 16:19:16 +0000

[diff] [blame]

463

def convert_resizebilinear_to_depthwise_convolutions(op, half_pixel_centers=True):

464

def _compute_interpolation_values(index, input_size, output_size):

465

scale = input_size / output_size

466

scaled_value = (index + 0.5 * half_pixel_centers) * scale - 0.5 * half_pixel_centers

467

lower_bound = max(np.floor(scaled_value), 0)

468

469

return scaled_value, lower_bound

470

471

def _compute_kernels(input_height, input_width, output_height, output_width):

kernels = []

for y in (1, 2):

for x in (1, 2):

sv_h, lb_h = _compute_interpolation_values(y, input_height, output_height)

476

sv_w, lb_w = _compute_interpolation_values(x, input_width, output_width)

477

478

# Interpolation values calculated for (x, y) = ([1, 2], [1, 2]) will always generalize to the whole

479

# input for upscale = 2 and input sizes >= 2x2 and be in the correct order for going left-to-right,

480

# top-to-bottom - same as the depthwise convolution strides across each tile

481

kernel = np.zeros((2, 2))

482

kernel[1, 1] = (1 - (sv_h - lb_h)) * (1 - (sv_w - lb_w))

483

kernel[0, 1] = (sv_h - lb_h) * (1 - (sv_w - lb_w))

484

kernel[1, 0] = (1 - (sv_h - lb_h)) * (sv_w - lb_w)

485

kernel[0, 0] = (sv_h - lb_h) * (sv_w - lb_w)

486

kernel *= 16

487

kernels.append(kernel)

return kernels

def _build_convolutions(op, kernels):

492

dw_op_attrs = {

493

"padding": Padding.TILE,

494

"stride_h": 1,

495

"stride_w": 1,

496

"strides": (1, 1, 1, 1),

497

"depth_multiplier": 1,

498

"channel_multiplier": 1,

499

"dilation_h_factor": 1,

500

"dilation_w_factor": 1,

501

"dilation": (1, 1, 1, 1),

}

ifm = op.ifm

ofm = op.ofm

ofm.ops = []

elem_size = 2 if ofm.dtype == DataType.int16 else 1

507

508

n, h, w, c = ifm.shape

509

_, _, ow, _ = ofm.shape

510

511

intermediate_tens = Tensor(ifm.shape, ifm.dtype, "intermediate_tens")

512

intermediate_tens.quantization = op.outputs[0].quantization.clone()

513

avgpool_op = op

514

avgpool_op.name = "rb_init_avgpool"

515

avgpool_op.type = Op.AvgPool

516

avgpool_op.attrs["padding"] = Padding.VALID

517

avgpool_op.attrs["stride_w"] = 1

518

avgpool_op.attrs["stride_h"] = 1

519

avgpool_op.attrs["filter_width"] = 1

520

avgpool_op.attrs["filter_height"] = 1

521

avgpool_op.attrs["strides"] = [1, 1, 1, 1]

522

avgpool_op.attrs["ksize"] = [1, 1, 1, 1]

523

524

avgpool_op.add_input_tensor(ifm)

525

avgpool_op.set_output_tensor(intermediate_tens)

526

avgpool_op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

527

DebugDatabase.add_optimised(op, op)

Rickard Bolin

2022-07-04 16:19:16 +0000

[diff] [blame]

528

529

dw_conv = Operation(Op.DepthwiseConv2DBias, "depthwise_conv")

530

dw_conv._original_type = Op.ResizeBilinear

531

dw_conv.write_shape = Shape4D(n, h, w, c)

532

dw_conv.write_offset = Shape4D(0, 0, 0, 0)

533

534

# Set the output rounding mode. Resize bilinear requires rounding away from zero. Therefore, we need to

535

# adjust the accumulated value by a "small" amount before applying natural rounding. The "small" amount

536

# should be big enough to cause a x.5 to be rounded correctly but small enough not to cause smaller

537

# values to be incorrectly rounded

538

ofm.quantization.next_after = True

539

dw_conv.rounding_mode = NpuRoundingMode.NATURAL

540

541

# Double height and width stride to write the output of each of the four depthwise convolutions below

542

# interleaved with each other when combined with OFM tile base offsets.

543

dw_conv.ofm_stride_multiplier = [1, 2, 2] # C/H/W

544

545

# Choose tile padding direction - pad by 1 with edge values in two direction.

546

# For example, TL (top left) will pad top and left in H/W-plane in all channels.

547

directions = [[1, 1, 0, 0], [1, 0, 0, 1], [0, 1, 1, 0], [0, 0, 1, 1]] # TL, TR, BL, BR

for i in (0, 1):

for j in (0, 1):

index = i * 2 + j

dw_conv.name = f"depthwise_conv_{index}"

552

dw_op_attrs["explicit_padding"] = directions[index]

553

dw_conv.attrs.update(dw_op_attrs)

554

555

# This will offset the start of the write by modifying the Tile 0 base address

556

dw_conv.tile_base_offsets_ofm[0] = (i * ow + j) * c * elem_size

557

558

ofm.ops.append(dw_conv)

559

dw_conv.outputs = [ofm]

560

561

kernel = kernels[index]

562

shape = [2, 2, 1, c]

563

kernel = np.dstack([kernel] * c)

564

565

quant = QuantizationParameters()

566

quant.zero_point = 0

567

quant.scale_f32 = 1.0 / 16

568

569

dw_conv.inputs = []

570

dw_conv.add_input_tensor(intermediate_tens)

571

dw_conv.add_input_tensor(

create_const_tensor(

"weights",

shape,

intermediate_tens.dtype,

576

np.array(kernel).reshape(shape),

Rickard Bolin

2022-07-04 16:19:16 +0000

[diff] [blame]

quantization=quant,

),

)

# setup bias tensor by assign None and then call the fix-up function to create a suitable tensor.

582

# need to append the bias tensor as resize ops only have 2 inputs

583

assert len(dw_conv.inputs) == 2

584

dw_conv.inputs.append(None)

Rickard Bolin

017b4cc

2022-09-23 10:16:48 +0000

[diff] [blame]

585

fixup_bias_tensors(dw_conv, None, None, dtype=DataType.int32)

Rickard Bolin

2022-07-04 16:19:16 +0000

[diff] [blame]

586

587

dw_conv.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

588

DebugDatabase.add_optimised(op, dw_conv)

589

Rickard Bolin

2022-07-04 16:19:16 +0000

[diff] [blame]

590

dw_conv = dw_conv.clone(f"_{index}")

591

return op

592

593

_, input_height, input_width, _ = op.ifm.shape

594

_, output_height, output_width, _ = op.ofm.shape

595

596

kernels = _compute_kernels(input_height, input_width, output_height, output_width)

597

op = _build_convolutions(op, kernels)

return op

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

602

def fixup_resize(op, arch, nng):

603

if op.type.is_resize_op() and op.run_on_npu:

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

604

if op.ifm_shapes[0] == op.ofm_shapes[0]:

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

605

# Bypass the resize op which is essentially a NOP

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

606

op.inputs = op.inputs[:1]

607

op.type = Op.Identity

608

elif op.ifm_shapes[0].height == 1 and op.ifm_shapes[0].width == 1:

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

609

convert_resize_1x1_to_add(op)

Rickard Bolin

2022-07-04 16:19:16 +0000

[diff] [blame]

610

elif op.type == Op.ResizeBilinear and op.attrs.get("half_pixel_centers", False):

611

convert_resizebilinear_to_depthwise_convolutions(op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

612

else:

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

613

convert_resize_to_upscale_and_average_pool(op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

def convert_nop_split_to_identity(op, arch, nng):

619

if op.type == Op.Split and op.attrs.get("num_splits") == 1:

620

# the list comprehension should return a list with a single tensor

621

# if it shouldn't, remove_passthrough_tensor will fail appropriately

622

op.inputs = [i for i in op.inputs if i.shape == op.outputs[0].shape]

623

op.type = Op.Identity

return op

Ayaan Masood

2022-04-21 14:28:03 +0100

[diff] [blame]

627

def rewrite_fully_connected_input(op: Operation, arch, nng):

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

628

Ayaan Masood

a2ec5aa

2022-04-21 14:28:03 +0100

[diff] [blame]

629

if op.type == Op.FullyConnected:

630

new_shape = op.ifm.get_shape_as_2d(op.weights.shape[-2])

631

assert new_shape is not None, "Tensor can not be reshaped to 2D"

632

op.ifm_shapes[0] = new_shape

Johan Alfvén

65835e0

2022-10-13 10:49:30 +0200

[diff] [blame]

633

634

if op.ifm_shapes[0].batch > 1 and op.ofm_shapes[0].batch == 1:

635

# If IFM is batching then also make sure OFM is batching

636

h, w = op.ofm_shapes[0].height, op.ofm_shapes[0].width

637

op.ofm_shapes[0] = Shape4D([h * w, 1, 1, op.ofm_shapes[0].depth])

638

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

def convert_batched_fc_shape(op, arch, nng):

643

if op.type == Op.FullyConnected:

644

# Check if the first dimension indicates batching

645

if op.ifm_shapes[0].batch > 1:

646

batching_split = {4: (2, 2), 8: (2, 4), 16: (4, 4)}

647

n = op.ifm_shapes[0].batch

648

h, w = batching_split.get(n, (1, n))

649

op.ifm_shapes[0] = Shape4D([1, h, w, op.ifm_shapes[0].depth])

650

651

# Reshape Weights to be 4D. IO becomes HWIO

652

weight_tensor = op.inputs[1]

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

653

weight_tensor.values = np.expand_dims(np.expand_dims(weight_tensor.values, axis=0), axis=0)

654

weight_tensor.set_all_shapes(list(weight_tensor.values.shape))

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

655

656

n = op.ofm_shapes[0].batch

657

h, w = batching_split.get(n, (1, n))

658

op.ofm_shapes[0] = Shape4D([1, h, w, op.ofm_shapes[0].depth])

return op

def unfuse_activation_function(op):

663

if op.type == Op.ConcatTFLite and op.run_on_npu and op.activation is not None:

664

act_op = Operation(op.activation.op_type, op.name + op.activation.op_type.name)

665

op.activation = None

666

out_tens = op.outputs[0]

667

intermediate_tens = out_tens.clone("_act_intermediate")

668

act_op.set_output_tensor(out_tens)

669

act_op.add_input_tensor(intermediate_tens)

670

op.set_output_tensor(intermediate_tens)

671

act_op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

672

DebugDatabase.add_optimised(op, act_op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

673

674

675

def rewrite_stridedslice_output(op, arch, nng):

676

if not op.run_on_npu or op.type != Op.StridedSlice:

677

return op

678

679

new_axis_mask = op.attrs["new_axis_mask"]

680

shrink_axis_mask = op.attrs["shrink_axis_mask"]

681

682

if shrink_axis_mask == 0 and new_axis_mask == 0:

683

return op

684

685

axis_4D = [0] * len(op.outputs)

686

for idx, out_tens in enumerate(op.outputs):

687

output_shape = list(out_tens.shape)

688

689

if shrink_axis_mask != 0:

690

n = 0

691

axis = 0

692

while shrink_axis_mask:

693

prev_mask = shrink_axis_mask

694

n += 1

695

shrink_axis_mask &= shrink_axis_mask - 1

696

axis = int(math.log2(prev_mask - shrink_axis_mask))

697

output_shape = output_shape[:axis] + [1] + output_shape[axis:]

698

699

assert len(out_tens.shape) == (len(op.inputs[0].shape) - n)

700

op.attrs["shrink_axis_mask"] = 0

701

if axis >= 0:

702

axis_4D[idx] = axis + (4 - len(output_shape))

703

else:

704

axis_4D[idx] = axis

705

op.ofm_shapes[idx] = Shape4D(output_shape)

706

707

elif new_axis_mask != 0:

n = 0

axis = 0

while new_axis_mask:

prev_mask = new_axis_mask

712

n += 1

713

new_axis_mask &= new_axis_mask - 1

714

axis = int(math.log2(prev_mask - new_axis_mask))

715

output_shape = output_shape[:axis] + output_shape[(axis + 1) :]

716

new_axis_mask >>= 1

717

718

assert len(out_tens.shape) == (len(op.inputs[0].shape) + n)

719

op.attrs["new_axis_mask"] = 0

720

if axis >= 0:

721

axis_4D[idx] = axis + (4 - len(output_shape))

722

else:

723

axis_4D[idx] = axis

724

op.ofm_shapes[idx] = Shape4D(output_shape)

725

726

op.attrs["split_axis_4D"] = axis_4D

return op

def rewrite_unpack_output(op, arch, nng):

731

tens = op.outputs[0]

732

if op.run_on_npu and op.type == Op.Unpack:

733

# Unpack is also referred to as Unstack

734

axis = int(op.attrs["axis"])

735

if axis < 0: # Convert to positive axis

736

axis = len(op.inputs[0].shape) + 1 + axis

737

op.type = Op.UnpackReshaped

738

desired_output_shape = tens.shape[:axis] + [1] + tens.shape[axis:]

739

740

axis_4D = axis + (4 - len(desired_output_shape))

741

op.attrs["split_axis_4D"] = [axis_4D] * len(op.outputs)

742

743

for idx, out_tens in enumerate(op.outputs):

744

op.ofm_shapes[idx] = Shape4D(desired_output_shape)

return op

def add_padding_fields(op, arch, nng):

749

if op.run_on_npu:

750

if "padding" in op.attrs:

751

input_shape = op.ifm_shapes[0]

752

output_shape = op.ofm_shapes[0]

753

if op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op():

754

kernel_size = op.inputs[1].shape[:2]

755

elif op.type.is_pool_op() or op.type.npu_block_type == NpuBlockType.ReduceSum:

756

kernel_size = op.attrs["ksize"][1:3]

757

else:

758

raise UnsupportedFeatureError(f"Unknown operation that uses padding: {optype_to_builtintype(op.type)}")

759

760

if op.type == Op.Conv2DBackpropInputSwitchedBias:

761

upscaling_factor = output_shape.height // input_shape.height

762

padding, skirt = calc_upscaled_padding_and_skirt(

763

op.attrs["padding"], kernel_size, op.attrs["strides"], input_shape, upscaling_factor

764

)

765

else:

766

padding, skirt = calc_padding_and_skirt(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

op.attrs["padding"],

op.kernel,

input_shape,

op.attrs.get("explicit_padding"),

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

771

)

772

773

op.attrs["explicit_padding"] = padding

774

op.attrs["skirt"] = skirt

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

779

def reorder_depthwise_weights(op, arch, nng):

780

if op.type.is_depthwise_conv2d_op():

781

weight_tensor = op.inputs[1]

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

782

weight_tensor.values = np.transpose(weight_tensor.values, (0, 1, 3, 2))

783

weight_tensor.set_all_shapes(list(weight_tensor.values.shape))

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

784

weight_tensor.weight_transpose_depthwise = True

return op

Raul Farkas

2023-01-24 16:29:06 +0000

[diff] [blame]

789

def fixup_strided_conv(op, arch, nng):

790

if op.type != Op.Conv2DBias:

Louis Verhaard

2022-03-17 14:06:00 +0100

[diff] [blame]

791

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

792

stride_x, stride_y = op.get_kernel_stride()

Louis Verhaard

2022-03-17 14:06:00 +0100

[diff] [blame]

793

weight_tensor = op.weights

794

ifm_shape = op.ifm_shapes[0]

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

795

Raul Farkas

2023-01-24 16:29:06 +0000

[diff] [blame]

796

# Do not optimize if op is not the first in the network and stride is

797

# supported by the hardware

798

if op.op_index != 0 and stride_x < 4:

799

return op

800

op.ifm.needs_linear_format = True

801

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

802

if (

Raul Farkas

2023-01-24 16:29:06 +0000

[diff] [blame]

803

(stride_x == 2 or stride_x == 4)

Louis Verhaard

2022-03-17 14:06:00 +0100

[diff] [blame]

804

and ifm_shape.depth <= 4

805

and ifm_shape.width % 2 == 0

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

806

and weight_tensor is not None

807

and weight_tensor.shape[1] >= 2

808

):

Louis Verhaard

2022-03-17 14:06:00 +0100

[diff] [blame]

809

k_w, _ = op.get_kernel_size()

Raul Farkas

2023-01-24 16:29:06 +0000

[diff] [blame]

810

curr_padding_x = needed_total_padding(ifm_shape.width, stride_x, k_w)

811

optimised_padding_x = needed_total_padding(ifm_shape.width // stride_x, 1, (k_w + 1) // stride_x)

812

padding_type = op.attrs.get("padding", None)

813

814

# If padding is enabled, check if current padding matches optimised padding

815

if not padding_type or (padding_type != Padding.VALID and curr_padding_x != optimised_padding_x):

Louis Verhaard

2022-03-17 14:06:00 +0100

[diff] [blame]

816

# Horizontal padding would become different after optimisation; this would not work

817

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

818

# IFM

Raul Farkas

2023-01-24 16:29:06 +0000

[diff] [blame]

819

op.ifm_shapes[0] = Shape4D(

820

[ifm_shape.batch, ifm_shape.height, ifm_shape.width // stride_x, ifm_shape.depth * stride_x]

821

)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

822

823

# Weights

824

weight_shape = weight_tensor.shape

825

if weight_shape[1] % 2 != 0:

826

weight_shape[1] = weight_shape[1] + 1

827

padded_array = np.zeros(weight_shape)

828

for i in range(weight_shape[0]):

829

padded_array[i] = np.vstack(

830

[

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

831

weight_tensor.values[i],

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

832

np.full((1, weight_shape[2], weight_shape[3]), weight_tensor.quantization.zero_point),

833

]

834

)

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

835

weight_tensor.values = padded_array

Raul Farkas

2023-01-24 16:29:06 +0000

[diff] [blame]

836

837

# Change weight shape based on stride_x

838

weight_shape[1] //= stride_x

839

weight_shape[2] *= stride_x

840

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

841

weight_tensor.values = np.reshape(weight_tensor.values, weight_shape)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

842

weight_tensor.set_all_shapes(weight_shape)

843

# If multiple copies of the weights are used, we could avoid

844

# them having the same address by changing the value_id

845

weight_tensor.value_id = uuid.uuid4()

# Strides

stride_x = 1

op.attrs.update({"stride_w": stride_x, "stride_h": stride_y, "strides": (1, stride_y, stride_x, 1)})

return op

def convert_conv_to_fc(op, arch, nng):

855

# Conv 1x1 can be equivalent to Fully Connected.

856

# By representing certain convs as fully connected layers, Vela can better determine wether or not to use

857

# caching/double buffering for the weights.

858

# (Weights dont need to be reloaded for convs when IFM H and W are 1)

859

if op.type == Op.Conv2DBias:

860

h = op.ifm_shapes[0].height

861

w = op.ifm_shapes[0].width

862

kh, kw, _, _ = op.inputs[1].shape

863

if h == 1 and w == 1 and kh == 1 and kw == 1:

864

# Overwrite this op as a Fully Connected Op

865

op.name += "_fc"

866

op.type = Op.FullyConnected

op.attrs = {

"weights_format": 0,

}

# Reshape Weights to be 2D. HWIO becomes just IO (as H and W are 1, they can just be dropped)

871

weight_tensor = op.inputs[1]

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

872

weight_tensor.values = weight_tensor.values.squeeze(axis=(0, 1))

873

weight_tensor.set_all_shapes(list(weight_tensor.values.shape))

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

874

875

DebugDatabase.add_optimised(op, op)

return op

def fixup_relus_with_differing_ifm_ofm_scaling(op, arch, nng):

880

if op.run_on_npu and op.type.is_relu_op():

881

ifm = op.inputs[0]

882

ofm = op.outputs[0]

883

# Relu with differing IFM and OFM scaling cannot be fused with another primary op

884

# and requires its own to be inserted

885

if not check_quantized_tens_scaling_equal(ifm, ofm):

886

# Override this op with its own primary op (avgpool)

887

relu_fused_op = create_avgpool_nop(op.name + "_avgpool")

888

# And fuse the original activation function to it

889

relu_fused_op.activation = create_activation_function(op.type)

Fredrik Svedberg

1a7527c

2021-09-13 15:52:16 +0200

[diff] [blame]

890

# Add explicit rescaling

891

rescale = ifm.quantization.scale_f32 / ofm.quantization.scale_f32

892

multiplier, shift = scaling.quantise_scale(rescale)

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame]

893

relu_fused_op.explicit_scaling = ExplicitScaling(False, [shift], [multiplier])

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

894

# Tidy up and assign the ifm and ofm to the new op

895

ifm.consumer_list.remove(op)

896

897

relu_fused_op.add_input_tensor(ifm)

898

relu_fused_op.set_output_tensor(ofm)

899

relu_fused_op.set_ifm_ofm_shapes()

op = relu_fused_op

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

904

def convert_softmax(op, arch, nng):

905

if op.type == Op.Softmax and op.run_on_npu:

906

softmax = SoftMax(op)

907

op = softmax.get_graph()

return op

Fredrik Svedberg

2022-08-19 16:06:04 +0200

[diff] [blame]

911

def convert_prelu(op, arch, nng):

912

if op.type == Op.Prelu:

913

ifm, alpha, ofm = op.get_ifm_ifm2_ofm()

914

if None in (ifm, alpha, ofm):

915

return op

916

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

917

if alpha.values is not None:

918

# If const alpha check for possible optimisations

919

alpha_zp = alpha.quantization.zero_point

920

alpha_scale = alpha.quantization.scale_f32

921

# If all alpha values are the same the PReLU can be converted to LeakyRelu

Rickard Bolin

5fdcf17

2022-12-19 12:56:17 +0000

[diff] [blame]

922

alpha_min = (alpha.values.min().astype(int) - alpha_zp) * alpha_scale

923

alpha_max = (alpha.values.max().astype(int) - alpha_zp) * alpha_scale

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

924

if alpha_min == alpha_max:

# or even a Relu

if alpha_min == 0:

new_op = Op.Relu

else:

new_op = Op.LeakyRelu

930

op.attrs["alpha"] = alpha_min

931

# setup alpha_scaling for bit exact result

932

ifm_scale = ifm.quantization.scale_f32

933

ofm_scale = ofm.quantization.scale_f32

934

alpha_scale, alpha_shift = scaling.elementwise_mul_scale(ifm_scale, alpha_scale, ofm_scale)

935

op.attrs["alpha_scaling"] = (alpha.values.min() - alpha_zp, alpha_scale, alpha_shift)

936

# Change op type

937

op.type = new_op

938

op.name = op.name.replace("Prelu", new_op.name)

939

del op.inputs[1] # Remove alpha tensor

940

return op

941

elif alpha_max < 1:

942

# If alpha_max is less than 1 convert PReLU to Max(alpha * IFM, identity * IFM)

943

# Multiply with alpha tensor

944

mul_alpha = Operation(Op.Mul, op.name + "_mul_alpha")

945

mul_alpha.add_input_tensor(ifm)

946

mul_alpha.add_input_tensor(alpha)

947

fm_alpha = ofm.clone(op.name + "_alpha", set_unique=True)

948

mul_alpha.set_output_tensor(fm_alpha)

949

mul_alpha.set_ifm_ofm_shapes()

950

DebugDatabase.add_optimised(op, mul_alpha)

951

if check_quantized_tens_scaling_equal(ifm, ofm):

952

# No scaling is needed

953

fm_id = ifm

954

else:

955

# Add multiplication with identity

956

mul_identity = Operation(Op.Mul, op.name + "_mul_identity")

957

mul_identity.add_input_tensor(ifm)

958

# Create const tensor containing identity as scalar

959

quantization = ifm.quantization.clone()

960

quantization.scale_f32 = np.float32(1)

961

quantization.zero_point = 0

962

one = create_const_tensor("one_const", [], ifm.dtype, [1], quantization=quantization)

963

mul_identity.add_input_tensor(one)

964

# Make sure that fm_id is allocated to a different address than fm_alpha

965

fm_id = ofm.clone(op.name + "_id", set_unique=True)

966

mul_identity.set_output_tensor(fm_id)

967

mul_identity.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

968

DebugDatabase.add_optimised(op, mul_identity)

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

969

970

# Combine scaled and alpha multiplied values

971

max_op = Operation(Op.Maximum, op.name + "_max")

972

max_op.add_input_tensor(fm_alpha)

973

max_op.add_input_tensor(fm_id)

974

max_op.set_output_tensor(ofm)

975

max_op.set_ifm_ofm_shapes()

976

977

DebugDatabase.add_optimised(op, max_op)

978

ifm.consumer_list.remove(op)

979

return max_op

980

981

# Catch all PReLU conversion for the cases that could not be optimised above

Fredrik Svedberg

2022-08-19 16:06:04 +0200

[diff] [blame]

982

no_scale_quant = ifm.quantization.clone()

983

no_scale_quant.scale_f32 = None

984

no_scale_quant.zero_point = 0

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

985

zero = create_const_tensor("zero_const", [], ifm.dtype, [0], quantization=no_scale_quant)

Fredrik Svedberg

2022-08-19 16:06:04 +0200

[diff] [blame]

986

987

# Select values < 0

988

min_op = Operation(Op.Minimum, op.name + "_min")

989

min_op.add_input_tensor(ifm)

990

min_op.add_input_tensor(zero)

991

fm_negative = ifm.clone(op.name + "_negative", set_unique=True)

992

min_op.set_output_tensor(fm_negative)

993

min_op.set_ifm_ofm_shapes()

994

DebugDatabase.add_optimised(op, min_op)

995

996

# and multiply with alpha tensor

997

mul_alpha = Operation(Op.Mul, op.name + "_mul_alpha")

998

mul_alpha.add_input_tensor(fm_negative)

999

mul_alpha.add_input_tensor(alpha)

1000

fm_alpha = ofm.clone(op.name + "_negative_alpha", set_unique=True)

1001

mul_alpha.set_output_tensor(fm_alpha)

1002

mul_alpha.set_ifm_ofm_shapes()

1003

DebugDatabase.add_optimised(op, mul_alpha)

1004

1005

# Select (and scale) values > 0

1006

relu_op = Operation(Op.Relu, op.name + "_relu")

1007

relu_op.add_input_tensor(ifm)

1008

fm_scaled = ofm.clone(op.name + "_positive_scaled", set_unique=True)

1009

relu_op.set_output_tensor(fm_scaled)

1010

relu_op.set_ifm_ofm_shapes()

1011

DebugDatabase.add_optimised(op, relu_op)

1012

1013

# Add scaled and alpha multiplied values (without scaling)

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame]

1014

add_op = Operation(Op.Add, op.name + "_add")

1015

add_op.explicit_scaling = ExplicitScaling(False, shift=[0], multiplier=[1]) # No scaling

Fredrik Svedberg

2022-08-19 16:06:04 +0200

[diff] [blame]

1016

add_op.add_input_tensor(fm_alpha)

1017

add_op.add_input_tensor(fm_scaled)

1018

add_op.set_output_tensor(ofm)

1019

add_op.set_ifm_ofm_shapes()

1020

1021

DebugDatabase.add_optimised(op, add_op)

1022

ifm.consumer_list.remove(op)

op = add_op

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1028

def convert_mul_max_to_abs_or_lrelu(op, arch, nng):

1029

r"""Whenever there is a subgraph with this topology:

1030

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

1031

Input X For X = -1 or X > 0

1032

| \ / This subgraph can be replaced with either

1033

| Mul an Abs (if X = -1) or a LeakyReLU (if X > 0)

1034

| /

1035

Max

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1036

"""

1037

1038

if op.type == Op.Maximum:

1039

# finds the Mul input(s) to the Max

1040

muls = [i for i in op.inputs if i.ops[0].type == Op.Mul]

if len(muls) == 1:

mul = muls[0].ops[0]

elif len(muls) == 2:

# In the case both inputs are Muls, find the one with the same input as the Max

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

1045

mul_ifms = [m for m in muls if len(set(op.inputs + m.ops[0].inputs)) == 1]

1046

if len(mul_ifms):

1047

mul = mul_ifms[0].ops[0]

1048

else:

1049

# Not using same input

1050

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

else:

# No Mul inputs

return op

# make sure the Mul doesn't have any other consumers

1056

mul_ofm = mul.outputs[0]

1057

if len(mul_ofm.consumers()) != 1:

1058

return op

1059

# make sure the Mul doesn't have a fused activation function

1060

if mul.activation:

1061

return op

1062

ifm, ofm = op.get_ifm_ofm()

1063

if ifm is None or ofm is None:

1064

return op

1065

1066

if ifm.dtype not in (DataType.uint8, DataType.int8) or ifm.dtype != ofm.dtype:

1067

return op

1068

if not check_quantized_tens_scaling_equal(ifm, ofm) or not check_quantized_tens_scaling_equal(ifm, mul_ofm):

1069

# rewrite to LeakyRelu currently only makes sense if the quantization is identical

1070

return op

1071

1072

# finds the branched input that goes to both the Max and the Mul

1073

shared = set(op.inputs) & set(mul.inputs)

1074

if len(shared) == 1:

1075

shared_in = shared.pop()

1076

# find the constant scalar input to the Mul

1077

const_tens = (set(mul.inputs) - {shared_in}).pop()

1078

# check that it is a scalar

1079

if const_tens.shape != []:

1080

return op

1081

const = const_tens.ops[0]

1082

# check that it is a constant

1083

if const.type != Op.Const:

1084

return op

1085

# Remove the Mul from the shared input's consumers

1086

shared_in.consumer_list.remove(mul)

else:

return op

val = const.outputs[0].values

1091

if val >= 0:

1092

new_op = Op.LeakyRelu

1093

op.attrs["alpha"] = val

1094

# to produce bit exact results, the alpha is not enough;

1095

# save additional scaling info in attr "alpha_scale", to be used as input

1096

# to the LUT construction

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

1097

alpha_scalar = const_tens.values - const_tens.quantization.zero_point

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1098

mul_ifm_scale = np.double(ifm.quantization.scale_f32)

1099

mul_ifm2_scale = np.double(const_tens.quantization.scale_f32)

1100

mul_ofm_scale = np.double(mul_ofm.quantization.scale_f32)

1101

alpha_scale, alpha_shift = scaling.elementwise_mul_scale(mul_ifm_scale, mul_ifm2_scale, mul_ofm_scale)

1102

op.attrs["alpha_scaling"] = (alpha_scalar, alpha_scale, alpha_shift)

elif val == -1:

new_op = Op.Abs

else:

return op

op.type = new_op

op.name = op.name.replace("Maximum", new_op.name)

1110

op.outputs[0].name = op.outputs[0].name.replace("Maximum", new_op.name)

1111

op.inputs = [shared_in]

1112

op.set_ifm_ofm_shapes()

1113

1114

# Record optimisation in debug database

1115

DebugDatabase.add_optimised(op, op)

return op

def convert_hardswish_to_lut(op, arch, nng):

1121

if op.type == Op.HardSwish:

1122

ifm, ofm = op.get_ifm_ofm()

1123

# Generate the LUT

1124

ifm_scale = np.double(ifm.quantization.scale_f32)

1125

ofm_scale = np.double(ofm.quantization.scale_f32)

1126

zp_in = ifm.quantization.zero_point

1127

zp_out = ofm.quantization.zero_point

1128

ifm_scale_hires = (1 / 128) * ifm_scale

1129

relu_multiplier = np.double(3 / 32768)

1130

out_scale, out_shift = scaling.quantise_scale(ifm_scale_hires / ofm_scale)

1131

relu_scale, relu_shift = scaling.quantise_scale(ifm_scale_hires / relu_multiplier)

1132

# Use 16bit scale

1133

out_scale_16 = fp_math.downscale_multiplier_int32_to_int16(out_scale)

1134

relu_scale_16 = fp_math.downscale_multiplier_int32_to_int16(relu_scale)

1135

1136

values = []

1137

ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)

1138

quantized_min = min(ix)

1139

quantized_max = max(ix)

1140

for x in ix:

1141

input_value = x - zp_in

1142

input_value_hires = input_value * 128

1143

# Compute the input value on essentially the output scale, not shifted yet

1144

input_value_preshift = fp_math.saturating_rounding_mul16(input_value_hires, out_scale_16)

1145

# Compute the "relu-ish multiplier". This matches the code in TensorFlow Lite Micro kernel

1146

relu_value = np.int16(input_value_hires)

1147

if relu_shift < 31:

1148

relu_value = fp_math.shift_left16(relu_value, 30 - relu_shift)

1149

1150

relu_value = fp_math.saturating_rounding_mul16(relu_value, relu_scale_16)

1151

1152

if relu_shift < 31:

1153

relu_value = fp_math.shift_left16(relu_value, 1)

1154

1155

if relu_shift > 31:

1156

relu_value = fp_math.rounding_divide_by_pot(relu_value, relu_shift - 31)

1157

1158

# Rescaled the value into a 16bit fixedpoint relu_value in [-1, 1]

1159

# Now convert that to a 16bit fixedpoint value in [0, 1]

1160

relu_value = (relu_value + (1 << 15)) >> 1

1161

lut_result = fp_math.saturating_mul16(relu_value, input_value_preshift)

1162

shift = 31 - out_shift

1163

shift = -shift if shift < 0 else 0

1164

# Finally apply the output shift

1165

lut_result = fp_math.rounding_divide_by_pot(lut_result, shift) + zp_out

1166

lut_result = min(quantized_max, max(quantized_min, lut_result))

1167

values.append(lut_result)

1168

return convert_to_lut(op, values, "hardswish")

return op

def convert_lrelu_to_mul_max(op, arch):

1173

# Converts LeakyRelu to Max(alpha * IFM, identity * IFM)

1174

# (the opposite of convert_mul_max_to_abs_or_lrelu)

1175

ifm, ofm = op.get_ifm_ofm()

1176

if ifm is None or ofm is None:

1177

return op

1178

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1179

alpha = np.float32(op.attrs["alpha"])

1180

use_mul_max = 0 < alpha < 1

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1181

is_converted_prelu = "alpha_scaling" in op.attrs

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

if use_mul_max:

mul_ifm = ifm

new_op = Op.Maximum

else:

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1186

# Need to use a different approach for alpha < 0 or alpha > 1

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1187

no_scale_quant = ifm.quantization.clone()

1188

no_scale_quant.scale_f32 = None

1189

no_scale_quant.zero_point = 0

1190

zero = create_const_tensor("zero_const", [], ifm.dtype, [0], quantization=no_scale_quant)

1191

1192

# Select values < 0

1193

min_op = Operation(Op.Minimum, op.name + "_min")

1194

min_op.add_input_tensor(ifm)

1195

min_op.add_input_tensor(zero)

1196

mul_ifm = ifm.clone(op.name + "_negative", set_unique=True)

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1197

if alpha < 0 and not is_converted_prelu:

1198

# For negative alpha that is not from a converted PReLU we need to use

1199

# int32 Mul below to perform the (negative) alpha scaling

1200

mul_ifm.dtype = DataType.int32

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1201

min_op.set_output_tensor(mul_ifm)

1202

min_op.set_ifm_ofm_shapes()

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame]

1203

new_op = Op.Add

1204

op.explicit_scaling = ExplicitScaling(False, shift=[0], multiplier=[1]) # No scaling

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1205

DebugDatabase.add_optimised(op, min_op)

1206

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1207

# Add multiplication with alpha

1208

mul_alpha = Operation(Op.Mul, op.name + "_mul_alpha")

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1209

mul_alpha.add_input_tensor(mul_ifm)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1210

# Create const tensor containing alpha as scalar

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1211

quantization = ifm.quantization.clone()

1212

quantization.min = 0

1213

quantization.max = alpha * (quantization.quant_max - quantization.quant_min)

1214

quantization.zero_point = 0

Fredrik Svedberg

2022-09-13 15:22:01 +0200

[diff] [blame]

1215

alpha_dtype = mul_ifm.dtype

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1216

if is_converted_prelu:

1217

# The LeakyRelu was the result from convert_prelu and the scaling is provided

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

1218

scalar, alpha_scale, alpha_shift = op.attrs["alpha_scaling"]

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame]

1219

mul_alpha.explicit_scaling = ExplicitScaling(False, [alpha_shift], [alpha_scale])

Fredrik Svedberg

2022-09-13 15:22:01 +0200

[diff] [blame]

1220

elif alpha == 0 or np.isinf(1 / alpha):

1221

# Handling of alpha near or at zero

Fredrik Svedberg

cce872b

2021-09-02 15:20:52 +0200

[diff] [blame]

1222

quantization.scale_f32 = np.float32(1)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1223

scalar = 0

1224

else:

1225

quantization.scale_f32 = alpha

Fredrik Svedberg

2022-09-13 15:22:01 +0200

[diff] [blame]

1226

if alpha_dtype == DataType.int32:

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1227

# When the datatype is int32 (alpha negative) we need to do the scaling with the multiplication

Fredrik Svedberg

2022-09-13 15:22:01 +0200

[diff] [blame]

1228

scalar, _ = scaling.elementwise_mul_scale(ifm.quantization.scale_f32, alpha, ofm.quantization.scale_f32)

1229

else:

1230

scalar = 1

Tim Hall

2023-01-13 17:57:25 +0000

[diff] [blame]

1231

alpha_tens = create_const_tensor(op.name + "_alpha_scalar", [1], alpha_dtype, [scalar], quantization=quantization)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1232

mul_alpha.add_input_tensor(alpha_tens)

1233

fm_alpha = ofm.clone(op.name + "_alpha", set_unique=True)

1234

mul_alpha.set_output_tensor(fm_alpha)

1235

mul_alpha.set_ifm_ofm_shapes()

1236

DebugDatabase.add_optimised(op, mul_alpha)

1237

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1238

if not use_mul_max:

1239

relu_op = Operation(Op.Relu, op.name + "_relu")

1240

relu_op.add_input_tensor(ifm)

1241

fm_id = ofm.clone(op.name + "_positive_scaled", set_unique=True)

1242

relu_op.set_output_tensor(fm_id)

1243

relu_op.set_ifm_ofm_shapes()

1244

DebugDatabase.add_optimised(op, relu_op)

1245

elif check_quantized_tens_scaling_equal(ifm, ofm):

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1246

# No identity multiplication is needed

1247

fm_id = ifm

1248

else:

1249

# Add multiplication with identity

1250

mul_identity = Operation(Op.Mul, op.name + "_mul_identity")

1251

mul_identity.add_input_tensor(ifm)

1252

# Create const tensor containing identity as scalar

1253

quantization = ifm.quantization.clone()

1254

quantization.min = 0

1255

quantization.max = quantization.quant_max - quantization.quant_min

Fredrik Svedberg

cce872b

2021-09-02 15:20:52 +0200

[diff] [blame]

1256

quantization.scale_f32 = np.float32(1)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1257

quantization.zero_point = 0

Tim Hall

2023-01-13 17:57:25 +0000

[diff] [blame]

1258

identity_tens = create_const_tensor(op.name + "_id_scalar", [], ifm.dtype, [1], quantization=quantization)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1259

mul_identity.add_input_tensor(identity_tens)

1260

# Make sure that fm_id is allocated to a different address than fm_alpha

1261

fm_id = ofm.clone(op.name + "_id", set_unique=True)

1262

mul_identity.set_output_tensor(fm_id)

1263

mul_identity.set_ifm_ofm_shapes()

1264

DebugDatabase.add_optimised(op, mul_identity)

1265

1266

# Convert LeakyRelu to Max, add the results of the multiplication(s) as inputs

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1267

op.type = new_op

1268

op.name = op.name.replace("LeakyRelu", new_op.name)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1269

op.inputs = []

1270

ifm.consumer_list.remove(op)

1271

op.add_input_tensor(fm_alpha)

1272

op.add_input_tensor(fm_id)

1273

op.set_ifm_ofm_shapes()

1274

1275

DebugDatabase.add_optimised(op, op)

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1279

def convert_to_lut8(op, fn, fn_name):

1280

# Converts op to a no-op + int8/uint8 LUT which is generated with the given function.

1281

# fn is a function(real) -> real

1282

ifm, ofm = op.get_ifm_ofm()

1283

if ifm.dtype not in (DataType.uint8, DataType.int8) or ifm.dtype != ofm.dtype:

1284

return op

1285

# Generate the LUT

1286

ifm_scale = np.double(ifm.quantization.scale_f32)

1287

ofm_scale = np.double(ofm.quantization.scale_f32)

1288

zp_in = ifm.quantization.zero_point

1289

zp_out = ofm.quantization.zero_point

1290

values = []

1291

ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)

1292

quantized_min = min(ix)

1293

quantized_max = max(ix)

1294

for x in ix:

1295

x_real = ifm_scale * (x - zp_in)

1296

y_real = fn(x_real)

1297

lut_result = round_away_zero(zp_out + y_real / ofm_scale)

1298

lut_result = min(quantized_max, max(quantized_min, lut_result))

1299

values.append(lut_result)

1300

return convert_to_lut(op, values, fn_name)

1301

1302

1303

def convert_lrelu_to_lut(op, arch):

1304

ifm, ofm = op.get_ifm_ofm()

1305

# Generate the LUT

1306

alpha = op.attrs["alpha"]

1307

ifm_scale = np.double(ifm.quantization.scale_f32)

1308

ofm_scale = np.double(ofm.quantization.scale_f32)

1309

zp_in = ifm.quantization.zero_point

1310

zp_out = ofm.quantization.zero_point

1311

identity_scale, identity_shift = scaling.elementwise_mul_scale(ifm_scale, 1, ofm_scale)

1312

alpha_scalar = 1

1313

alpha_scale, alpha_shift = scaling.elementwise_mul_scale(ifm_scale, alpha, ofm_scale)

1314

if "alpha_scaling" in op.attrs:

1315

# The LeakyRelu was the result from convert_mul_max_to_abs_or_lrelu

1316

alpha_scalar, alpha_scale, alpha_shift = op.attrs["alpha_scaling"]

1317

values = []

1318

ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)

1319

quantized_min = min(ix)

1320

quantized_max = max(ix)

1321

for x in ix:

1322

if x < zp_in:

1323

lut_result = zp_out + fp_math.multiply_by_quantized_multiplier(

1324

alpha_scalar * (x - zp_in), alpha_scale, alpha_shift

1325

)

1326

else:

1327

lut_result = zp_out + fp_math.multiply_by_quantized_multiplier(x - zp_in, identity_scale, identity_shift)

1328

lut_result = min(quantized_max, max(quantized_min, lut_result))

1329

values.append(lut_result)

1330

return convert_to_lut(op, values, "lrelu")

1331

1332

1333

def convert_lrelu(op, arch, nng):

1334

# Converts LeakyRelu to a LUT based solution if possible, otherwise a mul + max

1335

if op.type != Op.LeakyRelu:

1336

return op

1337

ifm, ofm = op.get_ifm_ofm()

1338

if ifm is None or ofm is None:

1339

return op

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1340

alpha = op.attrs["alpha"]

1341

if alpha == 0:

1342

# When alpha is 0 the opertion can be converted to a ReLU

1343

op.type = Op.Relu

1344

op.name = op.name.replace("LeakyRelu", op.type.name)

1345

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1346

if ifm.dtype in (DataType.uint8, DataType.int8) and ifm.dtype == ofm.dtype:

1347

# use LUT for int8/uint8

1348

return convert_lrelu_to_lut(op, arch)

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1349

if check_quantized_tens_scaling_equal(ifm, ofm) and ifm.dtype == ofm.dtype == DataType.int16 and alpha > 0:

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1350

# use LeakyRelu unmodified for int16 with equal input/output scaling and positive alpha

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1351

return op

1352

return convert_lrelu_to_mul_max(op, arch)

1353

1354

1355

def convert_tanh_sigmoid_to_lut(op, arch, nng):

1356

# Converts int8/uint8 Sigmoid and Tanh to a LUT based solution

1357

if op.type == Op.Sigmoid:

1358

return convert_to_lut8(op, clamp_sigmoid, "sigmoid")

1359

elif op.type == Op.Tanh:

1360

return convert_to_lut8(op, math.tanh, "tanh")

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1364

def fuse_activation_function_with_prev(op, arch, nng):

1365

# if op is a no-op: attempts to move the activation function to the preceding op

1366

if not op.attrs.get("is_nop", False) or op.activation is None:

1367

return op

1368

ifm, ofm = op.get_ifm_ofm()

1369

if ifm is None or ofm is None:

1370

return op

1371

# finds the input(s) to the operation

1372

prev_op = ifm.ops[0]

1373

# Note: the below checks on prev_op require that a first optimize pass on the full graph has been performed

1374

fuse = (

1375

prev_op.run_on_npu

1376

and prev_op.type.npu_block_type != NpuBlockType.Default

1377

and len(ifm.ops) == 1

1378

and len(prev_op.outputs[0].consumers()) == 1

1379

and prev_op.activation is None

1380

)

1381

if op.activation_lut is not None and arch.shram_reserved_unused_banks == 0:

1382

# TODO: if SHRAM LUT space is shared with SHRAM ACC (32, 64 MAC),

1383

# LUT currently only works correctly for elementwise ops

fuse = False

if not fuse:

return op

# Move the fused activation function + corresponding info to prev_op

1388

prev_op.activation = op.activation

1389

prev_op.forced_output_quantization = op.forced_output_quantization

1390

if op.activation_lut is not None:

1391

prev_op.set_activation_lut(op.activation_lut)

1392

# Bypass op

1393

prev_op.set_output_tensor(ofm)

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

1394

DebugDatabase.add_optimised(prev_op, prev_op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

def _leading_pad_ok(leading_pad, stride, kernel_size):

1399

# If kernel size // 2 > stride, then (left, top) padding must be a multiple of stride,

1400

# otherwise replacing PAD by hardware padding would iterate the wrong IFM rows/columns

1401

max_size = kernel_size // 2

1402

return leading_pad == max_size or max_size <= stride or leading_pad % stride == 0

1403

1404

1405

def replace_pad_by_hw_pad(op: Operation, arch, nng):

1406

"""

1407

Tries to completely remove a PAD operator by using hardware padding.

1408

E.g. a PAD operation that pads 1, followed by a CONV with VALID padding and kernel size 3

1409

is rewritten such that the PAD is removed, and the CONV uses SAME padding.

1410

Converts tens1 -> PAD -> tens2 -> CONV to tens1 -> CONV

1411

if both operations can be run on the NPU.

1412

This is the most efficient way to implement PAD, but cannot be done for all pad sizes.

1413

"""

1414

if (

1415

(op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op() or op.type.is_avgpool_op())

Tim Hall

0ab2edc

2022-02-23 17:58:02 +0000

[diff] [blame]

1416

and op.type not in (Op.Conv2DBackpropInput, Op.Conv2DBackpropInputSwitchedBias)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1417

and op.run_on_npu

1418

and op.attrs["padding"] == Padding.VALID

1419

):

1420

pad_op = op.ifm.ops[0]

1421

if pad_op.type != Op.Pad or not pad_op.run_on_npu:

1422

return op

1423

if pad_op.ifm.dtype != pad_op.ofm.dtype or not check_quantized_tens_scaling_equal(pad_op.ofm, pad_op.ifm):

1424

return op

1425

top, left, bottom, right = get_pad_values_from_input(pad_op.inputs[1].values)

1426

k = op.kernel

1427

k_w, k_h = k.dilated_wh()

1428

1429

# Check if the PAD operator can be replaced by hardware padding

1430

if left > k_w // 2 or right > k_w // 2 or top > k_h // 2 or bottom > k_h // 2:

1431

# Too much padding, it would require hardware padding to actually insert zeros

1432

return op

1433

if not _leading_pad_ok(top, k.stride.y, k_h) or not _leading_pad_ok(left, k.stride.x, k_w):

1434

return op

1435

1436

if op.type.is_avgpool_op():

1437

# For average pool, hardware padding can only be used if padding is 0 or kernel size / 2

for pad, k_size in (

(left, k_w),

(right, k_w),

(top, k_h),

(bottom, k_h),

):

if pad not in (0, k_size // 2):

1445

return op

1446

# Average pool is converted to depthwise, because NPU average pool + same padding

1447

# has a special implementation that is different from PAD followed by average pool with

1448

# valid padding.

1449

k_w, k_h = op.kernel.width, op.kernel.height

1450

ifm = op.ifm

1451

# Remember other inputs

1452

other_inputs = op.inputs[1:]

1453

# Create a weight tensor, all weights are set to 1/(kernel width * kernel height)

1454

quantization = QuantizationParameters(0.0, 255.0)

1455

quantization.scale_f32 = 1.0 / (k_w * k_h)

1456

quantization.zero_point = 0

1457

shape = [k_h, k_w, 1, op.ofm.shape[-1]]

1458

weights = np.full(shape, 1)

1459

1460

weight_tens = create_const_tensor(

1461

op.name + "_weights",

1462

shape,

1463

op.ifm.dtype,

1464

weights,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1465

purpose=TensorPurpose.Weights,

1466

quantization=quantization,

1467

)

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

1468

weight_tens.values = weights

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1469

op.type = Op.DepthwiseConv2DBias

1470

op.inputs = []

1471

op.add_input_tensor(ifm)

1472

op.add_input_tensor(weight_tens)

1473

# Add bias tensor, all biases set to 0

1474

op.inputs.append(None)

Fredrik Svedberg

cc219be

2022-09-20 16:32:52 +0200

[diff] [blame]

1475

fixup_bias_tensors(op, arch, nng, DataType.int32)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1476

# Add other inputs

1477

op.inputs.extend(other_inputs)

1478

op.rounding_mode = NpuRoundingMode.NATURAL

1479

1480

# Bypass the PAD operator

1481

op.set_input_tensor(pad_op.ifm, 0)

1482

# Adjust the padding attributes of the convolution operator

1483

op.attrs["padding"] = Padding.EXPLICIT

1484

op.attrs["explicit_padding"] = (top, left, bottom, right)

1485

op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

1486

DebugDatabase.add_optimised(op, op)

1487

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

def convert_pad(op: Operation, arch, nng):

1492

"""

1493

Rewrites PAD operator to an average pool that copies the IFM to the OFM

1494

+ up to 4 average pool operators that fill the OFM with zeros at the borders.

1495

This is done as fall-back for the PAD operators that remain after replace_pad_by_hw_pad

1496

"""

1497

if op.type != Op.Pad or not op.run_on_npu:

1498

return op

1499

top, left, bottom, right = get_pad_values_from_input(op.inputs[1].values)

1500

1501

ifm = op.ifm

1502

assert ifm is not None

James Ward

3e13434

2021-10-28 10:01:40 +0100

[diff] [blame]

1503

ifm_shape = op.ifm_shapes[0]

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1504

ofm = op.ofm

1505

assert ofm is not None

1506

ofm.ops = []

1507

ofm_shape = op.ofm_shapes[0]

1508

1509

# Average pool op that copies IFM to the right place inside the OFM

1510

shp0 = Shape4D(0, 0, 0, 0)

1511

shp_top = shp0.with_height(top)

1512

avgpool_op = create_avg_pool_for_concat(op, op.name + "_main", ifm, ifm_shape, shp_top.with_width(left))

1513

avgpool_op.activation = op.activation

1514

quant = ofm.quantization

1515

pad_value = quant.zero_point

1516

# Add operations that fill the borders of the OFM

1517

if top > 0:

1518

shape = Shape4D(1, top, ofm_shape.width, ofm_shape.depth)

1519

zero_tens = create_const_tensor(

Tim Hall

2023-01-13 17:57:25 +0000

[diff] [blame]

1520

op.name + "_top", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], quantization=quant

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1521

)

1522

# If top/bottom or left/right are equal, the const tensors can be allocated to the same address

1523

zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))

1524

create_avg_pool_for_concat(op, op.name + "_top", zero_tens, shape, shp0)

1525

if bottom > 0:

1526

shape = Shape4D(1, bottom, ofm_shape.width, ofm_shape.depth)

1527

zero_tens = create_const_tensor(

op.name + "_bottom",

shape.as_list(),

ofm.dtype,

shape.elements() * [pad_value],

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1532

quantization=quant,

1533

)

1534

zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))

1535

create_avg_pool_for_concat(

1536

op, op.name + "_bottom", zero_tens, shape, shp0.with_height(ofm_shape.height - bottom)

1537

)

1538

if left > 0:

1539

shape = Shape4D(1, ifm_shape.height, left, ofm_shape.depth)

1540

zero_tens = create_const_tensor(

Tim Hall

2023-01-13 17:57:25 +0000

[diff] [blame]

1541

op.name + "_left", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], quantization=quant

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1542

)

1543

zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))

1544

create_avg_pool_for_concat(op, op.name + "_left", zero_tens, shape, shp_top)

1545

if right > 0:

1546

shape = Shape4D(1, ifm_shape.height, right, ofm_shape.depth)

1547

zero_tens = create_const_tensor(

Tim Hall

2023-01-13 17:57:25 +0000

[diff] [blame]

1548

op.name + "_right", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], quantization=quant

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1549

)

1550

zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))

1551

create_avg_pool_for_concat(

1552

op, op.name + "_right", zero_tens, shape, shp_top.with_width(ofm_shape.width - right)

1553

)

1554

1555

op.type = Op.ConcatTFLite

return avgpool_op

Fredrik Svedberg

2022-09-20 16:32:52 +0200

[diff] [blame]

1559

def fixup_bias_tensors(op, arch, nng, dtype=None):

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1560

if op.type.needs_bias() and op.bias is None:

1561

# Op has no bias, add bias tensor filled with zeros

1562

nr_biases = op.inputs[1].shape[-1]

1563

bias_values = [0] * nr_biases

Fredrik Svedberg

cc219be

2022-09-20 16:32:52 +0200

[diff] [blame]

1564

# The DataType of the bias tensor can be explicitly provided or deduced from the ifm

1565

# DataType. Default is int32 bias for 8-bit ifms and int64 for int16 ifms.

1566

# For int16 the selected bias DataType will have an impact on the scaling

1567

# used when encoding the scales and biases later. The default mode will match the

1568

# refence with reduced scaling for int64 bias.

1569

# This means that in cases (in the graph optimiser) where DepthwiseConv2DBias

1570

# is used to emulate average pool int32 bias should be selected for full precision

1571

# int16 scaling.

1572

if dtype is None:

1573

dtype = DataType.int64 if op.ifm.dtype == DataType.int16 else DataType.int32

1574

bias_tensor = create_const_tensor(op.name + "_bias", [nr_biases], dtype, bias_values)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1575

op.set_input_tensor(bias_tensor, op.type.info.indices.biases[0])

return op

wilisa01

2023-02-08 09:56:14 +0000

[diff] [blame]

1580

def detect_asymmetric_weights(op):

1581

# Check all ops (cpu and npu)

1582

if op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op():

1583

if op.ifm.dtype in (DataType.int8, DataType.int16):

Fredrik Svedberg

2021-11-01 14:25:29 +0100

[diff] [blame]

1584

if not np.all(op.weights.quantization.zero_point == 0):

wilisa01

2023-02-08 09:56:14 +0000

[diff] [blame]

1585

print(f"Warning: Op {op.type} '{op.name}' has asymmetric weights.", end=" ")

1586

return True

1587

return False

Fredrik Svedberg

2021-11-01 14:25:29 +0100

[diff] [blame]

1588

wilisa01

2023-02-08 09:56:14 +0000

[diff] [blame]

1589

1590

def fixup_asymmetric_weights(op, arch, nng):

1591

if detect_asymmetric_weights(op):

1592

if op.run_on_npu:

1593

print("Zero points have been adjusted.")

1594

op.weights.quantization.zero_point *= 0

Fredrik Svedberg

2021-11-01 14:25:29 +0100

[diff] [blame]

return op

wilisa01

2023-02-08 09:56:14 +0000

[diff] [blame]

1598

def check_asymmetric_weights(op, arch, nng):

1599

# This function can modify the run_on_npu flag which causes an operator to be placed on the CPU. It is usually only

1600

# set by the supported operator checks. Therefore, it should be run immediately after those checks to avoid the

1601

# possibility of other graph optimiser functions modify the operator (that is later run on the CPU)

1602

if detect_asymmetric_weights(op):

1603

if op.run_on_npu:

1604

print("To run the operator on Ethos-U use the option --force-symmetric-int-weights")

1605

op.run_on_npu = False

return op

def fixup_or_check_asymmetric_weights(force_symmetric_int_weights):

1610

if force_symmetric_int_weights:

1611

return fixup_asymmetric_weights

1612

else:

1613

return check_asymmetric_weights

1614

1615

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1616

def convert_mean_to_depthwise_conv_or_avgpool(op, arch, nng):

1617

if op.type == Op.Mean and op.run_on_npu:

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1618

inp, axis = op.inputs

1619

shape = inp.shape

Diqing Zhong

2022-03-09 12:23:47 +0100

[diff] [blame]

1620

ofm_shape = op.ofm.shape

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1621

dims = len(shape)

Diqing Zhong

2022-03-09 12:23:47 +0100

[diff] [blame]

1622

dims_ofm = len(ofm_shape)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1623

1624

# Height and width axes have different index depending on dimensions

1625

if axis.shape == [] or axis.shape[0] == 1: # single axis

1626

axis = int(axis.values) if len(axis.shape) == 0 else int(axis.values[0])

1627

if dims in (2, 3):

1628

if axis == 0:

1629

h, w = shape[axis], 1

1630

else:

1631

h, w = 1, shape[axis]

1632

else:

1633

if axis == 1:

1634

h, w = shape[axis], 1

1635

else:

1636

h, w = 1, shape[axis]

1637

else: # multiple axes

1638

axis = sorted(axis.values)

1639

h, w = [shape[i] for i in axis]

1640

1641

# Set necessary depthwise attributes

1642

op.attrs.update(

1643

{

1644

"padding": Padding.VALID,

1645

"stride_h": 1,

1646

"stride_w": 1,

1647

"strides": (1, 1, 1, 1),

1648

"depth_multiplier": 1,

1649

"channel_multiplier": 1,

1650

"dilation_h_factor": 1,

1651

"dilation_w_factor": 1,

1652

"dilation": (1, 1, 1, 1),

}

)

# Change op type

op.type = Op.DepthwiseConv2DBias

1657

# Set IFM/OFM shapes after changing op type

1658

op.set_ifm_ofm_shapes()

1659

Fredrik Svedberg

1e5456f

2022-09-23 15:25:17 +0200

[diff] [blame]

1660

weight_scale, bias = 1, 0

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1661

ofmq, ifmq = op.ofm.quantization, inp.quantization

Johan Alfvén

9d51ec4

2022-10-27 16:30:01 +0200

[diff] [blame]

1662

if ifmq.is_scaling_equal(ofmq):

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1663

# Here we can just use a simple AvgPool with truncating rounding,

1664

# as we're emulating simple integer division.

1665

op.rounding_mode = NpuRoundingMode.TRUNCATE

1666

op.type = Op.AvgPool

1667

op.attrs.update({"ksize": (1, h, w, 1), "filter_height": h, "filter_width": w})

1668

else:

1669

op.rounding_mode = NpuRoundingMode.NATURAL

1670

weight_scale = 1 / (h * w)

1671

# Input zero point is adjusted after mean calculation, so we emulate that with a bias

1672

bias = -ifmq.zero_point * h * w

1673

fiq = ifmq.clone()

1674

fiq.zero_point = 0

1675

op.forced_input_quantization = fiq

1676

1677

# Change dimensions to 4

Diqing Zhong

2022-03-09 12:23:47 +0100

[diff] [blame]

1678

def extend_dims(dim, in_shape):

1679

if dim < 4:

1680

in_shape = [1] + in_shape

if dim == 2:

in_shape += [1]

return in_shape

if dims < 4 or dims_ofm < 4:

1686

# Fix the ofm dimension when keep_dims is false

1687

# e.g. IFM=1xHxWxC axis=2 OFM=1xHxC, the ofm_shape should be 1xHx1xC, not 1x1xHxC

1688

if isinstance(axis, int) and dims_ofm + 1 == dims:

1689

ofm_shape.insert(axis, 1)

1690

elif isinstance(axis, list) and (dims_ofm + len(axis) == dims):

1691

for i in axis:

1692

ofm_shape.insert(i, 1)

1693

shape = extend_dims(dims, shape)

1694

dims_ofm = len(ofm_shape)

1695

ofm_shape = extend_dims(dims_ofm, ofm_shape)

1696

op.set_ifm_ofm_shapes()

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1697

Rickard Bolin

7d7cb67

2021-12-07 09:09:14 +0000

[diff] [blame]

1698

# If height is greater than max kernel height, reshape from HxW to 1x(HxW)

Johan Alfvén

2022-09-26 13:46:51 +0200

[diff] [blame]

1699

weight_shape = None

Rickard Bolin

7d7cb67

2021-12-07 09:09:14 +0000

[diff] [blame]

1700

if (h > 64 and op.type == Op.DepthwiseConv2DBias) or (h > 256 and op.type == Op.AvgPool):

Johan Alfvén

2022-09-26 13:46:51 +0200

[diff] [blame]

1701

# This can only happen and be done for multiple axes, and

1702

# h * w <= 256 for DepthwiseConv2DBias

1703

# h * w <= 4096 for AvgPool

1704

# which is checked in supported ops

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1705

shape = [shape[0], 1, h * w, shape[3]]

1706

op.ifm_shapes[0] = Shape4D(shape)

Johan Alfvén

2022-09-26 13:46:51 +0200

[diff] [blame]

1707

weight_shape = [1, h * w, shape[3], shape[0]]

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1708

if h > 256 and op.type == Op.AvgPool:

1709

op.attrs.update({"ksize": (1, 1, h * w, 1), "filter_height": 1, "filter_width": h * w})

1710

1711

# If the AvgPool version is used, we don't need to do anything else

1712

if op.type == Op.AvgPool:

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

1713

DebugDatabase.add_optimised(op, op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1714

return op

1715

1716

# Make unit weight tensor quantization

1717

weight_quant = ifmq.clone()

1718

weight_quant.min = 0

1719

weight_quant.max = 255

1720

weight_quant.scale_f32 = weight_scale

1721

weight_quant.zero_point = 0

1722

Johan Alfvén

2022-09-26 13:46:51 +0200

[diff] [blame]

1723

if weight_shape is None:

1724

# Set weight shape to [H,W,C,B]

1725

weight_shape = [h, w, shape[3], shape[0]]

Diqing Zhong

2022-03-09 12:23:47 +0100

[diff] [blame]

1726

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1727

# Add unit weight tensor

op.set_input_tensor(

create_const_tensor(

"weights",

weight_shape,

inp.dtype,

np.ones(weight_shape),

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1734

quantization=weight_quant,

1735

),

1736

1,

1737

)

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

1738

op.weights.values = np.reshape(op.inputs[1].values, weight_shape)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1739

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1740

# Add bias tensor

Fredrik Svedberg

1e5456f

2022-09-23 15:25:17 +0200

[diff] [blame]

1741

bias_shape = [shape[-1]]

1742

op.inputs.append(create_const_tensor("bias", bias_shape, DataType.int32, np.ones(bias_shape) * bias))

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

1743

DebugDatabase.add_optimised(op, op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1748

def optimise_quantize(op: Operation, arch, nng):

1749

1750

if op.type == Op.Quantize and op.run_on_npu:

1751

1752

ifm, ofm = op.get_ifm_ofm()

1753

input_values = ifm.values

1754

1755

# Guard clause - input not const or no values to quantize

1756

if ifm.ops[0].type != Op.Const or input_values is None:

1757

return op

1758

1759

# Singular val in numpy array, convert to indexable array

1760

if input_values.ndim == 0:

1761

input_values = np.array([input_values])

1762

Fredrik Svedberg

1156317

2022-07-06 14:54:12 +0200

[diff] [blame]

1763

# requantized int8 to int8 or int16 to int16

1764

if ifm.dtype == ofm.dtype == DataType.int8 or ifm.dtype == ofm.dtype == DataType.int16:

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1765

1766

# scale needs to use double precision to match TFLite reference kernel

1767

effective_scale = np.float64(ifm.quantization.scale_f32) / np.float64(ofm.quantization.scale_f32)

1768

effective_multiplier, effective_shift = quantise_scale(effective_scale)

1769

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1770

requantized_vals = []

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

1771

for val in input_values.flatten():

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1772

input_val = val - ifm.quantization.zero_point

1773

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

1774

ofm_val = fp_math.multiply_by_quantized_multiplier(input_val, effective_multiplier, effective_shift)

1775

ofm_val += ofm.quantization.zero_point

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1776

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

1777

clamped_ofm_value = max(min(ofm_val, ofm.quantization.quant_max), ofm.quantization.quant_min)

1778

requantized_vals.append(clamped_ofm_value)

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1779

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

1780

ofm.values = np.array(requantized_vals, ofm.dtype.as_numpy_type())

1781

ofm.values.shape = input_values.shape

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1782

1783

# Case: Float input - quantize to int

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

1784

elif ifm.dtype.type == BaseType.Float:

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1785

1786

quantized_vals = []

1787

for val in input_values:

1788

1789

# Derive quantized value

1790

quant_val = (val / ofm.quantization.scale_f32) + ofm.quantization.zero_point

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

1791

clamped_quantized_val = np.clip(quant_val, ofm.quantization.quant_min, ofm.quantization.quant_max)

1792

quantized_vals.append(clamped_quantized_val)

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1793

1794

# Pass the statically calculated quant val to output tensor

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

1795

ofm.values = np.array(quantized_vals, ofm.dtype.as_numpy_type())

1796

1797

# Unsupported data type

1798

else:

1799

return op

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1800

1801

# Make quantize op const and disconnect from parent node

1802

1803

# Remove reference of the current quant op from the parent tensor's consumer list

1804

ifm.consumer_list = [consumer for consumer in ifm.consumer_list if consumer.op_index != op.op_index]

1805

1806

# Clear any references to parent node

1807

op.inputs = []

1808

1809

# Convert this quantize op to const

op.type = Op.Const

return op

Ayaan Masood

2022-06-29 11:30:57 +0100

[diff] [blame]

1815

def convert_shape_op_to_constant_tensor(op: Operation, arch, nng):

1816

"""Static optimisation for SHAPE operator output value known at compile time"""

1817

1818

# Disconnect SHAPE operator from its parent and transform SHAPE OP into constant

1819

1820

if op.type == Op.Shape and op.run_on_npu:

1821

1822

ifm, ofm = op.get_ifm_ofm()

1823

1824

if len(ifm.shape) != ofm.shape[0]:

1825

return op

1826

1827

# Remove reference of the current shape op from the parent tensor's consumer list

1828

ifm.consumer_list = [consumer for consumer in ifm.consumer_list if consumer.op_index != op.op_index]

1829

1830

# Clear any references to parent node

1831

op.inputs = []

1832

1833

# Convert this SHAPE op to const

1834

op.type = Op.Const

1835

1836

# Add size calculation to shape output tensors

1837

ofm.values = np.array(ifm.shape)

return op

Tim Hall

2022-11-11 18:19:53 +0000

[diff] [blame]

1842

def fixup_dilation_gt2(op, arch, nng):

1843

assert op.run_on_npu

1844

if op.type == Op.Conv2DBias or op.type == Op.DepthwiseConv2DBias:

1845

dilation_w, dilation_h = op.get_kernel_dilation()

1846

1847

# if dilation in either axis is greater than that supported by the hardware then we must manually dilate the

1848

# kernel

1849

if dilation_w > 2 or dilation_h > 2:

1850

kernel_w, kernel_h = op.get_kernel_size()

1851

kernel_ic = op.weights.shape[-2]

1852

kernel_oc = op.weights.shape[-1]

1853

1854

# if the dilation is a multiple of 2 then the hardware dialtion can be enabled to provide that multiple

1855

# of 2. this allows the kernel size to be reduced (via the scaled dilation) by half in that dimension.

1856

# odd = 1, even = 2

1857

hw_dilation_h = 1 if (dilation_h & 1) else 2

1858

hw_dilation_w = 1 if (dilation_w & 1) else 2

1859

1860

scale_dilation_h = dilation_h // hw_dilation_h

1861

scale_dilation_w = dilation_w // hw_dilation_w

1862

1863

# create new empty kernel (HWIO format)

1864

new_kernel_h = (kernel_h - 1) * scale_dilation_h + 1

1865

new_kernel_w = (kernel_w - 1) * scale_dilation_w + 1

1866

1867

new_kernel_shape = [new_kernel_h, new_kernel_w, kernel_ic, kernel_oc]

1868

new_kernel_values = np.zeros(new_kernel_shape, dtype=op.weights.values.dtype)

1869

1870

# copy the original kernel values into the new sparse kernel

1871

for h in range(0, kernel_h):

1872

for w in range(0, kernel_w):

1873

new_h = h * scale_dilation_h

1874

new_w = w * scale_dilation_w

1875

new_kernel_values[new_h, new_w, :, :] = op.weights.values[h, w, :, :]

1876

1877

# update the weight tensor with the new dilated kernel

1878

op.weights.shape = new_kernel_shape

1879

op.weights.values = new_kernel_values

1880

1881

# enable(=2) / disable(=1) hardware dilation

1882

op.attrs["dilation"] = (1, hw_dilation_h, hw_dilation_w, 1) # nhwc format

1883

op.attrs["dilation_h_factor"] = hw_dilation_h

1884

op.attrs["dilation_w_factor"] = hw_dilation_w

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1889

def supported_operator_check(op, arch, nng):

Jonas Ohlsson

45e653d

2021-07-26 16:13:12 +0200

[diff] [blame]

1890

op.run_on_npu = arch.tflite_supported_operators.is_operator_supported(op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

wilisa01

2023-02-08 09:56:14 +0000

[diff] [blame]

1894

def tflite_optimise_graph(nng, arch, force_symmetric_int_weights):

Fredrik Svedberg

1156317

2022-07-06 14:54:12 +0200

[diff] [blame]

1895

# Compile time static optimisations

wilisa01

2023-02-08 09:56:14 +0000

[diff] [blame]

1896

optimisation_list = [

1897

optimise_quantize,

1898

convert_shape_op_to_constant_tensor,

1899

fixup_or_check_asymmetric_weights(force_symmetric_int_weights),

1900

]

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1901

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1902

for idx, sg in enumerate(nng.subgraphs):

1903

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

nng,

sg,

arch,

[],

Ayaan Masood

4965fae

2022-06-29 11:30:57 +0100

[diff] [blame]

1908

optimisation_list,

1909

rewrite_unsupported=False,

1910

)

1911

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

1912

# Pre-processing step

wilisa01

2023-02-08 09:56:14 +0000

[diff] [blame]

1913

pre_process_list = [supported_operator_check, set_ifm_ofm_op_shapes]

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

1914

Ayaan Masood

4965fae

2022-06-29 11:30:57 +0100

[diff] [blame]

1915

for idx, sg in enumerate(nng.subgraphs):

1916

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

nng,

sg,

arch,

[],

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

1921

pre_process_list,

1922

rewrite_unsupported=False,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

)

# Handle Concat Ops

for idx, sg in enumerate(nng.subgraphs):

1927

rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [rewrite_concat_ops])

1928

sg.refresh_after_modification()

1929

1930

# Handle Split Ops

1931

for idx, sg in enumerate(nng.subgraphs):

1932

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

nng,

sg,

arch,

[],

[rewrite_unpack_output, rewrite_stridedslice_output, convert_nop_split_to_identity],

1938

rewrite_unsupported=False,

1939

)

1940

1941

for idx, sg in enumerate(nng.subgraphs):

1942

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

nng,

sg,

arch,

[rewrite_split_ops],

[],

rewrite_unsupported=False,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1949

)

1950

Johan Alfven

a5e1b62

2023-02-02 14:59:03 +0100

[diff] [blame^]

1951

# Bypass or rewrite memory only operators

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1952

for idx, sg in enumerate(nng.subgraphs):

1953

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

nng,

sg,

arch,

[],

Johan Alfven

a5e1b62

2023-02-02 14:59:03 +0100

[diff] [blame^]

1958

[bypass_memory_only_ops],

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

1959

rewrite_unsupported=False,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1960

)

1961

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1962

# Rewrite of operators

1963

op_rewrite_list = [

1964

set_tensor_equivalence,

1965

convert_mean_to_depthwise_conv_or_avgpool,

1966

convert_depthwise_to_conv,

1967

convert_conv_to_fc,

1968

convert_softmax,

Fredrik Svedberg

2022-08-19 16:06:04 +0200

[diff] [blame]

1969

convert_prelu,

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1970

convert_mul_max_to_abs_or_lrelu,

1971

convert_lrelu,

Raul Farkas

2023-01-24 16:29:06 +0000

[diff] [blame]

1972

fixup_strided_conv,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1973

convert_hardswish_to_lut,

1974

rewrite_fully_connected_input,

1975

convert_batched_fc_shape,

1976

fixup_conv2d_backprop,

1977

fixup_relus_with_differing_ifm_ofm_scaling,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1978

reorder_depthwise_weights,

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

1979

fixup_resize,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1980

fixup_bias_tensors,

Fredrik Svedberg

2021-11-01 14:25:29 +0100

[diff] [blame]

1981

fixup_asymmetric_weights,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1982

convert_tanh_sigmoid_to_lut,

1983

replace_pad_by_hw_pad,

Tim Hall

ea4ba66

2022-11-11 18:19:53 +0000

[diff] [blame]

1984

fixup_dilation_gt2,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1985

]

1986

1987

for idx, sg in enumerate(nng.subgraphs):

1988

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

nng,

sg,

arch,

[],

op_rewrite_list,

rewrite_unsupported=False,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1995

)

1996

1997

for idx, sg in enumerate(nng.subgraphs):

1998

# remove passthrough tensors and attempt further optimizations

1999

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

nng,

sg,

arch,

[remove_passthrough_tensor],

2004

[fuse_activation_function_with_prev, convert_pad, add_padding_fields],

2005

)

2006

2007

# Removal of SplitSliceRead, need to be done after optimisation has been performed,

2008

# since ifm/ofm_shapes are of importance to this function

2009

for sg in nng.subgraphs:

2010

rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [remove_SplitSliceRead])

2011

sg.refresh_after_modification()

2012

Fredrik Svedberg

f3c7d55

2022-11-04 09:48:49 +0100

[diff] [blame]

2013

# Make sure that const optimisations on subgraph outputs are handled correctly

2014

for sg in nng.subgraphs:

2015

for ofm in sg.output_tensors:

2016

if ofm.is_const and ofm.ops[0].type_changed:

2017

# Subgraph output cannot be const - insert a memory copy

2018

op = ofm.ops[0]

2019

ofm_clone = ofm.clone()

2020

ofm_clone.values = ofm.values

2021

ofm.values = None

Tim Hall

2023-01-13 17:57:25 +0000

[diff] [blame]

2022

zero = create_const_tensor("zero", [1], ofm.dtype, [0], quantization=ofm.quantization)

Fredrik Svedberg

f3c7d55

2022-11-04 09:48:49 +0100

[diff] [blame]

2023

memcpy = create_add_nop(f"{ofm.name}_copy")

2024

memcpy.add_input_tensor(ofm_clone)

2025

memcpy.add_input_tensor(zero)

2026

memcpy.set_output_tensor(ofm)

2027

memcpy.set_ifm_ofm_shapes()

2028

op.set_output_tensor(ofm_clone)

2029

DebugDatabase.add_optimised(op, memcpy)

2030

Patrik Gustavsson