Blame - ethosu/vela/tflite_graph_optimiser.py - ml/ethos-u/ethos-u-vela

2023-01-13 17:57:25 +0000

[diff] [blame]

1

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Rickard Bolin

bc6ee58

2022-11-04 08:24:29 +0000

[diff] [blame]

16

#

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

17

# Description:

18

# Early optimisation of a TensorFlow Lite based network graph, using the rewrite_graph module

19

# to do the traversal of the graph.

20

import math

21

import uuid

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

import numpy as np

from . import fp_math

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

26

from . import rewrite_graph

27

from . import scaling

28

from .api import NpuRoundingMode

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

29

from .data_type import BaseType

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

30

from .data_type import DataType

31

from .debug_database import DebugDatabase

32

from .errors import UnsupportedFeatureError

33

from .ethos_u55_regs.ethos_u55_regs import resampling_mode

Jonas Ohlsson

0957e3e

2021-09-01 15:57:21 +0200

[diff] [blame]

34

from .graph_optimiser_util import bypass_memory_only_ops

Patrik Gustavsson

c74682c

2021-08-17 14:26:38 +0200

[diff] [blame]

35

from .graph_optimiser_util import calc_explicit_padding

Patrik Gustavsson

df99510

2021-08-23 15:33:59 +0200

[diff] [blame]

36

from .graph_optimiser_util import convert_depthwise_to_conv

Patrik Gustavsson

f436ada

2021-09-14 14:56:48 +0200

[diff] [blame]

37

from .graph_optimiser_util import convert_to_lut

Fredrik Svedberg

2023-04-11 22:35:04 +0200

[diff] [blame^]

38

from .graph_optimiser_util import create_avg_pool_for_concat

Jonas Ohlsson

0957e3e

2021-09-01 15:57:21 +0200

[diff] [blame]

39

from .graph_optimiser_util import memory_only_ops

Patrik Gustavsson

f1580f0

2021-09-01 12:43:02 +0200

[diff] [blame]

40

from .graph_optimiser_util import move_splitsliceread_to_consumer

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

41

from .graph_optimiser_util import needed_total_padding

42

from .graph_optimiser_util import set_ifm_ofm_op_shapes

43

from .graph_optimiser_util import set_tensor_equivalence

Fredrik Svedberg

2023-04-11 22:35:04 +0200

[diff] [blame^]

44

from .lstm import Lstm

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

45

from .numeric_util import clamp_sigmoid

Johan Alfven

56811e6

2023-03-27 11:33:50 +0200

[diff] [blame]

46

from .numeric_util import full_shape

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

47

from .numeric_util import round_away_zero

48

from .operation import create_activation_function

Fredrik Svedberg

1a7527c

2021-09-13 15:52:16 +0200

[diff] [blame]

49

from .operation import ExplicitScaling

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

50

from .operation import NpuBlockType

51

from .operation import Op

52

from .operation import Operation

53

from .operation import Padding

Fredrik Svedberg

f3c7d55

2022-11-04 09:48:49 +0100

[diff] [blame]

54

from .operation_util import create_add_nop

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

55

from .operation_util import create_avgpool_nop

Johan Alfven

2023-03-31 10:19:23 +0200

[diff] [blame]

56

from .operation_util import create_cast_op

Rickard Bolin

2022-12-19 12:33:40 +0000

[diff] [blame]

57

from .operation_util import create_depthwise_maxpool

Johan Alfven

2023-03-31 10:19:23 +0200

[diff] [blame]

58

from .operation_util import create_memcpy

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

59

from .operation_util import get_pad_values_from_input

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

60

from .scaling import quantise_scale

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

61

from .shape4d import Shape4D

62

from .softmax import SoftMax

63

from .tensor import check_quantized_tens_scaling_equal

64

from .tensor import create_const_tensor

65

from .tensor import create_equivalence_id

66

from .tensor import QuantizationParameters

67

from .tensor import Tensor

68

from .tensor import TensorPurpose

69

from .tflite_mapping import optype_to_builtintype

70

71

passthrough_nodes = (Op.Identity,)

72

73

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

74

def remove_passthrough_tensor(tens, arch, nng):

75

if len(tens.ops) == 1 and tens.ops[0].type in passthrough_nodes:

76

assert len(tens.ops[0].inputs) == 1

77

tens = tens.ops[0].inputs[0]

return tens

def rewrite_concat_ops(op, arch):

82

if not op.run_on_npu or not op.type.is_concat_op():

return

axis_4D = 0

ofm = op.ofm

ofm.ops = []

offset = 0

unfuse_activation_function(op)

91

92

if op.type == Op.Pack:

93

# Pack is also referred to as Stack

94

axis = int(op.attrs["axis"])

95

if axis < 0: # Convert to positive axis

96

axis = len(op.inputs[0].shape) + 1 + axis

97

98

desired_shape = op.inputs[0].shape[:axis] + [1] + op.inputs[0].shape[axis:]

99

100

axis_4D = axis + (4 - len(desired_shape))

101

102

for idx, inp in enumerate(op.inputs):

103

op.ifm_shapes[idx] = Shape4D(desired_shape)

104

op.type = Op.PackReshaped

105

106

inputs, axis = op.get_concat_inputs_axis()

107

for idx, inp in enumerate(inputs):

108

if op.type != Op.PackReshaped:

109

op.ifm_shapes[idx] = Shape4D(inp.shape)

110

if axis >= 0:

111

axis_4D = axis + (4 - len(inp.shape))

112

else:

113

axis_4D = axis

114

write_offset = [0, 0, 0, 0]

115

write_offset[axis_4D] = offset

116

concat_end = offset + op.ifm_shapes[idx][axis_4D]

117

create_avg_pool_for_concat(

118

op, op.name + str(idx) + "_avgpool", inp, op.ifm_shapes[idx], Shape4D.from_list(write_offset)

119

)

120

offset = concat_end

121

assert ofm.shape[axis] == offset

return op

def rewrite_split_ops(tens, arch, nng):

127

128

if len(tens.ops) == 1 and tens.ops[0].type.is_split_op() and tens.ops[0].type != Op.Unpack:

129

split_op = tens.ops[0]

130

131

# Not supported so leave it and run on CPU

132

if not split_op.run_on_npu:

133

return tens

134

135

inp, outputs, axis, offset_start, offset_end = split_op.get_split_inputs_axis()

136

137

tens.ops = []

138

new_op = Operation(Op.SplitSliceRead, split_op.name)

139

new_op.inputs = [inp]

140

ofm_shape_idx = 0

Tim Hall

51a8dce

2021-12-20 16:49:27 +0000

[diff] [blame]

141

if None in (offset_end, offset_start):

142

read_shape = None

143

else:

144

# the read shape is relative to each start offset

145

read_shape = [oe - os for oe, os in zip(offset_end, offset_start)]

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

146

147

# For Split the offset cannot be extracted from the tensor so it has to

148

# be calculated from the index of the output tensor

149

if axis is not None:

150

# Get the start and end of the split

151

offset_start = [0] * 4

152

axis_4D_list = split_op.attrs.get("split_axis_4D", None) # Present for UnpackReshaped and some StridedSlice

153

for idx, out in enumerate(outputs):

154

if axis_4D_list is not None:

155

axis_4D = axis_4D_list[idx]

156

else:

157

split_op.ofm_shapes[idx] = Shape4D(out.shape)

158

if axis >= 0:

159

axis_4D = axis + (4 - len(out.shape))

else:

axis_4D = axis

if out == tens:

ofm_shape_idx = idx

read_shape = split_op.ofm_shapes[idx]

166

break

167

168

offset_start[axis_4D] += split_op.ofm_shapes[idx][axis_4D]

169

170

new_op.read_offsets[0] = Shape4D.from_list(offset_start, 0)

171

new_op.read_shapes[0] = read_shape

172

new_op.run_on_npu = True

173

new_op.set_output_tensor(tens)

174

new_op.ifm_shapes.append(Shape4D(inp.shape))

175

new_op.ofm_shapes.append(split_op.ofm_shapes[ofm_shape_idx])

176

DebugDatabase.add_optimised(split_op, new_op)

return tens

def remove_SplitSliceRead(op, arch):

182

183

if op.type == Op.SplitSliceRead:

Fredrik Svedberg

2023-04-11 22:35:04 +0200

[diff] [blame^]

184

# Check if it is possible to put the SplitSliceRead on the tensor consumer(s),

185

# or if an avgpool need to be inserted

186

if op.ofm_shapes[0] == Shape4D.from_list(op.ofm.shape) and all(

187

consumer is not None and consumer.run_on_npu and consumer.type not in memory_only_ops

188

for consumer in op.ofm.consumer_list

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

189

):

Fredrik Svedberg

2023-04-11 22:35:04 +0200

[diff] [blame^]

190

# SplitSliceRead can be performed by tensor consumer(s)

191

for cons_op in list(op.ofm.consumer_list):

192

move_splitsliceread_to_consumer(op, cons_op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

193

else:

194

avgpool_op = create_avgpool_nop(op.name + "_avgpool")

195

avgpool_op.add_input_tensor(op.ifm)

196

avgpool_op.outputs = [op.ofm]

197

op.ofm.ops.remove(op)

198

op.ofm.ops.append(avgpool_op)

199

avgpool_op.ifm_shapes.append(op.ifm_shapes[0])

200

avgpool_op.ofm_shapes.append(op.ofm_shapes[0])

201

avgpool_op.read_offsets[0] = op.read_offsets[0]

202

avgpool_op.read_shapes[0] = op.read_shapes[0]

203

204

op.ifm.consumer_list.remove(op)

205

DebugDatabase.add_optimised(op, avgpool_op)

206

207

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

208

def calc_padding_and_skirt(padding_type, kernel, input_shape, explicit_padding):

209

k_w, k_h = kernel.dilated_wh()

210

s_x, s_y = kernel.stride

211

ypad = needed_total_padding(int(input_shape.height), int(s_y), int(k_h))

212

xpad = needed_total_padding(int(input_shape.width), int(s_x), int(k_w))

213

if padding_type == Padding.SAME:

214

left_pad = (xpad + 0) // 2

215

right_pad = (xpad + 1) // 2

216

top_pad = (ypad + 0) // 2

217

bottom_pad = (ypad + 1) // 2

218

elif padding_type == Padding.VALID:

left_pad = 0

right_pad = 0

top_pad = 0

bottom_pad = 0

elif padding_type == Padding.EXPLICIT:

224

# Padding is specified in a PAD operator which has been bypassed.

225

top, left, bottom, right = explicit_padding

226

top_pad, bottom_pad = calc_explicit_padding(int(input_shape.height), int(s_y), int(k_h), int(top), int(bottom))

227

left_pad, right_pad = calc_explicit_padding(int(input_shape.width), int(s_x), int(k_w), int(left), int(right))

Rickard Bolin

9ae3455

2022-06-09 13:07:17 +0000

[diff] [blame]

228

elif padding_type == Padding.TILE:

229

# The values in the explicit padding only represent the "direction" in which to pad

230

top_pad, left_pad, bottom_pad, right_pad = explicit_padding

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

231

else:

Tim Hall

0ab2edc

2022-02-23 17:58:02 +0000

[diff] [blame]

232

raise UnsupportedFeatureError(f"Unsupported padding = {padding_type} for padding calculation")

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

233

padding = (top_pad, left_pad, bottom_pad, right_pad)

234

skirt = (top_pad, left_pad, ypad - top_pad, xpad - left_pad)

235

return padding, skirt

236

237

238

def calc_upscaled_padding_and_skirt(padding_type, kernel_size, stride, input_shape, upscaling_factor):

239

kernel_height, kernel_width = kernel_size[0], kernel_size[1]

240

if padding_type == Padding.SAME:

241

ypad = needed_total_padding(int(input_shape.height) * upscaling_factor, int(stride[1]), int(kernel_height))

242

xpad = needed_total_padding(int(input_shape.width) * upscaling_factor, int(stride[2]), int(kernel_width))

243

right_pad = max(((xpad + 1) // upscaling_factor) - 1, 0)

244

bottom_pad = max(((ypad + 1) // upscaling_factor) - 1, 0)

245

left_pad = max(kernel_width - 1 - right_pad, 0)

246

top_pad = max(kernel_height - 1 - bottom_pad, 0)

247

elif padding_type == Padding.VALID:

248

right_pad = max(kernel_width - 2, 0)

249

bottom_pad = max(kernel_height - 2, 0)

250

left_pad = kernel_width - 1

251

top_pad = kernel_height - 1

252

else:

Tim Hall

0ab2edc

2022-02-23 17:58:02 +0000

[diff] [blame]

253

raise UnsupportedFeatureError(f"Unsupported padding = {padding_type} for up-scaled padding calculation")

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

254

padding = (top_pad, left_pad, bottom_pad, right_pad)

255

skirt = padding

256

return padding, skirt

257

258

259

def fixup_conv2d_backprop(op, arch, nng):

260

if op.type == Op.Conv2DBackpropInput:

261

# flip the inputs

262

op.inputs[0], op.inputs[2] = op.inputs[2], op.inputs[0]

263

op.type = Op.Conv2DBackpropInputSwitchedBias

Tim Hall

3c5cfe9

2022-03-16 16:31:57 +0000

[diff] [blame]

264

op.ifm_resampling_mode = resampling_mode.TRANSPOSE

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

265

266

# Update strides

267

op.attrs.update({"stride_w": 1, "stride_h": 1, "strides": (1, 1, 1, 1)})

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

268

DebugDatabase.add_optimised(op, op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

# Convert the op to an elementwise add

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

274

def convert_resize_1x1_to_add(op):

275

op.type = Op.Add # original_type will stay as Op.ResizeBilinear or Op.ResizeNearestNeighbor

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

276

op.name = op.name + "_add"

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

277

# Create an input tensor filled with zeros

wilisa01

8289d51

2023-01-12 08:17:23 +0000

[diff] [blame]

278

name = op.inputs[1].name + "_add"

279

dtype = op.inputs[0].dtype

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

280

shape = op.ofm_shapes[0].as_list()

wilisa01

8289d51

2023-01-12 08:17:23 +0000

[diff] [blame]

281

values = np.zeros(shape, dtype.as_numpy_type())

282

quantization = QuantizationParameters(0.0, 255.0)

283

quantization.scale_f32 = 1.0

284

quantization.zero_point = 0

wilisa01

16b5e5e

2023-02-14 12:03:59 +0000

[diff] [blame]

285

op.inputs[1] = op.inputs[0]

286

op.set_input_tensor(create_const_tensor(name, shape, dtype, values, quantization=quantization), 0)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

287

op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

288

DebugDatabase.add_optimised(op, op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

293

# Convert ResizeNearestNeightbor with align corners to a depthwise convolution. The IFM will already have been upscaled

294

# apart from the final x2 scaling which will be done as part of this operation. The kernel contains a single coefficient

295

# to select the appropriate nearest neighbor value

296

def convert_resizenn_ac_to_depthwise_conv(op, upscale_factor):

297

ifm = op.ifm

298

ofm = op.ofm

299

output_depth = ofm.shape[-1]

300

dw_op_attrs = {

301

"padding": Padding.VALID,

302

"stride_h": 1,

303

"stride_w": 1,

304

"strides": (1, 1, 1, 1),

305

"depth_multiplier": 1,

306

"channel_multiplier": 1,

307

"dilation_h_factor": 1,

308

"dilation_w_factor": 1,

309

"dilation": (1, 1, 1, 1),

310

}

311

312

# change resizebilinear to depthwise

313

op.type = Op.DepthwiseConv2DBias

314

op.attrs.update(dw_op_attrs)

315

op.set_input_tensor(ifm, 0) # ifm tensor index

316

op.activation = None

317

318

# add input resample to resize by x2

319

op.ifm_resampling_mode = resampling_mode.NEAREST

320

321

# don't care about the rounding mode as it is nearest neighbor

322

323

# setup weight tensor

324

weight_quant = QuantizationParameters()

325

weight_quant.scale_f32 = 1.0 # no scaling as only a single non-zero coeff to select the desired value

326

weight_quant.zero_point = 0

327

weight_quant.quant_dim = 0

328

ofm_dtype = ofm.dtype

Tim Hall

2023-01-13 17:57:25 +0000

[diff] [blame]

329

if ofm_dtype.type == BaseType.UnsignedInt:

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

330

weight_quant.quant_min = 0

331

weight_quant.quant_max = (1 << ofm_dtype.bits) - 1

332

else:

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

333

weight_quant.quant_min = -(1 << (ofm_dtype.bits - 1))

334

weight_quant.quant_max = (1 << (ofm_dtype.bits - 1)) - 1

335

336

weight_shape = [upscale_factor, upscale_factor, output_depth, output_depth] # HWIO

337

338

# the single non-zero coefficient used to select the desired value needs to be placed in the 'centre value', which

339

# is calculated by finding the 'centre position' ('*' in the diagram below) and then choosing the 'value' that is

340

# below-and-right (i.e. next) to it (D).

# 0---1---2

# | A | B |

# 1---*---+

# | C | D |

# 2---+---+

weight_values = [0] * (upscale_factor * upscale_factor)

347

centre_coeff = (upscale_factor // 2) * upscale_factor + (upscale_factor // 2)

348

weight_values[centre_coeff] = 1

349

350

# add weight tensor, this will discard the size tensor of the resize op

op.set_input_tensor(

create_const_tensor(

"weights",

weight_shape,

Tim Hall

2023-01-13 17:57:25 +0000

[diff] [blame]

355

ofm_dtype,

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

356

np.array(weight_values).reshape(weight_shape),

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

357

quantization=weight_quant,

358

),

359

1, # inputs tensor weight index

360

)

361

362

# setup bias tensor by assign None and then call the fix-up function to create a suitable tensor.

363

# need to append the bias tensor as resize ops only have 2 inputs

364

assert len(op.inputs) == 2

365

op.inputs.append(None)

Fredrik Svedberg

cc219be

2022-09-20 16:32:52 +0200

[diff] [blame]

366

fixup_bias_tensors(op, None, None, DataType.int32)

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

367

368

# finally update the shape incase we've change the tensor shapes or connections

369

op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

370

DebugDatabase.add_optimised(op, op)

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

return op

# Convert ResizeBilinear/NearestNeighbor to a number of 1x1 average pools with nearest neighbor x2 upscaling and one

376

# final average pool with a kernel size that depends upon the resize ops upscaling factor (x2, x4 or x8). The maximum

377

# upscale factor is limited to x8 because of the limit 8x8 kernel size limit for average pool with padding.

378

def convert_resize_to_upscale_and_average_pool(op):

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

379

pre_op = op

380

outputs = op.outputs

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

381

dtype = op.ifm.dtype

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

382

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

383

op.attrs.update({"strides": (1, 1, 1, 1), "ksize": (1, 1, 1, 1)})

Tim Hall

47c7636

2022-07-18 21:26:47 +0100

[diff] [blame]

384

op.attrs["padding"] = Padding.SAME # doesn't really matter as the kernel is 1x1

Tim Hall

3c5cfe9

2022-03-16 16:31:57 +0000

[diff] [blame]

385

op.ifm_resampling_mode = resampling_mode.NEAREST

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

386

387

upscaled_shape = np.array(op.ifm_shapes[0].get_hw_as_list())

Tim Hall

47c7636

2022-07-18 21:26:47 +0100

[diff] [blame]

388

389

# Get upscale factor that was calculated in the supported operators check

390

upscale_factor = op.attrs["upscale_factor"]

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

391

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

392

# Calculate how many times 2x2 upscaling needs to be performed

Tim Hall

f9267da

2022-04-20 20:19:48 +0100

[diff] [blame]

393

# Force the result of round to be an integer. This is because the behaviour of rounding numpy.float64 values changed

394

# between different versions of numpy. This consistency ensures that the kernel dimensions are kept integral

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

395

n = int(np.log2(upscale_factor))

396

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

397

# Perform x2 upscaling n-1 times

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

398

scaled_op = pre_op

399

for count in range(n - 1):

400

if count > 0:

401

scaled_op = op.clone(f"_{count}")

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

402

scaled_op.inputs[0] = pre_op.outputs[0]

403

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

404

# Nearest neighbor x2 upscaling

Tim Hall

47c7636

2022-07-18 21:26:47 +0100

[diff] [blame]

405

upscaled_shape = upscaled_shape * 2

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

406

shape = op.ofm_shapes[0].as_list()

407

shape[1:3] = upscaled_shape

408

out_tens = Tensor(shape, dtype, f"{op.outputs[0].name}_{count}")

409

out_tens.quantization = op.outputs[0].quantization.clone()

410

scaled_op.set_output_tensor(out_tens)

411

pre_op = scaled_op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

412

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

413

scaled_op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

414

DebugDatabase.add_optimised(op, scaled_op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

415

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

416

# Last x2 upscaling

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

417

if n > 1:

418

scaled_op = op.clone(f"_{n-1}")

419

scaled_op.inputs[0] = pre_op.outputs[0]

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

420

421

if scaled_op.original_type == Op.ResizeBilinear:

422

if scaled_op.attrs["align_corners"]:

423

# no padding

424

scaled_op.attrs["padding"] = Padding.VALID

425

else:

426

# padding to the right and bottom (limits average pool to 8x8 kernel)

427

scaled_op.attrs["padding"] = Padding.EXPLICIT

428

scaled_op.attrs["explicit_padding"] = [0, 0, upscale_factor - 1, upscale_factor - 1]

429

430

# kernal size dependent on the upscaling factor

431

scaled_op.attrs.update({"ksize": (1, upscale_factor, upscale_factor, 1)})

432

else: # Op.ResizeNearestNeighbor

433

if scaled_op.attrs["align_corners"]:

434

# use depthwise conv to select the correct value

435

scaled_op = convert_resizenn_ac_to_depthwise_conv(scaled_op, upscale_factor)

436

else:

Johan Alfvén

a64616c

2022-10-17 12:29:12 +0200

[diff] [blame]

437

# Keep 1x1 kernel and average pool, this applies both when

438

# half-pixel-centers is True and False. Calculations are the

439

# same in the reference.

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

440

pass

441

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

442

scaled_op.outputs = outputs

443

scaled_op.outputs[0].ops = [scaled_op]

444

scaled_op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

445

DebugDatabase.add_optimised(op, scaled_op)

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

446

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

Rickard Bolin

2022-12-19 12:33:40 +0000

[diff] [blame]

450

def convert_argmax_to_depthwise_conv_and_max_pool(op, arch, nng):

451

"""

452

Convert ArgMax to DWConv2D->MaxPool->DWConv2D, see details below.

Example:

arr = [4, [00000100,

6, = 00000110, # <-- This is the largest value, so we're expecting argmax(arr) = 1

457

5] 00000101]

458

459

Use 16-bit precision and shift all values 7 bits to the left:

460

Shifted_arr = [0000001000000000,

0000001100000000,

0000001010000000]

Add "c - index of channel" to each channel:

465

Shifted_arr_plus_reverse_idx = [0000001000000010, (+2)

466

0000001100000001, (+1)

467

0000001010000000] (+0)

468

469

The index is reversed since ArgMax selects the lowest index if maximum value is found at two index. The index will

470

act as a tie-breaker between channels with equal values and since we want the smallest channel index to be chosen

471

we reverse the index before the maxpool and then subtract the index from the number of channel after the maxpool to

472

get the correct index.

473

474

Find the maximum value in the array:

475

val = max(shifted_arr_plus_reverse_idx) = 0000001100000001

476

477

Subtract the value from the number of channels:

478

shifted_arr_plus_idx = (c-1) - val = 2 - 1 = 1

479

480

Extract the 7 lowest bits using a LUT to cut off the 9 most significant bits:

481

idx = LUT(val) = 0000000000000001 = 1

482

"""

483

484

if op.type == Op.ArgMax:

485

ifm, ofm = op.inputs[0], op.outputs[0]

486

identity_quant = QuantizationParameters()

487

identity_quant.zero_point = 0

488

identity_quant.scale_f32 = 1.0

Rickard Bolin

2022-12-19 12:33:40 +0000

[diff] [blame]

489

# Add last dimension to ofm shape

ofm.shape += [1]

ofm.ops = []

# Create 1x1 Depthwise convolution with 2**7 weights for each channel to convert precision to 16 bit and shift

494

# all values 7 bits to the left

495

# Set necessary depthwise attributes

496

dw_op_attrs = {

497

"padding": Padding.VALID,

498

"stride_h": 1,

499

"stride_w": 1,

500

"strides": (1, 1, 1, 1),

501

"depth_multiplier": 1,

502

"channel_multiplier": 1,

503

"dilation_h_factor": 1,

504

"dilation_w_factor": 1,

505

"dilation": (1, 1, 1, 1),

506

"explicit_padding": None,

507

}

Johan Alfven

2023-03-31 10:19:23 +0200

[diff] [blame]

508

orig_name = op.name

509

op.name = f"{orig_name}_depthwise_conv_SHL_7"

Rickard Bolin

2022-12-19 12:33:40 +0000

[diff] [blame]

510

op.type = Op.DepthwiseConv2DBias

511

op.attrs.update(dw_op_attrs)

Johan Alfven

56811e6

2023-03-27 11:33:50 +0200

[diff] [blame]

512

n, h, w, c = full_shape(4, ifm.shape, 1)

Rickard Bolin

2022-12-19 12:33:40 +0000

[diff] [blame]

513

shape = [1, 1, 1, c]

514

kernel = np.dstack([2**7] * c)

515

op.inputs = []

516

op.add_input_tensor(ifm)

op.add_input_tensor(

create_const_tensor(

"weights",

shape,

DataType.uint8,

np.array(kernel).reshape(shape),

523

quantization=identity_quant,

524

),

525

)

526

# Let the bias for each channel be the "reverse" index of the channel it is in, ie c - channel_idx

527

reverse_idxs = list(reversed(range(c)))

528

bias_tensor = create_const_tensor(op.name + "_bias", [c], DataType.int64, reverse_idxs)

529

op.add_input_tensor(bias_tensor)

530

531

intermediate_tens = Tensor([n, h, w, c], DataType.int16, "int16_and_shifted_7_bits_left")

532

intermediate_tens.quantization = ifm.quantization

533

op.set_output_tensor(intermediate_tens)

534

op.set_ifm_ofm_shapes()

535

orig_ifm_shape = op.ifm_shapes[0]

536

DebugDatabase.add_optimised(op, op)

537

538

# To extract 7 least significant bits and swap reverse index back to real index using a LUT activation, we set

539

# the base value to c-1 and slope to -128. The 16-bit LUT uses a table of 32-bit values where the top 16 bits

540

# represent the slope and bottom 16 bits the base which are used to interpolate the activation value.

541

slope = (-128 & 0xFFFF) << 16 # Top 16 bits of 32 bit LUT table value

542

base = c - 1 # Bottom 16 bits of the LUT table value

543

lut_tensor = create_const_tensor(

544

"maxpool_LUT_extract_7_LSB",

545

[1, 1, 1, 512],

546

DataType.uint32,

547

[slope + base] * 512,

TensorPurpose.LUT,

)

# Split large feature maps into smaller chunks since the Depthwise Maxpool height dimension can overflow due to

552

# flattening the ifm to (H*W)xCx1

553

max_height = 2**16 // orig_ifm_shape.width

554

num_full_height_ops = orig_ifm_shape.height // max_height

555

last_op_height = orig_ifm_shape.height - max_height * num_full_height_ops

556

op_heights = [max_height] * num_full_height_ops

557

if last_op_height > 0:

558

op_heights.append(last_op_height)

559

560

# Create maxpool output tensor which is reshaped to 1x(H*W)x1x1. The product H*W might be larger than the

561

# maximum allowed height, but that's handled by reading and writing the data in chunks

562

maxpool_ofm = Tensor([1, orig_ifm_shape.height * orig_ifm_shape.width, 1, 1], DataType.int16, "argmax_maxpool")

563

maxpool_ofm.quantization = identity_quant

564

565

for op_idx, op_height in enumerate(op_heights):

566

maxpool_op = create_depthwise_maxpool(

567

f"dw_maxpool_{op_idx}", intermediate_tens, orig_ifm_shape, identity_quant

568

)

569

maxpool_op.outputs = [maxpool_ofm]

570

maxpool_ofm.ops.append(maxpool_op)

571

maxpool_op.ofm_shapes = [Shape4D(maxpool_ofm.shape)]

572

maxpool_op.set_activation_lut(lut_tensor)

573

574

# Set read and write shapes/offsets to read/write chunks of the IFM/OFM

575

maxpool_op.read_shapes[0] = Shape4D([1, op_height * orig_ifm_shape.width, orig_ifm_shape.depth, 1])

576

maxpool_op.read_offsets[0] = Shape4D([0, sum(op_heights[:op_idx]) * orig_ifm_shape.width, 0, 0])

577

maxpool_op.write_shape = Shape4D([1, op_height * orig_ifm_shape.width, 1, 1])

578

maxpool_op.write_offset = Shape4D([0, sum(op_heights[:op_idx]) * orig_ifm_shape.width, 0, 0])

579

DebugDatabase.add_optimised(op, maxpool_op)

580

Johan Alfven

2023-03-31 10:19:23 +0200

[diff] [blame]

581

# Set final shape

582

maxpool_ofm.set_all_shapes([1, h, w, 1])

583

584

# Convert 16bit to 32bit or 64bit

585

if ofm.dtype == DataType.int64:

586

# If OFM dtype is int64 the result is converted by two cast ops (16bit to 32bit)

587

#

588

# A -> B -> C -> D (OFM)

589

# |0001| |00010000| |0001|0000| |00010000|00000000|

590

# i16 i32 i16 i16 i32 i32

591

# <-------i64------->

592

#

593

# Memcpy is used to copy the content from B to C and from D to OFM

594

# Memcpy will be turned into a nop or an DMA transer if memory regions differs.

595

intermediate_32bit = Tensor([1, h, w, 1], DataType.int32, f"{orig_name}_32bit")

596

else:

597

intermediate_32bit = ofm

598

599

op_cast = create_cast_op(f"{orig_name}_cast_to_32bit_1", maxpool_ofm, intermediate_32bit)

600

DebugDatabase.add_optimised(op, op_cast)

601

602

if ofm.dtype == DataType.int64:

603

# Create int16 tensor with double shape to cover the intermediate_32bit result from the first cast

604

intermediate_16bit_2x_size = Tensor([1, h, w, 2], DataType.int16, f"{orig_name}_16bit_2x_size")

605

memcpy_op = create_memcpy(f"{orig_name}_memcpy_1", intermediate_32bit, intermediate_16bit_2x_size)

606

DebugDatabase.add_optimised(op, memcpy_op)

607

608

# Create int32 tensor with double ofm shape to be able to store a "int64" result

609

intermediate_32bit_2x_size = Tensor([1, h, w, 2], DataType.int32, f"{orig_name}_32bit_2x_size")

610

611

op_cast = create_cast_op(

612

f"{orig_name}_cast_to_32bit_2", intermediate_16bit_2x_size, intermediate_32bit_2x_size

613

)

614

DebugDatabase.add_optimised(op, op_cast)

615

616

memcpy_op = create_memcpy("f{orig_name}_memcpy_2", intermediate_32bit_2x_size, ofm)

617

DebugDatabase.add_optimised(op, memcpy_op)

Rickard Bolin

2022-12-19 12:33:40 +0000

[diff] [blame]

return op

Rickard Bolin

2022-07-04 16:19:16 +0000

[diff] [blame]

622

def convert_resizebilinear_to_depthwise_convolutions(op, half_pixel_centers=True):

623

def _compute_interpolation_values(index, input_size, output_size):

624

scale = input_size / output_size

625

scaled_value = (index + 0.5 * half_pixel_centers) * scale - 0.5 * half_pixel_centers

626

lower_bound = max(np.floor(scaled_value), 0)

627

628

return scaled_value, lower_bound

629

630

def _compute_kernels(input_height, input_width, output_height, output_width):

kernels = []

for y in (1, 2):

for x in (1, 2):

sv_h, lb_h = _compute_interpolation_values(y, input_height, output_height)

635

sv_w, lb_w = _compute_interpolation_values(x, input_width, output_width)

636

637

# Interpolation values calculated for (x, y) = ([1, 2], [1, 2]) will always generalize to the whole

638

# input for upscale = 2 and input sizes >= 2x2 and be in the correct order for going left-to-right,

639

# top-to-bottom - same as the depthwise convolution strides across each tile

640

kernel = np.zeros((2, 2))

641

kernel[1, 1] = (1 - (sv_h - lb_h)) * (1 - (sv_w - lb_w))

642

kernel[0, 1] = (sv_h - lb_h) * (1 - (sv_w - lb_w))

643

kernel[1, 0] = (1 - (sv_h - lb_h)) * (sv_w - lb_w)

644

kernel[0, 0] = (sv_h - lb_h) * (sv_w - lb_w)

645

kernel *= 16

646

kernels.append(kernel)

return kernels

def _build_convolutions(op, kernels):

651

dw_op_attrs = {

652

"padding": Padding.TILE,

653

"stride_h": 1,

654

"stride_w": 1,

655

"strides": (1, 1, 1, 1),

656

"depth_multiplier": 1,

657

"channel_multiplier": 1,

658

"dilation_h_factor": 1,

659

"dilation_w_factor": 1,

660

"dilation": (1, 1, 1, 1),

}

ifm = op.ifm

ofm = op.ofm

ofm.ops = []

elem_size = 2 if ofm.dtype == DataType.int16 else 1

666

667

n, h, w, c = ifm.shape

668

_, _, ow, _ = ofm.shape

669

670

intermediate_tens = Tensor(ifm.shape, ifm.dtype, "intermediate_tens")

671

intermediate_tens.quantization = op.outputs[0].quantization.clone()

672

avgpool_op = op

673

avgpool_op.name = "rb_init_avgpool"

674

avgpool_op.type = Op.AvgPool

675

avgpool_op.attrs["padding"] = Padding.VALID

676

avgpool_op.attrs["stride_w"] = 1

677

avgpool_op.attrs["stride_h"] = 1

678

avgpool_op.attrs["filter_width"] = 1

679

avgpool_op.attrs["filter_height"] = 1

680

avgpool_op.attrs["strides"] = [1, 1, 1, 1]

681

avgpool_op.attrs["ksize"] = [1, 1, 1, 1]

682

683

avgpool_op.add_input_tensor(ifm)

684

avgpool_op.set_output_tensor(intermediate_tens)

685

avgpool_op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

686

DebugDatabase.add_optimised(op, op)

Rickard Bolin

2022-07-04 16:19:16 +0000

[diff] [blame]

687

688

dw_conv = Operation(Op.DepthwiseConv2DBias, "depthwise_conv")

689

dw_conv._original_type = Op.ResizeBilinear

690

dw_conv.write_shape = Shape4D(n, h, w, c)

691

dw_conv.write_offset = Shape4D(0, 0, 0, 0)

692

693

# Set the output rounding mode. Resize bilinear requires rounding away from zero. Therefore, we need to

694

# adjust the accumulated value by a "small" amount before applying natural rounding. The "small" amount

695

# should be big enough to cause a x.5 to be rounded correctly but small enough not to cause smaller

696

# values to be incorrectly rounded

697

ofm.quantization.next_after = True

698

dw_conv.rounding_mode = NpuRoundingMode.NATURAL

699

700

# Double height and width stride to write the output of each of the four depthwise convolutions below

701

# interleaved with each other when combined with OFM tile base offsets.

702

dw_conv.ofm_stride_multiplier = [1, 2, 2] # C/H/W

703

704

# Choose tile padding direction - pad by 1 with edge values in two direction.

705

# For example, TL (top left) will pad top and left in H/W-plane in all channels.

706

directions = [[1, 1, 0, 0], [1, 0, 0, 1], [0, 1, 1, 0], [0, 0, 1, 1]] # TL, TR, BL, BR

for i in (0, 1):

for j in (0, 1):

index = i * 2 + j

dw_conv.name = f"depthwise_conv_{index}"

711

dw_op_attrs["explicit_padding"] = directions[index]

712

dw_conv.attrs.update(dw_op_attrs)

713

714

# This will offset the start of the write by modifying the Tile 0 base address

715

dw_conv.tile_base_offsets_ofm[0] = (i * ow + j) * c * elem_size

716

717

ofm.ops.append(dw_conv)

718

dw_conv.outputs = [ofm]

719

720

kernel = kernels[index]

721

shape = [2, 2, 1, c]

722

kernel = np.dstack([kernel] * c)

723

724

quant = QuantizationParameters()

725

quant.zero_point = 0

726

quant.scale_f32 = 1.0 / 16

727

728

dw_conv.inputs = []

729

dw_conv.add_input_tensor(intermediate_tens)

730

dw_conv.add_input_tensor(

create_const_tensor(

"weights",

shape,

intermediate_tens.dtype,

735

np.array(kernel).reshape(shape),

Rickard Bolin

2022-07-04 16:19:16 +0000

[diff] [blame]

quantization=quant,

),

)

# setup bias tensor by assign None and then call the fix-up function to create a suitable tensor.

741

# need to append the bias tensor as resize ops only have 2 inputs

742

assert len(dw_conv.inputs) == 2

743

dw_conv.inputs.append(None)

Rickard Bolin

017b4cc

2022-09-23 10:16:48 +0000

[diff] [blame]

744

fixup_bias_tensors(dw_conv, None, None, dtype=DataType.int32)

Rickard Bolin

2022-07-04 16:19:16 +0000

[diff] [blame]

745

746

dw_conv.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

747

DebugDatabase.add_optimised(op, dw_conv)

748

Rickard Bolin

2022-07-04 16:19:16 +0000

[diff] [blame]

749

dw_conv = dw_conv.clone(f"_{index}")

750

return op

751

752

_, input_height, input_width, _ = op.ifm.shape

753

_, output_height, output_width, _ = op.ofm.shape

754

755

kernels = _compute_kernels(input_height, input_width, output_height, output_width)

756

op = _build_convolutions(op, kernels)

return op

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

761

def fixup_resize(op, arch, nng):

762

if op.type.is_resize_op() and op.run_on_npu:

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

763

if op.ifm_shapes[0] == op.ofm_shapes[0]:

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

764

# Bypass the resize op which is essentially a NOP

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

765

op.inputs = op.inputs[:1]

766

op.type = Op.Identity

767

elif op.ifm_shapes[0].height == 1 and op.ifm_shapes[0].width == 1:

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

768

convert_resize_1x1_to_add(op)

Rickard Bolin

2022-07-04 16:19:16 +0000

[diff] [blame]

769

elif op.type == Op.ResizeBilinear and op.attrs.get("half_pixel_centers", False):

770

convert_resizebilinear_to_depthwise_convolutions(op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

771

else:

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

772

convert_resize_to_upscale_and_average_pool(op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

def convert_nop_split_to_identity(op, arch, nng):

778

if op.type == Op.Split and op.attrs.get("num_splits") == 1:

779

# the list comprehension should return a list with a single tensor

780

# if it shouldn't, remove_passthrough_tensor will fail appropriately

781

op.inputs = [i for i in op.inputs if i.shape == op.outputs[0].shape]

782

op.type = Op.Identity

return op

Ayaan Masood

2022-04-21 14:28:03 +0100

[diff] [blame]

786

def rewrite_fully_connected_input(op: Operation, arch, nng):

Fredrik Svedberg

2023-04-11 22:35:04 +0200

[diff] [blame^]

787

# If the operation already have a read shape do not modify

788

# the ifm shape, since that will already be correct

789

if op.type == Op.FullyConnected and not op.read_shapes[0]:

Ayaan Masood

a2ec5aa

2022-04-21 14:28:03 +0100

[diff] [blame]

790

new_shape = op.ifm.get_shape_as_2d(op.weights.shape[-2])

791

assert new_shape is not None, "Tensor can not be reshaped to 2D"

792

op.ifm_shapes[0] = new_shape

Johan Alfvén

65835e0

2022-10-13 10:49:30 +0200

[diff] [blame]

793

794

if op.ifm_shapes[0].batch > 1 and op.ofm_shapes[0].batch == 1:

795

# If IFM is batching then also make sure OFM is batching

796

h, w = op.ofm_shapes[0].height, op.ofm_shapes[0].width

797

op.ofm_shapes[0] = Shape4D([h * w, 1, 1, op.ofm_shapes[0].depth])

798

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

def convert_batched_fc_shape(op, arch, nng):

803

if op.type == Op.FullyConnected:

804

# Check if the first dimension indicates batching

805

if op.ifm_shapes[0].batch > 1:

806

batching_split = {4: (2, 2), 8: (2, 4), 16: (4, 4)}

807

n = op.ifm_shapes[0].batch

808

h, w = batching_split.get(n, (1, n))

809

op.ifm_shapes[0] = Shape4D([1, h, w, op.ifm_shapes[0].depth])

810

811

# Reshape Weights to be 4D. IO becomes HWIO

812

weight_tensor = op.inputs[1]

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

813

weight_tensor.values = np.expand_dims(np.expand_dims(weight_tensor.values, axis=0), axis=0)

814

weight_tensor.set_all_shapes(list(weight_tensor.values.shape))

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

815

816

n = op.ofm_shapes[0].batch

817

h, w = batching_split.get(n, (1, n))

818

op.ofm_shapes[0] = Shape4D([1, h, w, op.ofm_shapes[0].depth])

return op

def unfuse_activation_function(op):

823

if op.type == Op.ConcatTFLite and op.run_on_npu and op.activation is not None:

824

act_op = Operation(op.activation.op_type, op.name + op.activation.op_type.name)

825

op.activation = None

826

out_tens = op.outputs[0]

827

intermediate_tens = out_tens.clone("_act_intermediate")

828

act_op.set_output_tensor(out_tens)

829

act_op.add_input_tensor(intermediate_tens)

830

op.set_output_tensor(intermediate_tens)

831

act_op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

832

DebugDatabase.add_optimised(op, act_op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

833

834

835

def rewrite_stridedslice_output(op, arch, nng):

836

if not op.run_on_npu or op.type != Op.StridedSlice:

837

return op

838

839

new_axis_mask = op.attrs["new_axis_mask"]

840

shrink_axis_mask = op.attrs["shrink_axis_mask"]

841

842

if shrink_axis_mask == 0 and new_axis_mask == 0:

843

return op

844

845

axis_4D = [0] * len(op.outputs)

846

for idx, out_tens in enumerate(op.outputs):

847

output_shape = list(out_tens.shape)

848

849

if shrink_axis_mask != 0:

850

n = 0

851

axis = 0

852

while shrink_axis_mask:

853

prev_mask = shrink_axis_mask

854

n += 1

855

shrink_axis_mask &= shrink_axis_mask - 1

856

axis = int(math.log2(prev_mask - shrink_axis_mask))

857

output_shape = output_shape[:axis] + [1] + output_shape[axis:]

858

859

assert len(out_tens.shape) == (len(op.inputs[0].shape) - n)

860

op.attrs["shrink_axis_mask"] = 0

861

if axis >= 0:

862

axis_4D[idx] = axis + (4 - len(output_shape))

863

else:

864

axis_4D[idx] = axis

865

op.ofm_shapes[idx] = Shape4D(output_shape)

866

867

elif new_axis_mask != 0:

n = 0

axis = 0

while new_axis_mask:

prev_mask = new_axis_mask

872

n += 1

873

new_axis_mask &= new_axis_mask - 1

874

axis = int(math.log2(prev_mask - new_axis_mask))

875

output_shape = output_shape[:axis] + output_shape[(axis + 1) :]

876

new_axis_mask >>= 1

877

878

assert len(out_tens.shape) == (len(op.inputs[0].shape) + n)

879

op.attrs["new_axis_mask"] = 0

880

if axis >= 0:

881

axis_4D[idx] = axis + (4 - len(output_shape))

882

else:

883

axis_4D[idx] = axis

884

op.ofm_shapes[idx] = Shape4D(output_shape)

885

886

op.attrs["split_axis_4D"] = axis_4D

return op

def rewrite_unpack_output(op, arch, nng):

891

tens = op.outputs[0]

892

if op.run_on_npu and op.type == Op.Unpack:

893

# Unpack is also referred to as Unstack

894

axis = int(op.attrs["axis"])

895

if axis < 0: # Convert to positive axis

896

axis = len(op.inputs[0].shape) + 1 + axis

897

op.type = Op.UnpackReshaped

898

desired_output_shape = tens.shape[:axis] + [1] + tens.shape[axis:]

899

900

axis_4D = axis + (4 - len(desired_output_shape))

901

op.attrs["split_axis_4D"] = [axis_4D] * len(op.outputs)

902

903

for idx, out_tens in enumerate(op.outputs):

904

op.ofm_shapes[idx] = Shape4D(desired_output_shape)

return op

def add_padding_fields(op, arch, nng):

909

if op.run_on_npu:

910

if "padding" in op.attrs:

911

input_shape = op.ifm_shapes[0]

912

output_shape = op.ofm_shapes[0]

913

if op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op():

914

kernel_size = op.inputs[1].shape[:2]

915

elif op.type.is_pool_op() or op.type.npu_block_type == NpuBlockType.ReduceSum:

916

kernel_size = op.attrs["ksize"][1:3]

917

else:

918

raise UnsupportedFeatureError(f"Unknown operation that uses padding: {optype_to_builtintype(op.type)}")

919

920

if op.type == Op.Conv2DBackpropInputSwitchedBias:

921

upscaling_factor = output_shape.height // input_shape.height

922

padding, skirt = calc_upscaled_padding_and_skirt(

923

op.attrs["padding"], kernel_size, op.attrs["strides"], input_shape, upscaling_factor

924

)

925

else:

926

padding, skirt = calc_padding_and_skirt(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

op.attrs["padding"],

op.kernel,

input_shape,

op.attrs.get("explicit_padding"),

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

931

)

932

933

op.attrs["explicit_padding"] = padding

934

op.attrs["skirt"] = skirt

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

939

def reorder_depthwise_weights(op, arch, nng):

940

if op.type.is_depthwise_conv2d_op():

941

weight_tensor = op.inputs[1]

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

942

weight_tensor.values = np.transpose(weight_tensor.values, (0, 1, 3, 2))

943

weight_tensor.set_all_shapes(list(weight_tensor.values.shape))

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

944

weight_tensor.weight_transpose_depthwise = True

return op

Raul Farkas

2023-03-16 16:38:05 +0000

[diff] [blame]

949

def fixup_strided_conv(op: Operation, arch, nng):

950

"""Optimize or fixup strided Conv2DBias

951

Optimization:

952

Reduce, when possible, the Conv2DBias stride from 2 to 1 by re-shaping

both IFM and filter.

Fixup:

Introduce software support for Conv2DBias with stride_width = 4 by

957

reducing it to 1 when possible by re-shaping both IFM and filter.

958

"""

Raul Farkas

2023-01-24 16:29:06 +0000

[diff] [blame]

959

if op.type != Op.Conv2DBias:

Louis Verhaard

2022-03-17 14:06:00 +0100

[diff] [blame]

960

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

961

stride_x, stride_y = op.get_kernel_stride()

Louis Verhaard

2022-03-17 14:06:00 +0100

[diff] [blame]

962

weight_tensor = op.weights

963

ifm_shape = op.ifm_shapes[0]

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

964

if (

Raul Farkas

2023-01-24 16:29:06 +0000

[diff] [blame]

965

(stride_x == 2 or stride_x == 4)

Louis Verhaard

2022-03-17 14:06:00 +0100

[diff] [blame]

966

and ifm_shape.depth <= 4

967

and ifm_shape.width % 2 == 0

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

968

and weight_tensor is not None

969

and weight_tensor.shape[1] >= 2

970

):

Louis Verhaard

2022-03-17 14:06:00 +0100

[diff] [blame]

971

k_w, _ = op.get_kernel_size()

Raul Farkas

2023-01-24 16:29:06 +0000

[diff] [blame]

972

curr_padding_x = needed_total_padding(ifm_shape.width, stride_x, k_w)

973

optimised_padding_x = needed_total_padding(ifm_shape.width // stride_x, 1, (k_w + 1) // stride_x)

974

padding_type = op.attrs.get("padding", None)

975

976

# If padding is enabled, check if current padding matches optimised padding

977

if not padding_type or (padding_type != Padding.VALID and curr_padding_x != optimised_padding_x):

Louis Verhaard

2022-03-17 14:06:00 +0100

[diff] [blame]

978

# Horizontal padding would become different after optimisation; this would not work

979

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

980

# IFM

Raul Farkas

2023-01-24 16:29:06 +0000

[diff] [blame]

981

op.ifm_shapes[0] = Shape4D(

982

[ifm_shape.batch, ifm_shape.height, ifm_shape.width // stride_x, ifm_shape.depth * stride_x]

983

)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

984

985

# Weights

986

weight_shape = weight_tensor.shape

987

if weight_shape[1] % 2 != 0:

988

weight_shape[1] = weight_shape[1] + 1

989

padded_array = np.zeros(weight_shape)

990

for i in range(weight_shape[0]):

991

padded_array[i] = np.vstack(

992

[

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

993

weight_tensor.values[i],

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

994

np.full((1, weight_shape[2], weight_shape[3]), weight_tensor.quantization.zero_point),

995

]

996

)

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

997

weight_tensor.values = padded_array

Raul Farkas

2023-01-24 16:29:06 +0000

[diff] [blame]

998

999

# Change weight shape based on stride_x

1000

weight_shape[1] //= stride_x

1001

weight_shape[2] *= stride_x

1002

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

1003

weight_tensor.values = np.reshape(weight_tensor.values, weight_shape)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1004

weight_tensor.set_all_shapes(weight_shape)

1005

# If multiple copies of the weights are used, we could avoid

1006

# them having the same address by changing the value_id

1007

weight_tensor.value_id = uuid.uuid4()

# Strides

stride_x = 1

op.attrs.update({"stride_w": stride_x, "stride_h": stride_y, "strides": (1, stride_y, stride_x, 1)})

1012

Raul Farkas

72c6a24

2023-03-16 16:38:05 +0000

[diff] [blame]

1013

op.ifm.force_linear_format = True

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

def convert_conv_to_fc(op, arch, nng):

1018

# Conv 1x1 can be equivalent to Fully Connected.

1019

# By representing certain convs as fully connected layers, Vela can better determine wether or not to use

1020

# caching/double buffering for the weights.

1021

# (Weights dont need to be reloaded for convs when IFM H and W are 1)

1022

if op.type == Op.Conv2DBias:

1023

h = op.ifm_shapes[0].height

1024

w = op.ifm_shapes[0].width

1025

kh, kw, _, _ = op.inputs[1].shape

1026

if h == 1 and w == 1 and kh == 1 and kw == 1:

1027

# Overwrite this op as a Fully Connected Op

1028

op.name += "_fc"

1029

op.type = Op.FullyConnected

op.attrs = {

"weights_format": 0,

}

# Reshape Weights to be 2D. HWIO becomes just IO (as H and W are 1, they can just be dropped)

1034

weight_tensor = op.inputs[1]

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

1035

weight_tensor.values = weight_tensor.values.squeeze(axis=(0, 1))

1036

weight_tensor.set_all_shapes(list(weight_tensor.values.shape))

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1037

1038

DebugDatabase.add_optimised(op, op)

return op

def fixup_relus_with_differing_ifm_ofm_scaling(op, arch, nng):

1043

if op.run_on_npu and op.type.is_relu_op():

1044

ifm = op.inputs[0]

1045

ofm = op.outputs[0]

1046

# Relu with differing IFM and OFM scaling cannot be fused with another primary op

1047

# and requires its own to be inserted

1048

if not check_quantized_tens_scaling_equal(ifm, ofm):

1049

# Override this op with its own primary op (avgpool)

1050

relu_fused_op = create_avgpool_nop(op.name + "_avgpool")

1051

# And fuse the original activation function to it

1052

relu_fused_op.activation = create_activation_function(op.type)

Fredrik Svedberg

1a7527c

2021-09-13 15:52:16 +0200

[diff] [blame]

1053

# Add explicit rescaling

1054

rescale = ifm.quantization.scale_f32 / ofm.quantization.scale_f32

1055

multiplier, shift = scaling.quantise_scale(rescale)

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame]

1056

relu_fused_op.explicit_scaling = ExplicitScaling(False, [shift], [multiplier])

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1057

# Tidy up and assign the ifm and ofm to the new op

1058

ifm.consumer_list.remove(op)

1059

1060

relu_fused_op.add_input_tensor(ifm)

1061

relu_fused_op.set_output_tensor(ofm)

1062

relu_fused_op.set_ifm_ofm_shapes()

op = relu_fused_op

return op

Fredrik Svedberg

2023-04-11 22:35:04 +0200

[diff] [blame^]

1067

def convert_lstm(op, arch, nng):

1068

if op.type == Op.UnidirectionalSequenceLstm:

1069

lstm = Lstm(op)

1070

op = lstm.get_graph()

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1074

def convert_softmax(op, arch, nng):

1075

if op.type == Op.Softmax and op.run_on_npu:

1076

softmax = SoftMax(op)

1077

op = softmax.get_graph()

return op

Fredrik Svedberg

2022-08-19 16:06:04 +0200

[diff] [blame]

1081

def convert_prelu(op, arch, nng):

1082

if op.type == Op.Prelu:

1083

ifm, alpha, ofm = op.get_ifm_ifm2_ofm()

1084

if None in (ifm, alpha, ofm):

1085

return op

1086

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

1087

if alpha.values is not None:

1088

# If const alpha check for possible optimisations

1089

alpha_zp = alpha.quantization.zero_point

1090

alpha_scale = alpha.quantization.scale_f32

1091

# If all alpha values are the same the PReLU can be converted to LeakyRelu

Rickard Bolin

5fdcf17

2022-12-19 12:56:17 +0000

[diff] [blame]

1092

alpha_min = (alpha.values.min().astype(int) - alpha_zp) * alpha_scale

1093

alpha_max = (alpha.values.max().astype(int) - alpha_zp) * alpha_scale

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

1094

if alpha_min == alpha_max:

# or even a Relu

if alpha_min == 0:

new_op = Op.Relu

else:

new_op = Op.LeakyRelu

1100

op.attrs["alpha"] = alpha_min

1101

# setup alpha_scaling for bit exact result

1102

ifm_scale = ifm.quantization.scale_f32

1103

ofm_scale = ofm.quantization.scale_f32

1104

alpha_scale, alpha_shift = scaling.elementwise_mul_scale(ifm_scale, alpha_scale, ofm_scale)

1105

op.attrs["alpha_scaling"] = (alpha.values.min() - alpha_zp, alpha_scale, alpha_shift)

1106

# Change op type

1107

op.type = new_op

1108

op.name = op.name.replace("Prelu", new_op.name)

1109

del op.inputs[1] # Remove alpha tensor

1110

return op

1111

elif alpha_max < 1:

1112

# If alpha_max is less than 1 convert PReLU to Max(alpha * IFM, identity * IFM)

1113

# Multiply with alpha tensor

1114

mul_alpha = Operation(Op.Mul, op.name + "_mul_alpha")

1115

mul_alpha.add_input_tensor(ifm)

1116

mul_alpha.add_input_tensor(alpha)

1117

fm_alpha = ofm.clone(op.name + "_alpha", set_unique=True)

1118

mul_alpha.set_output_tensor(fm_alpha)

1119

mul_alpha.set_ifm_ofm_shapes()

1120

DebugDatabase.add_optimised(op, mul_alpha)

1121

if check_quantized_tens_scaling_equal(ifm, ofm):

1122

# No scaling is needed

1123

fm_id = ifm

1124

else:

1125

# Add multiplication with identity

1126

mul_identity = Operation(Op.Mul, op.name + "_mul_identity")

1127

mul_identity.add_input_tensor(ifm)

1128

# Create const tensor containing identity as scalar

1129

quantization = ifm.quantization.clone()

1130

quantization.scale_f32 = np.float32(1)

1131

quantization.zero_point = 0

1132

one = create_const_tensor("one_const", [], ifm.dtype, [1], quantization=quantization)

1133

mul_identity.add_input_tensor(one)

1134

# Make sure that fm_id is allocated to a different address than fm_alpha

1135

fm_id = ofm.clone(op.name + "_id", set_unique=True)

1136

mul_identity.set_output_tensor(fm_id)

1137

mul_identity.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

1138

DebugDatabase.add_optimised(op, mul_identity)

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

1139

1140

# Combine scaled and alpha multiplied values

1141

max_op = Operation(Op.Maximum, op.name + "_max")

1142

max_op.add_input_tensor(fm_alpha)

1143

max_op.add_input_tensor(fm_id)

1144

max_op.set_output_tensor(ofm)

1145

max_op.set_ifm_ofm_shapes()

1146

1147

DebugDatabase.add_optimised(op, max_op)

1148

ifm.consumer_list.remove(op)

1149

return max_op

1150

1151

# Catch all PReLU conversion for the cases that could not be optimised above

Fredrik Svedberg

2022-08-19 16:06:04 +0200

[diff] [blame]

1152

no_scale_quant = ifm.quantization.clone()

1153

no_scale_quant.scale_f32 = None

1154

no_scale_quant.zero_point = 0

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

1155

zero = create_const_tensor("zero_const", [], ifm.dtype, [0], quantization=no_scale_quant)

Fredrik Svedberg

2022-08-19 16:06:04 +0200

[diff] [blame]

1156

1157

# Select values < 0

1158

min_op = Operation(Op.Minimum, op.name + "_min")

1159

min_op.add_input_tensor(ifm)

1160

min_op.add_input_tensor(zero)

1161

fm_negative = ifm.clone(op.name + "_negative", set_unique=True)

1162

min_op.set_output_tensor(fm_negative)

1163

min_op.set_ifm_ofm_shapes()

1164

DebugDatabase.add_optimised(op, min_op)

1165

1166

# and multiply with alpha tensor

1167

mul_alpha = Operation(Op.Mul, op.name + "_mul_alpha")

1168

mul_alpha.add_input_tensor(fm_negative)

1169

mul_alpha.add_input_tensor(alpha)

1170

fm_alpha = ofm.clone(op.name + "_negative_alpha", set_unique=True)

1171

mul_alpha.set_output_tensor(fm_alpha)

1172

mul_alpha.set_ifm_ofm_shapes()

1173

DebugDatabase.add_optimised(op, mul_alpha)

1174

1175

# Select (and scale) values > 0

1176

relu_op = Operation(Op.Relu, op.name + "_relu")

1177

relu_op.add_input_tensor(ifm)

1178

fm_scaled = ofm.clone(op.name + "_positive_scaled", set_unique=True)

1179

relu_op.set_output_tensor(fm_scaled)

1180

relu_op.set_ifm_ofm_shapes()

1181

DebugDatabase.add_optimised(op, relu_op)

1182

1183

# Add scaled and alpha multiplied values (without scaling)

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame]

1184

add_op = Operation(Op.Add, op.name + "_add")

1185

add_op.explicit_scaling = ExplicitScaling(False, shift=[0], multiplier=[1]) # No scaling

Fredrik Svedberg

2022-08-19 16:06:04 +0200

[diff] [blame]

1186

add_op.add_input_tensor(fm_alpha)

1187

add_op.add_input_tensor(fm_scaled)

1188

add_op.set_output_tensor(ofm)

1189

add_op.set_ifm_ofm_shapes()

1190

1191

DebugDatabase.add_optimised(op, add_op)

1192

ifm.consumer_list.remove(op)

op = add_op

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1198

def convert_mul_max_to_abs_or_lrelu(op, arch, nng):

1199

r"""Whenever there is a subgraph with this topology:

1200

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

1201

Input X For X = -1 or X > 0

1202

| \ / This subgraph can be replaced with either

1203

| Mul an Abs (if X = -1) or a LeakyReLU (if X > 0)

1204

| /

1205

Max

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1206

"""

1207

1208

if op.type == Op.Maximum:

1209

# finds the Mul input(s) to the Max

1210

muls = [i for i in op.inputs if i.ops[0].type == Op.Mul]

if len(muls) == 1:

mul = muls[0].ops[0]

elif len(muls) == 2:

# In the case both inputs are Muls, find the one with the same input as the Max

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

1215

mul_ifms = [m for m in muls if len(set(op.inputs + m.ops[0].inputs)) == 1]

1216

if len(mul_ifms):

1217

mul = mul_ifms[0].ops[0]

1218

else:

1219

# Not using same input

1220

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

else:

# No Mul inputs

return op

# make sure the Mul doesn't have any other consumers

1226

mul_ofm = mul.outputs[0]

1227

if len(mul_ofm.consumers()) != 1:

1228

return op

1229

# make sure the Mul doesn't have a fused activation function

1230

if mul.activation:

1231

return op

1232

ifm, ofm = op.get_ifm_ofm()

1233

if ifm is None or ofm is None:

1234

return op

1235

1236

if ifm.dtype not in (DataType.uint8, DataType.int8) or ifm.dtype != ofm.dtype:

1237

return op

1238

if not check_quantized_tens_scaling_equal(ifm, ofm) or not check_quantized_tens_scaling_equal(ifm, mul_ofm):

1239

# rewrite to LeakyRelu currently only makes sense if the quantization is identical

1240

return op

1241

1242

# finds the branched input that goes to both the Max and the Mul

1243

shared = set(op.inputs) & set(mul.inputs)

1244

if len(shared) == 1:

1245

shared_in = shared.pop()

1246

# find the constant scalar input to the Mul

1247

const_tens = (set(mul.inputs) - {shared_in}).pop()

1248

# check that it is a scalar

1249

if const_tens.shape != []:

1250

return op

1251

const = const_tens.ops[0]

1252

# check that it is a constant

1253

if const.type != Op.Const:

1254

return op

1255

# Remove the Mul from the shared input's consumers

1256

shared_in.consumer_list.remove(mul)

else:

return op

val = const.outputs[0].values

1261

if val >= 0:

1262

new_op = Op.LeakyRelu

1263

op.attrs["alpha"] = val

1264

# to produce bit exact results, the alpha is not enough;

1265

# save additional scaling info in attr "alpha_scale", to be used as input

1266

# to the LUT construction

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

1267

alpha_scalar = const_tens.values - const_tens.quantization.zero_point

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1268

mul_ifm_scale = np.double(ifm.quantization.scale_f32)

1269

mul_ifm2_scale = np.double(const_tens.quantization.scale_f32)

1270

mul_ofm_scale = np.double(mul_ofm.quantization.scale_f32)

1271

alpha_scale, alpha_shift = scaling.elementwise_mul_scale(mul_ifm_scale, mul_ifm2_scale, mul_ofm_scale)

1272

op.attrs["alpha_scaling"] = (alpha_scalar, alpha_scale, alpha_shift)

elif val == -1:

new_op = Op.Abs

else:

return op

op.type = new_op

op.name = op.name.replace("Maximum", new_op.name)

1280

op.outputs[0].name = op.outputs[0].name.replace("Maximum", new_op.name)

1281

op.inputs = [shared_in]

1282

op.set_ifm_ofm_shapes()

1283

1284

# Record optimisation in debug database

1285

DebugDatabase.add_optimised(op, op)

return op

def convert_hardswish_to_lut(op, arch, nng):

1291

if op.type == Op.HardSwish:

1292

ifm, ofm = op.get_ifm_ofm()

1293

# Generate the LUT

1294

ifm_scale = np.double(ifm.quantization.scale_f32)

1295

ofm_scale = np.double(ofm.quantization.scale_f32)

1296

zp_in = ifm.quantization.zero_point

1297

zp_out = ofm.quantization.zero_point

1298

ifm_scale_hires = (1 / 128) * ifm_scale

1299

relu_multiplier = np.double(3 / 32768)

1300

out_scale, out_shift = scaling.quantise_scale(ifm_scale_hires / ofm_scale)

1301

relu_scale, relu_shift = scaling.quantise_scale(ifm_scale_hires / relu_multiplier)

1302

# Use 16bit scale

1303

out_scale_16 = fp_math.downscale_multiplier_int32_to_int16(out_scale)

1304

relu_scale_16 = fp_math.downscale_multiplier_int32_to_int16(relu_scale)

1305

1306

values = []

1307

ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)

1308

quantized_min = min(ix)

1309

quantized_max = max(ix)

1310

for x in ix:

1311

input_value = x - zp_in

1312

input_value_hires = input_value * 128

1313

# Compute the input value on essentially the output scale, not shifted yet

1314

input_value_preshift = fp_math.saturating_rounding_mul16(input_value_hires, out_scale_16)

1315

# Compute the "relu-ish multiplier". This matches the code in TensorFlow Lite Micro kernel

1316

relu_value = np.int16(input_value_hires)

1317

if relu_shift < 31:

1318

relu_value = fp_math.shift_left16(relu_value, 30 - relu_shift)

1319

1320

relu_value = fp_math.saturating_rounding_mul16(relu_value, relu_scale_16)

1321

1322

if relu_shift < 31:

1323

relu_value = fp_math.shift_left16(relu_value, 1)

1324

1325

if relu_shift > 31:

1326

relu_value = fp_math.rounding_divide_by_pot(relu_value, relu_shift - 31)

1327

1328

# Rescaled the value into a 16bit fixedpoint relu_value in [-1, 1]

1329

# Now convert that to a 16bit fixedpoint value in [0, 1]

1330

relu_value = (relu_value + (1 << 15)) >> 1

1331

lut_result = fp_math.saturating_mul16(relu_value, input_value_preshift)

1332

shift = 31 - out_shift

1333

shift = -shift if shift < 0 else 0

1334

# Finally apply the output shift

1335

lut_result = fp_math.rounding_divide_by_pot(lut_result, shift) + zp_out

1336

lut_result = min(quantized_max, max(quantized_min, lut_result))

1337

values.append(lut_result)

1338

return convert_to_lut(op, values, "hardswish")

return op

def convert_lrelu_to_mul_max(op, arch):

1343

# Converts LeakyRelu to Max(alpha * IFM, identity * IFM)

1344

# (the opposite of convert_mul_max_to_abs_or_lrelu)

1345

ifm, ofm = op.get_ifm_ofm()

1346

if ifm is None or ofm is None:

1347

return op

1348

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1349

alpha = np.float32(op.attrs["alpha"])

1350

use_mul_max = 0 < alpha < 1

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1351

is_converted_prelu = "alpha_scaling" in op.attrs

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

if use_mul_max:

mul_ifm = ifm

new_op = Op.Maximum

else:

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1356

# Need to use a different approach for alpha < 0 or alpha > 1

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1357

no_scale_quant = ifm.quantization.clone()

1358

no_scale_quant.scale_f32 = None

1359

no_scale_quant.zero_point = 0

1360

zero = create_const_tensor("zero_const", [], ifm.dtype, [0], quantization=no_scale_quant)

1361

1362

# Select values < 0

1363

min_op = Operation(Op.Minimum, op.name + "_min")

1364

min_op.add_input_tensor(ifm)

1365

min_op.add_input_tensor(zero)

1366

mul_ifm = ifm.clone(op.name + "_negative", set_unique=True)

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1367

if alpha < 0 and not is_converted_prelu:

1368

# For negative alpha that is not from a converted PReLU we need to use

1369

# int32 Mul below to perform the (negative) alpha scaling

1370

mul_ifm.dtype = DataType.int32

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1371

min_op.set_output_tensor(mul_ifm)

1372

min_op.set_ifm_ofm_shapes()

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame]

1373

new_op = Op.Add

1374

op.explicit_scaling = ExplicitScaling(False, shift=[0], multiplier=[1]) # No scaling

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1375

DebugDatabase.add_optimised(op, min_op)

1376

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1377

# Add multiplication with alpha

1378

mul_alpha = Operation(Op.Mul, op.name + "_mul_alpha")

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1379

mul_alpha.add_input_tensor(mul_ifm)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1380

# Create const tensor containing alpha as scalar

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1381

quantization = ifm.quantization.clone()

1382

quantization.min = 0

1383

quantization.max = alpha * (quantization.quant_max - quantization.quant_min)

1384

quantization.zero_point = 0

Fredrik Svedberg

2022-09-13 15:22:01 +0200

[diff] [blame]

1385

alpha_dtype = mul_ifm.dtype

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1386

if is_converted_prelu:

1387

# The LeakyRelu was the result from convert_prelu and the scaling is provided

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

1388

scalar, alpha_scale, alpha_shift = op.attrs["alpha_scaling"]

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame]

1389

mul_alpha.explicit_scaling = ExplicitScaling(False, [alpha_shift], [alpha_scale])

Fredrik Svedberg

2022-09-13 15:22:01 +0200

[diff] [blame]

1390

elif alpha == 0 or np.isinf(1 / alpha):

1391

# Handling of alpha near or at zero

Fredrik Svedberg

cce872b

2021-09-02 15:20:52 +0200

[diff] [blame]

1392

quantization.scale_f32 = np.float32(1)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1393

scalar = 0

1394

else:

1395

quantization.scale_f32 = alpha

Fredrik Svedberg

2022-09-13 15:22:01 +0200

[diff] [blame]

1396

if alpha_dtype == DataType.int32:

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1397

# When the datatype is int32 (alpha negative) we need to do the scaling with the multiplication

Fredrik Svedberg

2022-09-13 15:22:01 +0200

[diff] [blame]

1398

scalar, _ = scaling.elementwise_mul_scale(ifm.quantization.scale_f32, alpha, ofm.quantization.scale_f32)

1399

else:

1400

scalar = 1

Tim Hall

2023-01-13 17:57:25 +0000

[diff] [blame]

1401

alpha_tens = create_const_tensor(op.name + "_alpha_scalar", [1], alpha_dtype, [scalar], quantization=quantization)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1402

mul_alpha.add_input_tensor(alpha_tens)

1403

fm_alpha = ofm.clone(op.name + "_alpha", set_unique=True)

1404

mul_alpha.set_output_tensor(fm_alpha)

1405

mul_alpha.set_ifm_ofm_shapes()

1406

DebugDatabase.add_optimised(op, mul_alpha)

1407

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1408

if not use_mul_max:

1409

relu_op = Operation(Op.Relu, op.name + "_relu")

1410

relu_op.add_input_tensor(ifm)

1411

fm_id = ofm.clone(op.name + "_positive_scaled", set_unique=True)

1412

relu_op.set_output_tensor(fm_id)

1413

relu_op.set_ifm_ofm_shapes()

1414

DebugDatabase.add_optimised(op, relu_op)

1415

elif check_quantized_tens_scaling_equal(ifm, ofm):

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1416

# No identity multiplication is needed

1417

fm_id = ifm

1418

else:

1419

# Add multiplication with identity

1420

mul_identity = Operation(Op.Mul, op.name + "_mul_identity")

1421

mul_identity.add_input_tensor(ifm)

1422

# Create const tensor containing identity as scalar

1423

quantization = ifm.quantization.clone()

1424

quantization.min = 0

1425

quantization.max = quantization.quant_max - quantization.quant_min

Fredrik Svedberg

cce872b

2021-09-02 15:20:52 +0200

[diff] [blame]

1426

quantization.scale_f32 = np.float32(1)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1427

quantization.zero_point = 0

Tim Hall

2023-01-13 17:57:25 +0000

[diff] [blame]

1428

identity_tens = create_const_tensor(op.name + "_id_scalar", [], ifm.dtype, [1], quantization=quantization)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1429

mul_identity.add_input_tensor(identity_tens)

1430

# Make sure that fm_id is allocated to a different address than fm_alpha

1431

fm_id = ofm.clone(op.name + "_id", set_unique=True)

1432

mul_identity.set_output_tensor(fm_id)

1433

mul_identity.set_ifm_ofm_shapes()

1434

DebugDatabase.add_optimised(op, mul_identity)

1435

1436

# Convert LeakyRelu to Max, add the results of the multiplication(s) as inputs

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1437

op.type = new_op

1438

op.name = op.name.replace("LeakyRelu", new_op.name)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1439

op.inputs = []

1440

ifm.consumer_list.remove(op)

1441

op.add_input_tensor(fm_alpha)

1442

op.add_input_tensor(fm_id)

1443

op.set_ifm_ofm_shapes()

1444

1445

DebugDatabase.add_optimised(op, op)

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1449

def convert_to_lut8(op, fn, fn_name):

1450

# Converts op to a no-op + int8/uint8 LUT which is generated with the given function.

1451

# fn is a function(real) -> real

1452

ifm, ofm = op.get_ifm_ofm()

1453

if ifm.dtype not in (DataType.uint8, DataType.int8) or ifm.dtype != ofm.dtype:

1454

return op

1455

# Generate the LUT

1456

ifm_scale = np.double(ifm.quantization.scale_f32)

1457

ofm_scale = np.double(ofm.quantization.scale_f32)

1458

zp_in = ifm.quantization.zero_point

1459

zp_out = ofm.quantization.zero_point

1460

values = []

1461

ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)

1462

quantized_min = min(ix)

1463

quantized_max = max(ix)

1464

for x in ix:

1465

x_real = ifm_scale * (x - zp_in)

1466

y_real = fn(x_real)

1467

lut_result = round_away_zero(zp_out + y_real / ofm_scale)

1468

lut_result = min(quantized_max, max(quantized_min, lut_result))

1469

values.append(lut_result)

1470

return convert_to_lut(op, values, fn_name)

1471

1472

1473

def convert_lrelu_to_lut(op, arch):

1474

ifm, ofm = op.get_ifm_ofm()

1475

# Generate the LUT

1476

alpha = op.attrs["alpha"]

1477

ifm_scale = np.double(ifm.quantization.scale_f32)

1478

ofm_scale = np.double(ofm.quantization.scale_f32)

1479

zp_in = ifm.quantization.zero_point

1480

zp_out = ofm.quantization.zero_point

1481

identity_scale, identity_shift = scaling.elementwise_mul_scale(ifm_scale, 1, ofm_scale)

1482

alpha_scalar = 1

1483

alpha_scale, alpha_shift = scaling.elementwise_mul_scale(ifm_scale, alpha, ofm_scale)

1484

if "alpha_scaling" in op.attrs:

1485

# The LeakyRelu was the result from convert_mul_max_to_abs_or_lrelu

1486

alpha_scalar, alpha_scale, alpha_shift = op.attrs["alpha_scaling"]

1487

values = []

1488

ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)

1489

quantized_min = min(ix)

1490

quantized_max = max(ix)

1491

for x in ix:

1492

if x < zp_in:

1493

lut_result = zp_out + fp_math.multiply_by_quantized_multiplier(

1494

alpha_scalar * (x - zp_in), alpha_scale, alpha_shift

1495

)

1496

else:

1497

lut_result = zp_out + fp_math.multiply_by_quantized_multiplier(x - zp_in, identity_scale, identity_shift)

1498

lut_result = min(quantized_max, max(quantized_min, lut_result))

1499

values.append(lut_result)

1500

return convert_to_lut(op, values, "lrelu")

1501

1502

1503

def convert_lrelu(op, arch, nng):

1504

# Converts LeakyRelu to a LUT based solution if possible, otherwise a mul + max

1505

if op.type != Op.LeakyRelu:

1506

return op

1507

ifm, ofm = op.get_ifm_ofm()

1508

if ifm is None or ofm is None:

1509

return op

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1510

alpha = op.attrs["alpha"]

1511

if alpha == 0:

1512

# When alpha is 0 the opertion can be converted to a ReLU

1513

op.type = Op.Relu

1514

op.name = op.name.replace("LeakyRelu", op.type.name)

1515

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1516

if ifm.dtype in (DataType.uint8, DataType.int8) and ifm.dtype == ofm.dtype:

1517

# use LUT for int8/uint8

1518

return convert_lrelu_to_lut(op, arch)

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1519

if check_quantized_tens_scaling_equal(ifm, ofm) and ifm.dtype == ofm.dtype == DataType.int16 and alpha > 0:

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1520

# use LeakyRelu unmodified for int16 with equal input/output scaling and positive alpha

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1521

return op

1522

return convert_lrelu_to_mul_max(op, arch)

1523

1524

1525

def convert_tanh_sigmoid_to_lut(op, arch, nng):

1526

# Converts int8/uint8 Sigmoid and Tanh to a LUT based solution

1527

if op.type == Op.Sigmoid:

1528

return convert_to_lut8(op, clamp_sigmoid, "sigmoid")

1529

elif op.type == Op.Tanh:

1530

return convert_to_lut8(op, math.tanh, "tanh")

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1534

def fuse_activation_function_with_prev(op, arch, nng):

1535

# if op is a no-op: attempts to move the activation function to the preceding op

1536

if not op.attrs.get("is_nop", False) or op.activation is None:

1537

return op

1538

ifm, ofm = op.get_ifm_ofm()

1539

if ifm is None or ofm is None:

1540

return op

1541

# finds the input(s) to the operation

1542

prev_op = ifm.ops[0]

1543

# Note: the below checks on prev_op require that a first optimize pass on the full graph has been performed

1544

fuse = (

1545

prev_op.run_on_npu

1546

and prev_op.type.npu_block_type != NpuBlockType.Default

1547

and len(ifm.ops) == 1

1548

and len(prev_op.outputs[0].consumers()) == 1

1549

and prev_op.activation is None

1550

)

1551

if op.activation_lut is not None and arch.shram_reserved_unused_banks == 0:

1552

# TODO: if SHRAM LUT space is shared with SHRAM ACC (32, 64 MAC),

1553

# LUT currently only works correctly for elementwise ops

fuse = False

if not fuse:

return op

# Move the fused activation function + corresponding info to prev_op

1558

prev_op.activation = op.activation

1559

prev_op.forced_output_quantization = op.forced_output_quantization

1560

if op.activation_lut is not None:

1561

prev_op.set_activation_lut(op.activation_lut)

1562

# Bypass op

1563

prev_op.set_output_tensor(ofm)

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

1564

DebugDatabase.add_optimised(prev_op, prev_op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

def _leading_pad_ok(leading_pad, stride, kernel_size):

1569

# If kernel size // 2 > stride, then (left, top) padding must be a multiple of stride,

1570

# otherwise replacing PAD by hardware padding would iterate the wrong IFM rows/columns

1571

max_size = kernel_size // 2

1572

return leading_pad == max_size or max_size <= stride or leading_pad % stride == 0

1573

1574

1575

def replace_pad_by_hw_pad(op: Operation, arch, nng):

1576

"""

1577

Tries to completely remove a PAD operator by using hardware padding.

1578

E.g. a PAD operation that pads 1, followed by a CONV with VALID padding and kernel size 3

1579

is rewritten such that the PAD is removed, and the CONV uses SAME padding.

1580

Converts tens1 -> PAD -> tens2 -> CONV to tens1 -> CONV

1581

if both operations can be run on the NPU.

1582

This is the most efficient way to implement PAD, but cannot be done for all pad sizes.

1583

"""

1584

if (

1585

(op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op() or op.type.is_avgpool_op())

Tim Hall

0ab2edc

2022-02-23 17:58:02 +0000

[diff] [blame]

1586

and op.type not in (Op.Conv2DBackpropInput, Op.Conv2DBackpropInputSwitchedBias)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1587

and op.run_on_npu

1588

and op.attrs["padding"] == Padding.VALID

1589

):

1590

pad_op = op.ifm.ops[0]

1591

if pad_op.type != Op.Pad or not pad_op.run_on_npu:

1592

return op

1593

if pad_op.ifm.dtype != pad_op.ofm.dtype or not check_quantized_tens_scaling_equal(pad_op.ofm, pad_op.ifm):

1594

return op

1595

top, left, bottom, right = get_pad_values_from_input(pad_op.inputs[1].values)

1596

k = op.kernel

1597

k_w, k_h = k.dilated_wh()

1598

1599

# Check if the PAD operator can be replaced by hardware padding

1600

if left > k_w // 2 or right > k_w // 2 or top > k_h // 2 or bottom > k_h // 2:

1601

# Too much padding, it would require hardware padding to actually insert zeros

1602

return op

1603

if not _leading_pad_ok(top, k.stride.y, k_h) or not _leading_pad_ok(left, k.stride.x, k_w):

1604

return op

1605

1606

if op.type.is_avgpool_op():

1607

# For average pool, hardware padding can only be used if padding is 0 or kernel size / 2

for pad, k_size in (

(left, k_w),

(right, k_w),

(top, k_h),

(bottom, k_h),

):

if pad not in (0, k_size // 2):

1615

return op

1616

# Average pool is converted to depthwise, because NPU average pool + same padding

1617

# has a special implementation that is different from PAD followed by average pool with

1618

# valid padding.

1619

k_w, k_h = op.kernel.width, op.kernel.height

1620

ifm = op.ifm

1621

# Remember other inputs

1622

other_inputs = op.inputs[1:]

1623

# Create a weight tensor, all weights are set to 1/(kernel width * kernel height)

1624

quantization = QuantizationParameters(0.0, 255.0)

1625

quantization.scale_f32 = 1.0 / (k_w * k_h)

1626

quantization.zero_point = 0

1627

shape = [k_h, k_w, 1, op.ofm.shape[-1]]

1628

weights = np.full(shape, 1)

1629

1630

weight_tens = create_const_tensor(

1631

op.name + "_weights",

1632

shape,

1633

op.ifm.dtype,

1634

weights,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1635

purpose=TensorPurpose.Weights,

1636

quantization=quantization,

1637

)

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

1638

weight_tens.values = weights

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1639

op.type = Op.DepthwiseConv2DBias

1640

op.inputs = []

1641

op.add_input_tensor(ifm)

1642

op.add_input_tensor(weight_tens)

1643

# Add bias tensor, all biases set to 0

1644

op.inputs.append(None)

Fredrik Svedberg

cc219be

2022-09-20 16:32:52 +0200

[diff] [blame]

1645

fixup_bias_tensors(op, arch, nng, DataType.int32)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1646

# Add other inputs

1647

op.inputs.extend(other_inputs)

1648

op.rounding_mode = NpuRoundingMode.NATURAL

1649

1650

# Bypass the PAD operator

1651

op.set_input_tensor(pad_op.ifm, 0)

1652

# Adjust the padding attributes of the convolution operator

1653

op.attrs["padding"] = Padding.EXPLICIT

1654

op.attrs["explicit_padding"] = (top, left, bottom, right)

1655

op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

1656

DebugDatabase.add_optimised(op, op)

1657

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

def convert_pad(op: Operation, arch, nng):

1662

"""

1663

Rewrites PAD operator to an average pool that copies the IFM to the OFM

1664

+ up to 4 average pool operators that fill the OFM with zeros at the borders.

1665

This is done as fall-back for the PAD operators that remain after replace_pad_by_hw_pad

1666

"""

1667

if op.type != Op.Pad or not op.run_on_npu:

1668

return op

1669

top, left, bottom, right = get_pad_values_from_input(op.inputs[1].values)

1670

1671

ifm = op.ifm

1672

assert ifm is not None

James Ward

3e13434

2021-10-28 10:01:40 +0100

[diff] [blame]

1673

ifm_shape = op.ifm_shapes[0]

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1674

ofm = op.ofm

1675

assert ofm is not None

1676

ofm.ops = []

1677

ofm_shape = op.ofm_shapes[0]

1678

1679

# Average pool op that copies IFM to the right place inside the OFM

1680

shp0 = Shape4D(0, 0, 0, 0)

1681

shp_top = shp0.with_height(top)

1682

avgpool_op = create_avg_pool_for_concat(op, op.name + "_main", ifm, ifm_shape, shp_top.with_width(left))

1683

avgpool_op.activation = op.activation

1684

quant = ofm.quantization

1685

pad_value = quant.zero_point

1686

# Add operations that fill the borders of the OFM

1687

if top > 0:

1688

shape = Shape4D(1, top, ofm_shape.width, ofm_shape.depth)

1689

zero_tens = create_const_tensor(

Tim Hall

2023-01-13 17:57:25 +0000

[diff] [blame]

1690

op.name + "_top", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], quantization=quant

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1691

)

1692

# If top/bottom or left/right are equal, the const tensors can be allocated to the same address

1693

zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))

1694

create_avg_pool_for_concat(op, op.name + "_top", zero_tens, shape, shp0)

1695

if bottom > 0:

1696

shape = Shape4D(1, bottom, ofm_shape.width, ofm_shape.depth)

1697

zero_tens = create_const_tensor(

op.name + "_bottom",

shape.as_list(),

ofm.dtype,

shape.elements() * [pad_value],

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1702

quantization=quant,

1703

)

1704

zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))

1705

create_avg_pool_for_concat(

1706

op, op.name + "_bottom", zero_tens, shape, shp0.with_height(ofm_shape.height - bottom)

1707

)

1708

if left > 0:

1709

shape = Shape4D(1, ifm_shape.height, left, ofm_shape.depth)

1710

zero_tens = create_const_tensor(

Tim Hall

2023-01-13 17:57:25 +0000

[diff] [blame]

1711

op.name + "_left", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], quantization=quant

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1712

)

1713

zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))

1714

create_avg_pool_for_concat(op, op.name + "_left", zero_tens, shape, shp_top)

1715

if right > 0:

1716

shape = Shape4D(1, ifm_shape.height, right, ofm_shape.depth)

1717

zero_tens = create_const_tensor(

Tim Hall

2023-01-13 17:57:25 +0000

[diff] [blame]

1718

op.name + "_right", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], quantization=quant

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1719

)

1720

zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))

1721

create_avg_pool_for_concat(

1722

op, op.name + "_right", zero_tens, shape, shp_top.with_width(ofm_shape.width - right)

1723

)

1724

1725

op.type = Op.ConcatTFLite

return avgpool_op

Fredrik Svedberg

2022-09-20 16:32:52 +0200

[diff] [blame]

1729

def fixup_bias_tensors(op, arch, nng, dtype=None):

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1730

if op.type.needs_bias() and op.bias is None:

1731

# Op has no bias, add bias tensor filled with zeros

1732

nr_biases = op.inputs[1].shape[-1]

1733

bias_values = [0] * nr_biases

Fredrik Svedberg

cc219be

2022-09-20 16:32:52 +0200

[diff] [blame]

1734

# The DataType of the bias tensor can be explicitly provided or deduced from the ifm

1735

# DataType. Default is int32 bias for 8-bit ifms and int64 for int16 ifms.

1736

# For int16 the selected bias DataType will have an impact on the scaling

1737

# used when encoding the scales and biases later. The default mode will match the

1738

# refence with reduced scaling for int64 bias.

1739

# This means that in cases (in the graph optimiser) where DepthwiseConv2DBias

1740

# is used to emulate average pool int32 bias should be selected for full precision

1741

# int16 scaling.

1742

if dtype is None:

1743

dtype = DataType.int64 if op.ifm.dtype == DataType.int16 else DataType.int32

1744

bias_tensor = create_const_tensor(op.name + "_bias", [nr_biases], dtype, bias_values)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1745

op.set_input_tensor(bias_tensor, op.type.info.indices.biases[0])

return op

wilisa01

2023-02-08 09:56:14 +0000

[diff] [blame]

1750

def detect_asymmetric_weights(op):

1751

# Check all ops (cpu and npu)

1752

if op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op():

1753

if op.ifm.dtype in (DataType.int8, DataType.int16):

Fredrik Svedberg

2021-11-01 14:25:29 +0100

[diff] [blame]

1754

if not np.all(op.weights.quantization.zero_point == 0):

wilisa01

2023-02-08 09:56:14 +0000

[diff] [blame]

1755

print(f"Warning: Op {op.type} '{op.name}' has asymmetric weights.", end=" ")

1756

return True

1757

return False

Fredrik Svedberg

2021-11-01 14:25:29 +0100

[diff] [blame]

1758

wilisa01

2023-02-08 09:56:14 +0000

[diff] [blame]

1759

1760

def fixup_asymmetric_weights(op, arch, nng):

1761

if detect_asymmetric_weights(op):

1762

if op.run_on_npu:

1763

print("Zero points have been adjusted.")

1764

op.weights.quantization.zero_point *= 0

Fredrik Svedberg

2021-11-01 14:25:29 +0100

[diff] [blame]

return op

wilisa01

2023-02-08 09:56:14 +0000

[diff] [blame]

1768

def check_asymmetric_weights(op, arch, nng):

1769

# This function can modify the run_on_npu flag which causes an operator to be placed on the CPU. It is usually only

1770

# set by the supported operator checks. Therefore, it should be run immediately after those checks to avoid the

1771

# possibility of other graph optimiser functions modify the operator (that is later run on the CPU)

1772

if detect_asymmetric_weights(op):

1773

if op.run_on_npu:

1774

print("To run the operator on Ethos-U use the option --force-symmetric-int-weights")

1775

op.run_on_npu = False

return op

def fixup_or_check_asymmetric_weights(force_symmetric_int_weights):

1780

if force_symmetric_int_weights:

1781

return fixup_asymmetric_weights

1782

else:

1783

return check_asymmetric_weights

1784

1785

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1786

def convert_mean_to_depthwise_conv_or_avgpool(op, arch, nng):

1787

if op.type == Op.Mean and op.run_on_npu:

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1788

inp, axis = op.inputs

1789

shape = inp.shape

Diqing Zhong

2022-03-09 12:23:47 +0100

[diff] [blame]

1790

ofm_shape = op.ofm.shape

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1791

dims = len(shape)

Diqing Zhong

2022-03-09 12:23:47 +0100

[diff] [blame]

1792

dims_ofm = len(ofm_shape)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1793

1794

# Height and width axes have different index depending on dimensions

1795

if axis.shape == [] or axis.shape[0] == 1: # single axis

1796

axis = int(axis.values) if len(axis.shape) == 0 else int(axis.values[0])

1797

if dims in (2, 3):

1798

if axis == 0:

1799

h, w = shape[axis], 1

1800

else:

1801

h, w = 1, shape[axis]

1802

else:

1803

if axis == 1:

1804

h, w = shape[axis], 1

1805

else:

1806

h, w = 1, shape[axis]

1807

else: # multiple axes

1808

axis = sorted(axis.values)

1809

h, w = [shape[i] for i in axis]

1810

1811

# Set necessary depthwise attributes

1812

op.attrs.update(

1813

{

1814

"padding": Padding.VALID,

1815

"stride_h": 1,

1816

"stride_w": 1,

1817

"strides": (1, 1, 1, 1),

1818

"depth_multiplier": 1,

1819

"channel_multiplier": 1,

1820

"dilation_h_factor": 1,

1821

"dilation_w_factor": 1,

1822

"dilation": (1, 1, 1, 1),

}

)

# Change op type

op.type = Op.DepthwiseConv2DBias

1827

# Set IFM/OFM shapes after changing op type

1828

op.set_ifm_ofm_shapes()

1829

Fredrik Svedberg

1e5456f

2022-09-23 15:25:17 +0200

[diff] [blame]

1830

weight_scale, bias = 1, 0

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1831

ofmq, ifmq = op.ofm.quantization, inp.quantization

Johan Alfvén

9d51ec4

2022-10-27 16:30:01 +0200

[diff] [blame]

1832

if ifmq.is_scaling_equal(ofmq):

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1833

# Here we can just use a simple AvgPool with truncating rounding,

1834

# as we're emulating simple integer division.

1835

op.rounding_mode = NpuRoundingMode.TRUNCATE

1836

op.type = Op.AvgPool

1837

op.attrs.update({"ksize": (1, h, w, 1), "filter_height": h, "filter_width": w})

1838

else:

1839

op.rounding_mode = NpuRoundingMode.NATURAL

1840

weight_scale = 1 / (h * w)

1841

# Input zero point is adjusted after mean calculation, so we emulate that with a bias

1842

bias = -ifmq.zero_point * h * w

1843

fiq = ifmq.clone()

1844

fiq.zero_point = 0

1845

op.forced_input_quantization = fiq

1846

1847

# Change dimensions to 4

Diqing Zhong

2022-03-09 12:23:47 +0100

[diff] [blame]

1848

def extend_dims(dim, in_shape):

1849

if dim < 4:

1850

in_shape = [1] + in_shape

if dim == 2:

in_shape += [1]

return in_shape

if dims < 4 or dims_ofm < 4:

1856

# Fix the ofm dimension when keep_dims is false

1857

# e.g. IFM=1xHxWxC axis=2 OFM=1xHxC, the ofm_shape should be 1xHx1xC, not 1x1xHxC

1858

if isinstance(axis, int) and dims_ofm + 1 == dims:

1859

ofm_shape.insert(axis, 1)

1860

elif isinstance(axis, list) and (dims_ofm + len(axis) == dims):

1861

for i in axis:

1862

ofm_shape.insert(i, 1)

1863

shape = extend_dims(dims, shape)

1864

dims_ofm = len(ofm_shape)

1865

ofm_shape = extend_dims(dims_ofm, ofm_shape)

1866

op.set_ifm_ofm_shapes()

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1867

Rickard Bolin

7d7cb67

2021-12-07 09:09:14 +0000

[diff] [blame]

1868

# If height is greater than max kernel height, reshape from HxW to 1x(HxW)

Johan Alfvén

2022-09-26 13:46:51 +0200

[diff] [blame]

1869

weight_shape = None

Rickard Bolin

7d7cb67

2021-12-07 09:09:14 +0000

[diff] [blame]

1870

if (h > 64 and op.type == Op.DepthwiseConv2DBias) or (h > 256 and op.type == Op.AvgPool):

Johan Alfvén

2022-09-26 13:46:51 +0200

[diff] [blame]

1871

# This can only happen and be done for multiple axes, and

1872

# h * w <= 256 for DepthwiseConv2DBias

1873

# h * w <= 4096 for AvgPool

1874

# which is checked in supported ops

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1875

shape = [shape[0], 1, h * w, shape[3]]

1876

op.ifm_shapes[0] = Shape4D(shape)

Johan Alfvén

2022-09-26 13:46:51 +0200

[diff] [blame]

1877

weight_shape = [1, h * w, shape[3], shape[0]]

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1878

if h > 256 and op.type == Op.AvgPool:

1879

op.attrs.update({"ksize": (1, 1, h * w, 1), "filter_height": 1, "filter_width": h * w})

1880

1881

# If the AvgPool version is used, we don't need to do anything else

1882

if op.type == Op.AvgPool:

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

1883

DebugDatabase.add_optimised(op, op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1884

return op

1885

1886

# Make unit weight tensor quantization

1887

weight_quant = ifmq.clone()

1888

weight_quant.min = 0

1889

weight_quant.max = 255

1890

weight_quant.scale_f32 = weight_scale

1891

weight_quant.zero_point = 0

1892

Johan Alfvén

2022-09-26 13:46:51 +0200

[diff] [blame]

1893

if weight_shape is None:

1894

# Set weight shape to [H,W,C,B]

1895

weight_shape = [h, w, shape[3], shape[0]]

Diqing Zhong

2022-03-09 12:23:47 +0100

[diff] [blame]

1896

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1897

# Add unit weight tensor

op.set_input_tensor(

create_const_tensor(

"weights",

weight_shape,

inp.dtype,

np.ones(weight_shape),

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1904

quantization=weight_quant,

1905

),

1906

1,

1907

)

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

1908

op.weights.values = np.reshape(op.inputs[1].values, weight_shape)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1909

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1910

# Add bias tensor

Fredrik Svedberg

1e5456f

2022-09-23 15:25:17 +0200

[diff] [blame]

1911

bias_shape = [shape[-1]]

1912

op.inputs.append(create_const_tensor("bias", bias_shape, DataType.int32, np.ones(bias_shape) * bias))

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

1913

DebugDatabase.add_optimised(op, op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1918

def optimise_quantize(op: Operation, arch, nng):

1919

1920

if op.type == Op.Quantize and op.run_on_npu:

1921

1922

ifm, ofm = op.get_ifm_ofm()

1923

input_values = ifm.values

1924

1925

# Guard clause - input not const or no values to quantize

1926

if ifm.ops[0].type != Op.Const or input_values is None:

1927

return op

1928

1929

# Singular val in numpy array, convert to indexable array

1930

if input_values.ndim == 0:

1931

input_values = np.array([input_values])

1932

Fredrik Svedberg

1156317

2022-07-06 14:54:12 +0200

[diff] [blame]

1933

# requantized int8 to int8 or int16 to int16

1934

if ifm.dtype == ofm.dtype == DataType.int8 or ifm.dtype == ofm.dtype == DataType.int16:

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1935

1936

# scale needs to use double precision to match TFLite reference kernel

1937

effective_scale = np.float64(ifm.quantization.scale_f32) / np.float64(ofm.quantization.scale_f32)

1938

effective_multiplier, effective_shift = quantise_scale(effective_scale)

1939

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1940

requantized_vals = []

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

1941

for val in input_values.flatten():

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1942

input_val = val - ifm.quantization.zero_point

1943

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

1944

ofm_val = fp_math.multiply_by_quantized_multiplier(input_val, effective_multiplier, effective_shift)

1945

ofm_val += ofm.quantization.zero_point

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1946

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

1947

clamped_ofm_value = max(min(ofm_val, ofm.quantization.quant_max), ofm.quantization.quant_min)

1948

requantized_vals.append(clamped_ofm_value)

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1949

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

1950

ofm.values = np.array(requantized_vals, ofm.dtype.as_numpy_type())

1951

ofm.values.shape = input_values.shape

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1952

1953

# Case: Float input - quantize to int

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

1954

elif ifm.dtype.type == BaseType.Float:

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1955

1956

quantized_vals = []

1957

for val in input_values:

1958

1959

# Derive quantized value

1960

quant_val = (val / ofm.quantization.scale_f32) + ofm.quantization.zero_point

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

1961

clamped_quantized_val = np.clip(quant_val, ofm.quantization.quant_min, ofm.quantization.quant_max)

1962

quantized_vals.append(clamped_quantized_val)

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1963

1964

# Pass the statically calculated quant val to output tensor

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

1965

ofm.values = np.array(quantized_vals, ofm.dtype.as_numpy_type())

1966

1967

# Unsupported data type

1968

else:

1969

return op

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

1970

1971

# Make quantize op const and disconnect from parent node

1972

1973

# Remove reference of the current quant op from the parent tensor's consumer list

1974

ifm.consumer_list = [consumer for consumer in ifm.consumer_list if consumer.op_index != op.op_index]

1975

1976

# Clear any references to parent node

1977

op.inputs = []

1978

1979

# Convert this quantize op to const

op.type = Op.Const

return op

Ayaan Masood

2022-06-29 11:30:57 +0100

[diff] [blame]

1985

def convert_shape_op_to_constant_tensor(op: Operation, arch, nng):

1986

"""Static optimisation for SHAPE operator output value known at compile time"""

1987

1988

# Disconnect SHAPE operator from its parent and transform SHAPE OP into constant

1989

1990

if op.type == Op.Shape and op.run_on_npu:

1991

1992

ifm, ofm = op.get_ifm_ofm()

1993

1994

if len(ifm.shape) != ofm.shape[0]:

1995

return op

1996

1997

# Remove reference of the current shape op from the parent tensor's consumer list

1998

ifm.consumer_list = [consumer for consumer in ifm.consumer_list if consumer.op_index != op.op_index]

1999

2000

# Clear any references to parent node

2001

op.inputs = []

2002

2003

# Convert this SHAPE op to const

2004

op.type = Op.Const

2005

2006

# Add size calculation to shape output tensors

2007

ofm.values = np.array(ifm.shape)

return op

Tim Hall

2022-11-11 18:19:53 +0000

[diff] [blame]

2012

def fixup_dilation_gt2(op, arch, nng):

2013

assert op.run_on_npu

2014

if op.type == Op.Conv2DBias or op.type == Op.DepthwiseConv2DBias:

2015

dilation_w, dilation_h = op.get_kernel_dilation()

2016

2017

# if dilation in either axis is greater than that supported by the hardware then we must manually dilate the

2018

# kernel

2019

if dilation_w > 2 or dilation_h > 2:

2020

kernel_w, kernel_h = op.get_kernel_size()

2021

kernel_ic = op.weights.shape[-2]

2022

kernel_oc = op.weights.shape[-1]

2023

2024

# if the dilation is a multiple of 2 then the hardware dialtion can be enabled to provide that multiple

2025

# of 2. this allows the kernel size to be reduced (via the scaled dilation) by half in that dimension.

2026

# odd = 1, even = 2

2027

hw_dilation_h = 1 if (dilation_h & 1) else 2

2028

hw_dilation_w = 1 if (dilation_w & 1) else 2

2029

2030

scale_dilation_h = dilation_h // hw_dilation_h

2031

scale_dilation_w = dilation_w // hw_dilation_w

2032

2033

# create new empty kernel (HWIO format)

2034

new_kernel_h = (kernel_h - 1) * scale_dilation_h + 1

2035

new_kernel_w = (kernel_w - 1) * scale_dilation_w + 1

2036

2037

new_kernel_shape = [new_kernel_h, new_kernel_w, kernel_ic, kernel_oc]

2038

new_kernel_values = np.zeros(new_kernel_shape, dtype=op.weights.values.dtype)

2039

2040

# copy the original kernel values into the new sparse kernel

2041

for h in range(0, kernel_h):

2042

for w in range(0, kernel_w):

2043

new_h = h * scale_dilation_h

2044

new_w = w * scale_dilation_w

2045

new_kernel_values[new_h, new_w, :, :] = op.weights.values[h, w, :, :]

2046

2047

# update the weight tensor with the new dilated kernel

2048

op.weights.shape = new_kernel_shape

2049

op.weights.values = new_kernel_values

2050

2051

# enable(=2) / disable(=1) hardware dilation

2052

op.attrs["dilation"] = (1, hw_dilation_h, hw_dilation_w, 1) # nhwc format

2053

op.attrs["dilation_h_factor"] = hw_dilation_h

2054

op.attrs["dilation_w_factor"] = hw_dilation_w

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2059

def supported_operator_check(op, arch, nng):

Jonas Ohlsson

45e653d

2021-07-26 16:13:12 +0200

[diff] [blame]

2060

op.run_on_npu = arch.tflite_supported_operators.is_operator_supported(op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

wilisa01

2023-02-08 09:56:14 +0000

[diff] [blame]

2064

def tflite_optimise_graph(nng, arch, force_symmetric_int_weights):

Fredrik Svedberg

1156317

2022-07-06 14:54:12 +0200

[diff] [blame]

2065

# Compile time static optimisations

wilisa01

2023-02-08 09:56:14 +0000

[diff] [blame]

2066

optimisation_list = [

2067

optimise_quantize,

2068

convert_shape_op_to_constant_tensor,

2069

fixup_or_check_asymmetric_weights(force_symmetric_int_weights),

2070

]

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

2071

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2072

for idx, sg in enumerate(nng.subgraphs):

2073

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

nng,

sg,

arch,

[],

Ayaan Masood

4965fae

2022-06-29 11:30:57 +0100

[diff] [blame]

2078

optimisation_list,

2079

rewrite_unsupported=False,

2080

)

2081

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

2082

# Pre-processing step

wilisa01

2023-02-08 09:56:14 +0000

[diff] [blame]

2083

pre_process_list = [supported_operator_check, set_ifm_ofm_op_shapes]

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

2084

Ayaan Masood

4965fae

2022-06-29 11:30:57 +0100

[diff] [blame]

2085

for idx, sg in enumerate(nng.subgraphs):

2086

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

nng,

sg,

arch,

[],

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

2091

pre_process_list,

2092

rewrite_unsupported=False,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

)

# Handle Concat Ops

for idx, sg in enumerate(nng.subgraphs):

2097

rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [rewrite_concat_ops])

2098

sg.refresh_after_modification()

2099

2100

# Handle Split Ops

2101

for idx, sg in enumerate(nng.subgraphs):

2102

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

nng,

sg,

arch,

[],

[rewrite_unpack_output, rewrite_stridedslice_output, convert_nop_split_to_identity],

2108

rewrite_unsupported=False,

2109

)

2110

2111

for idx, sg in enumerate(nng.subgraphs):

2112

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

nng,

sg,

arch,

[rewrite_split_ops],

[],

rewrite_unsupported=False,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2119

)

2120

Johan Alfven

a5e1b62

2023-02-02 14:59:03 +0100

[diff] [blame]

2121

# Bypass or rewrite memory only operators

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2122

for idx, sg in enumerate(nng.subgraphs):

2123

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

nng,

sg,

arch,

[],

Johan Alfven

a5e1b62

2023-02-02 14:59:03 +0100

[diff] [blame]

2128

[bypass_memory_only_ops],

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

2129

rewrite_unsupported=False,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2130

)

2131

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2132

# Rewrite of operators

2133

op_rewrite_list = [

2134

set_tensor_equivalence,

2135

convert_mean_to_depthwise_conv_or_avgpool,

2136

convert_depthwise_to_conv,

2137

convert_conv_to_fc,

Fredrik Svedberg

2023-04-11 22:35:04 +0200

[diff] [blame^]

2138

convert_lstm,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2139

convert_softmax,

Fredrik Svedberg

2022-08-19 16:06:04 +0200

[diff] [blame]

2140

convert_prelu,

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

2141

convert_mul_max_to_abs_or_lrelu,

2142

convert_lrelu,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2143

convert_hardswish_to_lut,

2144

rewrite_fully_connected_input,

2145

convert_batched_fc_shape,

2146

fixup_conv2d_backprop,

2147

fixup_relus_with_differing_ifm_ofm_scaling,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2148

reorder_depthwise_weights,

Rickard Bolin

2022-12-19 12:33:40 +0000

[diff] [blame]

2149

convert_argmax_to_depthwise_conv_and_max_pool,

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

2150

fixup_resize,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2151

fixup_bias_tensors,

Fredrik Svedberg

2021-11-01 14:25:29 +0100

[diff] [blame]

2152

fixup_asymmetric_weights,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2153

convert_tanh_sigmoid_to_lut,

2154

replace_pad_by_hw_pad,

Tim Hall

ea4ba66

2022-11-11 18:19:53 +0000

[diff] [blame]

2155

fixup_dilation_gt2,

Raul Farkas

72c6a24

2023-03-16 16:38:05 +0000

[diff] [blame]

2156

fixup_strided_conv,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2157

]

2158

2159

for idx, sg in enumerate(nng.subgraphs):

2160

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

nng,

sg,

arch,

[],

op_rewrite_list,

rewrite_unsupported=False,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2167

)

2168

2169

for idx, sg in enumerate(nng.subgraphs):

2170

# remove passthrough tensors and attempt further optimizations

2171

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

nng,

sg,

arch,

[remove_passthrough_tensor],

2176

[fuse_activation_function_with_prev, convert_pad, add_padding_fields],

2177

)

2178

2179

# Removal of SplitSliceRead, need to be done after optimisation has been performed,

2180

# since ifm/ofm_shapes are of importance to this function

2181

for sg in nng.subgraphs:

2182

rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [remove_SplitSliceRead])

2183

sg.refresh_after_modification()

2184

Fredrik Svedberg

f3c7d55

2022-11-04 09:48:49 +0100

[diff] [blame]

2185

# Make sure that const optimisations on subgraph outputs are handled correctly

2186

for sg in nng.subgraphs:

2187

for ofm in sg.output_tensors:

2188

if ofm.is_const and ofm.ops[0].type_changed:

2189

# Subgraph output cannot be const - insert a memory copy

2190

op = ofm.ops[0]

2191

ofm_clone = ofm.clone()

2192

ofm_clone.values = ofm.values

2193

ofm.values = None

Tim Hall

2023-01-13 17:57:25 +0000

[diff] [blame]

2194

zero = create_const_tensor("zero", [1], ofm.dtype, [0], quantization=ofm.quantization)

Fredrik Svedberg

f3c7d55

2022-11-04 09:48:49 +0100

[diff] [blame]

2195

memcpy = create_add_nop(f"{ofm.name}_copy")

2196

memcpy.add_input_tensor(ofm_clone)

2197

memcpy.add_input_tensor(zero)

2198

memcpy.set_output_tensor(ofm)

2199

memcpy.set_ifm_ofm_shapes()

2200

op.set_output_tensor(ofm_clone)

2201

DebugDatabase.add_optimised(op, memcpy)

2202

Patrik Gustavsson