Blame - ethosu/vela/tflite_graph_optimiser.py - ml/ethos-u/ethos-u-vela

2023-01-13 17:57:25 +0000

[diff] [blame]

1

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Rickard Bolin

bc6ee58

2022-11-04 08:24:29 +0000

[diff] [blame]

16

#

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

17

# Description:

18

# Early optimisation of a TensorFlow Lite based network graph, using the rewrite_graph module

19

# to do the traversal of the graph.

Raul Farkas

2023-01-30 12:58:46 +0000

[diff] [blame]

20

from __future__ import annotations

21

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

22

import math

23

import uuid

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

import numpy as np

from . import fp_math

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

28

from . import rewrite_graph

29

from . import scaling

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

30

from .data_type import BaseType

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

31

from .data_type import DataType

32

from .debug_database import DebugDatabase

33

from .errors import UnsupportedFeatureError

34

from .ethos_u55_regs.ethos_u55_regs import resampling_mode

Jonas Ohlsson

0957e3e

2021-09-01 15:57:21 +0200

[diff] [blame]

35

from .graph_optimiser_util import bypass_memory_only_ops

Patrik Gustavsson

c74682c

2021-08-17 14:26:38 +0200

[diff] [blame]

36

from .graph_optimiser_util import calc_explicit_padding

Patrik Gustavsson

df99510

2021-08-23 15:33:59 +0200

[diff] [blame]

37

from .graph_optimiser_util import convert_depthwise_to_conv

Fredrik Svedberg

2023-04-11 22:35:04 +0200

[diff] [blame]

38

from .graph_optimiser_util import create_avg_pool_for_concat

Jonas Ohlsson

0957e3e

2021-09-01 15:57:21 +0200

[diff] [blame]

39

from .graph_optimiser_util import memory_only_ops

Patrik Gustavsson

f1580f0

2021-09-01 12:43:02 +0200

[diff] [blame]

40

from .graph_optimiser_util import move_splitsliceread_to_consumer

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

41

from .graph_optimiser_util import needed_total_padding

42

from .graph_optimiser_util import set_ifm_ofm_op_shapes

43

from .graph_optimiser_util import set_tensor_equivalence

Fredrik Svedberg

2023-04-11 22:35:04 +0200

[diff] [blame]

44

from .lstm import Lstm

Johan Alfven

2023-04-24 13:35:40 +0200

[diff] [blame]

45

from .lut import convert_to_lut

46

from .lut import create_lut_8bit_op

47

from .lut import create_lut_int16_op

Johan Alfven

8e525ca

2023-05-07 13:12:37 +0200

[diff] [blame]

48

from .lut import create_lut_rsqrt_int8_op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

49

from .numeric_util import clamp_sigmoid

Johan Alfven

56811e6

2023-03-27 11:33:50 +0200

[diff] [blame]

50

from .numeric_util import full_shape

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

51

from .numeric_util import round_away_zero

Johan Alfven

2023-04-13 18:54:47 +0200

[diff] [blame]

52

from .numeric_util import round_down_log2

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

53

from .operation import create_activation_function

Fredrik Svedberg

1a7527c

2021-09-13 15:52:16 +0200

[diff] [blame]

54

from .operation import ExplicitScaling

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

55

from .operation import NpuBlockType

56

from .operation import Op

57

from .operation import Operation

58

from .operation import Padding

Tim Hall

2023-05-16 22:39:14 +0100

[diff] [blame]

59

from .operation import RoundingMode

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame]

60

from .operation_util import create_add

Fredrik Svedberg

f3c7d55

2022-11-04 09:48:49 +0100

[diff] [blame]

61

from .operation_util import create_add_nop

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

62

from .operation_util import create_avgpool_nop

Johan Alfven

2023-03-31 10:19:23 +0200

[diff] [blame]

63

from .operation_util import create_cast_op

Rickard Bolin

2022-12-19 12:33:40 +0000

[diff] [blame]

64

from .operation_util import create_depthwise_maxpool

Johan Alfven

2023-03-31 10:19:23 +0200

[diff] [blame]

65

from .operation_util import create_memcpy

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

66

from .operation_util import get_pad_values_from_input

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

67

from .scaling import quantise_scale

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

68

from .shape4d import Shape4D

69

from .softmax import SoftMax

70

from .tensor import check_quantized_tens_scaling_equal

71

from .tensor import create_const_tensor

72

from .tensor import create_equivalence_id

73

from .tensor import QuantizationParameters

74

from .tensor import Tensor

75

from .tensor import TensorPurpose

76

from .tflite_mapping import optype_to_builtintype

Raul Farkas

2023-05-16 17:18:31 +0100

[diff] [blame]

77

from .utils import calc_resize_factor

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

78

79

passthrough_nodes = (Op.Identity,)

80

81

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

82

def remove_passthrough_tensor(tens, arch, nng):

83

if len(tens.ops) == 1 and tens.ops[0].type in passthrough_nodes:

84

assert len(tens.ops[0].inputs) == 1

85

tens = tens.ops[0].inputs[0]

return tens

def rewrite_concat_ops(op, arch):

90

if not op.run_on_npu or not op.type.is_concat_op():

return

axis_4D = 0

ofm = op.ofm

ofm.ops = []

offset = 0

unfuse_activation_function(op)

99

100

if op.type == Op.Pack:

101

# Pack is also referred to as Stack

102

axis = int(op.attrs["axis"])

103

if axis < 0: # Convert to positive axis

104

axis = len(op.inputs[0].shape) + 1 + axis

105

106

desired_shape = op.inputs[0].shape[:axis] + [1] + op.inputs[0].shape[axis:]

107

108

axis_4D = axis + (4 - len(desired_shape))

109

110

for idx, inp in enumerate(op.inputs):

111

op.ifm_shapes[idx] = Shape4D(desired_shape)

112

op.type = Op.PackReshaped

113

114

inputs, axis = op.get_concat_inputs_axis()

115

for idx, inp in enumerate(inputs):

116

if op.type != Op.PackReshaped:

117

op.ifm_shapes[idx] = Shape4D(inp.shape)

118

if axis >= 0:

119

axis_4D = axis + (4 - len(inp.shape))

120

else:

121

axis_4D = axis

122

write_offset = [0, 0, 0, 0]

123

write_offset[axis_4D] = offset

124

concat_end = offset + op.ifm_shapes[idx][axis_4D]

125

create_avg_pool_for_concat(

126

op, op.name + str(idx) + "_avgpool", inp, op.ifm_shapes[idx], Shape4D.from_list(write_offset)

127

)

128

offset = concat_end

129

assert ofm.shape[axis] == offset

return op

def rewrite_split_ops(tens, arch, nng):

135

136

if len(tens.ops) == 1 and tens.ops[0].type.is_split_op() and tens.ops[0].type != Op.Unpack:

137

split_op = tens.ops[0]

138

139

# Not supported so leave it and run on CPU

140

if not split_op.run_on_npu:

141

return tens

142

143

inp, outputs, axis, offset_start, offset_end = split_op.get_split_inputs_axis()

144

145

tens.ops = []

146

new_op = Operation(Op.SplitSliceRead, split_op.name)

147

new_op.inputs = [inp]

148

ofm_shape_idx = 0

Tim Hall

51a8dce

2021-12-20 16:49:27 +0000

[diff] [blame]

149

if None in (offset_end, offset_start):

150

read_shape = None

151

else:

152

# the read shape is relative to each start offset

William Isaksson

a71efe0

2023-07-12 12:28:05 +0000

[diff] [blame^]

153

read_shape = Shape4D([oe - os for oe, os in zip(offset_end, offset_start)])

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

154

155

# For Split the offset cannot be extracted from the tensor so it has to

156

# be calculated from the index of the output tensor

157

if axis is not None:

158

# Get the start and end of the split

159

offset_start = [0] * 4

160

axis_4D_list = split_op.attrs.get("split_axis_4D", None) # Present for UnpackReshaped and some StridedSlice

161

for idx, out in enumerate(outputs):

162

if axis_4D_list is not None:

163

axis_4D = axis_4D_list[idx]

164

else:

165

split_op.ofm_shapes[idx] = Shape4D(out.shape)

166

if axis >= 0:

167

axis_4D = axis + (4 - len(out.shape))

else:

axis_4D = axis

if out == tens:

ofm_shape_idx = idx

read_shape = split_op.ofm_shapes[idx]

174

break

175

176

offset_start[axis_4D] += split_op.ofm_shapes[idx][axis_4D]

177

178

new_op.read_offsets[0] = Shape4D.from_list(offset_start, 0)

179

new_op.read_shapes[0] = read_shape

180

new_op.run_on_npu = True

181

new_op.set_output_tensor(tens)

182

new_op.ifm_shapes.append(Shape4D(inp.shape))

183

new_op.ofm_shapes.append(split_op.ofm_shapes[ofm_shape_idx])

184

DebugDatabase.add_optimised(split_op, new_op)

return tens

def remove_SplitSliceRead(op, arch):

190

191

if op.type == Op.SplitSliceRead:

Fredrik Svedberg

2023-04-11 22:35:04 +0200

[diff] [blame]

192

# Check if it is possible to put the SplitSliceRead on the tensor consumer(s),

193

# or if an avgpool need to be inserted

194

if op.ofm_shapes[0] == Shape4D.from_list(op.ofm.shape) and all(

195

consumer is not None and consumer.run_on_npu and consumer.type not in memory_only_ops

196

for consumer in op.ofm.consumer_list

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

197

):

Fredrik Svedberg

2023-04-11 22:35:04 +0200

[diff] [blame]

198

# SplitSliceRead can be performed by tensor consumer(s)

199

for cons_op in list(op.ofm.consumer_list):

200

move_splitsliceread_to_consumer(op, cons_op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

201

else:

202

avgpool_op = create_avgpool_nop(op.name + "_avgpool")

203

avgpool_op.add_input_tensor(op.ifm)

204

avgpool_op.outputs = [op.ofm]

205

op.ofm.ops.remove(op)

206

op.ofm.ops.append(avgpool_op)

207

avgpool_op.ifm_shapes.append(op.ifm_shapes[0])

208

avgpool_op.ofm_shapes.append(op.ofm_shapes[0])

209

avgpool_op.read_offsets[0] = op.read_offsets[0]

210

avgpool_op.read_shapes[0] = op.read_shapes[0]

211

212

op.ifm.consumer_list.remove(op)

213

DebugDatabase.add_optimised(op, avgpool_op)

214

215

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

216

def calc_padding_and_skirt(padding_type, kernel, input_shape, explicit_padding):

217

k_w, k_h = kernel.dilated_wh()

218

s_x, s_y = kernel.stride

219

ypad = needed_total_padding(int(input_shape.height), int(s_y), int(k_h))

220

xpad = needed_total_padding(int(input_shape.width), int(s_x), int(k_w))

221

if padding_type == Padding.SAME:

222

left_pad = (xpad + 0) // 2

223

right_pad = (xpad + 1) // 2

224

top_pad = (ypad + 0) // 2

225

bottom_pad = (ypad + 1) // 2

226

elif padding_type == Padding.VALID:

left_pad = 0

right_pad = 0

top_pad = 0

bottom_pad = 0

elif padding_type == Padding.EXPLICIT:

232

# Padding is specified in a PAD operator which has been bypassed.

233

top, left, bottom, right = explicit_padding

234

top_pad, bottom_pad = calc_explicit_padding(int(input_shape.height), int(s_y), int(k_h), int(top), int(bottom))

235

left_pad, right_pad = calc_explicit_padding(int(input_shape.width), int(s_x), int(k_w), int(left), int(right))

Rickard Bolin

9ae3455

2022-06-09 13:07:17 +0000

[diff] [blame]

236

elif padding_type == Padding.TILE:

237

# The values in the explicit padding only represent the "direction" in which to pad

238

top_pad, left_pad, bottom_pad, right_pad = explicit_padding

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

239

else:

Tim Hall

0ab2edc

2022-02-23 17:58:02 +0000

[diff] [blame]

240

raise UnsupportedFeatureError(f"Unsupported padding = {padding_type} for padding calculation")

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

241

padding = (top_pad, left_pad, bottom_pad, right_pad)

242

skirt = (top_pad, left_pad, ypad - top_pad, xpad - left_pad)

243

return padding, skirt

244

245

246

def calc_upscaled_padding_and_skirt(padding_type, kernel_size, stride, input_shape, upscaling_factor):

247

kernel_height, kernel_width = kernel_size[0], kernel_size[1]

248

if padding_type == Padding.SAME:

249

ypad = needed_total_padding(int(input_shape.height) * upscaling_factor, int(stride[1]), int(kernel_height))

250

xpad = needed_total_padding(int(input_shape.width) * upscaling_factor, int(stride[2]), int(kernel_width))

251

right_pad = max(((xpad + 1) // upscaling_factor) - 1, 0)

252

bottom_pad = max(((ypad + 1) // upscaling_factor) - 1, 0)

253

left_pad = max(kernel_width - 1 - right_pad, 0)

254

top_pad = max(kernel_height - 1 - bottom_pad, 0)

255

elif padding_type == Padding.VALID:

256

right_pad = max(kernel_width - 2, 0)

257

bottom_pad = max(kernel_height - 2, 0)

258

left_pad = kernel_width - 1

259

top_pad = kernel_height - 1

260

else:

Tim Hall

0ab2edc

2022-02-23 17:58:02 +0000

[diff] [blame]

261

raise UnsupportedFeatureError(f"Unsupported padding = {padding_type} for up-scaled padding calculation")

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

262

padding = (top_pad, left_pad, bottom_pad, right_pad)

263

skirt = padding

264

return padding, skirt

265

266

Raul Farkas

2023-05-25 11:15:20 +0100

[diff] [blame]

267

def fixup_conv2d_backprop(op: Operation, arch, nng) -> Operation:

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

268

if op.type == Op.Conv2DBackpropInput:

269

# flip the inputs

270

op.inputs[0], op.inputs[2] = op.inputs[2], op.inputs[0]

271

op.type = Op.Conv2DBackpropInputSwitchedBias

Tim Hall

3c5cfe9

2022-03-16 16:31:57 +0000

[diff] [blame]

272

op.ifm_resampling_mode = resampling_mode.TRANSPOSE

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

273

274

# Update strides

275

op.attrs.update({"stride_w": 1, "stride_h": 1, "strides": (1, 1, 1, 1)})

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

276

DebugDatabase.add_optimised(op, op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

# Convert the op to an elementwise add

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

282

def convert_resize_1x1_to_add(op):

283

op.type = Op.Add # original_type will stay as Op.ResizeBilinear or Op.ResizeNearestNeighbor

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

284

op.name = op.name + "_add"

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

285

# Create an input tensor filled with zeros

wilisa01

8289d51

2023-01-12 08:17:23 +0000

[diff] [blame]

286

name = op.inputs[1].name + "_add"

287

dtype = op.inputs[0].dtype

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

288

shape = op.ofm_shapes[0].as_list()

wilisa01

8289d51

2023-01-12 08:17:23 +0000

[diff] [blame]

289

values = np.zeros(shape, dtype.as_numpy_type())

290

quantization = QuantizationParameters(0.0, 255.0)

291

quantization.scale_f32 = 1.0

292

quantization.zero_point = 0

wilisa01

16b5e5e

2023-02-14 12:03:59 +0000

[diff] [blame]

293

op.inputs[1] = op.inputs[0]

294

op.set_input_tensor(create_const_tensor(name, shape, dtype, values, quantization=quantization), 0)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

295

op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

296

DebugDatabase.add_optimised(op, op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

Tim Hall

2023-05-16 22:39:14 +0100

[diff] [blame]

301

# Convert ResizeNearestNeighbor with align corners to a depthwise convolution. The IFM will already have been upscaled

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

302

# apart from the final x2 scaling which will be done as part of this operation. The kernel contains a single coefficient

303

# to select the appropriate nearest neighbor value

304

def convert_resizenn_ac_to_depthwise_conv(op, upscale_factor):

305

ifm = op.ifm

306

ofm = op.ofm

307

output_depth = ofm.shape[-1]

308

dw_op_attrs = {

309

"padding": Padding.VALID,

310

"stride_h": 1,

311

"stride_w": 1,

312

"strides": (1, 1, 1, 1),

313

"depth_multiplier": 1,

314

"channel_multiplier": 1,

315

"dilation_h_factor": 1,

316

"dilation_w_factor": 1,

317

"dilation": (1, 1, 1, 1),

318

}

319

Tim Hall

2023-05-16 22:39:14 +0100

[diff] [blame]

320

# change ResizeNearestNeighbor to Depthwise

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

321

op.type = Op.DepthwiseConv2DBias

322

op.attrs.update(dw_op_attrs)

323

op.set_input_tensor(ifm, 0) # ifm tensor index

324

op.activation = None

325

326

# add input resample to resize by x2

327

op.ifm_resampling_mode = resampling_mode.NEAREST

328

329

# don't care about the rounding mode as it is nearest neighbor

330

331

# setup weight tensor

332

weight_quant = QuantizationParameters()

333

weight_quant.scale_f32 = 1.0 # no scaling as only a single non-zero coeff to select the desired value

334

weight_quant.zero_point = 0

335

weight_quant.quant_dim = 0

336

ofm_dtype = ofm.dtype

Tim Hall

2023-01-13 17:57:25 +0000

[diff] [blame]

337

if ofm_dtype.type == BaseType.UnsignedInt:

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

338

weight_quant.quant_min = 0

339

weight_quant.quant_max = (1 << ofm_dtype.bits) - 1

340

else:

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

341

weight_quant.quant_min = -(1 << (ofm_dtype.bits - 1))

342

weight_quant.quant_max = (1 << (ofm_dtype.bits - 1)) - 1

343

344

weight_shape = [upscale_factor, upscale_factor, output_depth, output_depth] # HWIO

345

346

# the single non-zero coefficient used to select the desired value needs to be placed in the 'centre value', which

347

# is calculated by finding the 'centre position' ('*' in the diagram below) and then choosing the 'value' that is

348

# below-and-right (i.e. next) to it (D).

# 0---1---2

# | A | B |

# 1---*---+

# | C | D |

# 2---+---+

weight_values = [0] * (upscale_factor * upscale_factor)

355

centre_coeff = (upscale_factor // 2) * upscale_factor + (upscale_factor // 2)

356

weight_values[centre_coeff] = 1

357

358

# add weight tensor, this will discard the size tensor of the resize op

op.set_input_tensor(

create_const_tensor(

"weights",

weight_shape,

Tim Hall

2023-01-13 17:57:25 +0000

[diff] [blame]

363

ofm_dtype,

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

364

np.array(weight_values).reshape(weight_shape),

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

365

quantization=weight_quant,

366

),

367

1, # inputs tensor weight index

368

)

369

370

# setup bias tensor by assign None and then call the fix-up function to create a suitable tensor.

371

# need to append the bias tensor as resize ops only have 2 inputs

372

assert len(op.inputs) == 2

373

op.inputs.append(None)

Fredrik Svedberg

cc219be

2022-09-20 16:32:52 +0200

[diff] [blame]

374

fixup_bias_tensors(op, None, None, DataType.int32)

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

375

376

# finally update the shape incase we've change the tensor shapes or connections

377

op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

378

DebugDatabase.add_optimised(op, op)

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

return op

# Convert ResizeBilinear/NearestNeighbor to a number of 1x1 average pools with nearest neighbor x2 upscaling and one

384

# final average pool with a kernel size that depends upon the resize ops upscaling factor (x2, x4 or x8). The maximum

385

# upscale factor is limited to x8 because of the limit 8x8 kernel size limit for average pool with padding.

386

def convert_resize_to_upscale_and_average_pool(op):

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

387

pre_op = op

388

outputs = op.outputs

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

389

dtype = op.ifm.dtype

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

390

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

391

op.attrs.update({"strides": (1, 1, 1, 1), "ksize": (1, 1, 1, 1)})

Tim Hall

47c7636

2022-07-18 21:26:47 +0100

[diff] [blame]

392

op.attrs["padding"] = Padding.SAME # doesn't really matter as the kernel is 1x1

Tim Hall

3c5cfe9

2022-03-16 16:31:57 +0000

[diff] [blame]

393

op.ifm_resampling_mode = resampling_mode.NEAREST

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

394

395

upscaled_shape = np.array(op.ifm_shapes[0].get_hw_as_list())

Tim Hall

47c7636

2022-07-18 21:26:47 +0100

[diff] [blame]

396

397

# Get upscale factor that was calculated in the supported operators check

398

upscale_factor = op.attrs["upscale_factor"]

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

399

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

400

# Calculate how many times 2x2 upscaling needs to be performed

Tim Hall

f9267da

2022-04-20 20:19:48 +0100

[diff] [blame]

401

# Force the result of round to be an integer. This is because the behaviour of rounding numpy.float64 values changed

402

# between different versions of numpy. This consistency ensures that the kernel dimensions are kept integral

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

403

n = int(np.log2(upscale_factor))

404

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

405

# Perform x2 upscaling n-1 times

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

406

scaled_op = pre_op

407

for count in range(n - 1):

408

if count > 0:

409

scaled_op = op.clone(f"_{count}")

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

410

scaled_op.inputs[0] = pre_op.outputs[0]

411

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

412

# Nearest neighbor x2 upscaling

Tim Hall

47c7636

2022-07-18 21:26:47 +0100

[diff] [blame]

413

upscaled_shape = upscaled_shape * 2

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

414

shape = op.ofm_shapes[0].as_list()

415

shape[1:3] = upscaled_shape

416

out_tens = Tensor(shape, dtype, f"{op.outputs[0].name}_{count}")

417

out_tens.quantization = op.outputs[0].quantization.clone()

418

scaled_op.set_output_tensor(out_tens)

419

pre_op = scaled_op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

420

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

421

scaled_op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

422

DebugDatabase.add_optimised(op, scaled_op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

423

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

424

# Last x2 upscaling

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

425

if n > 1:

426

scaled_op = op.clone(f"_{n-1}")

427

scaled_op.inputs[0] = pre_op.outputs[0]

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

428

429

if scaled_op.original_type == Op.ResizeBilinear:

430

if scaled_op.attrs["align_corners"]:

431

# no padding

432

scaled_op.attrs["padding"] = Padding.VALID

433

else:

434

# padding to the right and bottom (limits average pool to 8x8 kernel)

435

scaled_op.attrs["padding"] = Padding.EXPLICIT

436

scaled_op.attrs["explicit_padding"] = [0, 0, upscale_factor - 1, upscale_factor - 1]

437

438

# kernal size dependent on the upscaling factor

439

scaled_op.attrs.update({"ksize": (1, upscale_factor, upscale_factor, 1)})

440

else: # Op.ResizeNearestNeighbor

441

if scaled_op.attrs["align_corners"]:

442

# use depthwise conv to select the correct value

443

scaled_op = convert_resizenn_ac_to_depthwise_conv(scaled_op, upscale_factor)

444

else:

Johan Alfvén

a64616c

2022-10-17 12:29:12 +0200

[diff] [blame]

445

# Keep 1x1 kernel and average pool, this applies both when

446

# half-pixel-centers is True and False. Calculations are the

447

# same in the reference.

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

448

pass

449

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

450

scaled_op.outputs = outputs

451

scaled_op.outputs[0].ops = [scaled_op]

452

scaled_op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

453

DebugDatabase.add_optimised(op, scaled_op)

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

454

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

Raul Farkas

2023-05-25 11:15:20 +0100

[diff] [blame]

458

def convert_argmax_to_depthwise_conv_and_max_pool(op: Operation, arch, nng) -> Operation:

Rickard Bolin

2022-12-19 12:33:40 +0000

[diff] [blame]

459

"""

460

Convert ArgMax to DWConv2D->MaxPool->DWConv2D, see details below.

Example:

arr = [4, [00000100,

6, = 00000110, # <-- This is the largest value, so we're expecting argmax(arr) = 1

465

5] 00000101]

466

467

Use 16-bit precision and shift all values 7 bits to the left:

468

Shifted_arr = [0000001000000000,

0000001100000000,

0000001010000000]

Add "c - index of channel" to each channel:

473

Shifted_arr_plus_reverse_idx = [0000001000000010, (+2)

474

0000001100000001, (+1)

475

0000001010000000] (+0)

476

477

The index is reversed since ArgMax selects the lowest index if maximum value is found at two index. The index will

478

act as a tie-breaker between channels with equal values and since we want the smallest channel index to be chosen

479

we reverse the index before the maxpool and then subtract the index from the number of channel after the maxpool to

480

get the correct index.

481

482

Find the maximum value in the array:

483

val = max(shifted_arr_plus_reverse_idx) = 0000001100000001

484

485

Subtract the value from the number of channels:

486

shifted_arr_plus_idx = (c-1) - val = 2 - 1 = 1

487

488

Extract the 7 lowest bits using a LUT to cut off the 9 most significant bits:

489

idx = LUT(val) = 0000000000000001 = 1

490

"""

491

492

if op.type == Op.ArgMax:

493

ifm, ofm = op.inputs[0], op.outputs[0]

494

identity_quant = QuantizationParameters()

495

identity_quant.zero_point = 0

496

identity_quant.scale_f32 = 1.0

Rickard Bolin

2022-12-19 12:33:40 +0000

[diff] [blame]

497

# Add last dimension to ofm shape

ofm.shape += [1]

ofm.ops = []

# Create 1x1 Depthwise convolution with 2**7 weights for each channel to convert precision to 16 bit and shift

502

# all values 7 bits to the left

503

# Set necessary depthwise attributes

504

dw_op_attrs = {

505

"padding": Padding.VALID,

506

"stride_h": 1,

507

"stride_w": 1,

508

"strides": (1, 1, 1, 1),

509

"depth_multiplier": 1,

510

"channel_multiplier": 1,

511

"dilation_h_factor": 1,

512

"dilation_w_factor": 1,

513

"dilation": (1, 1, 1, 1),

514

"explicit_padding": None,

515

}

Johan Alfven

2023-03-31 10:19:23 +0200

[diff] [blame]

516

orig_name = op.name

517

op.name = f"{orig_name}_depthwise_conv_SHL_7"

Rickard Bolin

2022-12-19 12:33:40 +0000

[diff] [blame]

518

op.type = Op.DepthwiseConv2DBias

519

op.attrs.update(dw_op_attrs)

Johan Alfven

56811e6

2023-03-27 11:33:50 +0200

[diff] [blame]

520

n, h, w, c = full_shape(4, ifm.shape, 1)

Rickard Bolin

2022-12-19 12:33:40 +0000

[diff] [blame]

521

shape = [1, 1, 1, c]

522

kernel = np.dstack([2**7] * c)

523

op.inputs = []

524

op.add_input_tensor(ifm)

op.add_input_tensor(

create_const_tensor(

"weights",

shape,

DataType.uint8,

np.array(kernel).reshape(shape),

531

quantization=identity_quant,

532

),

533

)

534

# Let the bias for each channel be the "reverse" index of the channel it is in, ie c - channel_idx

535

reverse_idxs = list(reversed(range(c)))

536

bias_tensor = create_const_tensor(op.name + "_bias", [c], DataType.int64, reverse_idxs)

537

op.add_input_tensor(bias_tensor)

538

539

intermediate_tens = Tensor([n, h, w, c], DataType.int16, "int16_and_shifted_7_bits_left")

540

intermediate_tens.quantization = ifm.quantization

541

op.set_output_tensor(intermediate_tens)

542

op.set_ifm_ofm_shapes()

543

orig_ifm_shape = op.ifm_shapes[0]

544

DebugDatabase.add_optimised(op, op)

545

546

# To extract 7 least significant bits and swap reverse index back to real index using a LUT activation, we set

547

# the base value to c-1 and slope to -128. The 16-bit LUT uses a table of 32-bit values where the top 16 bits

548

# represent the slope and bottom 16 bits the base which are used to interpolate the activation value.

549

slope = (-128 & 0xFFFF) << 16 # Top 16 bits of 32 bit LUT table value

550

base = c - 1 # Bottom 16 bits of the LUT table value

551

lut_tensor = create_const_tensor(

552

"maxpool_LUT_extract_7_LSB",

553

[1, 1, 1, 512],

554

DataType.uint32,

555

[slope + base] * 512,

TensorPurpose.LUT,

)

# Split large feature maps into smaller chunks since the Depthwise Maxpool height dimension can overflow due to

560

# flattening the ifm to (H*W)xCx1

561

max_height = 2**16 // orig_ifm_shape.width

562

num_full_height_ops = orig_ifm_shape.height // max_height

563

last_op_height = orig_ifm_shape.height - max_height * num_full_height_ops

564

op_heights = [max_height] * num_full_height_ops

565

if last_op_height > 0:

566

op_heights.append(last_op_height)

567

568

# Create maxpool output tensor which is reshaped to 1x(H*W)x1x1. The product H*W might be larger than the

569

# maximum allowed height, but that's handled by reading and writing the data in chunks

570

maxpool_ofm = Tensor([1, orig_ifm_shape.height * orig_ifm_shape.width, 1, 1], DataType.int16, "argmax_maxpool")

571

maxpool_ofm.quantization = identity_quant

572

573

for op_idx, op_height in enumerate(op_heights):

574

maxpool_op = create_depthwise_maxpool(

575

f"dw_maxpool_{op_idx}", intermediate_tens, orig_ifm_shape, identity_quant

576

)

577

maxpool_op.outputs = [maxpool_ofm]

578

maxpool_ofm.ops.append(maxpool_op)

579

maxpool_op.ofm_shapes = [Shape4D(maxpool_ofm.shape)]

580

maxpool_op.set_activation_lut(lut_tensor)

581

582

# Set read and write shapes/offsets to read/write chunks of the IFM/OFM

583

maxpool_op.read_shapes[0] = Shape4D([1, op_height * orig_ifm_shape.width, orig_ifm_shape.depth, 1])

584

maxpool_op.read_offsets[0] = Shape4D([0, sum(op_heights[:op_idx]) * orig_ifm_shape.width, 0, 0])

585

maxpool_op.write_shape = Shape4D([1, op_height * orig_ifm_shape.width, 1, 1])

586

maxpool_op.write_offset = Shape4D([0, sum(op_heights[:op_idx]) * orig_ifm_shape.width, 0, 0])

587

DebugDatabase.add_optimised(op, maxpool_op)

588

Johan Alfven

2023-03-31 10:19:23 +0200

[diff] [blame]

589

# Set final shape

590

maxpool_ofm.set_all_shapes([1, h, w, 1])

591

592

# Convert 16bit to 32bit or 64bit

593

if ofm.dtype == DataType.int64:

594

# If OFM dtype is int64 the result is converted by two cast ops (16bit to 32bit)

595

#

596

# A -> B -> C -> D (OFM)

597

# |0001| |00010000| |0001|0000| |00010000|00000000|

598

# i16 i32 i16 i16 i32 i32

599

# <-------i64------->

600

#

601

# Memcpy is used to copy the content from B to C and from D to OFM

602

# Memcpy will be turned into a nop or an DMA transer if memory regions differs.

603

intermediate_32bit = Tensor([1, h, w, 1], DataType.int32, f"{orig_name}_32bit")

604

else:

605

intermediate_32bit = ofm

606

607

op_cast = create_cast_op(f"{orig_name}_cast_to_32bit_1", maxpool_ofm, intermediate_32bit)

608

DebugDatabase.add_optimised(op, op_cast)

609

610

if ofm.dtype == DataType.int64:

611

# Create int16 tensor with double shape to cover the intermediate_32bit result from the first cast

612

intermediate_16bit_2x_size = Tensor([1, h, w, 2], DataType.int16, f"{orig_name}_16bit_2x_size")

613

memcpy_op = create_memcpy(f"{orig_name}_memcpy_1", intermediate_32bit, intermediate_16bit_2x_size)

614

DebugDatabase.add_optimised(op, memcpy_op)

615

616

# Create int32 tensor with double ofm shape to be able to store a "int64" result

617

intermediate_32bit_2x_size = Tensor([1, h, w, 2], DataType.int32, f"{orig_name}_32bit_2x_size")

618

619

op_cast = create_cast_op(

620

f"{orig_name}_cast_to_32bit_2", intermediate_16bit_2x_size, intermediate_32bit_2x_size

621

)

622

DebugDatabase.add_optimised(op, op_cast)

623

624

memcpy_op = create_memcpy("f{orig_name}_memcpy_2", intermediate_32bit_2x_size, ofm)

625

DebugDatabase.add_optimised(op, memcpy_op)

Rickard Bolin

2022-12-19 12:33:40 +0000

[diff] [blame]

return op

Rickard Bolin

2022-07-04 16:19:16 +0000

[diff] [blame]

630

def convert_resizebilinear_to_depthwise_convolutions(op, half_pixel_centers=True):

631

def _compute_interpolation_values(index, input_size, output_size):

632

scale = input_size / output_size

633

scaled_value = (index + 0.5 * half_pixel_centers) * scale - 0.5 * half_pixel_centers

634

lower_bound = max(np.floor(scaled_value), 0)

635

636

return scaled_value, lower_bound

637

638

def _compute_kernels(input_height, input_width, output_height, output_width):

kernels = []

for y in (1, 2):

for x in (1, 2):

sv_h, lb_h = _compute_interpolation_values(y, input_height, output_height)

643

sv_w, lb_w = _compute_interpolation_values(x, input_width, output_width)

644

645

# Interpolation values calculated for (x, y) = ([1, 2], [1, 2]) will always generalize to the whole

646

# input for upscale = 2 and input sizes >= 2x2 and be in the correct order for going left-to-right,

647

# top-to-bottom - same as the depthwise convolution strides across each tile

648

kernel = np.zeros((2, 2))

649

kernel[1, 1] = (1 - (sv_h - lb_h)) * (1 - (sv_w - lb_w))

650

kernel[0, 1] = (sv_h - lb_h) * (1 - (sv_w - lb_w))

651

kernel[1, 0] = (1 - (sv_h - lb_h)) * (sv_w - lb_w)

652

kernel[0, 0] = (sv_h - lb_h) * (sv_w - lb_w)

653

kernel *= 16

654

kernels.append(kernel)

return kernels

def _build_convolutions(op, kernels):

659

dw_op_attrs = {

660

"padding": Padding.TILE,

661

"stride_h": 1,

662

"stride_w": 1,

663

"strides": (1, 1, 1, 1),

664

"depth_multiplier": 1,

665

"channel_multiplier": 1,

666

"dilation_h_factor": 1,

667

"dilation_w_factor": 1,

668

"dilation": (1, 1, 1, 1),

}

ifm = op.ifm

ofm = op.ofm

ofm.ops = []

elem_size = 2 if ofm.dtype == DataType.int16 else 1

674

675

n, h, w, c = ifm.shape

676

_, _, ow, _ = ofm.shape

677

678

intermediate_tens = Tensor(ifm.shape, ifm.dtype, "intermediate_tens")

679

intermediate_tens.quantization = op.outputs[0].quantization.clone()

680

avgpool_op = op

681

avgpool_op.name = "rb_init_avgpool"

682

avgpool_op.type = Op.AvgPool

683

avgpool_op.attrs["padding"] = Padding.VALID

684

avgpool_op.attrs["stride_w"] = 1

685

avgpool_op.attrs["stride_h"] = 1

686

avgpool_op.attrs["filter_width"] = 1

687

avgpool_op.attrs["filter_height"] = 1

688

avgpool_op.attrs["strides"] = [1, 1, 1, 1]

689

avgpool_op.attrs["ksize"] = [1, 1, 1, 1]

690

691

avgpool_op.add_input_tensor(ifm)

692

avgpool_op.set_output_tensor(intermediate_tens)

693

avgpool_op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

694

DebugDatabase.add_optimised(op, op)

Rickard Bolin

2022-07-04 16:19:16 +0000

[diff] [blame]

695

696

dw_conv = Operation(Op.DepthwiseConv2DBias, "depthwise_conv")

697

dw_conv._original_type = Op.ResizeBilinear

698

dw_conv.write_shape = Shape4D(n, h, w, c)

699

dw_conv.write_offset = Shape4D(0, 0, 0, 0)

700

Tim Hall

2023-05-16 22:39:14 +0100

[diff] [blame]

701

# Resize bilinear requires rounding away from zero

702

dw_conv.rounding_mode = RoundingMode.AwayZero

Rickard Bolin

2022-07-04 16:19:16 +0000

[diff] [blame]

703

704

# Double height and width stride to write the output of each of the four depthwise convolutions below

705

# interleaved with each other when combined with OFM tile base offsets.

706

dw_conv.ofm_stride_multiplier = [1, 2, 2] # C/H/W

707

708

# Choose tile padding direction - pad by 1 with edge values in two direction.

709

# For example, TL (top left) will pad top and left in H/W-plane in all channels.

710

directions = [[1, 1, 0, 0], [1, 0, 0, 1], [0, 1, 1, 0], [0, 0, 1, 1]] # TL, TR, BL, BR

for i in (0, 1):

for j in (0, 1):

index = i * 2 + j

dw_conv.name = f"depthwise_conv_{index}"

715

dw_op_attrs["explicit_padding"] = directions[index]

716

dw_conv.attrs.update(dw_op_attrs)

717

718

# This will offset the start of the write by modifying the Tile 0 base address

719

dw_conv.tile_base_offsets_ofm[0] = (i * ow + j) * c * elem_size

720

721

ofm.ops.append(dw_conv)

722

dw_conv.outputs = [ofm]

723

724

kernel = kernels[index]

725

shape = [2, 2, 1, c]

726

kernel = np.dstack([kernel] * c)

727

728

quant = QuantizationParameters()

729

quant.zero_point = 0

730

quant.scale_f32 = 1.0 / 16

731

732

dw_conv.inputs = []

733

dw_conv.add_input_tensor(intermediate_tens)

734

dw_conv.add_input_tensor(

create_const_tensor(

"weights",

shape,

intermediate_tens.dtype,

739

np.array(kernel).reshape(shape),

Rickard Bolin

2022-07-04 16:19:16 +0000

[diff] [blame]

quantization=quant,

),

)

# setup bias tensor by assign None and then call the fix-up function to create a suitable tensor.

745

# need to append the bias tensor as resize ops only have 2 inputs

746

assert len(dw_conv.inputs) == 2

747

dw_conv.inputs.append(None)

Rickard Bolin

017b4cc

2022-09-23 10:16:48 +0000

[diff] [blame]

748

fixup_bias_tensors(dw_conv, None, None, dtype=DataType.int32)

Rickard Bolin

2022-07-04 16:19:16 +0000

[diff] [blame]

749

750

dw_conv.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

751

DebugDatabase.add_optimised(op, dw_conv)

752

Rickard Bolin

2022-07-04 16:19:16 +0000

[diff] [blame]

753

dw_conv = dw_conv.clone(f"_{index}")

754

return op

755

756

_, input_height, input_width, _ = op.ifm.shape

757

_, output_height, output_width, _ = op.ofm.shape

758

759

kernels = _compute_kernels(input_height, input_width, output_height, output_width)

760

op = _build_convolutions(op, kernels)

return op

Raul Farkas

2023-05-25 11:15:20 +0100

[diff] [blame]

765

def fixup_resize(op: Operation, arch, nng) -> Operation:

766

"""Fixup resize ops to increase support for ResizeNearestNeighbor cases."""

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

767

if op.type.is_resize_op() and op.run_on_npu:

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

768

if op.ifm_shapes[0] == op.ofm_shapes[0]:

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

769

# Bypass the resize op which is essentially a NOP

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

770

op.inputs = op.inputs[:1]

771

op.type = Op.Identity

772

elif op.ifm_shapes[0].height == 1 and op.ifm_shapes[0].width == 1:

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

773

convert_resize_1x1_to_add(op)

Rickard Bolin

2022-07-04 16:19:16 +0000

[diff] [blame]

774

elif op.type == Op.ResizeBilinear and op.attrs.get("half_pixel_centers", False):

775

convert_resizebilinear_to_depthwise_convolutions(op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

776

else:

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

777

convert_resize_to_upscale_and_average_pool(op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

def convert_nop_split_to_identity(op, arch, nng):

783

if op.type == Op.Split and op.attrs.get("num_splits") == 1:

784

# the list comprehension should return a list with a single tensor

785

# if it shouldn't, remove_passthrough_tensor will fail appropriately

786

op.inputs = [i for i in op.inputs if i.shape == op.outputs[0].shape]

787

op.type = Op.Identity

return op

Raul Farkas

2023-05-25 11:15:20 +0100

[diff] [blame]

791

def rewrite_fully_connected_input(op: Operation, arch, nng) -> Operation:

792

"""Rewrite FullyConnected shape as 2D to allow it to run on NPU."""

Fredrik Svedberg

2023-04-11 22:35:04 +0200

[diff] [blame]

793

# If the operation already have a read shape do not modify

794

# the ifm shape, since that will already be correct

795

if op.type == Op.FullyConnected and not op.read_shapes[0]:

Ayaan Masood

a2ec5aa

2022-04-21 14:28:03 +0100

[diff] [blame]

796

new_shape = op.ifm.get_shape_as_2d(op.weights.shape[-2])

797

assert new_shape is not None, "Tensor can not be reshaped to 2D"

798

op.ifm_shapes[0] = new_shape

Johan Alfvén

65835e0

2022-10-13 10:49:30 +0200

[diff] [blame]

799

800

if op.ifm_shapes[0].batch > 1 and op.ofm_shapes[0].batch == 1:

801

# If IFM is batching then also make sure OFM is batching

802

h, w = op.ofm_shapes[0].height, op.ofm_shapes[0].width

803

op.ofm_shapes[0] = Shape4D([h * w, 1, 1, op.ofm_shapes[0].depth])

804

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

Raul Farkas

2023-05-25 11:15:20 +0100

[diff] [blame]

808

def convert_batched_fc_shape(op: Operation, arch, nng) -> Operation:

809

"""Convert batched FullyConnected op shape to allow for support on NPU."""

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

810

if op.type == Op.FullyConnected:

811

# Check if the first dimension indicates batching

812

if op.ifm_shapes[0].batch > 1:

813

batching_split = {4: (2, 2), 8: (2, 4), 16: (4, 4)}

814

n = op.ifm_shapes[0].batch

815

h, w = batching_split.get(n, (1, n))

816

op.ifm_shapes[0] = Shape4D([1, h, w, op.ifm_shapes[0].depth])

817

818

# Reshape Weights to be 4D. IO becomes HWIO

819

weight_tensor = op.inputs[1]

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

820

weight_tensor.values = np.expand_dims(np.expand_dims(weight_tensor.values, axis=0), axis=0)

821

weight_tensor.set_all_shapes(list(weight_tensor.values.shape))

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

822

823

n = op.ofm_shapes[0].batch

824

h, w = batching_split.get(n, (1, n))

825

op.ofm_shapes[0] = Shape4D([1, h, w, op.ofm_shapes[0].depth])

return op

def unfuse_activation_function(op):

830

if op.type == Op.ConcatTFLite and op.run_on_npu and op.activation is not None:

831

act_op = Operation(op.activation.op_type, op.name + op.activation.op_type.name)

832

op.activation = None

833

out_tens = op.outputs[0]

834

intermediate_tens = out_tens.clone("_act_intermediate")

835

act_op.set_output_tensor(out_tens)

836

act_op.add_input_tensor(intermediate_tens)

837

op.set_output_tensor(intermediate_tens)

838

act_op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

839

DebugDatabase.add_optimised(op, act_op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

840

841

842

def rewrite_stridedslice_output(op, arch, nng):

843

if not op.run_on_npu or op.type != Op.StridedSlice:

844

return op

845

846

new_axis_mask = op.attrs["new_axis_mask"]

847

shrink_axis_mask = op.attrs["shrink_axis_mask"]

848

849

if shrink_axis_mask == 0 and new_axis_mask == 0:

850

return op

851

852

axis_4D = [0] * len(op.outputs)

853

for idx, out_tens in enumerate(op.outputs):

854

output_shape = list(out_tens.shape)

855

856

if shrink_axis_mask != 0:

857

n = 0

858

axis = 0

859

while shrink_axis_mask:

860

prev_mask = shrink_axis_mask

861

n += 1

862

shrink_axis_mask &= shrink_axis_mask - 1

863

axis = int(math.log2(prev_mask - shrink_axis_mask))

864

output_shape = output_shape[:axis] + [1] + output_shape[axis:]

865

866

assert len(out_tens.shape) == (len(op.inputs[0].shape) - n)

867

op.attrs["shrink_axis_mask"] = 0

868

if axis >= 0:

869

axis_4D[idx] = axis + (4 - len(output_shape))

870

else:

871

axis_4D[idx] = axis

872

op.ofm_shapes[idx] = Shape4D(output_shape)

873

874

elif new_axis_mask != 0:

n = 0

axis = 0

while new_axis_mask:

prev_mask = new_axis_mask

879

n += 1

880

new_axis_mask &= new_axis_mask - 1

881

axis = int(math.log2(prev_mask - new_axis_mask))

882

output_shape = output_shape[:axis] + output_shape[(axis + 1) :]

883

new_axis_mask >>= 1

884

885

assert len(out_tens.shape) == (len(op.inputs[0].shape) + n)

886

op.attrs["new_axis_mask"] = 0

887

if axis >= 0:

888

axis_4D[idx] = axis + (4 - len(output_shape))

889

else:

890

axis_4D[idx] = axis

891

op.ofm_shapes[idx] = Shape4D(output_shape)

892

893

op.attrs["split_axis_4D"] = axis_4D

return op

def rewrite_unpack_output(op, arch, nng):

898

tens = op.outputs[0]

899

if op.run_on_npu and op.type == Op.Unpack:

900

# Unpack is also referred to as Unstack

901

axis = int(op.attrs["axis"])

902

if axis < 0: # Convert to positive axis

903

axis = len(op.inputs[0].shape) + 1 + axis

904

op.type = Op.UnpackReshaped

905

desired_output_shape = tens.shape[:axis] + [1] + tens.shape[axis:]

906

907

axis_4D = axis + (4 - len(desired_output_shape))

908

op.attrs["split_axis_4D"] = [axis_4D] * len(op.outputs)

909

910

for idx, out_tens in enumerate(op.outputs):

911

op.ofm_shapes[idx] = Shape4D(desired_output_shape)

return op

def add_padding_fields(op, arch, nng):

916

if op.run_on_npu:

917

if "padding" in op.attrs:

918

input_shape = op.ifm_shapes[0]

919

output_shape = op.ofm_shapes[0]

920

if op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op():

921

kernel_size = op.inputs[1].shape[:2]

922

elif op.type.is_pool_op() or op.type.npu_block_type == NpuBlockType.ReduceSum:

923

kernel_size = op.attrs["ksize"][1:3]

924

else:

925

raise UnsupportedFeatureError(f"Unknown operation that uses padding: {optype_to_builtintype(op.type)}")

926

927

if op.type == Op.Conv2DBackpropInputSwitchedBias:

928

upscaling_factor = output_shape.height // input_shape.height

929

padding, skirt = calc_upscaled_padding_and_skirt(

930

op.attrs["padding"], kernel_size, op.attrs["strides"], input_shape, upscaling_factor

931

)

932

else:

933

padding, skirt = calc_padding_and_skirt(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

op.attrs["padding"],

op.kernel,

input_shape,

op.attrs.get("explicit_padding"),

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

938

)

939

940

op.attrs["explicit_padding"] = padding

941

op.attrs["skirt"] = skirt

return op

Raul Farkas

2023-05-25 11:15:20 +0100

[diff] [blame]

946

def reorder_depthwise_weights(op: Operation, arch, nng) -> Operation:

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

947

if op.type.is_depthwise_conv2d_op():

948

weight_tensor = op.inputs[1]

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame]

949

if not weight_tensor.weight_transpose_depthwise:

950

weight_tensor.values = np.transpose(weight_tensor.values, (0, 1, 3, 2))

951

weight_tensor.set_all_shapes(list(weight_tensor.values.shape))

952

weight_tensor.weight_transpose_depthwise = True

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

Raul Farkas

2023-05-09 09:09:17 +0100

[diff] [blame]

957

def convert_avg_pool_to_conv2d(op: Operation, arch, nng) -> Operation:

958

"""Convert strided Average Pools with stride >= 4 to Conv2D."""

959

if op.type != Op.AvgPool:

960

return op

961

962

stride_x, stride_y = op.get_kernel_stride()

963

# For strides <= 3 no optimization is needed

964

if stride_x <= 3:

965

return op

966

h, w = op.attrs["filter_height"], op.attrs["filter_width"]

967

inputs = op.inputs[0]

968

shape = inputs.shape

969

970

# Set necessary conv2d attributes

971

op.attrs.update(

972

{

973

"stride_h": stride_y,

974

"stride_w": stride_x,

975

"dilation_h_factor": 1,

976

"dilation_w_factor": 1,

977

"strides": (1, stride_y, stride_x, 1),

978

"dilation": (1, 1, 1, 1),

}

)

# Change op type

op.type = Op.Conv2DBias

984

op.name += "_conv2d"

985

986

op.rounding_mode = RoundingMode.AwayZero

987

shape = [h, w, 1, op.ofm.shape[-1]]

988

weights = np.full(shape, 1)

989

quant = QuantizationParameters(scale_f32=1 / (h * w), zero_point=0)

990

# Add unit weight tensor

op.add_input_tensor(

create_const_tensor(

"weights",

shape,

inputs.dtype,

weights,

quantization=quant,

),

)

op.weights.values = np.reshape(op.inputs[1].values, shape)

1001

1002

# Set IFM/OFM shapes after changing op type

1003

op.set_ifm_ofm_shapes()

return op

def fixup_strided_conv(op: Operation, arch, nng):

Raul Farkas

72c6a24

2023-03-16 16:38:05 +0000

[diff] [blame]

1008

"""Optimize or fixup strided Conv2DBias

1009

Optimization:

Raul Farkas

2023-01-30 12:58:46 +0000

[diff] [blame]

1010

Reduce, when possible, the Conv2DBias stride from N with 1 > N > 4 to 1

1011

by re-shaping both IFM and filter.

Raul Farkas

72c6a24

2023-03-16 16:38:05 +0000

[diff] [blame]

1012

1013

Fixup:

Raul Farkas

2023-01-30 12:58:46 +0000

[diff] [blame]

1014

Introduce software support for Conv2DBias with stride_width > 4 by

1015

reducing it to 1, 2 or 3 (HW supported strides) when possible by

1016

re-shaping both IFM and filter.

Raul Farkas

72c6a24

2023-03-16 16:38:05 +0000

[diff] [blame]

1017

"""

Raul Farkas

2023-01-24 16:29:06 +0000

[diff] [blame]

1018

if op.type != Op.Conv2DBias:

Louis Verhaard

43d2758

2022-03-17 14:06:00 +0100

[diff] [blame]

1019

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1020

stride_x, stride_y = op.get_kernel_stride()

Louis Verhaard

43d2758

2022-03-17 14:06:00 +0100

[diff] [blame]

1021

weight_tensor = op.weights

1022

ifm_shape = op.ifm_shapes[0]

Raul Farkas

69782af

2023-05-09 10:39:52 +0100

[diff] [blame]

1023

1024

# Do not optimize if op is not the first in the network and stride is

1025

# supported by the hardware

1026

if op.op_index != 0 and stride_x < 4:

1027

return op

1028

Raul Farkas

2023-01-30 12:58:46 +0000

[diff] [blame]

1029

resize_factor, final_stride = calc_resize_factor(ifm_shape.width, stride_x)

1030

1031

def calc_filter_padding(

1032

ifm_padding_type: Padding | None,

1033

ifm_current_padding_x: int,

1034

post_op_stride: int,

1035

opt_resize_factor: int,

1036

filter_width: int,

Raul Farkas

2023-05-16 17:18:31 +0100

[diff] [blame]

1037

ifm_width: int,

Raul Farkas

2023-01-30 12:58:46 +0000

[diff] [blame]

1038

) -> tuple[int, int, int, int]:

1039

"""Calculate zero padding to be added to the filter.

Parameters

----------

ifm_padding_type : Padding or None

1044

The padding type that is applied to the IFM.

1045

ifm_current_padding_x : int

1046

Padding amount that is added to the IFM before optimization.

1047

post_op_stride : int

1048

The final stride once optimization is performed.

1049

opt_resize_factor : int

1050

The factor by which the stride will be reduced.

1051

E.g. opt_resize_factor = 2 on a stride of 4 will produce

1052

a stride of 2 after the optimization

1053

filter_width : int

1054

Width of the filter before optimization.

Raul Farkas

2023-05-16 17:18:31 +0100

[diff] [blame]

1055

ifm_width : int

1056

Width of the IFM before optimization

Raul Farkas

2023-01-30 12:58:46 +0000

[diff] [blame]

Returns

-------

padding : tuple[int, int, int, int]

1061

A tuple with the ammount of padding on each side (top, left, bottom, right)

1062

"""

1063

padding_size = 0

1064

padding = (0, 0, 0, 0)

1065

if ifm_padding_type and ifm_padding_type != Padding.VALID:

Raul Farkas

2023-05-16 17:18:31 +0100

[diff] [blame]

1066

# Compute padding size for the filter that guarantees that HW padding added to IFM matches

1067

# before and after the optimization is performed

1068

expected_filter_size = 0

1069

pre_opt_stride = post_op_stride * opt_resize_factor

1070

post_opt_ifm_width = ifm_width // opt_resize_factor

1071

# Compute the total expected filter size post optimization that ensures that the same HW padding

1072

# is added to IFM.

1073

# There are two ways of calculating required filter size depending on whether IFM width is divisible

1074

# by stride width or not. These approaches match the cases used to calculate HW padding in

1075

# needed_total_padding method.

1076

if ifm_width % pre_opt_stride == 0:

1077

expected_filter_size = ifm_current_padding_x + post_op_stride

1078

else:

1079

expected_filter_size = ifm_current_padding_x + (post_opt_ifm_width % post_op_stride)

1080

# Compute padding size from expected filter size

1081

padding_size = expected_filter_size * opt_resize_factor - filter_width

1082

1083

if ifm_current_padding_x == 0:

1084

# If no HW padding is added to IFM, divide filter padding between left and right following

1085

# the same strategy as the reference.

1086

padding_left = padding_size // 2

1087

else:

1088

# If HW padding is added to IFM, split padding for the filter so that left padding and right padding

1089

# are proportional to left and right HW padding.

1090

left_hw_padding = ifm_current_padding_x // 2

1091

# Compute filter padding

1092

padding_left = padding_size // ifm_current_padding_x * left_hw_padding

Raul Farkas

2023-01-30 12:58:46 +0000

[diff] [blame]

1093

padding = (0, padding_left, 0, padding_size - padding_left)

1094

1095

# Check if filter width is divisible by the stride width (required for optimization)

Raul Farkas

2023-05-16 17:18:31 +0100

[diff] [blame]

1096

# If filter width is not divisible by stride width and no HW padding is added to IFM, compute

1097

# filter padding required for the filter width to be divisible by the stride width and apply it as right

1098

# padding.

1099

if filter_width % opt_resize_factor != 0 and (padding_size == 0 or ifm_current_padding_x == 0):

Raul Farkas

2023-01-30 12:58:46 +0000

[diff] [blame]

1100

padding_size = opt_resize_factor - (filter_width % opt_resize_factor)

1101

# Add padding zeros to the right

1102

padding = (0, 0, 0, padding_size)

return padding

# Compute the depth of the IFM once the strided Conv2D is optimised

1107

post_opt_ifm_depth = ifm_shape.depth * resize_factor

1108

1109

if stride_x > 1 and (post_opt_ifm_depth <= 8 or stride_x > 3) and resize_factor != 1 and weight_tensor is not None:

1110

k_w, _ = op.get_kernel_size()

1111

weight_shape = weight_tensor.shape

1112

1113

padding_type = op.attrs.get("padding", None)

1114

if padding_type in (None, Padding.EXPLICIT, Padding.TILE):

Louis Verhaard

43d2758

2022-03-17 14:06:00 +0100

[diff] [blame]

1115

return op

Raul Farkas

2023-01-30 12:58:46 +0000

[diff] [blame]

1116

# Compute current padding as if IFM padding is SAME

1117

curr_padding_x = needed_total_padding(ifm_shape.width, stride_x, k_w)

1118

# Compute the padding needed on the filter for the optimisation

1119

_, left_filter_padding, _, right_filter_padding = calc_filter_padding(

Raul Farkas

2023-05-16 17:18:31 +0100

[diff] [blame]

1120

padding_type, curr_padding_x, final_stride, resize_factor, k_w, ifm_shape.width

Raul Farkas

2023-01-30 12:58:46 +0000

[diff] [blame]

1121

)

1122

total_horizontal_padding = left_filter_padding + right_filter_padding

1123

# If IFM padding is enabled, check if pre-opt and post-opt padding is

1124

# the same while taking into consideration the extra filter padding.

1125

if padding_type == Padding.SAME:

1126

optimised_padding_x = needed_total_padding(

1127

ifm_shape.width // resize_factor, final_stride, (k_w + 1 + total_horizontal_padding) // resize_factor

1128

)

1129

if curr_padding_x != optimised_padding_x:

1130

# Horizontal padding would become different after optimisation; this would not work

1131

return op

1132

1133

# Resize IFM

Raul Farkas

2023-01-24 16:29:06 +0000

[diff] [blame]

1134

op.ifm_shapes[0] = Shape4D(

Raul Farkas

2023-01-30 12:58:46 +0000

[diff] [blame]

1135

[ifm_shape.batch, ifm_shape.height, ifm_shape.width // resize_factor, ifm_shape.depth * resize_factor]

Raul Farkas

2023-01-24 16:29:06 +0000

[diff] [blame]

1136

)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1137

Raul Farkas

2023-01-30 12:58:46 +0000

[diff] [blame]

1138

# Compute list of 0 padding for each dimensions of the filter

1139

filter_dimension_padding = [(0, 0) for _ in weight_tensor.shape]

1140

# Update padding for filter width with computed padding

1141

filter_dimension_padding[1] = (left_filter_padding, right_filter_padding)

1142

# Add padding to the filter

1143

zero_point = weight_tensor.quantization.zero_point

1144

padding_constant = zero_point if np.isscalar(zero_point) else 0

1145

padded_filter_tensor = np.pad(weight_tensor.values, filter_dimension_padding, constant_values=padding_constant)

1146

weight_shape[1] = padded_filter_tensor.shape[1]

1147

weight_tensor.values = padded_filter_tensor

Raul Farkas

2023-01-24 16:29:06 +0000

[diff] [blame]

1148

# Change weight shape based on stride_x

Raul Farkas

2023-01-30 12:58:46 +0000

[diff] [blame]

1149

weight_shape[1] //= resize_factor

1150

weight_shape[2] *= resize_factor

Raul Farkas

2023-01-24 16:29:06 +0000

[diff] [blame]

1151

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

1152

weight_tensor.values = np.reshape(weight_tensor.values, weight_shape)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1153

weight_tensor.set_all_shapes(weight_shape)

1154

# If multiple copies of the weights are used, we could avoid

1155

# them having the same address by changing the value_id

1156

weight_tensor.value_id = uuid.uuid4()

1157

1158

# Strides

Raul Farkas

2023-01-30 12:58:46 +0000

[diff] [blame]

1159

stride_x = final_stride

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1160

op.attrs.update({"stride_w": stride_x, "stride_h": stride_y, "strides": (1, stride_y, stride_x, 1)})

return op

Raul Farkas

2023-05-25 11:15:20 +0100

[diff] [blame]

1165

def convert_conv_to_fc(op: Operation, arch, nng) -> Operation:

1166

"""Convert 1x1 Conv2D that behave like FullyConnected to FullyConnected, since they don't need any weight

1167

buffering.

1168

"""

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1169

# Conv 1x1 can be equivalent to Fully Connected.

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1170

# (Weights dont need to be reloaded for convs when IFM H and W are 1)

1171

if op.type == Op.Conv2DBias:

1172

h = op.ifm_shapes[0].height

1173

w = op.ifm_shapes[0].width

1174

kh, kw, _, _ = op.inputs[1].shape

1175

if h == 1 and w == 1 and kh == 1 and kw == 1:

1176

# Overwrite this op as a Fully Connected Op

1177

op.name += "_fc"

1178

op.type = Op.FullyConnected

op.attrs = {

"weights_format": 0,

}

# Reshape Weights to be 2D. HWIO becomes just IO (as H and W are 1, they can just be dropped)

1183

weight_tensor = op.inputs[1]

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

1184

weight_tensor.values = weight_tensor.values.squeeze(axis=(0, 1))

1185

weight_tensor.set_all_shapes(list(weight_tensor.values.shape))

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1186

1187

DebugDatabase.add_optimised(op, op)

return op

Raul Farkas

2023-05-25 11:15:20 +0100

[diff] [blame]

1191

def fixup_relus_with_differing_ifm_ofm_scaling(op: Operation, arch, nng) -> Operation:

1192

"""Fixup Relu with different IFM and OFM to allow fusing by adding its own primary op."""

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1193

if op.run_on_npu and op.type.is_relu_op():

1194

ifm = op.inputs[0]

1195

ofm = op.outputs[0]

1196

# Relu with differing IFM and OFM scaling cannot be fused with another primary op

1197

# and requires its own to be inserted

1198

if not check_quantized_tens_scaling_equal(ifm, ofm):

1199

# Override this op with its own primary op (avgpool)

1200

relu_fused_op = create_avgpool_nop(op.name + "_avgpool")

1201

# And fuse the original activation function to it

1202

relu_fused_op.activation = create_activation_function(op.type)

Fredrik Svedberg

1a7527c

2021-09-13 15:52:16 +0200

[diff] [blame]

1203

# Add explicit rescaling

1204

rescale = ifm.quantization.scale_f32 / ofm.quantization.scale_f32

1205

multiplier, shift = scaling.quantise_scale(rescale)

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame]

1206

relu_fused_op.explicit_scaling = ExplicitScaling(False, [shift], [multiplier])

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1207

# Tidy up and assign the ifm and ofm to the new op

1208

ifm.consumer_list.remove(op)

1209

1210

relu_fused_op.add_input_tensor(ifm)

1211

relu_fused_op.set_output_tensor(ofm)

1212

relu_fused_op.set_ifm_ofm_shapes()

op = relu_fused_op

return op

Raul Farkas

2023-05-25 11:15:20 +0100

[diff] [blame]

1217

def convert_lstm(op: Operation, arch, nng) -> Operation:

1218

"""Convert LSTM op into its basic opearations to allow for support on NPU."""

Fredrik Svedberg

2023-04-11 22:35:04 +0200

[diff] [blame]

1219

if op.type == Op.UnidirectionalSequenceLstm:

1220

lstm = Lstm(op)

1221

op = lstm.get_graph()

return op

Raul Farkas

2023-05-25 11:15:20 +0100

[diff] [blame]

1225

def convert_softmax(op: Operation, arch, nng) -> Operation:

1226

"""Convert Softmax op into its basic operations to allow for support on NPU."""

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1227

if op.type == Op.Softmax and op.run_on_npu:

1228

softmax = SoftMax(op)

1229

op = softmax.get_graph()

return op

Raul Farkas

2023-05-25 11:15:20 +0100

[diff] [blame]

1233

def convert_prelu(op: Operation, arch, nng) -> Operation:

1234

"""Convert PReLU op to other ops based on alpha values to allow for support on NPU."""

Fredrik Svedberg

2022-08-19 16:06:04 +0200

[diff] [blame]

1235

if op.type == Op.Prelu:

1236

ifm, alpha, ofm = op.get_ifm_ifm2_ofm()

1237

if None in (ifm, alpha, ofm):

1238

return op

1239

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

1240

if alpha.values is not None:

1241

# If const alpha check for possible optimisations

1242

alpha_zp = alpha.quantization.zero_point

1243

alpha_scale = alpha.quantization.scale_f32

1244

# If all alpha values are the same the PReLU can be converted to LeakyRelu

Rickard Bolin

5fdcf17

2022-12-19 12:56:17 +0000

[diff] [blame]

1245

alpha_min = (alpha.values.min().astype(int) - alpha_zp) * alpha_scale

1246

alpha_max = (alpha.values.max().astype(int) - alpha_zp) * alpha_scale

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

1247

if alpha_min == alpha_max:

# or even a Relu

if alpha_min == 0:

new_op = Op.Relu

else:

new_op = Op.LeakyRelu

1253

op.attrs["alpha"] = alpha_min

1254

# setup alpha_scaling for bit exact result

1255

ifm_scale = ifm.quantization.scale_f32

1256

ofm_scale = ofm.quantization.scale_f32

1257

alpha_scale, alpha_shift = scaling.elementwise_mul_scale(ifm_scale, alpha_scale, ofm_scale)

1258

op.attrs["alpha_scaling"] = (alpha.values.min() - alpha_zp, alpha_scale, alpha_shift)

1259

# Change op type

1260

op.type = new_op

1261

op.name = op.name.replace("Prelu", new_op.name)

1262

del op.inputs[1] # Remove alpha tensor

1263

return op

1264

elif alpha_max < 1:

1265

# If alpha_max is less than 1 convert PReLU to Max(alpha * IFM, identity * IFM)

1266

# Multiply with alpha tensor

1267

mul_alpha = Operation(Op.Mul, op.name + "_mul_alpha")

1268

mul_alpha.add_input_tensor(ifm)

1269

mul_alpha.add_input_tensor(alpha)

1270

fm_alpha = ofm.clone(op.name + "_alpha", set_unique=True)

1271

mul_alpha.set_output_tensor(fm_alpha)

1272

mul_alpha.set_ifm_ofm_shapes()

1273

DebugDatabase.add_optimised(op, mul_alpha)

1274

if check_quantized_tens_scaling_equal(ifm, ofm):

1275

# No scaling is needed

1276

fm_id = ifm

1277

else:

1278

# Add multiplication with identity

1279

mul_identity = Operation(Op.Mul, op.name + "_mul_identity")

1280

mul_identity.add_input_tensor(ifm)

1281

# Create const tensor containing identity as scalar

1282

quantization = ifm.quantization.clone()

1283

quantization.scale_f32 = np.float32(1)

1284

quantization.zero_point = 0

1285

one = create_const_tensor("one_const", [], ifm.dtype, [1], quantization=quantization)

1286

mul_identity.add_input_tensor(one)

1287

# Make sure that fm_id is allocated to a different address than fm_alpha

1288

fm_id = ofm.clone(op.name + "_id", set_unique=True)

1289

mul_identity.set_output_tensor(fm_id)

1290

mul_identity.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

1291

DebugDatabase.add_optimised(op, mul_identity)

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

1292

1293

# Combine scaled and alpha multiplied values

1294

max_op = Operation(Op.Maximum, op.name + "_max")

1295

max_op.add_input_tensor(fm_alpha)

1296

max_op.add_input_tensor(fm_id)

1297

max_op.set_output_tensor(ofm)

1298

max_op.set_ifm_ofm_shapes()

1299

1300

DebugDatabase.add_optimised(op, max_op)

1301

ifm.consumer_list.remove(op)

1302

return max_op

1303

1304

# Catch all PReLU conversion for the cases that could not be optimised above

Fredrik Svedberg

2022-08-19 16:06:04 +0200

[diff] [blame]

1305

no_scale_quant = ifm.quantization.clone()

1306

no_scale_quant.scale_f32 = None

1307

no_scale_quant.zero_point = 0

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

1308

zero = create_const_tensor("zero_const", [], ifm.dtype, [0], quantization=no_scale_quant)

Fredrik Svedberg

2022-08-19 16:06:04 +0200

[diff] [blame]

1309

1310

# Select values < 0

1311

min_op = Operation(Op.Minimum, op.name + "_min")

1312

min_op.add_input_tensor(ifm)

1313

min_op.add_input_tensor(zero)

1314

fm_negative = ifm.clone(op.name + "_negative", set_unique=True)

1315

min_op.set_output_tensor(fm_negative)

1316

min_op.set_ifm_ofm_shapes()

1317

DebugDatabase.add_optimised(op, min_op)

1318

1319

# and multiply with alpha tensor

1320

mul_alpha = Operation(Op.Mul, op.name + "_mul_alpha")

1321

mul_alpha.add_input_tensor(fm_negative)

1322

mul_alpha.add_input_tensor(alpha)

1323

fm_alpha = ofm.clone(op.name + "_negative_alpha", set_unique=True)

1324

mul_alpha.set_output_tensor(fm_alpha)

1325

mul_alpha.set_ifm_ofm_shapes()

1326

DebugDatabase.add_optimised(op, mul_alpha)

1327

1328

# Select (and scale) values > 0

1329

relu_op = Operation(Op.Relu, op.name + "_relu")

1330

relu_op.add_input_tensor(ifm)

1331

fm_scaled = ofm.clone(op.name + "_positive_scaled", set_unique=True)

1332

relu_op.set_output_tensor(fm_scaled)

1333

relu_op.set_ifm_ofm_shapes()

1334

DebugDatabase.add_optimised(op, relu_op)

1335

1336

# Add scaled and alpha multiplied values (without scaling)

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame]

1337

add_op = Operation(Op.Add, op.name + "_add")

1338

add_op.explicit_scaling = ExplicitScaling(False, shift=[0], multiplier=[1]) # No scaling

Fredrik Svedberg

2022-08-19 16:06:04 +0200

[diff] [blame]

1339

add_op.add_input_tensor(fm_alpha)

1340

add_op.add_input_tensor(fm_scaled)

1341

add_op.set_output_tensor(ofm)

1342

add_op.set_ifm_ofm_shapes()

1343

1344

DebugDatabase.add_optimised(op, add_op)

1345

ifm.consumer_list.remove(op)

op = add_op

return op

Raul Farkas

2023-05-25 11:15:20 +0100

[diff] [blame]

1351

def convert_mul_max_to_abs_or_lrelu(op: Operation, arch, nng) -> Operation:

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1352

r"""Whenever there is a subgraph with this topology:

1353

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

1354

Input X For X = -1 or X > 0

1355

| \ / This subgraph can be replaced with either

1356

| Mul an Abs (if X = -1) or a LeakyReLU (if X > 0)

1357

| /

1358

Max

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1359

"""

1360

1361

if op.type == Op.Maximum:

1362

# finds the Mul input(s) to the Max

1363

muls = [i for i in op.inputs if i.ops[0].type == Op.Mul]

if len(muls) == 1:

mul = muls[0].ops[0]

elif len(muls) == 2:

# In the case both inputs are Muls, find the one with the same input as the Max

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

1368

mul_ifms = [m for m in muls if len(set(op.inputs + m.ops[0].inputs)) == 1]

1369

if len(mul_ifms):

1370

mul = mul_ifms[0].ops[0]

1371

else:

1372

# Not using same input

1373

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

else:

# No Mul inputs

return op

# make sure the Mul doesn't have any other consumers

1379

mul_ofm = mul.outputs[0]

1380

if len(mul_ofm.consumers()) != 1:

1381

return op

1382

# make sure the Mul doesn't have a fused activation function

1383

if mul.activation:

1384

return op

1385

ifm, ofm = op.get_ifm_ofm()

1386

if ifm is None or ofm is None:

1387

return op

1388

1389

if ifm.dtype not in (DataType.uint8, DataType.int8) or ifm.dtype != ofm.dtype:

1390

return op

1391

if not check_quantized_tens_scaling_equal(ifm, ofm) or not check_quantized_tens_scaling_equal(ifm, mul_ofm):

1392

# rewrite to LeakyRelu currently only makes sense if the quantization is identical

1393

return op

1394

1395

# finds the branched input that goes to both the Max and the Mul

1396

shared = set(op.inputs) & set(mul.inputs)

1397

if len(shared) == 1:

1398

shared_in = shared.pop()

1399

# find the constant scalar input to the Mul

1400

const_tens = (set(mul.inputs) - {shared_in}).pop()

1401

# check that it is a scalar

1402

if const_tens.shape != []:

1403

return op

1404

const = const_tens.ops[0]

1405

# check that it is a constant

1406

if const.type != Op.Const:

1407

return op

1408

# Remove the Mul from the shared input's consumers

1409

shared_in.consumer_list.remove(mul)

else:

return op

val = const.outputs[0].values

1414

if val >= 0:

1415

new_op = Op.LeakyRelu

1416

op.attrs["alpha"] = val

1417

# to produce bit exact results, the alpha is not enough;

1418

# save additional scaling info in attr "alpha_scale", to be used as input

1419

# to the LUT construction

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

1420

alpha_scalar = const_tens.values - const_tens.quantization.zero_point

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1421

mul_ifm_scale = np.double(ifm.quantization.scale_f32)

1422

mul_ifm2_scale = np.double(const_tens.quantization.scale_f32)

1423

mul_ofm_scale = np.double(mul_ofm.quantization.scale_f32)

1424

alpha_scale, alpha_shift = scaling.elementwise_mul_scale(mul_ifm_scale, mul_ifm2_scale, mul_ofm_scale)

1425

op.attrs["alpha_scaling"] = (alpha_scalar, alpha_scale, alpha_shift)

elif val == -1:

new_op = Op.Abs

else:

return op

op.type = new_op

op.name = op.name.replace("Maximum", new_op.name)

1433

op.outputs[0].name = op.outputs[0].name.replace("Maximum", new_op.name)

1434

op.inputs = [shared_in]

1435

op.set_ifm_ofm_shapes()

1436

1437

# Record optimisation in debug database

1438

DebugDatabase.add_optimised(op, op)

return op

Raul Farkas

2023-05-25 11:15:20 +0100

[diff] [blame]

1443

def convert_hardswish_to_lut(op: Operation, arch, nng) -> Operation:

1444

"""Convert HardSwish to LUT to allow for support on NPU."""

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1445

if op.type == Op.HardSwish:

1446

ifm, ofm = op.get_ifm_ofm()

1447

# Generate the LUT

1448

ifm_scale = np.double(ifm.quantization.scale_f32)

1449

ofm_scale = np.double(ofm.quantization.scale_f32)

1450

zp_in = ifm.quantization.zero_point

1451

zp_out = ofm.quantization.zero_point

1452

ifm_scale_hires = (1 / 128) * ifm_scale

1453

relu_multiplier = np.double(3 / 32768)

1454

out_scale, out_shift = scaling.quantise_scale(ifm_scale_hires / ofm_scale)

1455

relu_scale, relu_shift = scaling.quantise_scale(ifm_scale_hires / relu_multiplier)

1456

# Use 16bit scale

1457

out_scale_16 = fp_math.downscale_multiplier_int32_to_int16(out_scale)

1458

relu_scale_16 = fp_math.downscale_multiplier_int32_to_int16(relu_scale)

1459

1460

values = []

1461

ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)

1462

quantized_min = min(ix)

1463

quantized_max = max(ix)

1464

for x in ix:

1465

input_value = x - zp_in

1466

input_value_hires = input_value * 128

1467

# Compute the input value on essentially the output scale, not shifted yet

1468

input_value_preshift = fp_math.saturating_rounding_mul16(input_value_hires, out_scale_16)

1469

# Compute the "relu-ish multiplier". This matches the code in TensorFlow Lite Micro kernel

1470

relu_value = np.int16(input_value_hires)

1471

if relu_shift < 31:

1472

relu_value = fp_math.shift_left16(relu_value, 30 - relu_shift)

1473

1474

relu_value = fp_math.saturating_rounding_mul16(relu_value, relu_scale_16)

1475

1476

if relu_shift < 31:

1477

relu_value = fp_math.shift_left16(relu_value, 1)

1478

1479

if relu_shift > 31:

1480

relu_value = fp_math.rounding_divide_by_pot(relu_value, relu_shift - 31)

1481

1482

# Rescaled the value into a 16bit fixedpoint relu_value in [-1, 1]

1483

# Now convert that to a 16bit fixedpoint value in [0, 1]

1484

relu_value = (relu_value + (1 << 15)) >> 1

1485

lut_result = fp_math.saturating_mul16(relu_value, input_value_preshift)

1486

shift = 31 - out_shift

1487

shift = -shift if shift < 0 else 0

1488

# Finally apply the output shift

1489

lut_result = fp_math.rounding_divide_by_pot(lut_result, shift) + zp_out

1490

lut_result = min(quantized_max, max(quantized_min, lut_result))

1491

values.append(lut_result)

1492

return convert_to_lut(op, values, "hardswish")

return op

def convert_lrelu_to_mul_max(op, arch):

1497

# Converts LeakyRelu to Max(alpha * IFM, identity * IFM)

1498

# (the opposite of convert_mul_max_to_abs_or_lrelu)

1499

ifm, ofm = op.get_ifm_ofm()

1500

if ifm is None or ofm is None:

1501

return op

1502

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1503

alpha = np.float32(op.attrs["alpha"])

1504

use_mul_max = 0 < alpha < 1

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1505

is_converted_prelu = "alpha_scaling" in op.attrs

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

if use_mul_max:

mul_ifm = ifm

new_op = Op.Maximum

else:

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1510

# Need to use a different approach for alpha < 0 or alpha > 1

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1511

no_scale_quant = ifm.quantization.clone()

1512

no_scale_quant.scale_f32 = None

1513

no_scale_quant.zero_point = 0

1514

zero = create_const_tensor("zero_const", [], ifm.dtype, [0], quantization=no_scale_quant)

1515

1516

# Select values < 0

1517

min_op = Operation(Op.Minimum, op.name + "_min")

1518

min_op.add_input_tensor(ifm)

1519

min_op.add_input_tensor(zero)

1520

mul_ifm = ifm.clone(op.name + "_negative", set_unique=True)

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1521

if alpha < 0 and not is_converted_prelu:

1522

# For negative alpha that is not from a converted PReLU we need to use

1523

# int32 Mul below to perform the (negative) alpha scaling

1524

mul_ifm.dtype = DataType.int32

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1525

min_op.set_output_tensor(mul_ifm)

1526

min_op.set_ifm_ofm_shapes()

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame]

1527

new_op = Op.Add

1528

op.explicit_scaling = ExplicitScaling(False, shift=[0], multiplier=[1]) # No scaling

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1529

DebugDatabase.add_optimised(op, min_op)

1530

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1531

# Add multiplication with alpha

1532

mul_alpha = Operation(Op.Mul, op.name + "_mul_alpha")

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1533

mul_alpha.add_input_tensor(mul_ifm)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1534

# Create const tensor containing alpha as scalar

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1535

quantization = ifm.quantization.clone()

1536

quantization.min = 0

1537

quantization.max = alpha * (quantization.quant_max - quantization.quant_min)

1538

quantization.zero_point = 0

Fredrik Svedberg

2022-09-13 15:22:01 +0200

[diff] [blame]

1539

alpha_dtype = mul_ifm.dtype

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1540

if is_converted_prelu:

1541

# The LeakyRelu was the result from convert_prelu and the scaling is provided

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

1542

scalar, alpha_scale, alpha_shift = op.attrs["alpha_scaling"]

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame]

1543

mul_alpha.explicit_scaling = ExplicitScaling(False, [alpha_shift], [alpha_scale])

Fredrik Svedberg

2022-09-13 15:22:01 +0200

[diff] [blame]

1544

elif alpha == 0 or np.isinf(1 / alpha):

1545

# Handling of alpha near or at zero

Fredrik Svedberg

cce872b

2021-09-02 15:20:52 +0200

[diff] [blame]

1546

quantization.scale_f32 = np.float32(1)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1547

scalar = 0

1548

else:

1549

quantization.scale_f32 = alpha

Fredrik Svedberg

2022-09-13 15:22:01 +0200

[diff] [blame]

1550

if alpha_dtype == DataType.int32:

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1551

# When the datatype is int32 (alpha negative) we need to do the scaling with the multiplication

Fredrik Svedberg

2022-09-13 15:22:01 +0200

[diff] [blame]

1552

scalar, _ = scaling.elementwise_mul_scale(ifm.quantization.scale_f32, alpha, ofm.quantization.scale_f32)

1553

else:

1554

scalar = 1

Tim Hall

2023-01-13 17:57:25 +0000

[diff] [blame]

1555

alpha_tens = create_const_tensor(op.name + "_alpha_scalar", [1], alpha_dtype, [scalar], quantization=quantization)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1556

mul_alpha.add_input_tensor(alpha_tens)

1557

fm_alpha = ofm.clone(op.name + "_alpha", set_unique=True)

1558

mul_alpha.set_output_tensor(fm_alpha)

1559

mul_alpha.set_ifm_ofm_shapes()

1560

DebugDatabase.add_optimised(op, mul_alpha)

1561

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1562

if not use_mul_max:

1563

relu_op = Operation(Op.Relu, op.name + "_relu")

1564

relu_op.add_input_tensor(ifm)

1565

fm_id = ofm.clone(op.name + "_positive_scaled", set_unique=True)

1566

relu_op.set_output_tensor(fm_id)

1567

relu_op.set_ifm_ofm_shapes()

1568

DebugDatabase.add_optimised(op, relu_op)

1569

elif check_quantized_tens_scaling_equal(ifm, ofm):

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1570

# No identity multiplication is needed

1571

fm_id = ifm

1572

else:

1573

# Add multiplication with identity

1574

mul_identity = Operation(Op.Mul, op.name + "_mul_identity")

1575

mul_identity.add_input_tensor(ifm)

1576

# Create const tensor containing identity as scalar

1577

quantization = ifm.quantization.clone()

1578

quantization.min = 0

1579

quantization.max = quantization.quant_max - quantization.quant_min

Fredrik Svedberg

cce872b

2021-09-02 15:20:52 +0200

[diff] [blame]

1580

quantization.scale_f32 = np.float32(1)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1581

quantization.zero_point = 0

Tim Hall

2023-01-13 17:57:25 +0000

[diff] [blame]

1582

identity_tens = create_const_tensor(op.name + "_id_scalar", [], ifm.dtype, [1], quantization=quantization)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1583

mul_identity.add_input_tensor(identity_tens)

1584

# Make sure that fm_id is allocated to a different address than fm_alpha

1585

fm_id = ofm.clone(op.name + "_id", set_unique=True)

1586

mul_identity.set_output_tensor(fm_id)

1587

mul_identity.set_ifm_ofm_shapes()

1588

DebugDatabase.add_optimised(op, mul_identity)

1589

1590

# Convert LeakyRelu to Max, add the results of the multiplication(s) as inputs

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1591

op.type = new_op

1592

op.name = op.name.replace("LeakyRelu", new_op.name)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1593

op.inputs = []

1594

ifm.consumer_list.remove(op)

1595

op.add_input_tensor(fm_alpha)

1596

op.add_input_tensor(fm_id)

1597

op.set_ifm_ofm_shapes()

1598

1599

DebugDatabase.add_optimised(op, op)

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1603

def convert_to_lut8(op, fn, fn_name):

1604

# Converts op to a no-op + int8/uint8 LUT which is generated with the given function.

1605

# fn is a function(real) -> real

1606

ifm, ofm = op.get_ifm_ofm()

1607

if ifm.dtype not in (DataType.uint8, DataType.int8) or ifm.dtype != ofm.dtype:

1608

return op

1609

# Generate the LUT

1610

ifm_scale = np.double(ifm.quantization.scale_f32)

1611

ofm_scale = np.double(ofm.quantization.scale_f32)

1612

zp_in = ifm.quantization.zero_point

1613

zp_out = ofm.quantization.zero_point

1614

values = []

1615

ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)

1616

quantized_min = min(ix)

1617

quantized_max = max(ix)

1618

for x in ix:

1619

x_real = ifm_scale * (x - zp_in)

1620

y_real = fn(x_real)

1621

lut_result = round_away_zero(zp_out + y_real / ofm_scale)

1622

lut_result = min(quantized_max, max(quantized_min, lut_result))

1623

values.append(lut_result)

1624

return convert_to_lut(op, values, fn_name)

1625

1626

1627

def convert_lrelu_to_lut(op, arch):

1628

ifm, ofm = op.get_ifm_ofm()

1629

# Generate the LUT

1630

alpha = op.attrs["alpha"]

1631

ifm_scale = np.double(ifm.quantization.scale_f32)

1632

ofm_scale = np.double(ofm.quantization.scale_f32)

1633

zp_in = ifm.quantization.zero_point

1634

zp_out = ofm.quantization.zero_point

1635

identity_scale, identity_shift = scaling.elementwise_mul_scale(ifm_scale, 1, ofm_scale)

1636

alpha_scalar = 1

1637

alpha_scale, alpha_shift = scaling.elementwise_mul_scale(ifm_scale, alpha, ofm_scale)

1638

if "alpha_scaling" in op.attrs:

1639

# The LeakyRelu was the result from convert_mul_max_to_abs_or_lrelu

1640

alpha_scalar, alpha_scale, alpha_shift = op.attrs["alpha_scaling"]

1641

values = []

1642

ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)

1643

quantized_min = min(ix)

1644

quantized_max = max(ix)

1645

for x in ix:

1646

if x < zp_in:

1647

lut_result = zp_out + fp_math.multiply_by_quantized_multiplier(

1648

alpha_scalar * (x - zp_in), alpha_scale, alpha_shift

1649

)

1650

else:

1651

lut_result = zp_out + fp_math.multiply_by_quantized_multiplier(x - zp_in, identity_scale, identity_shift)

1652

lut_result = min(quantized_max, max(quantized_min, lut_result))

1653

values.append(lut_result)

1654

return convert_to_lut(op, values, "lrelu")

1655

1656

Raul Farkas

2023-05-25 11:15:20 +0100

[diff] [blame]

1657

def convert_lrelu(op: Operation, arch, nng) -> Operation:

1658

"""Convert LeakyRelu to a LUT based solution if possible, otherwise a mul + max."""

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1659

if op.type != Op.LeakyRelu:

1660

return op

1661

ifm, ofm = op.get_ifm_ofm()

1662

if ifm is None or ofm is None:

1663

return op

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1664

alpha = op.attrs["alpha"]

1665

if alpha == 0:

1666

# When alpha is 0 the opertion can be converted to a ReLU

1667

op.type = Op.Relu

1668

op.name = op.name.replace("LeakyRelu", op.type.name)

1669

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1670

if ifm.dtype in (DataType.uint8, DataType.int8) and ifm.dtype == ofm.dtype:

1671

# use LUT for int8/uint8

1672

return convert_lrelu_to_lut(op, arch)

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1673

if check_quantized_tens_scaling_equal(ifm, ofm) and ifm.dtype == ofm.dtype == DataType.int16 and alpha > 0:

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1674

# use LeakyRelu unmodified for int16 with equal input/output scaling and positive alpha

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1675

return op

1676

return convert_lrelu_to_mul_max(op, arch)

1677

1678

Raul Farkas

2023-05-25 11:15:20 +0100

[diff] [blame]

1679

def convert_tanh_sigmoid_to_lut(op: Operation, arch, nng) -> Operation:

1680

"""Convert int8/uint8 Sigmoid and Tanh to a LUT based solution."""

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1681

if op.type == Op.Sigmoid:

1682

return convert_to_lut8(op, clamp_sigmoid, "sigmoid")

1683

elif op.type == Op.Tanh:

1684

return convert_to_lut8(op, math.tanh, "tanh")

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1688

def fuse_activation_function_with_prev(op, arch, nng):

1689

# if op is a no-op: attempts to move the activation function to the preceding op

1690

if not op.attrs.get("is_nop", False) or op.activation is None:

1691

return op

1692

ifm, ofm = op.get_ifm_ofm()

1693

if ifm is None or ofm is None:

1694

return op

1695

# finds the input(s) to the operation

1696

prev_op = ifm.ops[0]

1697

# Note: the below checks on prev_op require that a first optimize pass on the full graph has been performed

1698

fuse = (

1699

prev_op.run_on_npu

1700

and prev_op.type.npu_block_type != NpuBlockType.Default

1701

and len(ifm.ops) == 1

1702

and len(prev_op.outputs[0].consumers()) == 1

1703

and prev_op.activation is None

1704

)

1705

if op.activation_lut is not None and arch.shram_reserved_unused_banks == 0:

1706

# TODO: if SHRAM LUT space is shared with SHRAM ACC (32, 64 MAC),

1707

# LUT currently only works correctly for elementwise ops

fuse = False

if not fuse:

return op

# Move the fused activation function + corresponding info to prev_op

1712

prev_op.activation = op.activation

1713

prev_op.forced_output_quantization = op.forced_output_quantization

1714

if op.activation_lut is not None:

1715

prev_op.set_activation_lut(op.activation_lut)

1716

# Bypass op

1717

prev_op.set_output_tensor(ofm)

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

1718

DebugDatabase.add_optimised(prev_op, prev_op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

def _leading_pad_ok(leading_pad, stride, kernel_size):

1723

# If kernel size // 2 > stride, then (left, top) padding must be a multiple of stride,

1724

# otherwise replacing PAD by hardware padding would iterate the wrong IFM rows/columns

1725

max_size = kernel_size // 2

1726

return leading_pad == max_size or max_size <= stride or leading_pad % stride == 0

1727

1728

Raul Farkas

2023-05-25 11:15:20 +0100

[diff] [blame]

1729

def replace_pad_by_hw_pad(op: Operation, arch, nng) -> Operation:

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1730

"""

1731

Tries to completely remove a PAD operator by using hardware padding.

1732

E.g. a PAD operation that pads 1, followed by a CONV with VALID padding and kernel size 3

1733

is rewritten such that the PAD is removed, and the CONV uses SAME padding.

1734

Converts tens1 -> PAD -> tens2 -> CONV to tens1 -> CONV

1735

if both operations can be run on the NPU.

1736

This is the most efficient way to implement PAD, but cannot be done for all pad sizes.

1737

"""

1738

if (

1739

(op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op() or op.type.is_avgpool_op())

Tim Hall

0ab2edc

2022-02-23 17:58:02 +0000

[diff] [blame]

1740

and op.type not in (Op.Conv2DBackpropInput, Op.Conv2DBackpropInputSwitchedBias)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1741

and op.run_on_npu

1742

and op.attrs["padding"] == Padding.VALID

1743

):

1744

pad_op = op.ifm.ops[0]

1745

if pad_op.type != Op.Pad or not pad_op.run_on_npu:

1746

return op

1747

if pad_op.ifm.dtype != pad_op.ofm.dtype or not check_quantized_tens_scaling_equal(pad_op.ofm, pad_op.ifm):

1748

return op

1749

top, left, bottom, right = get_pad_values_from_input(pad_op.inputs[1].values)

1750

k = op.kernel

1751

k_w, k_h = k.dilated_wh()

1752

1753

# Check if the PAD operator can be replaced by hardware padding

1754

if left > k_w // 2 or right > k_w // 2 or top > k_h // 2 or bottom > k_h // 2:

1755

# Too much padding, it would require hardware padding to actually insert zeros

1756

return op

1757

if not _leading_pad_ok(top, k.stride.y, k_h) or not _leading_pad_ok(left, k.stride.x, k_w):

1758

return op

1759

1760

if op.type.is_avgpool_op():

1761

# For average pool, hardware padding can only be used if padding is 0 or kernel size / 2

for pad, k_size in (

(left, k_w),

(right, k_w),

(top, k_h),

(bottom, k_h),

):

if pad not in (0, k_size // 2):

1769

return op

1770

# Average pool is converted to depthwise, because NPU average pool + same padding

1771

# has a special implementation that is different from PAD followed by average pool with

1772

# valid padding.

1773

k_w, k_h = op.kernel.width, op.kernel.height

1774

ifm = op.ifm

1775

# Remember other inputs

1776

other_inputs = op.inputs[1:]

1777

# Create a weight tensor, all weights are set to 1/(kernel width * kernel height)

1778

quantization = QuantizationParameters(0.0, 255.0)

1779

quantization.scale_f32 = 1.0 / (k_w * k_h)

1780

quantization.zero_point = 0

1781

shape = [k_h, k_w, 1, op.ofm.shape[-1]]

1782

weights = np.full(shape, 1)

1783

1784

weight_tens = create_const_tensor(

1785

op.name + "_weights",

1786

shape,

1787

op.ifm.dtype,

1788

weights,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1789

purpose=TensorPurpose.Weights,

1790

quantization=quantization,

1791

)

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

1792

weight_tens.values = weights

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1793

op.type = Op.DepthwiseConv2DBias

1794

op.inputs = []

1795

op.add_input_tensor(ifm)

1796

op.add_input_tensor(weight_tens)

Tim Hall

2023-05-16 22:39:14 +0100

[diff] [blame]

1797

1798

if op.ifm.dtype == DataType.uint8:

1799

op.rounding_mode = RoundingMode.HalfUp

1800

1801

# Add bias tensor, all biases set to 0

1802

op.inputs.append(None)

1803

fixup_bias_tensors(op, arch, nng, DataType.int32)

1804

1805

else:

1806

op.rounding_mode = RoundingMode.AwayZero

1807

1808

# The DepthwiseConv needs to be performed with the IFM zero point set appropriately so that the correct

1809

# pad values are used. However, in order to use the rounding away from zero mode the zero point needs to

1810

# have been removed so that the zero point is at zero. This is done by adding a kernel sized amount of

1811

# the zero point as a bias. The datatype of the bias needs to be set to int32, even for an int16 IFM,

1812

# because this will cause full precision scaling to be used (see weight compression). Finally, the OFM

1813

# zero point will need forcing to zero (as it has already been removed)

1814

nr_biases = op.inputs[1].shape[-1]

1815

bias_values = [op.ifm.quantization.zero_point * k_h * k_w] * nr_biases

1816

bias_tensor = create_const_tensor(op.name + "_bias", [nr_biases], DataType.int32, bias_values)

1817

op.add_input_tensor(bias_tensor)

1818

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1819

# Add other inputs

1820

op.inputs.extend(other_inputs)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1821

1822

# Bypass the PAD operator

1823

op.set_input_tensor(pad_op.ifm, 0)

1824

# Adjust the padding attributes of the convolution operator

1825

op.attrs["padding"] = Padding.EXPLICIT

1826

op.attrs["explicit_padding"] = (top, left, bottom, right)

1827

op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

1828

DebugDatabase.add_optimised(op, op)

1829

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

def convert_pad(op: Operation, arch, nng):

1834

"""

1835

Rewrites PAD operator to an average pool that copies the IFM to the OFM

1836

+ up to 4 average pool operators that fill the OFM with zeros at the borders.

1837

This is done as fall-back for the PAD operators that remain after replace_pad_by_hw_pad

1838

"""

1839

if op.type != Op.Pad or not op.run_on_npu:

1840

return op

1841

top, left, bottom, right = get_pad_values_from_input(op.inputs[1].values)

1842

1843

ifm = op.ifm

1844

assert ifm is not None

James Ward

3e13434

2021-10-28 10:01:40 +0100

[diff] [blame]

1845

ifm_shape = op.ifm_shapes[0]

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1846

ofm = op.ofm

1847

assert ofm is not None

1848

ofm.ops = []

1849

ofm_shape = op.ofm_shapes[0]

1850

1851

# Average pool op that copies IFM to the right place inside the OFM

1852

shp0 = Shape4D(0, 0, 0, 0)

1853

shp_top = shp0.with_height(top)

1854

avgpool_op = create_avg_pool_for_concat(op, op.name + "_main", ifm, ifm_shape, shp_top.with_width(left))

1855

avgpool_op.activation = op.activation

1856

quant = ofm.quantization

1857

pad_value = quant.zero_point

1858

# Add operations that fill the borders of the OFM

1859

if top > 0:

1860

shape = Shape4D(1, top, ofm_shape.width, ofm_shape.depth)

1861

zero_tens = create_const_tensor(

Tim Hall

2023-01-13 17:57:25 +0000

[diff] [blame]

1862

op.name + "_top", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], quantization=quant

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1863

)

1864

# If top/bottom or left/right are equal, the const tensors can be allocated to the same address

1865

zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))

1866

create_avg_pool_for_concat(op, op.name + "_top", zero_tens, shape, shp0)

1867

if bottom > 0:

1868

shape = Shape4D(1, bottom, ofm_shape.width, ofm_shape.depth)

1869

zero_tens = create_const_tensor(

op.name + "_bottom",

shape.as_list(),

ofm.dtype,

shape.elements() * [pad_value],

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1874

quantization=quant,

1875

)

1876

zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))

1877

create_avg_pool_for_concat(

1878

op, op.name + "_bottom", zero_tens, shape, shp0.with_height(ofm_shape.height - bottom)

1879

)

1880

if left > 0:

1881

shape = Shape4D(1, ifm_shape.height, left, ofm_shape.depth)

1882

zero_tens = create_const_tensor(

Tim Hall

2023-01-13 17:57:25 +0000

[diff] [blame]

1883

op.name + "_left", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], quantization=quant

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1884

)

1885

zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))

1886

create_avg_pool_for_concat(op, op.name + "_left", zero_tens, shape, shp_top)

1887

if right > 0:

1888

shape = Shape4D(1, ifm_shape.height, right, ofm_shape.depth)

1889

zero_tens = create_const_tensor(

Tim Hall

2023-01-13 17:57:25 +0000

[diff] [blame]

1890

op.name + "_right", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], quantization=quant

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1891

)

1892

zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))

1893

create_avg_pool_for_concat(

1894

op, op.name + "_right", zero_tens, shape, shp_top.with_width(ofm_shape.width - right)

1895

)

1896

1897

op.type = Op.ConcatTFLite

return avgpool_op

Raul Farkas

2023-05-25 11:15:20 +0100

[diff] [blame]

1901

def fixup_bias_tensors(op: Operation, arch, nng, dtype=None) -> Operation:

1902

"""Fixup ops that require a bias and don't have one by adding a bias tensor filled with zeros."""

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1903

if op.type.needs_bias() and op.bias is None:

1904

# Op has no bias, add bias tensor filled with zeros

1905

nr_biases = op.inputs[1].shape[-1]

1906

bias_values = [0] * nr_biases

Fredrik Svedberg

cc219be

2022-09-20 16:32:52 +0200

[diff] [blame]

1907

# The DataType of the bias tensor can be explicitly provided or deduced from the ifm

1908

# DataType. Default is int32 bias for 8-bit ifms and int64 for int16 ifms.

1909

# For int16 the selected bias DataType will have an impact on the scaling

1910

# used when encoding the scales and biases later. The default mode will match the

1911

# refence with reduced scaling for int64 bias.

1912

# This means that in cases (in the graph optimiser) where DepthwiseConv2DBias

1913

# is used to emulate average pool int32 bias should be selected for full precision

1914

# int16 scaling.

1915

if dtype is None:

1916

dtype = DataType.int64 if op.ifm.dtype == DataType.int16 else DataType.int32

1917

bias_tensor = create_const_tensor(op.name + "_bias", [nr_biases], dtype, bias_values)

Raul Farkas

3e7157b

2023-05-09 09:09:17 +0100

[diff] [blame]

1918

bias_index = op.type.info.indices.biases[0]

1919

if bias_index < len(op.inputs):

1920

op.set_input_tensor(bias_tensor, bias_index)

1921

else:

1922

op.add_input_tensor(bias_tensor)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

wilisa01

2023-02-08 09:56:14 +0000

[diff] [blame]

1927

def detect_asymmetric_weights(op):

1928

# Check all ops (cpu and npu)

1929

if op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op():

1930

if op.ifm.dtype in (DataType.int8, DataType.int16):

Fredrik Svedberg

2021-11-01 14:25:29 +0100

[diff] [blame]

1931

if not np.all(op.weights.quantization.zero_point == 0):

wilisa01

2023-02-08 09:56:14 +0000

[diff] [blame]

1932

print(f"Warning: Op {op.type} '{op.name}' has asymmetric weights.", end=" ")

1933

return True

1934

return False

Fredrik Svedberg

2021-11-01 14:25:29 +0100

[diff] [blame]

1935

wilisa01

2023-02-08 09:56:14 +0000

[diff] [blame]

1936

Raul Farkas

2023-05-25 11:15:20 +0100

[diff] [blame]

1937

def fixup_asymmetric_weights(op: Operation, arch, nng) -> Operation:

wilisa01

2023-02-08 09:56:14 +0000

[diff] [blame]

1938

if detect_asymmetric_weights(op):

1939

if op.run_on_npu:

1940

print("Zero points have been adjusted.")

1941

op.weights.quantization.zero_point *= 0

Fredrik Svedberg

2021-11-01 14:25:29 +0100

[diff] [blame]

return op

wilisa01

2023-02-08 09:56:14 +0000

[diff] [blame]

1945

def check_asymmetric_weights(op, arch, nng):

1946

# This function can modify the run_on_npu flag which causes an operator to be placed on the CPU. It is usually only

1947

# set by the supported operator checks. Therefore, it should be run immediately after those checks to avoid the

1948

# possibility of other graph optimiser functions modify the operator (that is later run on the CPU)

1949

if detect_asymmetric_weights(op):

1950

if op.run_on_npu:

1951

print("To run the operator on Ethos-U use the option --force-symmetric-int-weights")

1952

op.run_on_npu = False

return op

def fixup_or_check_asymmetric_weights(force_symmetric_int_weights):

1957

if force_symmetric_int_weights:

1958

return fixup_asymmetric_weights

1959

else:

1960

return check_asymmetric_weights

1961

1962

Rickard Bolin

2023-04-20 15:12:28 +0000

[diff] [blame]

1963

def convert_mean_to_depthwise_conv(op, arch, nng):

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame]

1964

"""

1965

When h x w <= 4096 When h x w > 4096 there is a need to split into several ops.

1966

Do this by splitting up h and change the read_offset/shape.

1967

Below is an example where ifm is 1x190x64x1

1968

MEAN MEAN

1969

| |-----------------------|----------------------|

1970

DepthwiseConv2DBias 1_DepthwiseConv2DBias 2_DepthwiseConv2DBias 3_DepthwiseConv2DBias

1971

| | | |

1972

MUL |---------ADD-----------| |

1973

| |

1974

|----------------ADD---------------|

1975

|

1976

MUL

1977

1_DepthwiseConv2DBias: read_offset [0, 0, 0, 0]> read_shape [1, 64, 64, 1]>

1978

2_DepthwiseConv2DBias: read_offset [0, 64, 0, 0]> read_shape [1, 64, 64, 1]>

1979

3_DepthwiseConv2DBias: read_offset [0, 128, 0, 0]> read_shape [1, 62, 64, 1]>

1980

"""

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1981

if op.type == Op.Mean and op.run_on_npu:

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame]

1982

max_kernel_size = 4096

1983

max_height = 64

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1984

inp, axis = op.inputs

Alexander Hansson

2023-06-27 12:36:25 +0000

[diff] [blame]

1985

dims = len(inp.shape)

1986

dims_ofm = len(op.ofm.shape)

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame]

1987

ofmq = op.ofm.quantization

1988

ifmq = op.ifm.quantization

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1989

Alexander Hansson

2023-06-27 12:36:25 +0000

[diff] [blame]

1990

# reduce_axis[i] is true if axis i should be reduced

1991

if axis.shape == []:

1992

reduce_axis = [True if i == axis.values else False for i in range(dims)]

1993

else:

1994

reduce_axis = [True if i in axis.values else False for i in range(dims)]

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1995

Alexander Hansson

2023-06-27 12:36:25 +0000

[diff] [blame]

1996

ifm_shape = inp.shape.copy()

1997

intermediate_shape = op.ofm.shape.copy()

Diqing Zhong

1ddb2ed

2022-03-09 12:23:47 +0100

[diff] [blame]

1998

Alexander Hansson

2023-06-27 12:36:25 +0000

[diff] [blame]

1999

# Fix intermediate_shape when keep_dims is false

2000

# e.g. IFM=1xHxWxC axis=2 OFM=1xHxC, the intermediate_shape should be 1xHx1xC

2001

if dims_ofm < dims:

2002

for i in range(dims):

2003

if reduce_axis[i]:

2004

intermediate_shape.insert(i, 1)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2005

Alexander Hansson

2023-06-27 12:36:25 +0000

[diff] [blame]

2006

# Reshape to 4D

Alexander Hansson

da8741a

2023-06-30 15:41:13 +0000

[diff] [blame]

2007

reduce_axis = full_shape(4, reduce_axis, False)

2008

ifm_shape = full_shape(4, ifm_shape, 1)

2009

intermediate_shape = full_shape(4, intermediate_shape, 1)

Alexander Hansson

2023-06-27 12:36:25 +0000

[diff] [blame]

2010

2011

# If all dimensions to reduce have shape 1, the operation is essentially a memcpy.

2012

# We can then remove the whole op by propagating ofm to previous ops

2013

if not any([reduce_axis[i] and ifm_shape[i] > 1 for i in range(4)]):

2014

op.type = Op.Memcpy

2015

op = bypass_memory_only_ops(op, arch, nng)

2016

return op

2017

Alexander Hansson

da8741a

2023-06-30 15:41:13 +0000

[diff] [blame]

2018

# Support mean over depth-axis by left-shifting the C channel

2019

# From semantics checks we can assume that one of H,W,C has shape 1

2020

if reduce_axis[3] and ifm_shape[3] > 1:

2021

assert 1 in ifm_shape[1:], "Mean reduction over depth channel, but none of H,W,C has shape 1"

2022

# If W=1 reshape NxHx1xC -> NxHxCx1, else reshape Nx1xWxC -> NxWxCx1

2023

idx_to_del = 2 if ifm_shape[2] == 1 else 1

2024

2025

# Delete axis with size 1

2026

del reduce_axis[idx_to_del]

2027

del ifm_shape[idx_to_del]

2028

del intermediate_shape[idx_to_del]

2029

2030

# Add another element to set channel-axis to one

2031

reduce_axis.append(False)

2032

ifm_shape.append(1)

2033

intermediate_shape.append(1)

2034

2035

# Compute kernel sizes for our convolutions

2036

# Batch axis is implicit as it is only supported if batch size is 1.

Alexander Hansson

2023-06-27 12:36:25 +0000

[diff] [blame]

2037

h = ifm_shape[1] if reduce_axis[1] else 1

2038

w = ifm_shape[2] if reduce_axis[2] else 1

2039

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame]

2040

num_elements_in_axis = h * w

2041

2042

# If one convolution is enough, but height is greater than max kernel height

2043

# reshape from HxW to 1x(HxW)

2044

# This can only be done if the mean is computed over both H and W

Alexander Hansson

2023-06-27 12:36:25 +0000

[diff] [blame]

2045

if h > max_height and num_elements_in_axis <= max_kernel_size and reduce_axis[1] and reduce_axis[2]:

2046

ifm_shape = [ifm_shape[0], 1, h * w, ifm_shape[3]]

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame]

2047

w = h * w

2048

h = 1

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2049

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame]

2050

intermediate_op = None

2051

height_per_conv = min(max_kernel_size // w, h)

2052

height_per_conv = min(height_per_conv, max_height)

2053

num_convs = math.ceil(h / height_per_conv)

2054

convs = list()

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2055

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame]

2056

for i in range(num_convs):

2057

is_last_op = i == (num_convs - 1)

2058

2059

intermediate_op = op.clone(f"{op.name}_conv_{i}")

2060

2061

intermediate_op.type = Op.DepthwiseConv2DBias

2062

2063

# Set necessary depthwise attributes

2064

intermediate_op.attrs.update(

2065

{

2066

"padding": Padding.VALID,

2067

"stride_h": 1,

2068

"stride_w": 1,

2069

"strides": (1, 1, 1, 1),

2070

"depth_multiplier": 1,

2071

"channel_multiplier": 1,

2072

"dilation_h_factor": 1,

2073

"dilation_w_factor": 1,

2074

"dilation": (1, 1, 1, 1),

}

)

Alexander Hansson

2023-06-27 12:36:25 +0000

[diff] [blame]

2078

b, _, _, c = ifm_shape

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame]

2079

2080

intermediate_tensor = op.ofm.clone(suffix=f"_conv_sum_{i}", set_unique=True)

2081

intermediate_tensor.dtype = DataType.int32

Alexander Hansson

2023-06-27 12:36:25 +0000

[diff] [blame]

2082

intermediate_tensor.shape = intermediate_shape

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame]

2083

intermediate_op.set_output_tensor(intermediate_tensor)

2084

2085

# as we have several convs, scaling/rounding must be done after the sum has been calculated

2086

intermediate_op.explicit_scaling = ExplicitScaling(False, shift=[0], multiplier=[1])

2087

2088

# compute height for the kernel

2089

if is_last_op and h % height_per_conv != 0:

2090

weight_h = h % height_per_conv

2091

else:

2092

weight_h = height_per_conv

2093

2094

# compute ifm read offset and shape for the convolution

Alexander Hansson

2023-06-27 12:36:25 +0000

[diff] [blame]

2095

read_shape_h = weight_h if reduce_axis[1] else ifm_shape[1]

2096

read_shape_w = w if reduce_axis[2] else ifm_shape[2]

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame]

2097

2098

intermediate_op.read_offsets[0] = Shape4D([0, i * height_per_conv, 0, 0])

Alexander Hansson

2023-06-27 12:36:25 +0000

[diff] [blame]

2099

intermediate_op.read_shapes[0] = Shape4D(ifm_shape).with_hw(read_shape_h, read_shape_w)

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame]

2100

2101

weight_quant = QuantizationParameters(0, 255, scale_f32=1.0, zero_point=0)

2102

weight_shape = [weight_h, w, c, b]

2103

weight_tensor = create_const_tensor(

2104

f"{intermediate_op.name}_weights",

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2105

weight_shape,

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame]

2106

DataType.uint8,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2107

np.ones(weight_shape),

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame]

2108

TensorPurpose.Weights,

2109

quantization=weight_quant,

2110

)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2111

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame]

2112

weights_1D = np.ones(np.prod(weight_shape))

2113

weight_tensor.equivalence_id = create_equivalence_id(tuple(weights_1D))

2114

weight_tensor.value_id = weight_tensor.equivalence_id

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2115

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame]

2116

intermediate_op.set_input_tensor(weight_tensor, 1)

Rickard Bolin

2023-04-20 15:12:28 +0000

[diff] [blame]

2117

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame]

2118

dtype = DataType.int64 if intermediate_op.ifm.dtype == DataType.int16 else DataType.int32

2119

bias_values = [0] * c

2120

bias = create_const_tensor(f"{intermediate_op.name}_bias", [c], dtype, bias_values)

2121

bias.equivalence_id = create_equivalence_id(tuple(bias_values))

2122

bias.value_id = bias.equivalence_id

2123

intermediate_op.inputs.append(bias)

2124

intermediate_op.set_ifm_ofm_shapes()

Johan Alfven

2023-04-13 18:54:47 +0200

[diff] [blame]

2125

Alexander Hansson

2023-06-27 12:36:25 +0000

[diff] [blame]

2126

# We want to avoid reshaping the ifm tensor directly, to not affect other ops

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame]

2127

# so we update the shape explicitly for this operation

Alexander Hansson

2023-06-27 12:36:25 +0000

[diff] [blame]

2128

intermediate_op.ifm_shapes[0] = Shape4D(ifm_shape)

Rickard Bolin

2023-04-20 15:12:28 +0000

[diff] [blame]

2129

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame]

2130

convs.append(intermediate_op)

2131

DebugDatabase.add_optimised(op, intermediate_op)

2132

2133

# If we have more than one convolution

2134

# We use add operations to accumulate the intermediate tensors

if len(convs) > 1:

prev_add_op = None

idx = 0

while len(convs):

intermediate_tensor = op.ofm.clone(suffix=f"_add_sum_{idx}", set_unique=True)

2141

intermediate_tensor.dtype = DataType.int32

Alexander Hansson

2023-06-27 12:36:25 +0000

[diff] [blame]

2142

intermediate_tensor.shape = intermediate_shape

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame]

2143

2144

one_scale_quant = QuantizationParameters(scale_f32=1.0, zero_point=0)

2145

2146

ifm = convs.pop().ofm

2147

if not prev_add_op:

2148

ifm2 = convs.pop().ofm

2149

else:

2150

ifm2 = prev_add_op.ofm

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame]

2151

intermediate_op = create_add(f"{op.name}_add_{idx}", ifm, ifm2, one_scale_quant)

2152

intermediate_op.explicit_scaling = ExplicitScaling(False, shift=[0], multiplier=[1])

2153

intermediate_op.set_output_tensor(intermediate_tensor)

2154

intermediate_op.set_ifm_ofm_shapes()

2155

2156

prev_add_op = intermediate_op

2157

idx += 1

2158

2159

DebugDatabase.add_optimised(op, intermediate_op)

2160

2161

# Convert the original mean op to our final Mul operation

2162

# Which scales and divides by num_elements_in_axis

2163

op.type = Op.Mul

2164

op.name = f"{op.name}_mul"

2165

op.attrs = {}

2166

op.set_input_tensor(intermediate_op.ofm, 0)

Rickard Bolin

2023-04-20 15:12:28 +0000

[diff] [blame]

2167

Johan Alfven

2023-04-13 18:54:47 +0200

[diff] [blame]

2168

# The multiplier is calculated in the same way as in the reference,

2169

# clamping the shift value at the price of some precision loss.

Johan Alfven

2023-04-13 18:54:47 +0200

[diff] [blame]

2170

output_multiplier, output_shift_vela = quantise_scale(np.double(ifmq.scale_f32) / np.double(ofmq.scale_f32))

2171

2172

# Convert to reference representation shift value

2173

output_shift = 31 - output_shift_vela

2174

2175

# Reference calculation

2176

# round_down_log2 same as 63 - CountLeadingZeros(num_elements_in_axis)

2177

shift = round_down_log2(num_elements_in_axis)

2178

shift = min(shift, 32)

2179

shift = min(shift, 31 + output_shift)

2180

output_multiplier = (output_multiplier << shift) // num_elements_in_axis

2181

output_shift = output_shift - shift

2182

2183

# Convert to vela representation shift

2184

output_shift_vela = 31 - output_shift

2185

2186

# For int32 scaling is not supported so instead multiply with the scale

2187

# intermediate * scale -> round and shift.

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame]

2188

identity_quant = QuantizationParameters(scale_f32=1.0, zero_point=0)

Johan Alfven

2023-04-13 18:54:47 +0200

[diff] [blame]

2189

scalar = create_const_tensor(

2190

op.name + "_scalar", [1, 1, 1, 1], DataType.int32, [output_multiplier], quantization=identity_quant

2191

)

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame]

2192

op.set_input_tensor(scalar, 1)

2193

op.set_ifm_ofm_shapes()

Alexander Hansson

2023-06-27 12:36:25 +0000

[diff] [blame]

2194

op.ofm_shapes[0] = Shape4D(intermediate_shape)

Johan Alfven

2023-04-13 18:54:47 +0200

[diff] [blame]

2195

2196

# Reference using TFL rounding for the multiply

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame]

2197

op.rounding_mode = RoundingMode.TFLite

Johan Alfven

2023-04-13 18:54:47 +0200

[diff] [blame]

2198

2199

# Need to use explicit scaling to get the wanted shift

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame]

2200

op.explicit_scaling = ExplicitScaling(False, [output_shift_vela], [1])

2201

DebugDatabase.add_optimised(op, op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

Raul Farkas

2023-05-25 11:15:20 +0100

[diff] [blame]

2205

def convert_ops_to_lut(op: Operation, arch, nng) -> Operation:

2206

"""Convert Exp to 8bit or 16bit LUT to allow for support on NPU."""

Johan Alfven

2023-04-24 13:35:40 +0200

[diff] [blame]

2207

if op.type == Op.Exp:

2208

if op.ifm.dtype == DataType.int8:

2209

return create_lut_8bit_op(op, math.exp, "exp")

2210

elif op.ifm.dtype == DataType.int16:

2211

return create_lut_int16_op(op, math.exp, "exp")

2212

else:

2213

# Should already be catched in tflite supported ops

2214

assert False, f"Unsupported data type {op.ifm.dtype} for {op.type}"

2215

Johan Alfven

8e525ca

2023-05-07 13:12:37 +0200

[diff] [blame]

2216

if op.type == Op.Rsqrt:

2217

return create_lut_rsqrt_int8_op(op)

2218

Johan Alfven

2023-04-24 13:35:40 +0200

[diff] [blame]

return op

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

2222

def optimise_quantize(op: Operation, arch, nng):

2223

2224

if op.type == Op.Quantize and op.run_on_npu:

2225

2226

ifm, ofm = op.get_ifm_ofm()

2227

input_values = ifm.values

2228

2229

# Guard clause - input not const or no values to quantize

2230

if ifm.ops[0].type != Op.Const or input_values is None:

2231

return op

2232

2233

# Singular val in numpy array, convert to indexable array

2234

if input_values.ndim == 0:

2235

input_values = np.array([input_values])

2236

Fredrik Svedberg

1156317

2022-07-06 14:54:12 +0200

[diff] [blame]

2237

# requantized int8 to int8 or int16 to int16

2238

if ifm.dtype == ofm.dtype == DataType.int8 or ifm.dtype == ofm.dtype == DataType.int16:

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

2239

2240

# scale needs to use double precision to match TFLite reference kernel

2241

effective_scale = np.float64(ifm.quantization.scale_f32) / np.float64(ofm.quantization.scale_f32)

2242

effective_multiplier, effective_shift = quantise_scale(effective_scale)

2243

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

2244

requantized_vals = []

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

2245

for val in input_values.flatten():

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

2246

input_val = val - ifm.quantization.zero_point

2247

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

2248

ofm_val = fp_math.multiply_by_quantized_multiplier(input_val, effective_multiplier, effective_shift)

2249

ofm_val += ofm.quantization.zero_point

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

2250

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

2251

clamped_ofm_value = max(min(ofm_val, ofm.quantization.quant_max), ofm.quantization.quant_min)

2252

requantized_vals.append(clamped_ofm_value)

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

2253

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

2254

ofm.values = np.array(requantized_vals, ofm.dtype.as_numpy_type())

2255

ofm.values.shape = input_values.shape

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

2256

2257

# Case: Float input - quantize to int

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

2258

elif ifm.dtype.type == BaseType.Float:

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

2259

2260

quantized_vals = []

2261

for val in input_values:

2262

2263

# Derive quantized value

2264

quant_val = (val / ofm.quantization.scale_f32) + ofm.quantization.zero_point

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

2265

clamped_quantized_val = np.clip(quant_val, ofm.quantization.quant_min, ofm.quantization.quant_max)

2266

quantized_vals.append(clamped_quantized_val)

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

2267

2268

# Pass the statically calculated quant val to output tensor

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

2269

ofm.values = np.array(quantized_vals, ofm.dtype.as_numpy_type())

2270

2271

# Unsupported data type

2272

else:

2273

return op

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

2274

2275

# Make quantize op const and disconnect from parent node

2276

2277

# Remove reference of the current quant op from the parent tensor's consumer list

2278

ifm.consumer_list = [consumer for consumer in ifm.consumer_list if consumer.op_index != op.op_index]

2279

2280

# Clear any references to parent node

2281

op.inputs = []

2282

2283

# Convert this quantize op to const

op.type = Op.Const

return op

Ayaan Masood

2022-06-29 11:30:57 +0100

[diff] [blame]

2289

def convert_shape_op_to_constant_tensor(op: Operation, arch, nng):

2290

"""Static optimisation for SHAPE operator output value known at compile time"""

2291

2292

# Disconnect SHAPE operator from its parent and transform SHAPE OP into constant

2293

2294

if op.type == Op.Shape and op.run_on_npu:

2295

2296

ifm, ofm = op.get_ifm_ofm()

2297

2298

if len(ifm.shape) != ofm.shape[0]:

2299

return op

2300

2301

# Remove reference of the current shape op from the parent tensor's consumer list

2302

ifm.consumer_list = [consumer for consumer in ifm.consumer_list if consumer.op_index != op.op_index]

2303

2304

# Clear any references to parent node

2305

op.inputs = []

2306

2307

# Convert this SHAPE op to const

2308

op.type = Op.Const

2309

2310

# Add size calculation to shape output tensors

2311

ofm.values = np.array(ifm.shape)

return op

Raul Farkas

2023-05-25 11:15:20 +0100

[diff] [blame]

2316

def fixup_dilation_gt2(op: Operation, arch, nng) -> Operation:

2317

"""Fixup Conv2DBias and DepthwiseConv2DBias to allow dilation greater than 2."""

Tim Hall

ea4ba66

2022-11-11 18:19:53 +0000

[diff] [blame]

2318

assert op.run_on_npu

2319

if op.type == Op.Conv2DBias or op.type == Op.DepthwiseConv2DBias:

2320

dilation_w, dilation_h = op.get_kernel_dilation()

2321

2322

# if dilation in either axis is greater than that supported by the hardware then we must manually dilate the

2323

# kernel

2324

if dilation_w > 2 or dilation_h > 2:

2325

kernel_w, kernel_h = op.get_kernel_size()

2326

kernel_ic = op.weights.shape[-2]

2327

kernel_oc = op.weights.shape[-1]

2328

2329

# if the dilation is a multiple of 2 then the hardware dialtion can be enabled to provide that multiple

2330

# of 2. this allows the kernel size to be reduced (via the scaled dilation) by half in that dimension.

2331

# odd = 1, even = 2

2332

hw_dilation_h = 1 if (dilation_h & 1) else 2

2333

hw_dilation_w = 1 if (dilation_w & 1) else 2

2334

2335

scale_dilation_h = dilation_h // hw_dilation_h

2336

scale_dilation_w = dilation_w // hw_dilation_w

2337

2338

# create new empty kernel (HWIO format)

2339

new_kernel_h = (kernel_h - 1) * scale_dilation_h + 1

2340

new_kernel_w = (kernel_w - 1) * scale_dilation_w + 1

2341

2342

new_kernel_shape = [new_kernel_h, new_kernel_w, kernel_ic, kernel_oc]

2343

new_kernel_values = np.zeros(new_kernel_shape, dtype=op.weights.values.dtype)

2344

2345

# copy the original kernel values into the new sparse kernel

2346

for h in range(0, kernel_h):

2347

for w in range(0, kernel_w):

2348

new_h = h * scale_dilation_h

2349

new_w = w * scale_dilation_w

2350

new_kernel_values[new_h, new_w, :, :] = op.weights.values[h, w, :, :]

2351

2352

# update the weight tensor with the new dilated kernel

2353

op.weights.shape = new_kernel_shape

2354

op.weights.values = new_kernel_values

2355

2356

# enable(=2) / disable(=1) hardware dilation

2357

op.attrs["dilation"] = (1, hw_dilation_h, hw_dilation_w, 1) # nhwc format

2358

op.attrs["dilation_h_factor"] = hw_dilation_h

2359

op.attrs["dilation_w_factor"] = hw_dilation_w

return op

Tim Hall

2023-03-10 18:11:34 +0000

[diff] [blame]

2364

def fixup_reshape(op, arch, nng):

2365

def _get_explicit_shape(implicit_shape, total_size):

2366

# the explicit shape is a copy of the implicit shape but with the special -1 (remaining size) value converted to

2367

# the appropriate value

2368

if implicit_shape is None:

2369

return None

2370

2371

explicit_shape = list(implicit_shape)

2372

if -1 in explicit_shape:

2373

explicit_shape[explicit_shape.index(-1)] = int(total_size / abs(np.prod(implicit_shape)))

2374

2375

return explicit_shape

2376

2377

if op.type == Op.Reshape:

2378

ifm_tensor, _, ofm_tensor = op.get_ifm_ifm2_ofm()

2379

ifm_size = ifm_tensor.elements()

2380

ofm_shape = ofm_tensor.shape

2381

2382

new_shape_tensor_shape = op.inputs[1].values.flatten() if len(op.inputs) > 1 else None

2383

new_shape_tensor_shape = _get_explicit_shape(new_shape_tensor_shape, ifm_size)

2384

2385

new_shape_attribute = op.attrs.get("new_shape", None)

2386

new_shape_attribute = _get_explicit_shape(new_shape_attribute, ifm_size)

2387

2388

# if present the new shape tensor overrides the new_shape attribute

2389

if new_shape_tensor_shape is not None:

2390

# check tensor

2391

if not np.array_equal(new_shape_tensor_shape, ofm_shape):

2392

print(

2393

f"Warning: {optype_to_builtintype(op.type)} '{op.name}' has new shape tensor"

2394

f" ({new_shape_tensor_shape}) that does not match output tensor shape {ofm_shape}. Will use output"

2395

f" tensor shape."

2396

)

2397

elif new_shape_attribute is not None:

2398

# check attribute

2399

if not np.array_equal(new_shape_attribute, ofm_shape):

2400

print(

2401

f"Warning: {optype_to_builtintype(op.type)} '{op.name}' has new_shape attribute"

2402

f" ({new_shape_attribute}) that does not match output tensor shape {ofm_shape}. Will use output"

f" tensor shape."

)

else:

print(

f"Warning: {optype_to_builtintype(op.type)} '{op.name}' does not have a new shape tensor or a new_shape"

2408

f" attribute. Will use output tensor shape {ofm_shape}."

2409

)

2410

2411

# force new shape tensor to output shape

2412

new_shape_tensor = create_const_tensor(

2413

op.name + "_new_shape", [len(ofm_shape)], DataType.int32, np.array(ofm_shape, np.int32)

2414

)

2415

if len(op.inputs) > 1:

2416

op.set_input_tensor(new_shape_tensor, 1)

2417

else:

2418

op.add_input_tensor(new_shape_tensor)

2419

2420

# force new_shape attribute to output shape

2421

op.attrs["new_shape"] = ofm_shape

return op

Tim Hall

2023-06-27 12:07:49 +0100

[diff] [blame]

2426

def convert_conv_groups(op: Operation, arch, nng):

2427

"""

2428

Convert convolution groups to a split followed by separate convolutions and then a concat.

2429

This needs to run before the concat and split handling functions"""

2430

if not op.type.is_conv2d_op():

2431

return op

2432

2433

num_conv_groups = op.attrs.get("num_conv_groups", 0)

2434

if num_conv_groups > 1:

2435

# convolution groups params

2436

ifm_depth_cg = op.ifm.shape[-1] // num_conv_groups

2437

num_filters_cg = op.weights.shape[-1] // num_conv_groups

2438

2439

# create split

2440

split_op = Operation(Op.Split, f"{op.name}_split")

2441

split_op.attrs.update(

2442

{

2443

"num_splits": num_conv_groups,

2444

}

2445

)

2446

# first input is the split axis

2447

split_op.add_input_tensor(

2448

# split along the depth axis

2449

create_const_tensor(f"{split_op.name}_axis", [0], DataType.int32, [-1])

2450

)

2451

# second input is the ifm

2452

split_op.add_input_tensor(op.ifm)

2453

# calculate shape of each ofm part

2454

split_op_ofm_shape = op.ifm.shape[:-1] + [ifm_depth_cg]

2455

2456

# create concat. do this prior to each conv group so that the for-loop can reference the concat as it iterates

2457

concat_op = Operation(Op.ConcatTFLite, f"{op.name}_concat")

2458

concat_op.attrs.update(

2459

{

2460

"axis": -1,

2461

"fused_activation_function": None,

2462

}

2463

)

2464

# calculate shape of each ifm part

2465

concat_op_ifm_shape = op.ofm.shape[:-1] + [num_filters_cg]

2466

# output is the concatenated tensor

2467

concat_op.set_output_tensor(op.ofm) # will disconnect ofm from op

2468

2469

# for each conv group

2470

for i in range(num_conv_groups):

2471

# cg params

2472

cg_oc_start = i * num_filters_cg

2473

cg_oc_end = (i + 1) * num_filters_cg

2474

2475

# split has multiple outputs

2476

split_op_ofm_part = Tensor(split_op_ofm_shape, op.ifm.dtype, f"{split_op.name}_out{i}")

2477

split_op_ofm_part.quantization = op.ifm.quantization.clone()

2478

split_op.add_output_tensor(split_op_ofm_part)

2479

2480

# concat has multiple inputs

2481

concat_op_ifm_part = Tensor(concat_op_ifm_shape, op.ifm.dtype, f"{concat_op.name}_in{i}")

2482

concat_op_ifm_part.quantization = op.ofm.quantization.clone()

2483

concat_op.add_input_tensor(concat_op_ifm_part)

2484

2485

# create convolution group operator

2486

conv_group_op = Operation(op.type, f"{op.name}_cg{i}")

2487

conv_group_op.attrs = op.attrs.copy()

2488

conv_group_op.attrs["num_conv_groups"] = 1

2489

# first input is the ifm

2490

conv_group_op.add_input_tensor(split_op_ofm_part)

2491

# second input is weights. the number of filters (i.e. the output channels) need to be split equally

2492

# across all of the convolution groups

2493

conv_group_op_weights_shape = op.weights.shape[:-1] + [num_filters_cg]

2494

conv_group_op_weights_quant = op.weights.quantization.clone()

2495

conv_group_op_weights_quant.scale_f32 = op.weights.quantization.scale_f32[..., cg_oc_start:cg_oc_end]

2496

conv_group_op_weights_quant.zero_point = op.weights.quantization.zero_point[..., cg_oc_start:cg_oc_end]

2497

conv_group_op.add_input_tensor(

2498

create_const_tensor(

2499

f"{op.weights.name}_cg{i}",

2500

conv_group_op_weights_shape,

2501

op.weights.dtype,

2502

op.weights.values[..., cg_oc_start:cg_oc_end],

2503

op.weights.purpose,

2504

conv_group_op_weights_quant,

2505

)

2506

)

2507

# third input is bias. like the weights, the bias needs to be split equally across all of the convolution

2508

# groups

2509

if op.bias is None:

2510

conv_group_op.add_input_tensor(None)

2511

else:

2512

conv_group_op_bias_shape = op.bias.shape[:-1] + [num_filters_cg]

2513

conv_group_op_bias_quant = op.bias.quantization.clone()

2514

conv_group_op_bias_quant.scale_f32 = op.bias.quantization.scale_f32[..., cg_oc_start:cg_oc_end]

2515

conv_group_op_bias_quant.zero_point = op.bias.quantization.zero_point[..., cg_oc_start:cg_oc_end]

2516

conv_group_op.add_input_tensor(

2517

create_const_tensor(

2518

f"{op.bias.name}_cg{i}",

2519

conv_group_op_bias_shape,

2520

op.bias.dtype,

2521

op.bias.values[..., cg_oc_start:cg_oc_end],

2522

op.bias.purpose,

2523

op.bias.quantization,

2524

)

2525

)

2526

# output goes to the concat

2527

conv_group_op.set_output_tensor(concat_op_ifm_part)

2528

# update the cg op shapes and debug db

2529

conv_group_op.set_ifm_ofm_shapes()

2530

DebugDatabase.add_optimised(op, conv_group_op)

2531

2532

# update the split/concat op shapes/debug db

2533

split_op.set_ifm_ofm_shapes()

2534

DebugDatabase.add_optimised(op, split_op)

2535

concat_op.set_ifm_ofm_shapes()

2536

DebugDatabase.add_optimised(op, concat_op)

2537

2538

# disconnect the original convolution operator.

2539

# the ofm has already been disconnected by concat_op.set_output_tensor()

2540

op.ifm.consumer_list.remove(op)

op.inputs = []

op.outputs = []

# return last op so that other graph optimiser functions can process the new operators

op = concat_op

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2550

def supported_operator_check(op, arch, nng):

Jonas Ohlsson

45e653d

2021-07-26 16:13:12 +0200

[diff] [blame]

2551

op.run_on_npu = arch.tflite_supported_operators.is_operator_supported(op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

wilisa01

2023-02-08 09:56:14 +0000

[diff] [blame]

2555

def tflite_optimise_graph(nng, arch, force_symmetric_int_weights):

Fredrik Svedberg

1156317

2022-07-06 14:54:12 +0200

[diff] [blame]

2556

# Compile time static optimisations

wilisa01

2023-02-08 09:56:14 +0000

[diff] [blame]

2557

optimisation_list = [

2558

optimise_quantize,

2559

convert_shape_op_to_constant_tensor,

2560

fixup_or_check_asymmetric_weights(force_symmetric_int_weights),

2561

]

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

2562

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2563

for idx, sg in enumerate(nng.subgraphs):

2564

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

nng,

sg,

arch,

[],

Ayaan Masood

4965fae

2022-06-29 11:30:57 +0100

[diff] [blame]

2569

optimisation_list,

2570

rewrite_unsupported=False,

2571

)

2572

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

2573

# Pre-processing step

Tim Hall

9cf63a3

2023-06-27 12:07:49 +0100

[diff] [blame]

2574

pre_process_list = [supported_operator_check, set_ifm_ofm_op_shapes, fixup_reshape, convert_conv_groups]

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

2575

Ayaan Masood

4965fae

2022-06-29 11:30:57 +0100

[diff] [blame]

2576

for idx, sg in enumerate(nng.subgraphs):

2577

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

nng,

sg,

arch,

[],

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

2582

pre_process_list,

2583

rewrite_unsupported=False,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

)

# Handle Concat Ops

for idx, sg in enumerate(nng.subgraphs):

2588

rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [rewrite_concat_ops])

2589

sg.refresh_after_modification()

2590

2591

# Handle Split Ops

2592

for idx, sg in enumerate(nng.subgraphs):

2593

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

nng,

sg,

arch,

[],

[rewrite_unpack_output, rewrite_stridedslice_output, convert_nop_split_to_identity],

2599

rewrite_unsupported=False,

2600

)

2601

2602

for idx, sg in enumerate(nng.subgraphs):

2603

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

nng,

sg,

arch,

[rewrite_split_ops],

[],

rewrite_unsupported=False,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2610

)

2611

Johan Alfven

a5e1b62

2023-02-02 14:59:03 +0100

[diff] [blame]

2612

# Bypass or rewrite memory only operators

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2613

for idx, sg in enumerate(nng.subgraphs):

2614

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

nng,

sg,

arch,

[],

Johan Alfven

a5e1b62

2023-02-02 14:59:03 +0100

[diff] [blame]

2619

[bypass_memory_only_ops],

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

2620

rewrite_unsupported=False,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2621

)

2622

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2623

# Rewrite of operators

2624

op_rewrite_list = [

2625

set_tensor_equivalence,

Johan Alfven

2023-04-24 13:35:40 +0200

[diff] [blame]

2626

convert_ops_to_lut,

Rickard Bolin

2023-04-20 15:12:28 +0000

[diff] [blame]

2627

convert_mean_to_depthwise_conv,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2628

convert_depthwise_to_conv,

2629

convert_conv_to_fc,

Fredrik Svedberg

2023-04-11 22:35:04 +0200

[diff] [blame]

2630

convert_lstm,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2631

convert_softmax,

Fredrik Svedberg

2022-08-19 16:06:04 +0200

[diff] [blame]

2632

convert_prelu,

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

2633

convert_mul_max_to_abs_or_lrelu,

2634

convert_lrelu,

Raul Farkas

3e7157b

2023-05-09 09:09:17 +0100

[diff] [blame]

2635

convert_avg_pool_to_conv2d,

Raul Farkas

69782af

2023-05-09 10:39:52 +0100

[diff] [blame]

2636

fixup_strided_conv,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2637

convert_hardswish_to_lut,

2638

rewrite_fully_connected_input,

2639

convert_batched_fc_shape,

2640

fixup_conv2d_backprop,

2641

fixup_relus_with_differing_ifm_ofm_scaling,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2642

reorder_depthwise_weights,

Rickard Bolin

2022-12-19 12:33:40 +0000

[diff] [blame]

2643

convert_argmax_to_depthwise_conv_and_max_pool,

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

2644

fixup_resize,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2645

fixup_bias_tensors,

Fredrik Svedberg

2021-11-01 14:25:29 +0100

[diff] [blame]

2646

fixup_asymmetric_weights,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2647

convert_tanh_sigmoid_to_lut,

2648

replace_pad_by_hw_pad,

Tim Hall

ea4ba66

2022-11-11 18:19:53 +0000

[diff] [blame]

2649

fixup_dilation_gt2,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2650

]

2651

2652

for idx, sg in enumerate(nng.subgraphs):

2653

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

nng,

sg,

arch,

[],

op_rewrite_list,

rewrite_unsupported=False,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2660

)

2661

2662

for idx, sg in enumerate(nng.subgraphs):

2663

# remove passthrough tensors and attempt further optimizations

2664

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

nng,

sg,

arch,

[remove_passthrough_tensor],

2669

[fuse_activation_function_with_prev, convert_pad, add_padding_fields],

2670

)

2671

2672

# Removal of SplitSliceRead, need to be done after optimisation has been performed,

2673

# since ifm/ofm_shapes are of importance to this function

2674

for sg in nng.subgraphs:

2675

rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [remove_SplitSliceRead])

2676

sg.refresh_after_modification()

2677

Fredrik Svedberg

f3c7d55

2022-11-04 09:48:49 +0100

[diff] [blame]

2678

# Make sure that const optimisations on subgraph outputs are handled correctly

2679

for sg in nng.subgraphs:

2680

for ofm in sg.output_tensors:

2681

if ofm.is_const and ofm.ops[0].type_changed:

2682

# Subgraph output cannot be const - insert a memory copy

2683

op = ofm.ops[0]

2684

ofm_clone = ofm.clone()

2685

ofm_clone.values = ofm.values

2686

ofm.values = None

Tim Hall

2023-01-13 17:57:25 +0000

[diff] [blame]

2687

zero = create_const_tensor("zero", [1], ofm.dtype, [0], quantization=ofm.quantization)

Fredrik Svedberg

f3c7d55

2022-11-04 09:48:49 +0100

[diff] [blame]

2688

memcpy = create_add_nop(f"{ofm.name}_copy")

2689

memcpy.add_input_tensor(ofm_clone)

2690

memcpy.add_input_tensor(zero)

2691

memcpy.set_output_tensor(ofm)

2692

memcpy.set_ifm_ofm_shapes()

2693

op.set_output_tensor(ofm_clone)

2694

DebugDatabase.add_optimised(op, memcpy)

2695

Patrik Gustavsson