Blame - ethosu/vela/tflite_graph_optimiser.py - ml/ethos-u/ethos-u-vela

2023-01-13 17:57:25 +0000

[diff] [blame]

1

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Rickard Bolin

bc6ee58

2022-11-04 08:24:29 +0000

[diff] [blame]

16

#

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

17

# Description:

18

# Early optimisation of a TensorFlow Lite based network graph, using the rewrite_graph module

19

# to do the traversal of the graph.

Raul Farkas

2023-01-30 12:58:46 +0000

[diff] [blame]

20

from __future__ import annotations

21

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

22

import math

23

import uuid

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

import numpy as np

from . import fp_math

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

28

from . import rewrite_graph

29

from . import scaling

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

30

from .data_type import BaseType

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

31

from .data_type import DataType

32

from .debug_database import DebugDatabase

33

from .errors import UnsupportedFeatureError

34

from .ethos_u55_regs.ethos_u55_regs import resampling_mode

Jonas Ohlsson

0957e3e

2021-09-01 15:57:21 +0200

[diff] [blame]

35

from .graph_optimiser_util import bypass_memory_only_ops

Patrik Gustavsson

c74682c

2021-08-17 14:26:38 +0200

[diff] [blame]

36

from .graph_optimiser_util import calc_explicit_padding

Patrik Gustavsson

df99510

2021-08-23 15:33:59 +0200

[diff] [blame]

37

from .graph_optimiser_util import convert_depthwise_to_conv

Fredrik Svedberg

2023-04-11 22:35:04 +0200

[diff] [blame]

38

from .graph_optimiser_util import create_avg_pool_for_concat

Jonas Ohlsson

0957e3e

2021-09-01 15:57:21 +0200

[diff] [blame]

39

from .graph_optimiser_util import memory_only_ops

Patrik Gustavsson

f1580f0

2021-09-01 12:43:02 +0200

[diff] [blame]

40

from .graph_optimiser_util import move_splitsliceread_to_consumer

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

41

from .graph_optimiser_util import needed_total_padding

42

from .graph_optimiser_util import set_ifm_ofm_op_shapes

43

from .graph_optimiser_util import set_tensor_equivalence

Fredrik Svedberg

2023-04-11 22:35:04 +0200

[diff] [blame]

44

from .lstm import Lstm

Johan Alfven

ce50273

2023-04-24 13:35:40 +0200

[diff] [blame]

45

from .lut import convert_to_lut

46

from .lut import create_lut_8bit_op

47

from .lut import create_lut_int16_op

Johan Alfven

8e525ca

2023-05-07 13:12:37 +0200

[diff] [blame]

48

from .lut import create_lut_rsqrt_int8_op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

49

from .numeric_util import clamp_sigmoid

Johan Alfven

56811e6

2023-03-27 11:33:50 +0200

[diff] [blame]

50

from .numeric_util import full_shape

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

51

from .numeric_util import round_away_zero

Johan Alfven

2023-04-13 18:54:47 +0200

[diff] [blame]

52

from .numeric_util import round_down_log2

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

53

from .operation import create_activation_function

Fredrik Svedberg

1a7527c

2021-09-13 15:52:16 +0200

[diff] [blame]

54

from .operation import ExplicitScaling

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

55

from .operation import NpuBlockType

56

from .operation import Op

57

from .operation import Operation

58

from .operation import Padding

Tim Hall

2023-05-16 22:39:14 +0100

[diff] [blame]

59

from .operation import RoundingMode

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame^]

60

from .operation_util import create_add

Fredrik Svedberg

f3c7d55

2022-11-04 09:48:49 +0100

[diff] [blame]

61

from .operation_util import create_add_nop

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

62

from .operation_util import create_avgpool_nop

Johan Alfven

2023-03-31 10:19:23 +0200

[diff] [blame]

63

from .operation_util import create_cast_op

Rickard Bolin

2022-12-19 12:33:40 +0000

[diff] [blame]

64

from .operation_util import create_depthwise_maxpool

Johan Alfven

2023-03-31 10:19:23 +0200

[diff] [blame]

65

from .operation_util import create_memcpy

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

66

from .operation_util import get_pad_values_from_input

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

67

from .scaling import quantise_scale

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

68

from .shape4d import Shape4D

69

from .softmax import SoftMax

70

from .tensor import check_quantized_tens_scaling_equal

71

from .tensor import create_const_tensor

72

from .tensor import create_equivalence_id

73

from .tensor import QuantizationParameters

74

from .tensor import Tensor

75

from .tensor import TensorPurpose

76

from .tflite_mapping import optype_to_builtintype

Raul Farkas

2023-05-16 17:18:31 +0100

[diff] [blame]

77

from .utils import calc_resize_factor

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

78

79

passthrough_nodes = (Op.Identity,)

80

81

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

82

def remove_passthrough_tensor(tens, arch, nng):

83

if len(tens.ops) == 1 and tens.ops[0].type in passthrough_nodes:

84

assert len(tens.ops[0].inputs) == 1

85

tens = tens.ops[0].inputs[0]

return tens

def rewrite_concat_ops(op, arch):

90

if not op.run_on_npu or not op.type.is_concat_op():

return

axis_4D = 0

ofm = op.ofm

ofm.ops = []

offset = 0

unfuse_activation_function(op)

99

100

if op.type == Op.Pack:

101

# Pack is also referred to as Stack

102

axis = int(op.attrs["axis"])

103

if axis < 0: # Convert to positive axis

104

axis = len(op.inputs[0].shape) + 1 + axis

105

106

desired_shape = op.inputs[0].shape[:axis] + [1] + op.inputs[0].shape[axis:]

107

108

axis_4D = axis + (4 - len(desired_shape))

109

110

for idx, inp in enumerate(op.inputs):

111

op.ifm_shapes[idx] = Shape4D(desired_shape)

112

op.type = Op.PackReshaped

113

114

inputs, axis = op.get_concat_inputs_axis()

115

for idx, inp in enumerate(inputs):

116

if op.type != Op.PackReshaped:

117

op.ifm_shapes[idx] = Shape4D(inp.shape)

118

if axis >= 0:

119

axis_4D = axis + (4 - len(inp.shape))

120

else:

121

axis_4D = axis

122

write_offset = [0, 0, 0, 0]

123

write_offset[axis_4D] = offset

124

concat_end = offset + op.ifm_shapes[idx][axis_4D]

125

create_avg_pool_for_concat(

126

op, op.name + str(idx) + "_avgpool", inp, op.ifm_shapes[idx], Shape4D.from_list(write_offset)

127

)

128

offset = concat_end

129

assert ofm.shape[axis] == offset

return op

def rewrite_split_ops(tens, arch, nng):

135

136

if len(tens.ops) == 1 and tens.ops[0].type.is_split_op() and tens.ops[0].type != Op.Unpack:

137

split_op = tens.ops[0]

138

139

# Not supported so leave it and run on CPU

140

if not split_op.run_on_npu:

141

return tens

142

143

inp, outputs, axis, offset_start, offset_end = split_op.get_split_inputs_axis()

144

145

tens.ops = []

146

new_op = Operation(Op.SplitSliceRead, split_op.name)

147

new_op.inputs = [inp]

148

ofm_shape_idx = 0

Tim Hall

51a8dce

2021-12-20 16:49:27 +0000

[diff] [blame]

149

if None in (offset_end, offset_start):

150

read_shape = None

151

else:

152

# the read shape is relative to each start offset

153

read_shape = [oe - os for oe, os in zip(offset_end, offset_start)]

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

154

155

# For Split the offset cannot be extracted from the tensor so it has to

156

# be calculated from the index of the output tensor

157

if axis is not None:

158

# Get the start and end of the split

159

offset_start = [0] * 4

160

axis_4D_list = split_op.attrs.get("split_axis_4D", None) # Present for UnpackReshaped and some StridedSlice

161

for idx, out in enumerate(outputs):

162

if axis_4D_list is not None:

163

axis_4D = axis_4D_list[idx]

164

else:

165

split_op.ofm_shapes[idx] = Shape4D(out.shape)

166

if axis >= 0:

167

axis_4D = axis + (4 - len(out.shape))

else:

axis_4D = axis

if out == tens:

ofm_shape_idx = idx

read_shape = split_op.ofm_shapes[idx]

174

break

175

176

offset_start[axis_4D] += split_op.ofm_shapes[idx][axis_4D]

177

178

new_op.read_offsets[0] = Shape4D.from_list(offset_start, 0)

179

new_op.read_shapes[0] = read_shape

180

new_op.run_on_npu = True

181

new_op.set_output_tensor(tens)

182

new_op.ifm_shapes.append(Shape4D(inp.shape))

183

new_op.ofm_shapes.append(split_op.ofm_shapes[ofm_shape_idx])

184

DebugDatabase.add_optimised(split_op, new_op)

return tens

def remove_SplitSliceRead(op, arch):

190

191

if op.type == Op.SplitSliceRead:

Fredrik Svedberg

2023-04-11 22:35:04 +0200

[diff] [blame]

192

# Check if it is possible to put the SplitSliceRead on the tensor consumer(s),

193

# or if an avgpool need to be inserted

194

if op.ofm_shapes[0] == Shape4D.from_list(op.ofm.shape) and all(

195

consumer is not None and consumer.run_on_npu and consumer.type not in memory_only_ops

196

for consumer in op.ofm.consumer_list

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

197

):

Fredrik Svedberg

2023-04-11 22:35:04 +0200

[diff] [blame]

198

# SplitSliceRead can be performed by tensor consumer(s)

199

for cons_op in list(op.ofm.consumer_list):

200

move_splitsliceread_to_consumer(op, cons_op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

201

else:

202

avgpool_op = create_avgpool_nop(op.name + "_avgpool")

203

avgpool_op.add_input_tensor(op.ifm)

204

avgpool_op.outputs = [op.ofm]

205

op.ofm.ops.remove(op)

206

op.ofm.ops.append(avgpool_op)

207

avgpool_op.ifm_shapes.append(op.ifm_shapes[0])

208

avgpool_op.ofm_shapes.append(op.ofm_shapes[0])

209

avgpool_op.read_offsets[0] = op.read_offsets[0]

210

avgpool_op.read_shapes[0] = op.read_shapes[0]

211

212

op.ifm.consumer_list.remove(op)

213

DebugDatabase.add_optimised(op, avgpool_op)

214

215

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

216

def calc_padding_and_skirt(padding_type, kernel, input_shape, explicit_padding):

217

k_w, k_h = kernel.dilated_wh()

218

s_x, s_y = kernel.stride

219

ypad = needed_total_padding(int(input_shape.height), int(s_y), int(k_h))

220

xpad = needed_total_padding(int(input_shape.width), int(s_x), int(k_w))

221

if padding_type == Padding.SAME:

222

left_pad = (xpad + 0) // 2

223

right_pad = (xpad + 1) // 2

224

top_pad = (ypad + 0) // 2

225

bottom_pad = (ypad + 1) // 2

226

elif padding_type == Padding.VALID:

left_pad = 0

right_pad = 0

top_pad = 0

bottom_pad = 0

elif padding_type == Padding.EXPLICIT:

232

# Padding is specified in a PAD operator which has been bypassed.

233

top, left, bottom, right = explicit_padding

234

top_pad, bottom_pad = calc_explicit_padding(int(input_shape.height), int(s_y), int(k_h), int(top), int(bottom))

235

left_pad, right_pad = calc_explicit_padding(int(input_shape.width), int(s_x), int(k_w), int(left), int(right))

Rickard Bolin

9ae3455

2022-06-09 13:07:17 +0000

[diff] [blame]

236

elif padding_type == Padding.TILE:

237

# The values in the explicit padding only represent the "direction" in which to pad

238

top_pad, left_pad, bottom_pad, right_pad = explicit_padding

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

239

else:

Tim Hall

0ab2edc

2022-02-23 17:58:02 +0000

[diff] [blame]

240

raise UnsupportedFeatureError(f"Unsupported padding = {padding_type} for padding calculation")

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

241

padding = (top_pad, left_pad, bottom_pad, right_pad)

242

skirt = (top_pad, left_pad, ypad - top_pad, xpad - left_pad)

243

return padding, skirt

244

245

246

def calc_upscaled_padding_and_skirt(padding_type, kernel_size, stride, input_shape, upscaling_factor):

247

kernel_height, kernel_width = kernel_size[0], kernel_size[1]

248

if padding_type == Padding.SAME:

249

ypad = needed_total_padding(int(input_shape.height) * upscaling_factor, int(stride[1]), int(kernel_height))

250

xpad = needed_total_padding(int(input_shape.width) * upscaling_factor, int(stride[2]), int(kernel_width))

251

right_pad = max(((xpad + 1) // upscaling_factor) - 1, 0)

252

bottom_pad = max(((ypad + 1) // upscaling_factor) - 1, 0)

253

left_pad = max(kernel_width - 1 - right_pad, 0)

254

top_pad = max(kernel_height - 1 - bottom_pad, 0)

255

elif padding_type == Padding.VALID:

256

right_pad = max(kernel_width - 2, 0)

257

bottom_pad = max(kernel_height - 2, 0)

258

left_pad = kernel_width - 1

259

top_pad = kernel_height - 1

260

else:

Tim Hall

0ab2edc

2022-02-23 17:58:02 +0000

[diff] [blame]

261

raise UnsupportedFeatureError(f"Unsupported padding = {padding_type} for up-scaled padding calculation")

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

262

padding = (top_pad, left_pad, bottom_pad, right_pad)

263

skirt = padding

264

return padding, skirt

265

266

267

def fixup_conv2d_backprop(op, arch, nng):

268

if op.type == Op.Conv2DBackpropInput:

269

# flip the inputs

270

op.inputs[0], op.inputs[2] = op.inputs[2], op.inputs[0]

271

op.type = Op.Conv2DBackpropInputSwitchedBias

Tim Hall

3c5cfe9

2022-03-16 16:31:57 +0000

[diff] [blame]

272

op.ifm_resampling_mode = resampling_mode.TRANSPOSE

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

273

274

# Update strides

275

op.attrs.update({"stride_w": 1, "stride_h": 1, "strides": (1, 1, 1, 1)})

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

276

DebugDatabase.add_optimised(op, op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

# Convert the op to an elementwise add

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

282

def convert_resize_1x1_to_add(op):

283

op.type = Op.Add # original_type will stay as Op.ResizeBilinear or Op.ResizeNearestNeighbor

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

284

op.name = op.name + "_add"

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

285

# Create an input tensor filled with zeros

wilisa01

8289d51

2023-01-12 08:17:23 +0000

[diff] [blame]

286

name = op.inputs[1].name + "_add"

287

dtype = op.inputs[0].dtype

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

288

shape = op.ofm_shapes[0].as_list()

wilisa01

8289d51

2023-01-12 08:17:23 +0000

[diff] [blame]

289

values = np.zeros(shape, dtype.as_numpy_type())

290

quantization = QuantizationParameters(0.0, 255.0)

291

quantization.scale_f32 = 1.0

292

quantization.zero_point = 0

wilisa01

16b5e5e

2023-02-14 12:03:59 +0000

[diff] [blame]

293

op.inputs[1] = op.inputs[0]

294

op.set_input_tensor(create_const_tensor(name, shape, dtype, values, quantization=quantization), 0)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

295

op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

296

DebugDatabase.add_optimised(op, op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

Tim Hall

2023-05-16 22:39:14 +0100

[diff] [blame]

301

# Convert ResizeNearestNeighbor with align corners to a depthwise convolution. The IFM will already have been upscaled

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

302

# apart from the final x2 scaling which will be done as part of this operation. The kernel contains a single coefficient

303

# to select the appropriate nearest neighbor value

304

def convert_resizenn_ac_to_depthwise_conv(op, upscale_factor):

305

ifm = op.ifm

306

ofm = op.ofm

307

output_depth = ofm.shape[-1]

308

dw_op_attrs = {

309

"padding": Padding.VALID,

310

"stride_h": 1,

311

"stride_w": 1,

312

"strides": (1, 1, 1, 1),

313

"depth_multiplier": 1,

314

"channel_multiplier": 1,

315

"dilation_h_factor": 1,

316

"dilation_w_factor": 1,

317

"dilation": (1, 1, 1, 1),

318

}

319

Tim Hall

2023-05-16 22:39:14 +0100

[diff] [blame]

320

# change ResizeNearestNeighbor to Depthwise

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

321

op.type = Op.DepthwiseConv2DBias

322

op.attrs.update(dw_op_attrs)

323

op.set_input_tensor(ifm, 0) # ifm tensor index

324

op.activation = None

325

326

# add input resample to resize by x2

327

op.ifm_resampling_mode = resampling_mode.NEAREST

328

329

# don't care about the rounding mode as it is nearest neighbor

330

331

# setup weight tensor

332

weight_quant = QuantizationParameters()

333

weight_quant.scale_f32 = 1.0 # no scaling as only a single non-zero coeff to select the desired value

334

weight_quant.zero_point = 0

335

weight_quant.quant_dim = 0

336

ofm_dtype = ofm.dtype

Tim Hall

2023-01-13 17:57:25 +0000

[diff] [blame]

337

if ofm_dtype.type == BaseType.UnsignedInt:

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

338

weight_quant.quant_min = 0

339

weight_quant.quant_max = (1 << ofm_dtype.bits) - 1

340

else:

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

341

weight_quant.quant_min = -(1 << (ofm_dtype.bits - 1))

342

weight_quant.quant_max = (1 << (ofm_dtype.bits - 1)) - 1

343

344

weight_shape = [upscale_factor, upscale_factor, output_depth, output_depth] # HWIO

345

346

# the single non-zero coefficient used to select the desired value needs to be placed in the 'centre value', which

347

# is calculated by finding the 'centre position' ('*' in the diagram below) and then choosing the 'value' that is

348

# below-and-right (i.e. next) to it (D).

# 0---1---2

# | A | B |

# 1---*---+

# | C | D |

# 2---+---+

weight_values = [0] * (upscale_factor * upscale_factor)

355

centre_coeff = (upscale_factor // 2) * upscale_factor + (upscale_factor // 2)

356

weight_values[centre_coeff] = 1

357

358

# add weight tensor, this will discard the size tensor of the resize op

op.set_input_tensor(

create_const_tensor(

"weights",

weight_shape,

Tim Hall

2023-01-13 17:57:25 +0000

[diff] [blame]

363

ofm_dtype,

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

364

np.array(weight_values).reshape(weight_shape),

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

365

quantization=weight_quant,

366

),

367

1, # inputs tensor weight index

368

)

369

370

# setup bias tensor by assign None and then call the fix-up function to create a suitable tensor.

371

# need to append the bias tensor as resize ops only have 2 inputs

372

assert len(op.inputs) == 2

373

op.inputs.append(None)

Fredrik Svedberg

cc219be

2022-09-20 16:32:52 +0200

[diff] [blame]

374

fixup_bias_tensors(op, None, None, DataType.int32)

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

375

376

# finally update the shape incase we've change the tensor shapes or connections

377

op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

378

DebugDatabase.add_optimised(op, op)

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

return op

# Convert ResizeBilinear/NearestNeighbor to a number of 1x1 average pools with nearest neighbor x2 upscaling and one

384

# final average pool with a kernel size that depends upon the resize ops upscaling factor (x2, x4 or x8). The maximum

385

# upscale factor is limited to x8 because of the limit 8x8 kernel size limit for average pool with padding.

386

def convert_resize_to_upscale_and_average_pool(op):

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

387

pre_op = op

388

outputs = op.outputs

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

389

dtype = op.ifm.dtype

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

390

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

391

op.attrs.update({"strides": (1, 1, 1, 1), "ksize": (1, 1, 1, 1)})

Tim Hall

47c7636

2022-07-18 21:26:47 +0100

[diff] [blame]

392

op.attrs["padding"] = Padding.SAME # doesn't really matter as the kernel is 1x1

Tim Hall

3c5cfe9

2022-03-16 16:31:57 +0000

[diff] [blame]

393

op.ifm_resampling_mode = resampling_mode.NEAREST

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

394

395

upscaled_shape = np.array(op.ifm_shapes[0].get_hw_as_list())

Tim Hall

47c7636

2022-07-18 21:26:47 +0100

[diff] [blame]

396

397

# Get upscale factor that was calculated in the supported operators check

398

upscale_factor = op.attrs["upscale_factor"]

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

399

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

400

# Calculate how many times 2x2 upscaling needs to be performed

Tim Hall

f9267da

2022-04-20 20:19:48 +0100

[diff] [blame]

401

# Force the result of round to be an integer. This is because the behaviour of rounding numpy.float64 values changed

402

# between different versions of numpy. This consistency ensures that the kernel dimensions are kept integral

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

403

n = int(np.log2(upscale_factor))

404

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

405

# Perform x2 upscaling n-1 times

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

406

scaled_op = pre_op

407

for count in range(n - 1):

408

if count > 0:

409

scaled_op = op.clone(f"_{count}")

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

410

scaled_op.inputs[0] = pre_op.outputs[0]

411

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

412

# Nearest neighbor x2 upscaling

Tim Hall

47c7636

2022-07-18 21:26:47 +0100

[diff] [blame]

413

upscaled_shape = upscaled_shape * 2

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

414

shape = op.ofm_shapes[0].as_list()

415

shape[1:3] = upscaled_shape

416

out_tens = Tensor(shape, dtype, f"{op.outputs[0].name}_{count}")

417

out_tens.quantization = op.outputs[0].quantization.clone()

418

scaled_op.set_output_tensor(out_tens)

419

pre_op = scaled_op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

420

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

421

scaled_op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

422

DebugDatabase.add_optimised(op, scaled_op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

423

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

424

# Last x2 upscaling

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

425

if n > 1:

426

scaled_op = op.clone(f"_{n-1}")

427

scaled_op.inputs[0] = pre_op.outputs[0]

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

428

429

if scaled_op.original_type == Op.ResizeBilinear:

430

if scaled_op.attrs["align_corners"]:

431

# no padding

432

scaled_op.attrs["padding"] = Padding.VALID

433

else:

434

# padding to the right and bottom (limits average pool to 8x8 kernel)

435

scaled_op.attrs["padding"] = Padding.EXPLICIT

436

scaled_op.attrs["explicit_padding"] = [0, 0, upscale_factor - 1, upscale_factor - 1]

437

438

# kernal size dependent on the upscaling factor

439

scaled_op.attrs.update({"ksize": (1, upscale_factor, upscale_factor, 1)})

440

else: # Op.ResizeNearestNeighbor

441

if scaled_op.attrs["align_corners"]:

442

# use depthwise conv to select the correct value

443

scaled_op = convert_resizenn_ac_to_depthwise_conv(scaled_op, upscale_factor)

444

else:

Johan Alfvén

a64616c

2022-10-17 12:29:12 +0200

[diff] [blame]

445

# Keep 1x1 kernel and average pool, this applies both when

446

# half-pixel-centers is True and False. Calculations are the

447

# same in the reference.

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

448

pass

449

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

450

scaled_op.outputs = outputs

451

scaled_op.outputs[0].ops = [scaled_op]

452

scaled_op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

453

DebugDatabase.add_optimised(op, scaled_op)

Rickard Bolin

2022-01-25 15:45:00 +0000

[diff] [blame]

454

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

Rickard Bolin

2022-12-19 12:33:40 +0000

[diff] [blame]

458

def convert_argmax_to_depthwise_conv_and_max_pool(op, arch, nng):

459

"""

460

Convert ArgMax to DWConv2D->MaxPool->DWConv2D, see details below.

Example:

arr = [4, [00000100,

6, = 00000110, # <-- This is the largest value, so we're expecting argmax(arr) = 1

465

5] 00000101]

466

467

Use 16-bit precision and shift all values 7 bits to the left:

468

Shifted_arr = [0000001000000000,

0000001100000000,

0000001010000000]

Add "c - index of channel" to each channel:

473

Shifted_arr_plus_reverse_idx = [0000001000000010, (+2)

474

0000001100000001, (+1)

475

0000001010000000] (+0)

476

477

The index is reversed since ArgMax selects the lowest index if maximum value is found at two index. The index will

478

act as a tie-breaker between channels with equal values and since we want the smallest channel index to be chosen

479

we reverse the index before the maxpool and then subtract the index from the number of channel after the maxpool to

480

get the correct index.

481

482

Find the maximum value in the array:

483

val = max(shifted_arr_plus_reverse_idx) = 0000001100000001

484

485

Subtract the value from the number of channels:

486

shifted_arr_plus_idx = (c-1) - val = 2 - 1 = 1

487

488

Extract the 7 lowest bits using a LUT to cut off the 9 most significant bits:

489

idx = LUT(val) = 0000000000000001 = 1

490

"""

491

492

if op.type == Op.ArgMax:

493

ifm, ofm = op.inputs[0], op.outputs[0]

494

identity_quant = QuantizationParameters()

495

identity_quant.zero_point = 0

496

identity_quant.scale_f32 = 1.0

Rickard Bolin

2022-12-19 12:33:40 +0000

[diff] [blame]

497

# Add last dimension to ofm shape

ofm.shape += [1]

ofm.ops = []

# Create 1x1 Depthwise convolution with 2**7 weights for each channel to convert precision to 16 bit and shift

502

# all values 7 bits to the left

503

# Set necessary depthwise attributes

504

dw_op_attrs = {

505

"padding": Padding.VALID,

506

"stride_h": 1,

507

"stride_w": 1,

508

"strides": (1, 1, 1, 1),

509

"depth_multiplier": 1,

510

"channel_multiplier": 1,

511

"dilation_h_factor": 1,

512

"dilation_w_factor": 1,

513

"dilation": (1, 1, 1, 1),

514

"explicit_padding": None,

515

}

Johan Alfven

2023-03-31 10:19:23 +0200

[diff] [blame]

516

orig_name = op.name

517

op.name = f"{orig_name}_depthwise_conv_SHL_7"

Rickard Bolin

2022-12-19 12:33:40 +0000

[diff] [blame]

518

op.type = Op.DepthwiseConv2DBias

519

op.attrs.update(dw_op_attrs)

Johan Alfven

56811e6

2023-03-27 11:33:50 +0200

[diff] [blame]

520

n, h, w, c = full_shape(4, ifm.shape, 1)

Rickard Bolin

2022-12-19 12:33:40 +0000

[diff] [blame]

521

shape = [1, 1, 1, c]

522

kernel = np.dstack([2**7] * c)

523

op.inputs = []

524

op.add_input_tensor(ifm)

op.add_input_tensor(

create_const_tensor(

"weights",

shape,

DataType.uint8,

np.array(kernel).reshape(shape),

531

quantization=identity_quant,

532

),

533

)

534

# Let the bias for each channel be the "reverse" index of the channel it is in, ie c - channel_idx

535

reverse_idxs = list(reversed(range(c)))

536

bias_tensor = create_const_tensor(op.name + "_bias", [c], DataType.int64, reverse_idxs)

537

op.add_input_tensor(bias_tensor)

538

539

intermediate_tens = Tensor([n, h, w, c], DataType.int16, "int16_and_shifted_7_bits_left")

540

intermediate_tens.quantization = ifm.quantization

541

op.set_output_tensor(intermediate_tens)

542

op.set_ifm_ofm_shapes()

543

orig_ifm_shape = op.ifm_shapes[0]

544

DebugDatabase.add_optimised(op, op)

545

546

# To extract 7 least significant bits and swap reverse index back to real index using a LUT activation, we set

547

# the base value to c-1 and slope to -128. The 16-bit LUT uses a table of 32-bit values where the top 16 bits

548

# represent the slope and bottom 16 bits the base which are used to interpolate the activation value.

549

slope = (-128 & 0xFFFF) << 16 # Top 16 bits of 32 bit LUT table value

550

base = c - 1 # Bottom 16 bits of the LUT table value

551

lut_tensor = create_const_tensor(

552

"maxpool_LUT_extract_7_LSB",

553

[1, 1, 1, 512],

554

DataType.uint32,

555

[slope + base] * 512,

TensorPurpose.LUT,

)

# Split large feature maps into smaller chunks since the Depthwise Maxpool height dimension can overflow due to

560

# flattening the ifm to (H*W)xCx1

561

max_height = 2**16 // orig_ifm_shape.width

562

num_full_height_ops = orig_ifm_shape.height // max_height

563

last_op_height = orig_ifm_shape.height - max_height * num_full_height_ops

564

op_heights = [max_height] * num_full_height_ops

565

if last_op_height > 0:

566

op_heights.append(last_op_height)

567

568

# Create maxpool output tensor which is reshaped to 1x(H*W)x1x1. The product H*W might be larger than the

569

# maximum allowed height, but that's handled by reading and writing the data in chunks

570

maxpool_ofm = Tensor([1, orig_ifm_shape.height * orig_ifm_shape.width, 1, 1], DataType.int16, "argmax_maxpool")

571

maxpool_ofm.quantization = identity_quant

572

573

for op_idx, op_height in enumerate(op_heights):

574

maxpool_op = create_depthwise_maxpool(

575

f"dw_maxpool_{op_idx}", intermediate_tens, orig_ifm_shape, identity_quant

576

)

577

maxpool_op.outputs = [maxpool_ofm]

578

maxpool_ofm.ops.append(maxpool_op)

579

maxpool_op.ofm_shapes = [Shape4D(maxpool_ofm.shape)]

580

maxpool_op.set_activation_lut(lut_tensor)

581

582

# Set read and write shapes/offsets to read/write chunks of the IFM/OFM

583

maxpool_op.read_shapes[0] = Shape4D([1, op_height * orig_ifm_shape.width, orig_ifm_shape.depth, 1])

584

maxpool_op.read_offsets[0] = Shape4D([0, sum(op_heights[:op_idx]) * orig_ifm_shape.width, 0, 0])

585

maxpool_op.write_shape = Shape4D([1, op_height * orig_ifm_shape.width, 1, 1])

586

maxpool_op.write_offset = Shape4D([0, sum(op_heights[:op_idx]) * orig_ifm_shape.width, 0, 0])

587

DebugDatabase.add_optimised(op, maxpool_op)

588

Johan Alfven

2023-03-31 10:19:23 +0200

[diff] [blame]

589

# Set final shape

590

maxpool_ofm.set_all_shapes([1, h, w, 1])

591

592

# Convert 16bit to 32bit or 64bit

593

if ofm.dtype == DataType.int64:

594

# If OFM dtype is int64 the result is converted by two cast ops (16bit to 32bit)

595

#

596

# A -> B -> C -> D (OFM)

597

# |0001| |00010000| |0001|0000| |00010000|00000000|

598

# i16 i32 i16 i16 i32 i32

599

# <-------i64------->

600

#

601

# Memcpy is used to copy the content from B to C and from D to OFM

602

# Memcpy will be turned into a nop or an DMA transer if memory regions differs.

603

intermediate_32bit = Tensor([1, h, w, 1], DataType.int32, f"{orig_name}_32bit")

604

else:

605

intermediate_32bit = ofm

606

607

op_cast = create_cast_op(f"{orig_name}_cast_to_32bit_1", maxpool_ofm, intermediate_32bit)

608

DebugDatabase.add_optimised(op, op_cast)

609

610

if ofm.dtype == DataType.int64:

611

# Create int16 tensor with double shape to cover the intermediate_32bit result from the first cast

612

intermediate_16bit_2x_size = Tensor([1, h, w, 2], DataType.int16, f"{orig_name}_16bit_2x_size")

613

memcpy_op = create_memcpy(f"{orig_name}_memcpy_1", intermediate_32bit, intermediate_16bit_2x_size)

614

DebugDatabase.add_optimised(op, memcpy_op)

615

616

# Create int32 tensor with double ofm shape to be able to store a "int64" result

617

intermediate_32bit_2x_size = Tensor([1, h, w, 2], DataType.int32, f"{orig_name}_32bit_2x_size")

618

619

op_cast = create_cast_op(

620

f"{orig_name}_cast_to_32bit_2", intermediate_16bit_2x_size, intermediate_32bit_2x_size

621

)

622

DebugDatabase.add_optimised(op, op_cast)

623

624

memcpy_op = create_memcpy("f{orig_name}_memcpy_2", intermediate_32bit_2x_size, ofm)

625

DebugDatabase.add_optimised(op, memcpy_op)

Rickard Bolin

2022-12-19 12:33:40 +0000

[diff] [blame]

return op

Rickard Bolin

2022-07-04 16:19:16 +0000

[diff] [blame]

630

def convert_resizebilinear_to_depthwise_convolutions(op, half_pixel_centers=True):

631

def _compute_interpolation_values(index, input_size, output_size):

632

scale = input_size / output_size

633

scaled_value = (index + 0.5 * half_pixel_centers) * scale - 0.5 * half_pixel_centers

634

lower_bound = max(np.floor(scaled_value), 0)

635

636

return scaled_value, lower_bound

637

638

def _compute_kernels(input_height, input_width, output_height, output_width):

kernels = []

for y in (1, 2):

for x in (1, 2):

sv_h, lb_h = _compute_interpolation_values(y, input_height, output_height)

643

sv_w, lb_w = _compute_interpolation_values(x, input_width, output_width)

644

645

# Interpolation values calculated for (x, y) = ([1, 2], [1, 2]) will always generalize to the whole

646

# input for upscale = 2 and input sizes >= 2x2 and be in the correct order for going left-to-right,

647

# top-to-bottom - same as the depthwise convolution strides across each tile

648

kernel = np.zeros((2, 2))

649

kernel[1, 1] = (1 - (sv_h - lb_h)) * (1 - (sv_w - lb_w))

650

kernel[0, 1] = (sv_h - lb_h) * (1 - (sv_w - lb_w))

651

kernel[1, 0] = (1 - (sv_h - lb_h)) * (sv_w - lb_w)

652

kernel[0, 0] = (sv_h - lb_h) * (sv_w - lb_w)

653

kernel *= 16

654

kernels.append(kernel)

return kernels

def _build_convolutions(op, kernels):

659

dw_op_attrs = {

660

"padding": Padding.TILE,

661

"stride_h": 1,

662

"stride_w": 1,

663

"strides": (1, 1, 1, 1),

664

"depth_multiplier": 1,

665

"channel_multiplier": 1,

666

"dilation_h_factor": 1,

667

"dilation_w_factor": 1,

668

"dilation": (1, 1, 1, 1),

}

ifm = op.ifm

ofm = op.ofm

ofm.ops = []

elem_size = 2 if ofm.dtype == DataType.int16 else 1

674

675

n, h, w, c = ifm.shape

676

_, _, ow, _ = ofm.shape

677

678

intermediate_tens = Tensor(ifm.shape, ifm.dtype, "intermediate_tens")

679

intermediate_tens.quantization = op.outputs[0].quantization.clone()

680

avgpool_op = op

681

avgpool_op.name = "rb_init_avgpool"

682

avgpool_op.type = Op.AvgPool

683

avgpool_op.attrs["padding"] = Padding.VALID

684

avgpool_op.attrs["stride_w"] = 1

685

avgpool_op.attrs["stride_h"] = 1

686

avgpool_op.attrs["filter_width"] = 1

687

avgpool_op.attrs["filter_height"] = 1

688

avgpool_op.attrs["strides"] = [1, 1, 1, 1]

689

avgpool_op.attrs["ksize"] = [1, 1, 1, 1]

690

691

avgpool_op.add_input_tensor(ifm)

692

avgpool_op.set_output_tensor(intermediate_tens)

693

avgpool_op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

694

DebugDatabase.add_optimised(op, op)

Rickard Bolin

2022-07-04 16:19:16 +0000

[diff] [blame]

695

696

dw_conv = Operation(Op.DepthwiseConv2DBias, "depthwise_conv")

697

dw_conv._original_type = Op.ResizeBilinear

698

dw_conv.write_shape = Shape4D(n, h, w, c)

699

dw_conv.write_offset = Shape4D(0, 0, 0, 0)

700

Tim Hall

2023-05-16 22:39:14 +0100

[diff] [blame]

701

# Resize bilinear requires rounding away from zero

702

dw_conv.rounding_mode = RoundingMode.AwayZero

Rickard Bolin

2022-07-04 16:19:16 +0000

[diff] [blame]

703

704

# Double height and width stride to write the output of each of the four depthwise convolutions below

705

# interleaved with each other when combined with OFM tile base offsets.

706

dw_conv.ofm_stride_multiplier = [1, 2, 2] # C/H/W

707

708

# Choose tile padding direction - pad by 1 with edge values in two direction.

709

# For example, TL (top left) will pad top and left in H/W-plane in all channels.

710

directions = [[1, 1, 0, 0], [1, 0, 0, 1], [0, 1, 1, 0], [0, 0, 1, 1]] # TL, TR, BL, BR

for i in (0, 1):

for j in (0, 1):

index = i * 2 + j

dw_conv.name = f"depthwise_conv_{index}"

715

dw_op_attrs["explicit_padding"] = directions[index]

716

dw_conv.attrs.update(dw_op_attrs)

717

718

# This will offset the start of the write by modifying the Tile 0 base address

719

dw_conv.tile_base_offsets_ofm[0] = (i * ow + j) * c * elem_size

720

721

ofm.ops.append(dw_conv)

722

dw_conv.outputs = [ofm]

723

724

kernel = kernels[index]

725

shape = [2, 2, 1, c]

726

kernel = np.dstack([kernel] * c)

727

728

quant = QuantizationParameters()

729

quant.zero_point = 0

730

quant.scale_f32 = 1.0 / 16

731

732

dw_conv.inputs = []

733

dw_conv.add_input_tensor(intermediate_tens)

734

dw_conv.add_input_tensor(

create_const_tensor(

"weights",

shape,

intermediate_tens.dtype,

739

np.array(kernel).reshape(shape),

Rickard Bolin

2022-07-04 16:19:16 +0000

[diff] [blame]

quantization=quant,

),

)

# setup bias tensor by assign None and then call the fix-up function to create a suitable tensor.

745

# need to append the bias tensor as resize ops only have 2 inputs

746

assert len(dw_conv.inputs) == 2

747

dw_conv.inputs.append(None)

Rickard Bolin

017b4cc

2022-09-23 10:16:48 +0000

[diff] [blame]

748

fixup_bias_tensors(dw_conv, None, None, dtype=DataType.int32)

Rickard Bolin

2022-07-04 16:19:16 +0000

[diff] [blame]

749

750

dw_conv.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

751

DebugDatabase.add_optimised(op, dw_conv)

752

Rickard Bolin

2022-07-04 16:19:16 +0000

[diff] [blame]

753

dw_conv = dw_conv.clone(f"_{index}")

754

return op

755

756

_, input_height, input_width, _ = op.ifm.shape

757

_, output_height, output_width, _ = op.ofm.shape

758

759

kernels = _compute_kernels(input_height, input_width, output_height, output_width)

760

op = _build_convolutions(op, kernels)

return op

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

765

def fixup_resize(op, arch, nng):

766

if op.type.is_resize_op() and op.run_on_npu:

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

767

if op.ifm_shapes[0] == op.ofm_shapes[0]:

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

768

# Bypass the resize op which is essentially a NOP

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

769

op.inputs = op.inputs[:1]

770

op.type = Op.Identity

771

elif op.ifm_shapes[0].height == 1 and op.ifm_shapes[0].width == 1:

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

772

convert_resize_1x1_to_add(op)

Rickard Bolin

2022-07-04 16:19:16 +0000

[diff] [blame]

773

elif op.type == Op.ResizeBilinear and op.attrs.get("half_pixel_centers", False):

774

convert_resizebilinear_to_depthwise_convolutions(op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

775

else:

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

776

convert_resize_to_upscale_and_average_pool(op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

def convert_nop_split_to_identity(op, arch, nng):

782

if op.type == Op.Split and op.attrs.get("num_splits") == 1:

783

# the list comprehension should return a list with a single tensor

784

# if it shouldn't, remove_passthrough_tensor will fail appropriately

785

op.inputs = [i for i in op.inputs if i.shape == op.outputs[0].shape]

786

op.type = Op.Identity

return op

Ayaan Masood

2022-04-21 14:28:03 +0100

[diff] [blame]

790

def rewrite_fully_connected_input(op: Operation, arch, nng):

Fredrik Svedberg

2023-04-11 22:35:04 +0200

[diff] [blame]

791

# If the operation already have a read shape do not modify

792

# the ifm shape, since that will already be correct

793

if op.type == Op.FullyConnected and not op.read_shapes[0]:

Ayaan Masood

a2ec5aa

2022-04-21 14:28:03 +0100

[diff] [blame]

794

new_shape = op.ifm.get_shape_as_2d(op.weights.shape[-2])

795

assert new_shape is not None, "Tensor can not be reshaped to 2D"

796

op.ifm_shapes[0] = new_shape

Johan Alfvén

65835e0

2022-10-13 10:49:30 +0200

[diff] [blame]

797

798

if op.ifm_shapes[0].batch > 1 and op.ofm_shapes[0].batch == 1:

799

# If IFM is batching then also make sure OFM is batching

800

h, w = op.ofm_shapes[0].height, op.ofm_shapes[0].width

801

op.ofm_shapes[0] = Shape4D([h * w, 1, 1, op.ofm_shapes[0].depth])

802

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

def convert_batched_fc_shape(op, arch, nng):

807

if op.type == Op.FullyConnected:

808

# Check if the first dimension indicates batching

809

if op.ifm_shapes[0].batch > 1:

810

batching_split = {4: (2, 2), 8: (2, 4), 16: (4, 4)}

811

n = op.ifm_shapes[0].batch

812

h, w = batching_split.get(n, (1, n))

813

op.ifm_shapes[0] = Shape4D([1, h, w, op.ifm_shapes[0].depth])

814

815

# Reshape Weights to be 4D. IO becomes HWIO

816

weight_tensor = op.inputs[1]

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

817

weight_tensor.values = np.expand_dims(np.expand_dims(weight_tensor.values, axis=0), axis=0)

818

weight_tensor.set_all_shapes(list(weight_tensor.values.shape))

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

819

820

n = op.ofm_shapes[0].batch

821

h, w = batching_split.get(n, (1, n))

822

op.ofm_shapes[0] = Shape4D([1, h, w, op.ofm_shapes[0].depth])

return op

def unfuse_activation_function(op):

827

if op.type == Op.ConcatTFLite and op.run_on_npu and op.activation is not None:

828

act_op = Operation(op.activation.op_type, op.name + op.activation.op_type.name)

829

op.activation = None

830

out_tens = op.outputs[0]

831

intermediate_tens = out_tens.clone("_act_intermediate")

832

act_op.set_output_tensor(out_tens)

833

act_op.add_input_tensor(intermediate_tens)

834

op.set_output_tensor(intermediate_tens)

835

act_op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

836

DebugDatabase.add_optimised(op, act_op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

837

838

839

def rewrite_stridedslice_output(op, arch, nng):

840

if not op.run_on_npu or op.type != Op.StridedSlice:

841

return op

842

843

new_axis_mask = op.attrs["new_axis_mask"]

844

shrink_axis_mask = op.attrs["shrink_axis_mask"]

845

846

if shrink_axis_mask == 0 and new_axis_mask == 0:

847

return op

848

849

axis_4D = [0] * len(op.outputs)

850

for idx, out_tens in enumerate(op.outputs):

851

output_shape = list(out_tens.shape)

852

853

if shrink_axis_mask != 0:

854

n = 0

855

axis = 0

856

while shrink_axis_mask:

857

prev_mask = shrink_axis_mask

858

n += 1

859

shrink_axis_mask &= shrink_axis_mask - 1

860

axis = int(math.log2(prev_mask - shrink_axis_mask))

861

output_shape = output_shape[:axis] + [1] + output_shape[axis:]

862

863

assert len(out_tens.shape) == (len(op.inputs[0].shape) - n)

864

op.attrs["shrink_axis_mask"] = 0

865

if axis >= 0:

866

axis_4D[idx] = axis + (4 - len(output_shape))

867

else:

868

axis_4D[idx] = axis

869

op.ofm_shapes[idx] = Shape4D(output_shape)

870

871

elif new_axis_mask != 0:

n = 0

axis = 0

while new_axis_mask:

prev_mask = new_axis_mask

876

n += 1

877

new_axis_mask &= new_axis_mask - 1

878

axis = int(math.log2(prev_mask - new_axis_mask))

879

output_shape = output_shape[:axis] + output_shape[(axis + 1) :]

880

new_axis_mask >>= 1

881

882

assert len(out_tens.shape) == (len(op.inputs[0].shape) + n)

883

op.attrs["new_axis_mask"] = 0

884

if axis >= 0:

885

axis_4D[idx] = axis + (4 - len(output_shape))

886

else:

887

axis_4D[idx] = axis

888

op.ofm_shapes[idx] = Shape4D(output_shape)

889

890

op.attrs["split_axis_4D"] = axis_4D

return op

def rewrite_unpack_output(op, arch, nng):

895

tens = op.outputs[0]

896

if op.run_on_npu and op.type == Op.Unpack:

897

# Unpack is also referred to as Unstack

898

axis = int(op.attrs["axis"])

899

if axis < 0: # Convert to positive axis

900

axis = len(op.inputs[0].shape) + 1 + axis

901

op.type = Op.UnpackReshaped

902

desired_output_shape = tens.shape[:axis] + [1] + tens.shape[axis:]

903

904

axis_4D = axis + (4 - len(desired_output_shape))

905

op.attrs["split_axis_4D"] = [axis_4D] * len(op.outputs)

906

907

for idx, out_tens in enumerate(op.outputs):

908

op.ofm_shapes[idx] = Shape4D(desired_output_shape)

return op

def add_padding_fields(op, arch, nng):

913

if op.run_on_npu:

914

if "padding" in op.attrs:

915

input_shape = op.ifm_shapes[0]

916

output_shape = op.ofm_shapes[0]

917

if op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op():

918

kernel_size = op.inputs[1].shape[:2]

919

elif op.type.is_pool_op() or op.type.npu_block_type == NpuBlockType.ReduceSum:

920

kernel_size = op.attrs["ksize"][1:3]

921

else:

922

raise UnsupportedFeatureError(f"Unknown operation that uses padding: {optype_to_builtintype(op.type)}")

923

924

if op.type == Op.Conv2DBackpropInputSwitchedBias:

925

upscaling_factor = output_shape.height // input_shape.height

926

padding, skirt = calc_upscaled_padding_and_skirt(

927

op.attrs["padding"], kernel_size, op.attrs["strides"], input_shape, upscaling_factor

928

)

929

else:

930

padding, skirt = calc_padding_and_skirt(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

op.attrs["padding"],

op.kernel,

input_shape,

op.attrs.get("explicit_padding"),

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

935

)

936

937

op.attrs["explicit_padding"] = padding

938

op.attrs["skirt"] = skirt

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

943

def reorder_depthwise_weights(op, arch, nng):

944

if op.type.is_depthwise_conv2d_op():

945

weight_tensor = op.inputs[1]

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame^]

946

if not weight_tensor.weight_transpose_depthwise:

947

weight_tensor.values = np.transpose(weight_tensor.values, (0, 1, 3, 2))

948

weight_tensor.set_all_shapes(list(weight_tensor.values.shape))

949

weight_tensor.weight_transpose_depthwise = True

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

Raul Farkas

2023-05-09 09:09:17 +0100

[diff] [blame]

954

def convert_avg_pool_to_conv2d(op: Operation, arch, nng) -> Operation:

955

"""Convert strided Average Pools with stride >= 4 to Conv2D."""

956

if op.type != Op.AvgPool:

957

return op

958

959

stride_x, stride_y = op.get_kernel_stride()

960

# For strides <= 3 no optimization is needed

961

if stride_x <= 3:

962

return op

963

h, w = op.attrs["filter_height"], op.attrs["filter_width"]

964

inputs = op.inputs[0]

965

shape = inputs.shape

966

967

# Set necessary conv2d attributes

968

op.attrs.update(

969

{

970

"stride_h": stride_y,

971

"stride_w": stride_x,

972

"dilation_h_factor": 1,

973

"dilation_w_factor": 1,

974

"strides": (1, stride_y, stride_x, 1),

975

"dilation": (1, 1, 1, 1),

}

)

# Change op type

op.type = Op.Conv2DBias

981

op.name += "_conv2d"

982

983

op.rounding_mode = RoundingMode.AwayZero

984

shape = [h, w, 1, op.ofm.shape[-1]]

985

weights = np.full(shape, 1)

986

quant = QuantizationParameters(scale_f32=1 / (h * w), zero_point=0)

987

# Add unit weight tensor

op.add_input_tensor(

create_const_tensor(

"weights",

shape,

inputs.dtype,

weights,

quantization=quant,

),

)

op.weights.values = np.reshape(op.inputs[1].values, shape)

998

999

# Set IFM/OFM shapes after changing op type

1000

op.set_ifm_ofm_shapes()

return op

def fixup_strided_conv(op: Operation, arch, nng):

Raul Farkas

72c6a24

2023-03-16 16:38:05 +0000

[diff] [blame]

1005

"""Optimize or fixup strided Conv2DBias

1006

Optimization:

Raul Farkas

2023-01-30 12:58:46 +0000

[diff] [blame]

1007

Reduce, when possible, the Conv2DBias stride from N with 1 > N > 4 to 1

1008

by re-shaping both IFM and filter.

Raul Farkas

72c6a24

2023-03-16 16:38:05 +0000

[diff] [blame]

1009

1010

Fixup:

Raul Farkas

2023-01-30 12:58:46 +0000

[diff] [blame]

1011

Introduce software support for Conv2DBias with stride_width > 4 by

1012

reducing it to 1, 2 or 3 (HW supported strides) when possible by

1013

re-shaping both IFM and filter.

Raul Farkas

72c6a24

2023-03-16 16:38:05 +0000

[diff] [blame]

1014

"""

Raul Farkas

2023-01-24 16:29:06 +0000

[diff] [blame]

1015

if op.type != Op.Conv2DBias:

Louis Verhaard

43d2758

2022-03-17 14:06:00 +0100

[diff] [blame]

1016

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1017

stride_x, stride_y = op.get_kernel_stride()

Louis Verhaard

43d2758

2022-03-17 14:06:00 +0100

[diff] [blame]

1018

weight_tensor = op.weights

1019

ifm_shape = op.ifm_shapes[0]

Raul Farkas

69782af

2023-05-09 10:39:52 +0100

[diff] [blame]

1020

1021

# Do not optimize if op is not the first in the network and stride is

1022

# supported by the hardware

1023

if op.op_index != 0 and stride_x < 4:

1024

return op

1025

Raul Farkas

2023-01-30 12:58:46 +0000

[diff] [blame]

1026

resize_factor, final_stride = calc_resize_factor(ifm_shape.width, stride_x)

1027

1028

def calc_filter_padding(

1029

ifm_padding_type: Padding | None,

1030

ifm_current_padding_x: int,

1031

post_op_stride: int,

1032

opt_resize_factor: int,

1033

filter_width: int,

Raul Farkas

2023-05-16 17:18:31 +0100

[diff] [blame]

1034

ifm_width: int,

Raul Farkas

2023-01-30 12:58:46 +0000

[diff] [blame]

1035

) -> tuple[int, int, int, int]:

1036

"""Calculate zero padding to be added to the filter.

Parameters

----------

ifm_padding_type : Padding or None

1041

The padding type that is applied to the IFM.

1042

ifm_current_padding_x : int

1043

Padding amount that is added to the IFM before optimization.

1044

post_op_stride : int

1045

The final stride once optimization is performed.

1046

opt_resize_factor : int

1047

The factor by which the stride will be reduced.

1048

E.g. opt_resize_factor = 2 on a stride of 4 will produce

1049

a stride of 2 after the optimization

1050

filter_width : int

1051

Width of the filter before optimization.

Raul Farkas

2023-05-16 17:18:31 +0100

[diff] [blame]

1052

ifm_width : int

1053

Width of the IFM before optimization

Raul Farkas

2023-01-30 12:58:46 +0000

[diff] [blame]

Returns

-------

padding : tuple[int, int, int, int]

1058

A tuple with the ammount of padding on each side (top, left, bottom, right)

1059

"""

1060

padding_size = 0

1061

padding = (0, 0, 0, 0)

1062

if ifm_padding_type and ifm_padding_type != Padding.VALID:

Raul Farkas

2023-05-16 17:18:31 +0100

[diff] [blame]

1063

# Compute padding size for the filter that guarantees that HW padding added to IFM matches

1064

# before and after the optimization is performed

1065

expected_filter_size = 0

1066

pre_opt_stride = post_op_stride * opt_resize_factor

1067

post_opt_ifm_width = ifm_width // opt_resize_factor

1068

# Compute the total expected filter size post optimization that ensures that the same HW padding

1069

# is added to IFM.

1070

# There are two ways of calculating required filter size depending on whether IFM width is divisible

1071

# by stride width or not. These approaches match the cases used to calculate HW padding in

1072

# needed_total_padding method.

1073

if ifm_width % pre_opt_stride == 0:

1074

expected_filter_size = ifm_current_padding_x + post_op_stride

1075

else:

1076

expected_filter_size = ifm_current_padding_x + (post_opt_ifm_width % post_op_stride)

1077

# Compute padding size from expected filter size

1078

padding_size = expected_filter_size * opt_resize_factor - filter_width

1079

1080

if ifm_current_padding_x == 0:

1081

# If no HW padding is added to IFM, divide filter padding between left and right following

1082

# the same strategy as the reference.

1083

padding_left = padding_size // 2

1084

else:

1085

# If HW padding is added to IFM, split padding for the filter so that left padding and right padding

1086

# are proportional to left and right HW padding.

1087

left_hw_padding = ifm_current_padding_x // 2

1088

# Compute filter padding

1089

padding_left = padding_size // ifm_current_padding_x * left_hw_padding

Raul Farkas

2023-01-30 12:58:46 +0000

[diff] [blame]

1090

padding = (0, padding_left, 0, padding_size - padding_left)

1091

1092

# Check if filter width is divisible by the stride width (required for optimization)

Raul Farkas

2023-05-16 17:18:31 +0100

[diff] [blame]

1093

# If filter width is not divisible by stride width and no HW padding is added to IFM, compute

1094

# filter padding required for the filter width to be divisible by the stride width and apply it as right

1095

# padding.

1096

if filter_width % opt_resize_factor != 0 and (padding_size == 0 or ifm_current_padding_x == 0):

Raul Farkas

2023-01-30 12:58:46 +0000

[diff] [blame]

1097

padding_size = opt_resize_factor - (filter_width % opt_resize_factor)

1098

# Add padding zeros to the right

1099

padding = (0, 0, 0, padding_size)

return padding

# Compute the depth of the IFM once the strided Conv2D is optimised

1104

post_opt_ifm_depth = ifm_shape.depth * resize_factor

1105

1106

if stride_x > 1 and (post_opt_ifm_depth <= 8 or stride_x > 3) and resize_factor != 1 and weight_tensor is not None:

1107

k_w, _ = op.get_kernel_size()

1108

weight_shape = weight_tensor.shape

1109

1110

padding_type = op.attrs.get("padding", None)

1111

if padding_type in (None, Padding.EXPLICIT, Padding.TILE):

Louis Verhaard

43d2758

2022-03-17 14:06:00 +0100

[diff] [blame]

1112

return op

Raul Farkas

2023-01-30 12:58:46 +0000

[diff] [blame]

1113

# Compute current padding as if IFM padding is SAME

1114

curr_padding_x = needed_total_padding(ifm_shape.width, stride_x, k_w)

1115

# Compute the padding needed on the filter for the optimisation

1116

_, left_filter_padding, _, right_filter_padding = calc_filter_padding(

Raul Farkas

2023-05-16 17:18:31 +0100

[diff] [blame]

1117

padding_type, curr_padding_x, final_stride, resize_factor, k_w, ifm_shape.width

Raul Farkas

2023-01-30 12:58:46 +0000

[diff] [blame]

1118

)

1119

total_horizontal_padding = left_filter_padding + right_filter_padding

1120

# If IFM padding is enabled, check if pre-opt and post-opt padding is

1121

# the same while taking into consideration the extra filter padding.

1122

if padding_type == Padding.SAME:

1123

optimised_padding_x = needed_total_padding(

1124

ifm_shape.width // resize_factor, final_stride, (k_w + 1 + total_horizontal_padding) // resize_factor

1125

)

1126

if curr_padding_x != optimised_padding_x:

1127

# Horizontal padding would become different after optimisation; this would not work

1128

return op

1129

1130

# Resize IFM

Raul Farkas

2023-01-24 16:29:06 +0000

[diff] [blame]

1131

op.ifm_shapes[0] = Shape4D(

Raul Farkas

2023-01-30 12:58:46 +0000

[diff] [blame]

1132

[ifm_shape.batch, ifm_shape.height, ifm_shape.width // resize_factor, ifm_shape.depth * resize_factor]

Raul Farkas

2023-01-24 16:29:06 +0000

[diff] [blame]

1133

)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1134

Raul Farkas

2023-01-30 12:58:46 +0000

[diff] [blame]

1135

# Compute list of 0 padding for each dimensions of the filter

1136

filter_dimension_padding = [(0, 0) for _ in weight_tensor.shape]

1137

# Update padding for filter width with computed padding

1138

filter_dimension_padding[1] = (left_filter_padding, right_filter_padding)

1139

# Add padding to the filter

1140

zero_point = weight_tensor.quantization.zero_point

1141

padding_constant = zero_point if np.isscalar(zero_point) else 0

1142

padded_filter_tensor = np.pad(weight_tensor.values, filter_dimension_padding, constant_values=padding_constant)

1143

weight_shape[1] = padded_filter_tensor.shape[1]

1144

weight_tensor.values = padded_filter_tensor

Raul Farkas

2023-01-24 16:29:06 +0000

[diff] [blame]

1145

# Change weight shape based on stride_x

Raul Farkas

2023-01-30 12:58:46 +0000

[diff] [blame]

1146

weight_shape[1] //= resize_factor

1147

weight_shape[2] *= resize_factor

Raul Farkas

2023-01-24 16:29:06 +0000

[diff] [blame]

1148

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

1149

weight_tensor.values = np.reshape(weight_tensor.values, weight_shape)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1150

weight_tensor.set_all_shapes(weight_shape)

1151

# If multiple copies of the weights are used, we could avoid

1152

# them having the same address by changing the value_id

1153

weight_tensor.value_id = uuid.uuid4()

1154

1155

# Strides

Raul Farkas

2023-01-30 12:58:46 +0000

[diff] [blame]

1156

stride_x = final_stride

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1157

op.attrs.update({"stride_w": stride_x, "stride_h": stride_y, "strides": (1, stride_y, stride_x, 1)})

return op

def convert_conv_to_fc(op, arch, nng):

1163

# Conv 1x1 can be equivalent to Fully Connected.

1164

# By representing certain convs as fully connected layers, Vela can better determine wether or not to use

1165

# caching/double buffering for the weights.

1166

# (Weights dont need to be reloaded for convs when IFM H and W are 1)

1167

if op.type == Op.Conv2DBias:

1168

h = op.ifm_shapes[0].height

1169

w = op.ifm_shapes[0].width

1170

kh, kw, _, _ = op.inputs[1].shape

1171

if h == 1 and w == 1 and kh == 1 and kw == 1:

1172

# Overwrite this op as a Fully Connected Op

1173

op.name += "_fc"

1174

op.type = Op.FullyConnected

op.attrs = {

"weights_format": 0,

}

# Reshape Weights to be 2D. HWIO becomes just IO (as H and W are 1, they can just be dropped)

1179

weight_tensor = op.inputs[1]

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

1180

weight_tensor.values = weight_tensor.values.squeeze(axis=(0, 1))

1181

weight_tensor.set_all_shapes(list(weight_tensor.values.shape))

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1182

1183

DebugDatabase.add_optimised(op, op)

return op

def fixup_relus_with_differing_ifm_ofm_scaling(op, arch, nng):

1188

if op.run_on_npu and op.type.is_relu_op():

1189

ifm = op.inputs[0]

1190

ofm = op.outputs[0]

1191

# Relu with differing IFM and OFM scaling cannot be fused with another primary op

1192

# and requires its own to be inserted

1193

if not check_quantized_tens_scaling_equal(ifm, ofm):

1194

# Override this op with its own primary op (avgpool)

1195

relu_fused_op = create_avgpool_nop(op.name + "_avgpool")

1196

# And fuse the original activation function to it

1197

relu_fused_op.activation = create_activation_function(op.type)

Fredrik Svedberg

1a7527c

2021-09-13 15:52:16 +0200

[diff] [blame]

1198

# Add explicit rescaling

1199

rescale = ifm.quantization.scale_f32 / ofm.quantization.scale_f32

1200

multiplier, shift = scaling.quantise_scale(rescale)

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame]

1201

relu_fused_op.explicit_scaling = ExplicitScaling(False, [shift], [multiplier])

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1202

# Tidy up and assign the ifm and ofm to the new op

1203

ifm.consumer_list.remove(op)

1204

1205

relu_fused_op.add_input_tensor(ifm)

1206

relu_fused_op.set_output_tensor(ofm)

1207

relu_fused_op.set_ifm_ofm_shapes()

op = relu_fused_op

return op

Fredrik Svedberg

2023-04-11 22:35:04 +0200

[diff] [blame]

1212

def convert_lstm(op, arch, nng):

1213

if op.type == Op.UnidirectionalSequenceLstm:

1214

lstm = Lstm(op)

1215

op = lstm.get_graph()

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1219

def convert_softmax(op, arch, nng):

1220

if op.type == Op.Softmax and op.run_on_npu:

1221

softmax = SoftMax(op)

1222

op = softmax.get_graph()

return op

Fredrik Svedberg

2022-08-19 16:06:04 +0200

[diff] [blame]

1226

def convert_prelu(op, arch, nng):

1227

if op.type == Op.Prelu:

1228

ifm, alpha, ofm = op.get_ifm_ifm2_ofm()

1229

if None in (ifm, alpha, ofm):

1230

return op

1231

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

1232

if alpha.values is not None:

1233

# If const alpha check for possible optimisations

1234

alpha_zp = alpha.quantization.zero_point

1235

alpha_scale = alpha.quantization.scale_f32

1236

# If all alpha values are the same the PReLU can be converted to LeakyRelu

Rickard Bolin

5fdcf17

2022-12-19 12:56:17 +0000

[diff] [blame]

1237

alpha_min = (alpha.values.min().astype(int) - alpha_zp) * alpha_scale

1238

alpha_max = (alpha.values.max().astype(int) - alpha_zp) * alpha_scale

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

1239

if alpha_min == alpha_max:

# or even a Relu

if alpha_min == 0:

new_op = Op.Relu

else:

new_op = Op.LeakyRelu

1245

op.attrs["alpha"] = alpha_min

1246

# setup alpha_scaling for bit exact result

1247

ifm_scale = ifm.quantization.scale_f32

1248

ofm_scale = ofm.quantization.scale_f32

1249

alpha_scale, alpha_shift = scaling.elementwise_mul_scale(ifm_scale, alpha_scale, ofm_scale)

1250

op.attrs["alpha_scaling"] = (alpha.values.min() - alpha_zp, alpha_scale, alpha_shift)

1251

# Change op type

1252

op.type = new_op

1253

op.name = op.name.replace("Prelu", new_op.name)

1254

del op.inputs[1] # Remove alpha tensor

1255

return op

1256

elif alpha_max < 1:

1257

# If alpha_max is less than 1 convert PReLU to Max(alpha * IFM, identity * IFM)

1258

# Multiply with alpha tensor

1259

mul_alpha = Operation(Op.Mul, op.name + "_mul_alpha")

1260

mul_alpha.add_input_tensor(ifm)

1261

mul_alpha.add_input_tensor(alpha)

1262

fm_alpha = ofm.clone(op.name + "_alpha", set_unique=True)

1263

mul_alpha.set_output_tensor(fm_alpha)

1264

mul_alpha.set_ifm_ofm_shapes()

1265

DebugDatabase.add_optimised(op, mul_alpha)

1266

if check_quantized_tens_scaling_equal(ifm, ofm):

1267

# No scaling is needed

1268

fm_id = ifm

1269

else:

1270

# Add multiplication with identity

1271

mul_identity = Operation(Op.Mul, op.name + "_mul_identity")

1272

mul_identity.add_input_tensor(ifm)

1273

# Create const tensor containing identity as scalar

1274

quantization = ifm.quantization.clone()

1275

quantization.scale_f32 = np.float32(1)

1276

quantization.zero_point = 0

1277

one = create_const_tensor("one_const", [], ifm.dtype, [1], quantization=quantization)

1278

mul_identity.add_input_tensor(one)

1279

# Make sure that fm_id is allocated to a different address than fm_alpha

1280

fm_id = ofm.clone(op.name + "_id", set_unique=True)

1281

mul_identity.set_output_tensor(fm_id)

1282

mul_identity.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

1283

DebugDatabase.add_optimised(op, mul_identity)

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

1284

1285

# Combine scaled and alpha multiplied values

1286

max_op = Operation(Op.Maximum, op.name + "_max")

1287

max_op.add_input_tensor(fm_alpha)

1288

max_op.add_input_tensor(fm_id)

1289

max_op.set_output_tensor(ofm)

1290

max_op.set_ifm_ofm_shapes()

1291

1292

DebugDatabase.add_optimised(op, max_op)

1293

ifm.consumer_list.remove(op)

1294

return max_op

1295

1296

# Catch all PReLU conversion for the cases that could not be optimised above

Fredrik Svedberg

2022-08-19 16:06:04 +0200

[diff] [blame]

1297

no_scale_quant = ifm.quantization.clone()

1298

no_scale_quant.scale_f32 = None

1299

no_scale_quant.zero_point = 0

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

1300

zero = create_const_tensor("zero_const", [], ifm.dtype, [0], quantization=no_scale_quant)

Fredrik Svedberg

2022-08-19 16:06:04 +0200

[diff] [blame]

1301

1302

# Select values < 0

1303

min_op = Operation(Op.Minimum, op.name + "_min")

1304

min_op.add_input_tensor(ifm)

1305

min_op.add_input_tensor(zero)

1306

fm_negative = ifm.clone(op.name + "_negative", set_unique=True)

1307

min_op.set_output_tensor(fm_negative)

1308

min_op.set_ifm_ofm_shapes()

1309

DebugDatabase.add_optimised(op, min_op)

1310

1311

# and multiply with alpha tensor

1312

mul_alpha = Operation(Op.Mul, op.name + "_mul_alpha")

1313

mul_alpha.add_input_tensor(fm_negative)

1314

mul_alpha.add_input_tensor(alpha)

1315

fm_alpha = ofm.clone(op.name + "_negative_alpha", set_unique=True)

1316

mul_alpha.set_output_tensor(fm_alpha)

1317

mul_alpha.set_ifm_ofm_shapes()

1318

DebugDatabase.add_optimised(op, mul_alpha)

1319

1320

# Select (and scale) values > 0

1321

relu_op = Operation(Op.Relu, op.name + "_relu")

1322

relu_op.add_input_tensor(ifm)

1323

fm_scaled = ofm.clone(op.name + "_positive_scaled", set_unique=True)

1324

relu_op.set_output_tensor(fm_scaled)

1325

relu_op.set_ifm_ofm_shapes()

1326

DebugDatabase.add_optimised(op, relu_op)

1327

1328

# Add scaled and alpha multiplied values (without scaling)

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame]

1329

add_op = Operation(Op.Add, op.name + "_add")

1330

add_op.explicit_scaling = ExplicitScaling(False, shift=[0], multiplier=[1]) # No scaling

Fredrik Svedberg

2022-08-19 16:06:04 +0200

[diff] [blame]

1331

add_op.add_input_tensor(fm_alpha)

1332

add_op.add_input_tensor(fm_scaled)

1333

add_op.set_output_tensor(ofm)

1334

add_op.set_ifm_ofm_shapes()

1335

1336

DebugDatabase.add_optimised(op, add_op)

1337

ifm.consumer_list.remove(op)

op = add_op

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1343

def convert_mul_max_to_abs_or_lrelu(op, arch, nng):

1344

r"""Whenever there is a subgraph with this topology:

1345

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

1346

Input X For X = -1 or X > 0

1347

| \ / This subgraph can be replaced with either

1348

| Mul an Abs (if X = -1) or a LeakyReLU (if X > 0)

1349

| /

1350

Max

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1351

"""

1352

1353

if op.type == Op.Maximum:

1354

# finds the Mul input(s) to the Max

1355

muls = [i for i in op.inputs if i.ops[0].type == Op.Mul]

if len(muls) == 1:

mul = muls[0].ops[0]

elif len(muls) == 2:

# In the case both inputs are Muls, find the one with the same input as the Max

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

1360

mul_ifms = [m for m in muls if len(set(op.inputs + m.ops[0].inputs)) == 1]

1361

if len(mul_ifms):

1362

mul = mul_ifms[0].ops[0]

1363

else:

1364

# Not using same input

1365

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

else:

# No Mul inputs

return op

# make sure the Mul doesn't have any other consumers

1371

mul_ofm = mul.outputs[0]

1372

if len(mul_ofm.consumers()) != 1:

1373

return op

1374

# make sure the Mul doesn't have a fused activation function

1375

if mul.activation:

1376

return op

1377

ifm, ofm = op.get_ifm_ofm()

1378

if ifm is None or ofm is None:

1379

return op

1380

1381

if ifm.dtype not in (DataType.uint8, DataType.int8) or ifm.dtype != ofm.dtype:

1382

return op

1383

if not check_quantized_tens_scaling_equal(ifm, ofm) or not check_quantized_tens_scaling_equal(ifm, mul_ofm):

1384

# rewrite to LeakyRelu currently only makes sense if the quantization is identical

1385

return op

1386

1387

# finds the branched input that goes to both the Max and the Mul

1388

shared = set(op.inputs) & set(mul.inputs)

1389

if len(shared) == 1:

1390

shared_in = shared.pop()

1391

# find the constant scalar input to the Mul

1392

const_tens = (set(mul.inputs) - {shared_in}).pop()

1393

# check that it is a scalar

1394

if const_tens.shape != []:

1395

return op

1396

const = const_tens.ops[0]

1397

# check that it is a constant

1398

if const.type != Op.Const:

1399

return op

1400

# Remove the Mul from the shared input's consumers

1401

shared_in.consumer_list.remove(mul)

else:

return op

val = const.outputs[0].values

1406

if val >= 0:

1407

new_op = Op.LeakyRelu

1408

op.attrs["alpha"] = val

1409

# to produce bit exact results, the alpha is not enough;

1410

# save additional scaling info in attr "alpha_scale", to be used as input

1411

# to the LUT construction

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

1412

alpha_scalar = const_tens.values - const_tens.quantization.zero_point

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1413

mul_ifm_scale = np.double(ifm.quantization.scale_f32)

1414

mul_ifm2_scale = np.double(const_tens.quantization.scale_f32)

1415

mul_ofm_scale = np.double(mul_ofm.quantization.scale_f32)

1416

alpha_scale, alpha_shift = scaling.elementwise_mul_scale(mul_ifm_scale, mul_ifm2_scale, mul_ofm_scale)

1417

op.attrs["alpha_scaling"] = (alpha_scalar, alpha_scale, alpha_shift)

elif val == -1:

new_op = Op.Abs

else:

return op

op.type = new_op

op.name = op.name.replace("Maximum", new_op.name)

1425

op.outputs[0].name = op.outputs[0].name.replace("Maximum", new_op.name)

1426

op.inputs = [shared_in]

1427

op.set_ifm_ofm_shapes()

1428

1429

# Record optimisation in debug database

1430

DebugDatabase.add_optimised(op, op)

return op

def convert_hardswish_to_lut(op, arch, nng):

1436

if op.type == Op.HardSwish:

1437

ifm, ofm = op.get_ifm_ofm()

1438

# Generate the LUT

1439

ifm_scale = np.double(ifm.quantization.scale_f32)

1440

ofm_scale = np.double(ofm.quantization.scale_f32)

1441

zp_in = ifm.quantization.zero_point

1442

zp_out = ofm.quantization.zero_point

1443

ifm_scale_hires = (1 / 128) * ifm_scale

1444

relu_multiplier = np.double(3 / 32768)

1445

out_scale, out_shift = scaling.quantise_scale(ifm_scale_hires / ofm_scale)

1446

relu_scale, relu_shift = scaling.quantise_scale(ifm_scale_hires / relu_multiplier)

1447

# Use 16bit scale

1448

out_scale_16 = fp_math.downscale_multiplier_int32_to_int16(out_scale)

1449

relu_scale_16 = fp_math.downscale_multiplier_int32_to_int16(relu_scale)

1450

1451

values = []

1452

ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)

1453

quantized_min = min(ix)

1454

quantized_max = max(ix)

1455

for x in ix:

1456

input_value = x - zp_in

1457

input_value_hires = input_value * 128

1458

# Compute the input value on essentially the output scale, not shifted yet

1459

input_value_preshift = fp_math.saturating_rounding_mul16(input_value_hires, out_scale_16)

1460

# Compute the "relu-ish multiplier". This matches the code in TensorFlow Lite Micro kernel

1461

relu_value = np.int16(input_value_hires)

1462

if relu_shift < 31:

1463

relu_value = fp_math.shift_left16(relu_value, 30 - relu_shift)

1464

1465

relu_value = fp_math.saturating_rounding_mul16(relu_value, relu_scale_16)

1466

1467

if relu_shift < 31:

1468

relu_value = fp_math.shift_left16(relu_value, 1)

1469

1470

if relu_shift > 31:

1471

relu_value = fp_math.rounding_divide_by_pot(relu_value, relu_shift - 31)

1472

1473

# Rescaled the value into a 16bit fixedpoint relu_value in [-1, 1]

1474

# Now convert that to a 16bit fixedpoint value in [0, 1]

1475

relu_value = (relu_value + (1 << 15)) >> 1

1476

lut_result = fp_math.saturating_mul16(relu_value, input_value_preshift)

1477

shift = 31 - out_shift

1478

shift = -shift if shift < 0 else 0

1479

# Finally apply the output shift

1480

lut_result = fp_math.rounding_divide_by_pot(lut_result, shift) + zp_out

1481

lut_result = min(quantized_max, max(quantized_min, lut_result))

1482

values.append(lut_result)

1483

return convert_to_lut(op, values, "hardswish")

return op

def convert_lrelu_to_mul_max(op, arch):

1488

# Converts LeakyRelu to Max(alpha * IFM, identity * IFM)

1489

# (the opposite of convert_mul_max_to_abs_or_lrelu)

1490

ifm, ofm = op.get_ifm_ofm()

1491

if ifm is None or ofm is None:

1492

return op

1493

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1494

alpha = np.float32(op.attrs["alpha"])

1495

use_mul_max = 0 < alpha < 1

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1496

is_converted_prelu = "alpha_scaling" in op.attrs

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

if use_mul_max:

mul_ifm = ifm

new_op = Op.Maximum

else:

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1501

# Need to use a different approach for alpha < 0 or alpha > 1

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1502

no_scale_quant = ifm.quantization.clone()

1503

no_scale_quant.scale_f32 = None

1504

no_scale_quant.zero_point = 0

1505

zero = create_const_tensor("zero_const", [], ifm.dtype, [0], quantization=no_scale_quant)

1506

1507

# Select values < 0

1508

min_op = Operation(Op.Minimum, op.name + "_min")

1509

min_op.add_input_tensor(ifm)

1510

min_op.add_input_tensor(zero)

1511

mul_ifm = ifm.clone(op.name + "_negative", set_unique=True)

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1512

if alpha < 0 and not is_converted_prelu:

1513

# For negative alpha that is not from a converted PReLU we need to use

1514

# int32 Mul below to perform the (negative) alpha scaling

1515

mul_ifm.dtype = DataType.int32

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1516

min_op.set_output_tensor(mul_ifm)

1517

min_op.set_ifm_ofm_shapes()

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame]

1518

new_op = Op.Add

1519

op.explicit_scaling = ExplicitScaling(False, shift=[0], multiplier=[1]) # No scaling

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1520

DebugDatabase.add_optimised(op, min_op)

1521

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1522

# Add multiplication with alpha

1523

mul_alpha = Operation(Op.Mul, op.name + "_mul_alpha")

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1524

mul_alpha.add_input_tensor(mul_ifm)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1525

# Create const tensor containing alpha as scalar

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1526

quantization = ifm.quantization.clone()

1527

quantization.min = 0

1528

quantization.max = alpha * (quantization.quant_max - quantization.quant_min)

1529

quantization.zero_point = 0

Fredrik Svedberg

2022-09-13 15:22:01 +0200

[diff] [blame]

1530

alpha_dtype = mul_ifm.dtype

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1531

if is_converted_prelu:

1532

# The LeakyRelu was the result from convert_prelu and the scaling is provided

Fredrik Svedberg

2022-08-29 10:51:27 +0200

[diff] [blame]

1533

scalar, alpha_scale, alpha_shift = op.attrs["alpha_scaling"]

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame]

1534

mul_alpha.explicit_scaling = ExplicitScaling(False, [alpha_shift], [alpha_scale])

Fredrik Svedberg

2022-09-13 15:22:01 +0200

[diff] [blame]

1535

elif alpha == 0 or np.isinf(1 / alpha):

1536

# Handling of alpha near or at zero

Fredrik Svedberg

cce872b

2021-09-02 15:20:52 +0200

[diff] [blame]

1537

quantization.scale_f32 = np.float32(1)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1538

scalar = 0

1539

else:

1540

quantization.scale_f32 = alpha

Fredrik Svedberg

2022-09-13 15:22:01 +0200

[diff] [blame]

1541

if alpha_dtype == DataType.int32:

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1542

# When the datatype is int32 (alpha negative) we need to do the scaling with the multiplication

Fredrik Svedberg

2022-09-13 15:22:01 +0200

[diff] [blame]

1543

scalar, _ = scaling.elementwise_mul_scale(ifm.quantization.scale_f32, alpha, ofm.quantization.scale_f32)

1544

else:

1545

scalar = 1

Tim Hall

2023-01-13 17:57:25 +0000

[diff] [blame]

1546

alpha_tens = create_const_tensor(op.name + "_alpha_scalar", [1], alpha_dtype, [scalar], quantization=quantization)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1547

mul_alpha.add_input_tensor(alpha_tens)

1548

fm_alpha = ofm.clone(op.name + "_alpha", set_unique=True)

1549

mul_alpha.set_output_tensor(fm_alpha)

1550

mul_alpha.set_ifm_ofm_shapes()

1551

DebugDatabase.add_optimised(op, mul_alpha)

1552

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1553

if not use_mul_max:

1554

relu_op = Operation(Op.Relu, op.name + "_relu")

1555

relu_op.add_input_tensor(ifm)

1556

fm_id = ofm.clone(op.name + "_positive_scaled", set_unique=True)

1557

relu_op.set_output_tensor(fm_id)

1558

relu_op.set_ifm_ofm_shapes()

1559

DebugDatabase.add_optimised(op, relu_op)

1560

elif check_quantized_tens_scaling_equal(ifm, ofm):

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1561

# No identity multiplication is needed

1562

fm_id = ifm

1563

else:

1564

# Add multiplication with identity

1565

mul_identity = Operation(Op.Mul, op.name + "_mul_identity")

1566

mul_identity.add_input_tensor(ifm)

1567

# Create const tensor containing identity as scalar

1568

quantization = ifm.quantization.clone()

1569

quantization.min = 0

1570

quantization.max = quantization.quant_max - quantization.quant_min

Fredrik Svedberg

cce872b

2021-09-02 15:20:52 +0200

[diff] [blame]

1571

quantization.scale_f32 = np.float32(1)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1572

quantization.zero_point = 0

Tim Hall

2023-01-13 17:57:25 +0000

[diff] [blame]

1573

identity_tens = create_const_tensor(op.name + "_id_scalar", [], ifm.dtype, [1], quantization=quantization)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1574

mul_identity.add_input_tensor(identity_tens)

1575

# Make sure that fm_id is allocated to a different address than fm_alpha

1576

fm_id = ofm.clone(op.name + "_id", set_unique=True)

1577

mul_identity.set_output_tensor(fm_id)

1578

mul_identity.set_ifm_ofm_shapes()

1579

DebugDatabase.add_optimised(op, mul_identity)

1580

1581

# Convert LeakyRelu to Max, add the results of the multiplication(s) as inputs

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1582

op.type = new_op

1583

op.name = op.name.replace("LeakyRelu", new_op.name)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1584

op.inputs = []

1585

ifm.consumer_list.remove(op)

1586

op.add_input_tensor(fm_alpha)

1587

op.add_input_tensor(fm_id)

1588

op.set_ifm_ofm_shapes()

1589

1590

DebugDatabase.add_optimised(op, op)

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1594

def convert_to_lut8(op, fn, fn_name):

1595

# Converts op to a no-op + int8/uint8 LUT which is generated with the given function.

1596

# fn is a function(real) -> real

1597

ifm, ofm = op.get_ifm_ofm()

1598

if ifm.dtype not in (DataType.uint8, DataType.int8) or ifm.dtype != ofm.dtype:

1599

return op

1600

# Generate the LUT

1601

ifm_scale = np.double(ifm.quantization.scale_f32)

1602

ofm_scale = np.double(ofm.quantization.scale_f32)

1603

zp_in = ifm.quantization.zero_point

1604

zp_out = ofm.quantization.zero_point

1605

values = []

1606

ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)

1607

quantized_min = min(ix)

1608

quantized_max = max(ix)

1609

for x in ix:

1610

x_real = ifm_scale * (x - zp_in)

1611

y_real = fn(x_real)

1612

lut_result = round_away_zero(zp_out + y_real / ofm_scale)

1613

lut_result = min(quantized_max, max(quantized_min, lut_result))

1614

values.append(lut_result)

1615

return convert_to_lut(op, values, fn_name)

1616

1617

1618

def convert_lrelu_to_lut(op, arch):

1619

ifm, ofm = op.get_ifm_ofm()

1620

# Generate the LUT

1621

alpha = op.attrs["alpha"]

1622

ifm_scale = np.double(ifm.quantization.scale_f32)

1623

ofm_scale = np.double(ofm.quantization.scale_f32)

1624

zp_in = ifm.quantization.zero_point

1625

zp_out = ofm.quantization.zero_point

1626

identity_scale, identity_shift = scaling.elementwise_mul_scale(ifm_scale, 1, ofm_scale)

1627

alpha_scalar = 1

1628

alpha_scale, alpha_shift = scaling.elementwise_mul_scale(ifm_scale, alpha, ofm_scale)

1629

if "alpha_scaling" in op.attrs:

1630

# The LeakyRelu was the result from convert_mul_max_to_abs_or_lrelu

1631

alpha_scalar, alpha_scale, alpha_shift = op.attrs["alpha_scaling"]

1632

values = []

1633

ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)

1634

quantized_min = min(ix)

1635

quantized_max = max(ix)

1636

for x in ix:

1637

if x < zp_in:

1638

lut_result = zp_out + fp_math.multiply_by_quantized_multiplier(

1639

alpha_scalar * (x - zp_in), alpha_scale, alpha_shift

1640

)

1641

else:

1642

lut_result = zp_out + fp_math.multiply_by_quantized_multiplier(x - zp_in, identity_scale, identity_shift)

1643

lut_result = min(quantized_max, max(quantized_min, lut_result))

1644

values.append(lut_result)

1645

return convert_to_lut(op, values, "lrelu")

1646

1647

1648

def convert_lrelu(op, arch, nng):

1649

# Converts LeakyRelu to a LUT based solution if possible, otherwise a mul + max

1650

if op.type != Op.LeakyRelu:

1651

return op

1652

ifm, ofm = op.get_ifm_ofm()

1653

if ifm is None or ofm is None:

1654

return op

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1655

alpha = op.attrs["alpha"]

1656

if alpha == 0:

1657

# When alpha is 0 the opertion can be converted to a ReLU

1658

op.type = Op.Relu

1659

op.name = op.name.replace("LeakyRelu", op.type.name)

1660

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1661

if ifm.dtype in (DataType.uint8, DataType.int8) and ifm.dtype == ofm.dtype:

1662

# use LUT for int8/uint8

1663

return convert_lrelu_to_lut(op, arch)

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

1664

if check_quantized_tens_scaling_equal(ifm, ofm) and ifm.dtype == ofm.dtype == DataType.int16 and alpha > 0:

Fredrik Svedberg

2022-09-07 16:01:15 +0200

[diff] [blame]

1665

# use LeakyRelu unmodified for int16 with equal input/output scaling and positive alpha

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1666

return op

1667

return convert_lrelu_to_mul_max(op, arch)

1668

1669

1670

def convert_tanh_sigmoid_to_lut(op, arch, nng):

1671

# Converts int8/uint8 Sigmoid and Tanh to a LUT based solution

1672

if op.type == Op.Sigmoid:

1673

return convert_to_lut8(op, clamp_sigmoid, "sigmoid")

1674

elif op.type == Op.Tanh:

1675

return convert_to_lut8(op, math.tanh, "tanh")

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1679

def fuse_activation_function_with_prev(op, arch, nng):

1680

# if op is a no-op: attempts to move the activation function to the preceding op

1681

if not op.attrs.get("is_nop", False) or op.activation is None:

1682

return op

1683

ifm, ofm = op.get_ifm_ofm()

1684

if ifm is None or ofm is None:

1685

return op

1686

# finds the input(s) to the operation

1687

prev_op = ifm.ops[0]

1688

# Note: the below checks on prev_op require that a first optimize pass on the full graph has been performed

1689

fuse = (

1690

prev_op.run_on_npu

1691

and prev_op.type.npu_block_type != NpuBlockType.Default

1692

and len(ifm.ops) == 1

1693

and len(prev_op.outputs[0].consumers()) == 1

1694

and prev_op.activation is None

1695

)

1696

if op.activation_lut is not None and arch.shram_reserved_unused_banks == 0:

1697

# TODO: if SHRAM LUT space is shared with SHRAM ACC (32, 64 MAC),

1698

# LUT currently only works correctly for elementwise ops

fuse = False

if not fuse:

return op

# Move the fused activation function + corresponding info to prev_op

1703

prev_op.activation = op.activation

1704

prev_op.forced_output_quantization = op.forced_output_quantization

1705

if op.activation_lut is not None:

1706

prev_op.set_activation_lut(op.activation_lut)

1707

# Bypass op

1708

prev_op.set_output_tensor(ofm)

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

1709

DebugDatabase.add_optimised(prev_op, prev_op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

def _leading_pad_ok(leading_pad, stride, kernel_size):

1714

# If kernel size // 2 > stride, then (left, top) padding must be a multiple of stride,

1715

# otherwise replacing PAD by hardware padding would iterate the wrong IFM rows/columns

1716

max_size = kernel_size // 2

1717

return leading_pad == max_size or max_size <= stride or leading_pad % stride == 0

1718

1719

1720

def replace_pad_by_hw_pad(op: Operation, arch, nng):

1721

"""

1722

Tries to completely remove a PAD operator by using hardware padding.

1723

E.g. a PAD operation that pads 1, followed by a CONV with VALID padding and kernel size 3

1724

is rewritten such that the PAD is removed, and the CONV uses SAME padding.

1725

Converts tens1 -> PAD -> tens2 -> CONV to tens1 -> CONV

1726

if both operations can be run on the NPU.

1727

This is the most efficient way to implement PAD, but cannot be done for all pad sizes.

1728

"""

1729

if (

1730

(op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op() or op.type.is_avgpool_op())

Tim Hall

0ab2edc

2022-02-23 17:58:02 +0000

[diff] [blame]

1731

and op.type not in (Op.Conv2DBackpropInput, Op.Conv2DBackpropInputSwitchedBias)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1732

and op.run_on_npu

1733

and op.attrs["padding"] == Padding.VALID

1734

):

1735

pad_op = op.ifm.ops[0]

1736

if pad_op.type != Op.Pad or not pad_op.run_on_npu:

1737

return op

1738

if pad_op.ifm.dtype != pad_op.ofm.dtype or not check_quantized_tens_scaling_equal(pad_op.ofm, pad_op.ifm):

1739

return op

1740

top, left, bottom, right = get_pad_values_from_input(pad_op.inputs[1].values)

1741

k = op.kernel

1742

k_w, k_h = k.dilated_wh()

1743

1744

# Check if the PAD operator can be replaced by hardware padding

1745

if left > k_w // 2 or right > k_w // 2 or top > k_h // 2 or bottom > k_h // 2:

1746

# Too much padding, it would require hardware padding to actually insert zeros

1747

return op

1748

if not _leading_pad_ok(top, k.stride.y, k_h) or not _leading_pad_ok(left, k.stride.x, k_w):

1749

return op

1750

1751

if op.type.is_avgpool_op():

1752

# For average pool, hardware padding can only be used if padding is 0 or kernel size / 2

for pad, k_size in (

(left, k_w),

(right, k_w),

(top, k_h),

(bottom, k_h),

):

if pad not in (0, k_size // 2):

1760

return op

1761

# Average pool is converted to depthwise, because NPU average pool + same padding

1762

# has a special implementation that is different from PAD followed by average pool with

1763

# valid padding.

1764

k_w, k_h = op.kernel.width, op.kernel.height

1765

ifm = op.ifm

1766

# Remember other inputs

1767

other_inputs = op.inputs[1:]

1768

# Create a weight tensor, all weights are set to 1/(kernel width * kernel height)

1769

quantization = QuantizationParameters(0.0, 255.0)

1770

quantization.scale_f32 = 1.0 / (k_w * k_h)

1771

quantization.zero_point = 0

1772

shape = [k_h, k_w, 1, op.ofm.shape[-1]]

1773

weights = np.full(shape, 1)

1774

1775

weight_tens = create_const_tensor(

1776

op.name + "_weights",

1777

shape,

1778

op.ifm.dtype,

1779

weights,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1780

purpose=TensorPurpose.Weights,

1781

quantization=quantization,

1782

)

James Peet

2021-07-19 16:47:58 +0100

[diff] [blame]

1783

weight_tens.values = weights

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1784

op.type = Op.DepthwiseConv2DBias

1785

op.inputs = []

1786

op.add_input_tensor(ifm)

1787

op.add_input_tensor(weight_tens)

Tim Hall

2023-05-16 22:39:14 +0100

[diff] [blame]

1788

1789

if op.ifm.dtype == DataType.uint8:

1790

op.rounding_mode = RoundingMode.HalfUp

1791

1792

# Add bias tensor, all biases set to 0

1793

op.inputs.append(None)

1794

fixup_bias_tensors(op, arch, nng, DataType.int32)

1795

1796

else:

1797

op.rounding_mode = RoundingMode.AwayZero

1798

1799

# The DepthwiseConv needs to be performed with the IFM zero point set appropriately so that the correct

1800

# pad values are used. However, in order to use the rounding away from zero mode the zero point needs to

1801

# have been removed so that the zero point is at zero. This is done by adding a kernel sized amount of

1802

# the zero point as a bias. The datatype of the bias needs to be set to int32, even for an int16 IFM,

1803

# because this will cause full precision scaling to be used (see weight compression). Finally, the OFM

1804

# zero point will need forcing to zero (as it has already been removed)

1805

nr_biases = op.inputs[1].shape[-1]

1806

bias_values = [op.ifm.quantization.zero_point * k_h * k_w] * nr_biases

1807

bias_tensor = create_const_tensor(op.name + "_bias", [nr_biases], DataType.int32, bias_values)

1808

op.add_input_tensor(bias_tensor)

1809

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1810

# Add other inputs

1811

op.inputs.extend(other_inputs)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1812

1813

# Bypass the PAD operator

1814

op.set_input_tensor(pad_op.ifm, 0)

1815

# Adjust the padding attributes of the convolution operator

1816

op.attrs["padding"] = Padding.EXPLICIT

1817

op.attrs["explicit_padding"] = (top, left, bottom, right)

1818

op.set_ifm_ofm_shapes()

wilisa01

2022-11-02 17:18:43 +0000

[diff] [blame]

1819

DebugDatabase.add_optimised(op, op)

1820

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

def convert_pad(op: Operation, arch, nng):

1825

"""

1826

Rewrites PAD operator to an average pool that copies the IFM to the OFM

1827

+ up to 4 average pool operators that fill the OFM with zeros at the borders.

1828

This is done as fall-back for the PAD operators that remain after replace_pad_by_hw_pad

1829

"""

1830

if op.type != Op.Pad or not op.run_on_npu:

1831

return op

1832

top, left, bottom, right = get_pad_values_from_input(op.inputs[1].values)

1833

1834

ifm = op.ifm

1835

assert ifm is not None

James Ward

3e13434

2021-10-28 10:01:40 +0100

[diff] [blame]

1836

ifm_shape = op.ifm_shapes[0]

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1837

ofm = op.ofm

1838

assert ofm is not None

1839

ofm.ops = []

1840

ofm_shape = op.ofm_shapes[0]

1841

1842

# Average pool op that copies IFM to the right place inside the OFM

1843

shp0 = Shape4D(0, 0, 0, 0)

1844

shp_top = shp0.with_height(top)

1845

avgpool_op = create_avg_pool_for_concat(op, op.name + "_main", ifm, ifm_shape, shp_top.with_width(left))

1846

avgpool_op.activation = op.activation

1847

quant = ofm.quantization

1848

pad_value = quant.zero_point

1849

# Add operations that fill the borders of the OFM

1850

if top > 0:

1851

shape = Shape4D(1, top, ofm_shape.width, ofm_shape.depth)

1852

zero_tens = create_const_tensor(

Tim Hall

2023-01-13 17:57:25 +0000

[diff] [blame]

1853

op.name + "_top", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], quantization=quant

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1854

)

1855

# If top/bottom or left/right are equal, the const tensors can be allocated to the same address

1856

zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))

1857

create_avg_pool_for_concat(op, op.name + "_top", zero_tens, shape, shp0)

1858

if bottom > 0:

1859

shape = Shape4D(1, bottom, ofm_shape.width, ofm_shape.depth)

1860

zero_tens = create_const_tensor(

op.name + "_bottom",

shape.as_list(),

ofm.dtype,

shape.elements() * [pad_value],

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1865

quantization=quant,

1866

)

1867

zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))

1868

create_avg_pool_for_concat(

1869

op, op.name + "_bottom", zero_tens, shape, shp0.with_height(ofm_shape.height - bottom)

1870

)

1871

if left > 0:

1872

shape = Shape4D(1, ifm_shape.height, left, ofm_shape.depth)

1873

zero_tens = create_const_tensor(

Tim Hall

2023-01-13 17:57:25 +0000

[diff] [blame]

1874

op.name + "_left", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], quantization=quant

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1875

)

1876

zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))

1877

create_avg_pool_for_concat(op, op.name + "_left", zero_tens, shape, shp_top)

1878

if right > 0:

1879

shape = Shape4D(1, ifm_shape.height, right, ofm_shape.depth)

1880

zero_tens = create_const_tensor(

Tim Hall

2023-01-13 17:57:25 +0000

[diff] [blame]

1881

op.name + "_right", shape.as_list(), ofm.dtype, shape.elements() * [pad_value], quantization=quant

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1882

)

1883

zero_tens.equivalence_id = create_equivalence_id(tuple(zero_tens.values))

1884

create_avg_pool_for_concat(

1885

op, op.name + "_right", zero_tens, shape, shp_top.with_width(ofm_shape.width - right)

1886

)

1887

1888

op.type = Op.ConcatTFLite

return avgpool_op

Fredrik Svedberg

2022-09-20 16:32:52 +0200

[diff] [blame]

1892

def fixup_bias_tensors(op, arch, nng, dtype=None):

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1893

if op.type.needs_bias() and op.bias is None:

1894

# Op has no bias, add bias tensor filled with zeros

1895

nr_biases = op.inputs[1].shape[-1]

1896

bias_values = [0] * nr_biases

Fredrik Svedberg

cc219be

2022-09-20 16:32:52 +0200

[diff] [blame]

1897

# The DataType of the bias tensor can be explicitly provided or deduced from the ifm

1898

# DataType. Default is int32 bias for 8-bit ifms and int64 for int16 ifms.

1899

# For int16 the selected bias DataType will have an impact on the scaling

1900

# used when encoding the scales and biases later. The default mode will match the

1901

# refence with reduced scaling for int64 bias.

1902

# This means that in cases (in the graph optimiser) where DepthwiseConv2DBias

1903

# is used to emulate average pool int32 bias should be selected for full precision

1904

# int16 scaling.

1905

if dtype is None:

1906

dtype = DataType.int64 if op.ifm.dtype == DataType.int16 else DataType.int32

1907

bias_tensor = create_const_tensor(op.name + "_bias", [nr_biases], dtype, bias_values)

Raul Farkas

3e7157b

2023-05-09 09:09:17 +0100

[diff] [blame]

1908

bias_index = op.type.info.indices.biases[0]

1909

if bias_index < len(op.inputs):

1910

op.set_input_tensor(bias_tensor, bias_index)

1911

else:

1912

op.add_input_tensor(bias_tensor)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

wilisa01

2023-02-08 09:56:14 +0000

[diff] [blame]

1917

def detect_asymmetric_weights(op):

1918

# Check all ops (cpu and npu)

1919

if op.type.is_conv2d_op() or op.type.is_depthwise_conv2d_op():

1920

if op.ifm.dtype in (DataType.int8, DataType.int16):

Fredrik Svedberg

2021-11-01 14:25:29 +0100

[diff] [blame]

1921

if not np.all(op.weights.quantization.zero_point == 0):

wilisa01

46c9477

2023-02-08 09:56:14 +0000

[diff] [blame]

1922

print(f"Warning: Op {op.type} '{op.name}' has asymmetric weights.", end=" ")

1923

return True

1924

return False

Fredrik Svedberg

2021-11-01 14:25:29 +0100

[diff] [blame]

1925

wilisa01

46c9477

2023-02-08 09:56:14 +0000

[diff] [blame]

1926

1927

def fixup_asymmetric_weights(op, arch, nng):

1928

if detect_asymmetric_weights(op):

1929

if op.run_on_npu:

1930

print("Zero points have been adjusted.")

1931

op.weights.quantization.zero_point *= 0

Fredrik Svedberg

2021-11-01 14:25:29 +0100

[diff] [blame]

return op

wilisa01

2023-02-08 09:56:14 +0000

[diff] [blame]

1935

def check_asymmetric_weights(op, arch, nng):

1936

# This function can modify the run_on_npu flag which causes an operator to be placed on the CPU. It is usually only

1937

# set by the supported operator checks. Therefore, it should be run immediately after those checks to avoid the

1938

# possibility of other graph optimiser functions modify the operator (that is later run on the CPU)

1939

if detect_asymmetric_weights(op):

1940

if op.run_on_npu:

1941

print("To run the operator on Ethos-U use the option --force-symmetric-int-weights")

1942

op.run_on_npu = False

return op

def fixup_or_check_asymmetric_weights(force_symmetric_int_weights):

1947

if force_symmetric_int_weights:

1948

return fixup_asymmetric_weights

1949

else:

1950

return check_asymmetric_weights

1951

1952

Rickard Bolin

2023-04-20 15:12:28 +0000

[diff] [blame]

1953

def convert_mean_to_depthwise_conv(op, arch, nng):

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame^]

1954

"""

1955

When h x w <= 4096 When h x w > 4096 there is a need to split into several ops.

1956

Do this by splitting up h and change the read_offset/shape.

1957

Below is an example where ifm is 1x190x64x1

1958

MEAN MEAN

1959

| |-----------------------|----------------------|

1960

DepthwiseConv2DBias 1_DepthwiseConv2DBias 2_DepthwiseConv2DBias 3_DepthwiseConv2DBias

1961

| | | |

1962

MUL |---------ADD-----------| |

1963

| |

1964

|----------------ADD---------------|

1965

|

1966

MUL

1967

1_DepthwiseConv2DBias: read_offset [0, 0, 0, 0]> read_shape [1, 64, 64, 1]>

1968

2_DepthwiseConv2DBias: read_offset [0, 64, 0, 0]> read_shape [1, 64, 64, 1]>

1969

3_DepthwiseConv2DBias: read_offset [0, 128, 0, 0]> read_shape [1, 62, 64, 1]>

1970

"""

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1971

if op.type == Op.Mean and op.run_on_npu:

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame^]

1972

max_kernel_size = 4096

1973

max_height = 64

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1974

inp, axis = op.inputs

1975

shape = inp.shape

Diqing Zhong

1ddb2ed

2022-03-09 12:23:47 +0100

[diff] [blame]

1976

ofm_shape = op.ofm.shape

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1977

dims = len(shape)

Diqing Zhong

1ddb2ed

2022-03-09 12:23:47 +0100

[diff] [blame]

1978

dims_ofm = len(ofm_shape)

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame^]

1979

ofmq = op.ofm.quantization

1980

ifmq = op.ifm.quantization

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1981

1982

# Height and width axes have different index depending on dimensions

1983

if axis.shape == [] or axis.shape[0] == 1: # single axis

1984

axis = int(axis.values) if len(axis.shape) == 0 else int(axis.values[0])

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame^]

1985

# If dims is 4, axis 1 refers to h-dimension

1986

if dims == 4:

1987

reduce_h, reduce_w = (True, False) if axis == 1 else (False, True)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1988

else:

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame^]

1989

reduce_h, reduce_w = (True, False) if axis == 0 else (False, True)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1990

else: # multiple axes

1991

axis = sorted(axis.values)

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame^]

1992

reduce_h, reduce_w = (True, True)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1993

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

1994

# Change dimensions to 4

Diqing Zhong

1ddb2ed

2022-03-09 12:23:47 +0100

[diff] [blame]

1995

def extend_dims(dim, in_shape):

1996

if dim < 4:

1997

in_shape = [1] + in_shape

if dim == 2:

in_shape += [1]

return in_shape

if dims < 4 or dims_ofm < 4:

2003

# Fix the ofm dimension when keep_dims is false

2004

# e.g. IFM=1xHxWxC axis=2 OFM=1xHxC, the ofm_shape should be 1xHx1xC, not 1x1xHxC

2005

if isinstance(axis, int) and dims_ofm + 1 == dims:

2006

ofm_shape.insert(axis, 1)

2007

elif isinstance(axis, list) and (dims_ofm + len(axis) == dims):

2008

for i in axis:

2009

ofm_shape.insert(i, 1)

2010

shape = extend_dims(dims, shape)

2011

dims_ofm = len(ofm_shape)

2012

ofm_shape = extend_dims(dims_ofm, ofm_shape)

2013

op.set_ifm_ofm_shapes()

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2014

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame^]

2015

# Compute kernel sizes for our convolutions

2016

h = shape[1] if reduce_h else 1

2017

w = shape[2] if reduce_w else 1

2018

num_elements_in_axis = h * w

2019

2020

# If one convolution is enough, but height is greater than max kernel height

2021

# reshape from HxW to 1x(HxW)

2022

# This can only be done if the mean is computed over both H and W

2023

if h > max_height and num_elements_in_axis <= max_kernel_size and reduce_h and reduce_w:

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2024

shape = [shape[0], 1, h * w, shape[3]]

2025

op.ifm_shapes[0] = Shape4D(shape)

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame^]

2026

op.ifm.shape = shape

2027

w = h * w

2028

h = 1

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2029

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame^]

2030

intermediate_op = None

2031

height_per_conv = min(max_kernel_size // w, h)

2032

height_per_conv = min(height_per_conv, max_height)

2033

num_convs = math.ceil(h / height_per_conv)

2034

convs = list()

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2035

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame^]

2036

for i in range(num_convs):

2037

is_last_op = i == (num_convs - 1)

2038

2039

intermediate_op = op.clone(f"{op.name}_conv_{i}")

2040

2041

intermediate_op.type = Op.DepthwiseConv2DBias

2042

2043

# Set necessary depthwise attributes

2044

intermediate_op.attrs.update(

2045

{

2046

"padding": Padding.VALID,

2047

"stride_h": 1,

2048

"stride_w": 1,

2049

"strides": (1, 1, 1, 1),

2050

"depth_multiplier": 1,

2051

"channel_multiplier": 1,

2052

"dilation_h_factor": 1,

2053

"dilation_w_factor": 1,

2054

"dilation": (1, 1, 1, 1),

}

)

b, _, _, c = shape

intermediate_tensor = op.ofm.clone(suffix=f"_conv_sum_{i}", set_unique=True)

2061

intermediate_tensor.dtype = DataType.int32

2062

intermediate_op.set_output_tensor(intermediate_tensor)

2063

2064

# as we have several convs, scaling/rounding must be done after the sum has been calculated

2065

intermediate_op.explicit_scaling = ExplicitScaling(False, shift=[0], multiplier=[1])

2066

2067

# compute height for the kernel

2068

if is_last_op and h % height_per_conv != 0:

2069

weight_h = h % height_per_conv

2070

else:

2071

weight_h = height_per_conv

2072

2073

# compute ifm read offset and shape for the convolution

2074

read_shape_h = weight_h if reduce_h else shape[1]

2075

read_shape_w = w if reduce_w else shape[2]

2076

2077

intermediate_op.read_offsets[0] = Shape4D([0, i * height_per_conv, 0, 0])

2078

intermediate_op.read_shapes[0] = Shape4D(shape).with_hw(read_shape_h, read_shape_w)

2079

2080

weight_quant = QuantizationParameters(0, 255, scale_f32=1.0, zero_point=0)

2081

weight_shape = [weight_h, w, c, b]

2082

weight_tensor = create_const_tensor(

2083

f"{intermediate_op.name}_weights",

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2084

weight_shape,

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame^]

2085

DataType.uint8,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2086

np.ones(weight_shape),

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame^]

2087

TensorPurpose.Weights,

2088

quantization=weight_quant,

2089

)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2090

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame^]

2091

weights_1D = np.ones(np.prod(weight_shape))

2092

weight_tensor.equivalence_id = create_equivalence_id(tuple(weights_1D))

2093

weight_tensor.value_id = weight_tensor.equivalence_id

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2094

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame^]

2095

intermediate_op.set_input_tensor(weight_tensor, 1)

Rickard Bolin

2023-04-20 15:12:28 +0000

[diff] [blame]

2096

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame^]

2097

dtype = DataType.int64 if intermediate_op.ifm.dtype == DataType.int16 else DataType.int32

2098

bias_values = [0] * c

2099

bias = create_const_tensor(f"{intermediate_op.name}_bias", [c], dtype, bias_values)

2100

bias.equivalence_id = create_equivalence_id(tuple(bias_values))

2101

bias.value_id = bias.equivalence_id

2102

intermediate_op.inputs.append(bias)

2103

intermediate_op.set_ifm_ofm_shapes()

Johan Alfven

2023-04-13 18:54:47 +0200

[diff] [blame]

2104

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame^]

2105

# We want to avoid reshaping the tensor directly, to not affect other ops

2106

# so we update the shape explicitly for this operation

2107

intermediate_op.ifm_shapes[0] = Shape4D(shape)

Rickard Bolin

2023-04-20 15:12:28 +0000

[diff] [blame]

2108

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame^]

2109

convs.append(intermediate_op)

2110

DebugDatabase.add_optimised(op, intermediate_op)

2111

2112

# If we have more than one convolution

2113

# We use add operations to accumulate the intermediate tensors

if len(convs) > 1:

prev_add_op = None

idx = 0

while len(convs):

intermediate_tensor = op.ofm.clone(suffix=f"_add_sum_{idx}", set_unique=True)

2120

intermediate_tensor.dtype = DataType.int32

2121

2122

one_scale_quant = QuantizationParameters(scale_f32=1.0, zero_point=0)

2123

2124

ifm = convs.pop().ofm

2125

if not prev_add_op:

2126

ifm2 = convs.pop().ofm

2127

else:

2128

ifm2 = prev_add_op.ofm

2129

2130

intermediate_op = create_add(f"{op.name}_add_{idx}", ifm, ifm2, one_scale_quant)

2131

intermediate_op.explicit_scaling = ExplicitScaling(False, shift=[0], multiplier=[1])

2132

intermediate_op.set_output_tensor(intermediate_tensor)

2133

intermediate_op.set_ifm_ofm_shapes()

2134

2135

prev_add_op = intermediate_op

2136

idx += 1

2137

2138

DebugDatabase.add_optimised(op, intermediate_op)

2139

2140

# Convert the original mean op to our final Mul operation

2141

# Which scales and divides by num_elements_in_axis

2142

op.type = Op.Mul

2143

op.name = f"{op.name}_mul"

2144

op.attrs = {}

2145

op.set_input_tensor(intermediate_op.ofm, 0)

Rickard Bolin

2023-04-20 15:12:28 +0000

[diff] [blame]

2146

Johan Alfven

2023-04-13 18:54:47 +0200

[diff] [blame]

2147

# The multiplier is calculated in the same way as in the reference,

2148

# clamping the shift value at the price of some precision loss.

Johan Alfven

2023-04-13 18:54:47 +0200

[diff] [blame]

2149

output_multiplier, output_shift_vela = quantise_scale(np.double(ifmq.scale_f32) / np.double(ofmq.scale_f32))

2150

2151

# Convert to reference representation shift value

2152

output_shift = 31 - output_shift_vela

2153

2154

# Reference calculation

2155

# round_down_log2 same as 63 - CountLeadingZeros(num_elements_in_axis)

2156

shift = round_down_log2(num_elements_in_axis)

2157

shift = min(shift, 32)

2158

shift = min(shift, 31 + output_shift)

2159

output_multiplier = (output_multiplier << shift) // num_elements_in_axis

2160

output_shift = output_shift - shift

2161

2162

# Convert to vela representation shift

2163

output_shift_vela = 31 - output_shift

2164

2165

# For int32 scaling is not supported so instead multiply with the scale

2166

# intermediate * scale -> round and shift.

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame^]

2167

identity_quant = QuantizationParameters(scale_f32=1.0, zero_point=0)

Johan Alfven

2023-04-13 18:54:47 +0200

[diff] [blame]

2168

scalar = create_const_tensor(

2169

op.name + "_scalar", [1, 1, 1, 1], DataType.int32, [output_multiplier], quantization=identity_quant

2170

)

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame^]

2171

op.set_input_tensor(scalar, 1)

2172

op.set_ifm_ofm_shapes()

Johan Alfven

2023-04-13 18:54:47 +0200

[diff] [blame]

2173

2174

# Reference using TFL rounding for the multiply

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame^]

2175

op.rounding_mode = RoundingMode.TFLite

Johan Alfven

2023-04-13 18:54:47 +0200

[diff] [blame]

2176

2177

# Need to use explicit scaling to get the wanted shift

Alexander Hansson

2023-05-31 15:03:03 +0000

[diff] [blame^]

2178

op.explicit_scaling = ExplicitScaling(False, [output_shift_vela], [1])

2179

DebugDatabase.add_optimised(op, op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

Johan Alfven

2023-04-24 13:35:40 +0200

[diff] [blame]

2183

def convert_ops_to_lut(op, arch, nng):

2184

if op.type == Op.Exp:

2185

if op.ifm.dtype == DataType.int8:

2186

return create_lut_8bit_op(op, math.exp, "exp")

2187

elif op.ifm.dtype == DataType.int16:

2188

return create_lut_int16_op(op, math.exp, "exp")

2189

else:

2190

# Should already be catched in tflite supported ops

2191

assert False, f"Unsupported data type {op.ifm.dtype} for {op.type}"

2192

Johan Alfven

8e525ca

2023-05-07 13:12:37 +0200

[diff] [blame]

2193

if op.type == Op.Rsqrt:

2194

return create_lut_rsqrt_int8_op(op)

2195

Johan Alfven

ce50273

2023-04-24 13:35:40 +0200

[diff] [blame]

return op

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

2199

def optimise_quantize(op: Operation, arch, nng):

2200

2201

if op.type == Op.Quantize and op.run_on_npu:

2202

2203

ifm, ofm = op.get_ifm_ofm()

2204

input_values = ifm.values

2205

2206

# Guard clause - input not const or no values to quantize

2207

if ifm.ops[0].type != Op.Const or input_values is None:

2208

return op

2209

2210

# Singular val in numpy array, convert to indexable array

2211

if input_values.ndim == 0:

2212

input_values = np.array([input_values])

2213

Fredrik Svedberg

1156317

2022-07-06 14:54:12 +0200

[diff] [blame]

2214

# requantized int8 to int8 or int16 to int16

2215

if ifm.dtype == ofm.dtype == DataType.int8 or ifm.dtype == ofm.dtype == DataType.int16:

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

2216

2217

# scale needs to use double precision to match TFLite reference kernel

2218

effective_scale = np.float64(ifm.quantization.scale_f32) / np.float64(ofm.quantization.scale_f32)

2219

effective_multiplier, effective_shift = quantise_scale(effective_scale)

2220

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

2221

requantized_vals = []

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

2222

for val in input_values.flatten():

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

2223

input_val = val - ifm.quantization.zero_point

2224

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

2225

ofm_val = fp_math.multiply_by_quantized_multiplier(input_val, effective_multiplier, effective_shift)

2226

ofm_val += ofm.quantization.zero_point

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

2227

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

2228

clamped_ofm_value = max(min(ofm_val, ofm.quantization.quant_max), ofm.quantization.quant_min)

2229

requantized_vals.append(clamped_ofm_value)

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

2230

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

2231

ofm.values = np.array(requantized_vals, ofm.dtype.as_numpy_type())

2232

ofm.values.shape = input_values.shape

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

2233

2234

# Case: Float input - quantize to int

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

2235

elif ifm.dtype.type == BaseType.Float:

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

2236

2237

quantized_vals = []

2238

for val in input_values:

2239

2240

# Derive quantized value

2241

quant_val = (val / ofm.quantization.scale_f32) + ofm.quantization.zero_point

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

2242

clamped_quantized_val = np.clip(quant_val, ofm.quantization.quant_min, ofm.quantization.quant_max)

2243

quantized_vals.append(clamped_quantized_val)

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

2244

2245

# Pass the statically calculated quant val to output tensor

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

2246

ofm.values = np.array(quantized_vals, ofm.dtype.as_numpy_type())

2247

2248

# Unsupported data type

2249

else:

2250

return op

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

2251

2252

# Make quantize op const and disconnect from parent node

2253

2254

# Remove reference of the current quant op from the parent tensor's consumer list

2255

ifm.consumer_list = [consumer for consumer in ifm.consumer_list if consumer.op_index != op.op_index]

2256

2257

# Clear any references to parent node

2258

op.inputs = []

2259

2260

# Convert this quantize op to const

op.type = Op.Const

return op

Ayaan Masood

2022-06-29 11:30:57 +0100

[diff] [blame]

2266

def convert_shape_op_to_constant_tensor(op: Operation, arch, nng):

2267

"""Static optimisation for SHAPE operator output value known at compile time"""

2268

2269

# Disconnect SHAPE operator from its parent and transform SHAPE OP into constant

2270

2271

if op.type == Op.Shape and op.run_on_npu:

2272

2273

ifm, ofm = op.get_ifm_ofm()

2274

2275

if len(ifm.shape) != ofm.shape[0]:

2276

return op

2277

2278

# Remove reference of the current shape op from the parent tensor's consumer list

2279

ifm.consumer_list = [consumer for consumer in ifm.consumer_list if consumer.op_index != op.op_index]

2280

2281

# Clear any references to parent node

2282

op.inputs = []

2283

2284

# Convert this SHAPE op to const

2285

op.type = Op.Const

2286

2287

# Add size calculation to shape output tensors

2288

ofm.values = np.array(ifm.shape)

return op

Tim Hall

2022-11-11 18:19:53 +0000

[diff] [blame]

2293

def fixup_dilation_gt2(op, arch, nng):

2294

assert op.run_on_npu

2295

if op.type == Op.Conv2DBias or op.type == Op.DepthwiseConv2DBias:

2296

dilation_w, dilation_h = op.get_kernel_dilation()

2297

2298

# if dilation in either axis is greater than that supported by the hardware then we must manually dilate the

2299

# kernel

2300

if dilation_w > 2 or dilation_h > 2:

2301

kernel_w, kernel_h = op.get_kernel_size()

2302

kernel_ic = op.weights.shape[-2]

2303

kernel_oc = op.weights.shape[-1]

2304

2305

# if the dilation is a multiple of 2 then the hardware dialtion can be enabled to provide that multiple

2306

# of 2. this allows the kernel size to be reduced (via the scaled dilation) by half in that dimension.

2307

# odd = 1, even = 2

2308

hw_dilation_h = 1 if (dilation_h & 1) else 2

2309

hw_dilation_w = 1 if (dilation_w & 1) else 2

2310

2311

scale_dilation_h = dilation_h // hw_dilation_h

2312

scale_dilation_w = dilation_w // hw_dilation_w

2313

2314

# create new empty kernel (HWIO format)

2315

new_kernel_h = (kernel_h - 1) * scale_dilation_h + 1

2316

new_kernel_w = (kernel_w - 1) * scale_dilation_w + 1

2317

2318

new_kernel_shape = [new_kernel_h, new_kernel_w, kernel_ic, kernel_oc]

2319

new_kernel_values = np.zeros(new_kernel_shape, dtype=op.weights.values.dtype)

2320

2321

# copy the original kernel values into the new sparse kernel

2322

for h in range(0, kernel_h):

2323

for w in range(0, kernel_w):

2324

new_h = h * scale_dilation_h

2325

new_w = w * scale_dilation_w

2326

new_kernel_values[new_h, new_w, :, :] = op.weights.values[h, w, :, :]

2327

2328

# update the weight tensor with the new dilated kernel

2329

op.weights.shape = new_kernel_shape

2330

op.weights.values = new_kernel_values

2331

2332

# enable(=2) / disable(=1) hardware dilation

2333

op.attrs["dilation"] = (1, hw_dilation_h, hw_dilation_w, 1) # nhwc format

2334

op.attrs["dilation_h_factor"] = hw_dilation_h

2335

op.attrs["dilation_w_factor"] = hw_dilation_w

return op

Tim Hall

2023-03-10 18:11:34 +0000

[diff] [blame]

2340

def fixup_reshape(op, arch, nng):

2341

def _get_explicit_shape(implicit_shape, total_size):

2342

# the explicit shape is a copy of the implicit shape but with the special -1 (remaining size) value converted to

2343

# the appropriate value

2344

if implicit_shape is None:

2345

return None

2346

2347

explicit_shape = list(implicit_shape)

2348

if -1 in explicit_shape:

2349

explicit_shape[explicit_shape.index(-1)] = int(total_size / abs(np.prod(implicit_shape)))

2350

2351

return explicit_shape

2352

2353

if op.type == Op.Reshape:

2354

ifm_tensor, _, ofm_tensor = op.get_ifm_ifm2_ofm()

2355

ifm_size = ifm_tensor.elements()

2356

ofm_shape = ofm_tensor.shape

2357

2358

new_shape_tensor_shape = op.inputs[1].values.flatten() if len(op.inputs) > 1 else None

2359

new_shape_tensor_shape = _get_explicit_shape(new_shape_tensor_shape, ifm_size)

2360

2361

new_shape_attribute = op.attrs.get("new_shape", None)

2362

new_shape_attribute = _get_explicit_shape(new_shape_attribute, ifm_size)

2363

2364

# if present the new shape tensor overrides the new_shape attribute

2365

if new_shape_tensor_shape is not None:

2366

# check tensor

2367

if not np.array_equal(new_shape_tensor_shape, ofm_shape):

2368

print(

2369

f"Warning: {optype_to_builtintype(op.type)} '{op.name}' has new shape tensor"

2370

f" ({new_shape_tensor_shape}) that does not match output tensor shape {ofm_shape}. Will use output"

2371

f" tensor shape."

2372

)

2373

elif new_shape_attribute is not None:

2374

# check attribute

2375

if not np.array_equal(new_shape_attribute, ofm_shape):

2376

print(

2377

f"Warning: {optype_to_builtintype(op.type)} '{op.name}' has new_shape attribute"

2378

f" ({new_shape_attribute}) that does not match output tensor shape {ofm_shape}. Will use output"

f" tensor shape."

)

else:

print(

f"Warning: {optype_to_builtintype(op.type)} '{op.name}' does not have a new shape tensor or a new_shape"

2384

f" attribute. Will use output tensor shape {ofm_shape}."

2385

)

2386

2387

# force new shape tensor to output shape

2388

new_shape_tensor = create_const_tensor(

2389

op.name + "_new_shape", [len(ofm_shape)], DataType.int32, np.array(ofm_shape, np.int32)

2390

)

2391

if len(op.inputs) > 1:

2392

op.set_input_tensor(new_shape_tensor, 1)

2393

else:

2394

op.add_input_tensor(new_shape_tensor)

2395

2396

# force new_shape attribute to output shape

2397

op.attrs["new_shape"] = ofm_shape

return op

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2402

def supported_operator_check(op, arch, nng):

Jonas Ohlsson

45e653d

2021-07-26 16:13:12 +0200

[diff] [blame]

2403

op.run_on_npu = arch.tflite_supported_operators.is_operator_supported(op)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return op

wilisa01

2023-02-08 09:56:14 +0000

[diff] [blame]

2407

def tflite_optimise_graph(nng, arch, force_symmetric_int_weights):

Fredrik Svedberg

1156317

2022-07-06 14:54:12 +0200

[diff] [blame]

2408

# Compile time static optimisations

wilisa01

46c9477

2023-02-08 09:56:14 +0000

[diff] [blame]

2409

optimisation_list = [

2410

optimise_quantize,

2411

convert_shape_op_to_constant_tensor,

2412

fixup_or_check_asymmetric_weights(force_symmetric_int_weights),

2413

]

Ayaan Masood

2022-06-29 18:16:04 +0100

[diff] [blame]

2414

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2415

for idx, sg in enumerate(nng.subgraphs):

2416

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

nng,

sg,

arch,

[],

Ayaan Masood

4965fae

2022-06-29 11:30:57 +0100

[diff] [blame]

2421

optimisation_list,

2422

rewrite_unsupported=False,

2423

)

2424

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

2425

# Pre-processing step

Tim Hall

2180a17

2023-03-10 18:11:34 +0000

[diff] [blame]

2426

pre_process_list = [supported_operator_check, set_ifm_ofm_op_shapes, fixup_reshape]

Fredrik Svedberg

2022-07-06 13:42:24 +0200

[diff] [blame]

2427

Ayaan Masood

4965fae

2022-06-29 11:30:57 +0100

[diff] [blame]

2428

for idx, sg in enumerate(nng.subgraphs):

2429

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

nng,

sg,

arch,

[],

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

2434

pre_process_list,

2435

rewrite_unsupported=False,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

)

# Handle Concat Ops

for idx, sg in enumerate(nng.subgraphs):

2440

rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [rewrite_concat_ops])

2441

sg.refresh_after_modification()

2442

2443

# Handle Split Ops

2444

for idx, sg in enumerate(nng.subgraphs):

2445

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

nng,

sg,

arch,

[],

[rewrite_unpack_output, rewrite_stridedslice_output, convert_nop_split_to_identity],

2451

rewrite_unsupported=False,

2452

)

2453

2454

for idx, sg in enumerate(nng.subgraphs):

2455

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

nng,

sg,

arch,

[rewrite_split_ops],

[],

rewrite_unsupported=False,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2462

)

2463

Johan Alfven

a5e1b62

2023-02-02 14:59:03 +0100

[diff] [blame]

2464

# Bypass or rewrite memory only operators

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2465

for idx, sg in enumerate(nng.subgraphs):

2466

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

nng,

sg,

arch,

[],

Johan Alfven

a5e1b62

2023-02-02 14:59:03 +0100

[diff] [blame]

2471

[bypass_memory_only_ops],

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

2472

rewrite_unsupported=False,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2473

)

2474

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2475

# Rewrite of operators

2476

op_rewrite_list = [

2477

set_tensor_equivalence,

Johan Alfven

ce50273

2023-04-24 13:35:40 +0200

[diff] [blame]

2478

convert_ops_to_lut,

Rickard Bolin

2023-04-20 15:12:28 +0000

[diff] [blame]

2479

convert_mean_to_depthwise_conv,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2480

convert_depthwise_to_conv,

2481

convert_conv_to_fc,

Fredrik Svedberg

2023-04-11 22:35:04 +0200

[diff] [blame]

2482

convert_lstm,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2483

convert_softmax,

Fredrik Svedberg

2022-08-19 16:06:04 +0200

[diff] [blame]

2484

convert_prelu,

Fredrik Svedberg

2022-09-16 09:39:26 +0200

[diff] [blame]

2485

convert_mul_max_to_abs_or_lrelu,

2486

convert_lrelu,

Raul Farkas

3e7157b

2023-05-09 09:09:17 +0100

[diff] [blame]

2487

convert_avg_pool_to_conv2d,

Raul Farkas

69782af

2023-05-09 10:39:52 +0100

[diff] [blame]

2488

fixup_strided_conv,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2489

convert_hardswish_to_lut,

2490

rewrite_fully_connected_input,

2491

convert_batched_fc_shape,

2492

fixup_conv2d_backprop,

2493

fixup_relus_with_differing_ifm_ofm_scaling,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2494

reorder_depthwise_weights,

Rickard Bolin

2022-12-19 12:33:40 +0000

[diff] [blame]

2495

convert_argmax_to_depthwise_conv_and_max_pool,

Tim Hall

2022-07-21 11:46:03 +0100

[diff] [blame]

2496

fixup_resize,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2497

fixup_bias_tensors,

Fredrik Svedberg

2021-11-01 14:25:29 +0100

[diff] [blame]

2498

fixup_asymmetric_weights,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2499

convert_tanh_sigmoid_to_lut,

2500

replace_pad_by_hw_pad,

Tim Hall

ea4ba66

2022-11-11 18:19:53 +0000

[diff] [blame]

2501

fixup_dilation_gt2,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2502

]

2503

2504

for idx, sg in enumerate(nng.subgraphs):

2505

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

nng,

sg,

arch,

[],

op_rewrite_list,

rewrite_unsupported=False,

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

2512

)

2513

2514

for idx, sg in enumerate(nng.subgraphs):

2515

# remove passthrough tensors and attempt further optimizations

2516

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

nng,

sg,

arch,

[remove_passthrough_tensor],

2521

[fuse_activation_function_with_prev, convert_pad, add_padding_fields],

2522

)

2523

2524

# Removal of SplitSliceRead, need to be done after optimisation has been performed,

2525

# since ifm/ofm_shapes are of importance to this function

2526

for sg in nng.subgraphs:

2527

rewrite_graph.visit_graph_post_order(sg.output_tensors, arch, [], [remove_SplitSliceRead])

2528

sg.refresh_after_modification()

2529

Fredrik Svedberg

f3c7d55

2022-11-04 09:48:49 +0100

[diff] [blame]

2530

# Make sure that const optimisations on subgraph outputs are handled correctly

2531

for sg in nng.subgraphs:

2532

for ofm in sg.output_tensors:

2533

if ofm.is_const and ofm.ops[0].type_changed:

2534

# Subgraph output cannot be const - insert a memory copy

2535

op = ofm.ops[0]

2536

ofm_clone = ofm.clone()

2537

ofm_clone.values = ofm.values

2538

ofm.values = None

Tim Hall

2023-01-13 17:57:25 +0000

[diff] [blame]

2539

zero = create_const_tensor("zero", [1], ofm.dtype, [0], quantization=ofm.quantization)

Fredrik Svedberg

f3c7d55

2022-11-04 09:48:49 +0100

[diff] [blame]

2540

memcpy = create_add_nop(f"{ofm.name}_copy")

2541

memcpy.add_input_tensor(ofm_clone)

2542

memcpy.add_input_tensor(zero)

2543

memcpy.set_output_tensor(ofm)

2544

memcpy.set_ifm_ofm_shapes()

2545

op.set_output_tensor(ofm_clone)

2546

DebugDatabase.add_optimised(op, memcpy)

2547

Patrik Gustavsson