Blame - ethosu/vela/graph_optimiser.py - ml/ethos-u/ethos-u-vela

2020-04-27 18:20:16 +0100

[diff] [blame]

1

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

16

# Description:

17

# Early optimisation of the network graph, using the rewrite_graph module to do the traversal of the graph. These are

18

# split into two parts optimise_graph_a and optimise_graph_b.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

19

import math

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

import numpy as np

Louis Verhaard

2020-08-25 13:36:41 +0200

[diff] [blame]

23

from . import fp_math

Louis Verhaard

2020-08-13 11:47:36 +0200

[diff] [blame]

24

from . import lut

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

25

from . import rewrite_graph

Louis Verhaard

2020-08-25 13:36:41 +0200

[diff] [blame]

26

from . import scaling

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

27

from .data_type import DataType

Louis Verhaard

2020-05-25 15:05:26 +0200

[diff] [blame]

28

from .errors import UnsupportedFeatureError

Dwight Lidman

2020-05-29 09:37:03 +0200

[diff] [blame]

29

from .ethos_u55_regs.ethos_u55_regs import resampling_mode

Louis Verhaard

2020-06-03 08:56:44 +0200

[diff] [blame]

30

from .numeric_util import full_shape

Michael McGeagh

8dbf8cf

2020-09-08 11:09:48 +0100

[diff] [blame^]

31

from .operation import create_avgpool_nop

Diego Russo

e8a1045

2020-04-21 17:39:10 +0100

[diff] [blame]

32

from .operation import NpuBlockType

33

from .operation import Operation

Fredrik Svedberg

a0c3624

2020-06-03 15:43:31 +0200

[diff] [blame]

34

from .softmax import SoftMax

Michael McGeagh

2020-08-07 11:54:28 +0100

[diff] [blame]

35

from .tensor import create_const_tensor

36

from .tensor import create_reshape_tensor

Charles Xu

2020-07-02 15:12:40 +0200

[diff] [blame]

37

from .tensor import QuantizationParameters

Diego Russo

e8a1045

2020-04-21 17:39:10 +0100

[diff] [blame]

38

from .tensor import Tensor

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

39

40

passthrough_nodes = set(("Identity",))

41

Michael McGeagh

2020-09-08 11:07:35 +0100

[diff] [blame]

42

conv_op = set(("Conv2D", "QuantizedConv2D", "Conv2DBackpropInputSwitchedBias", "Conv2DBiasAct"))

fc_op = set(

(

"MatMul",

"QuantizedMatMul",

"BlockLSTM",

"RnnAct",

"UnidirectionalSequenceRnnAct",

50

"BidirectionalSequenceRnnAct",

51

"LstmAct",

52

"UnidirectionalSequenceLstmAct",

53

"BidirectionalSequenceLstmAct",

"FullyConnectedAct",

)

)

depthwise_op = set(("DepthwiseConv2dNative", "DepthwiseConv2dBiasAct",))

58

pool_op = set(

59

("AvgPool", "MaxPool", "QuantizedAvgPool", "QuantizedMaxPool", "AvgPoolAct", "MaxPoolAct", "ResizeBilinear")

60

)

61

reduce_sum_ops = set(("ReduceSum",))

62

binary_elementwise_op = set(("AddAct", "MulAct", "SubAct", "Maximum", "Minimum"))

63

elementwise_op = set(("LeakyRelu", "Abs", "CLZ", "SHL", "SHR")) | binary_elementwise_op

64

relu_ops = set(("Relu", "Relu6", "ReluN1To1"))

65

activation_ops = set(("Sigmoid", "Tanh")) | relu_ops

66

memory_only_ops = set(("Reshape",))

67

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

68

69

def remove_passthrough_tensor(tens, arch):

70

if len(tens.ops) == 1 and tens.ops[0].type in passthrough_nodes:

71

assert len(tens.ops[0].inputs) == 1

72

tens = tens.ops[0].inputs[0]

return tens

def rewrite_concat(tens, arch):

77

if len(tens.ops) == 1 and tens.ops[0].is_concat_op():

78

concat_op = tens.ops[0]

79

if tens != concat_op.outputs[0]:

80

return tens # don't attempt to rewrite the min/max outputs of QuantizedConcat

81

82

# Not supported so leave it and run on CPU

83

if not concat_op.run_on_npu:

84

return tens

85

86

inputs, axis = concat_op.get_concat_inputs_axis()

tens.ops = []

offset = 0

for idx, inp in enumerate(inputs):

91

new_op = Operation("ConcatSliceWrite", concat_op.name + str(idx))

92

new_op.inputs = [inp]

93

new_op.outputs = [tens]

94

new_op.attrs["concat_axis"] = axis

95

new_op.attrs["concat_start"] = offset

96

offset += inp.shape[axis]

97

new_op.attrs["concat_end"] = offset

98

new_op.run_on_npu = True

99

tens.ops.append(new_op)

100

assert tens.shape[axis] == offset

101

Patrik Gustavsson

29d568e

2020-08-18 10:11:21 +0200

[diff] [blame]

102

# If axis corresponds to C-dimension, NHCWB16 can only be used in the output if all the concat_start's are a

103

# multiple of 16. This as, it is only then the address offset for the ofm, for all operations, will be 16 byte

104

# aligned. For other values of axis the address offsets will be 16 byte aligned, as they are all based on c = 0

Patrik Gustavsson

458a208

2020-08-13 13:41:05 +0200

[diff] [blame]

105

# and those addresses are always 16 byte aligned due to the NHCWB16 format.

Patrik Gustavsson

29d568e

2020-08-18 10:11:21 +0200

[diff] [blame]

106

if axis == (len(tens.shape) - 1):

Patrik Gustavsson

458a208

2020-08-13 13:41:05 +0200

[diff] [blame]

107

for op in tens.ops:

108

if op.attrs["concat_start"] % 16 != 0:

109

tens.avoid_NHCWB16 = True

110

break

111

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

return tens

def rewrite_split(tens, arch):

116

117

if len(tens.ops) == 1 and tens.ops[0].is_split_op():

118

split_op = tens.ops[0]

119

120

# Not supported so leave it and run on CPU

121

if not split_op.run_on_npu:

122

return tens

123

124

inp, outputs, axis, offset_start, offset_end = split_op.get_split_inputs_axis()

125

126

tens.ops = []

127

new_op = Operation("SplitSliceRead", split_op.name)

128

new_op.inputs = [inp]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

129

130

# For Split the offset cannot be extracted from the tensor so it has to

131

# be calculated from the index of the output tensor

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

132

if axis is not None:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

133

# Get the start and end of the split

134

offset_start = [0] * len(tens.shape)

135

offset_end = [0] * len(tens.shape)

for out in outputs:

if out == tens:

break

offset_start[axis] += out.shape[axis]

140

Patrik Gustavsson

eebb1c2

2020-08-18 15:03:04 +0200

[diff] [blame]

141

# If start offset is not a multiple of 16 in the C-dimension, NHCWB16 need to be avoided in the input

142

if (offset_start[-1] % 16) != 0:

143

inp.avoid_NHCWB16 = True

144

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

145

offset_end[axis] = offset_start[axis] + tens.shape[axis]

146

147

new_op.attrs["split_start"] = offset_start

148

new_op.attrs["split_end"] = offset_end

149

new_op.run_on_npu = True

Michael McGeagh

2020-08-07 11:54:28 +0100

[diff] [blame]

150

new_op.set_output_tensor(tens)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

return tens

def needed_total_padding(input_size, stride, filter_size):

156

out_size = (input_size + stride - 1) // stride

157

needed_input = (out_size - 1) * stride + filter_size

158

total_padding = max(0, needed_input - input_size)

return total_padding

def calc_padding_and_skirt(padding_type, kernel_size, stride, input_dims):

163

ypad = needed_total_padding(int(input_dims[1]), int(stride[1]), int(kernel_size[0]))

164

xpad = needed_total_padding(int(input_dims[2]), int(stride[2]), int(kernel_size[1]))

165

if padding_type == b"SAME":

166

left_pad = (xpad + 0) // 2

167

right_pad = (xpad + 1) // 2

168

top_pad = (ypad + 0) // 2

169

bottom_pad = (ypad + 1) // 2

170

elif padding_type == b"VALID":

left_pad = 0

right_pad = 0

top_pad = 0

bottom_pad = 0

else:

Louis Verhaard

2020-05-25 15:05:26 +0200

[diff] [blame]

176

raise UnsupportedFeatureError("Unknown padding {}".format(str(padding_type)))

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

177

padding = (top_pad, left_pad, bottom_pad, right_pad)

178

skirt = (top_pad, left_pad, ypad - top_pad, xpad - left_pad)

179

return padding, skirt

180

Tim Hall

2020-06-15 20:47:35 +0100

[diff] [blame]

181

Jacob Bohlin

2020-07-07 17:15:22 +0200

[diff] [blame]

182

def calc_upscaled_padding_and_skirt(padding_type, kernel_size, stride, input_dims, upscaling_factor):

183

kernel_height, kernel_width = kernel_size[0], kernel_size[1]

Jacob Bohlin

2020-05-20 09:03:40 +0200

[diff] [blame]

184

if padding_type == b"SAME":

Jacob Bohlin

2020-07-07 17:15:22 +0200

[diff] [blame]

185

ypad = needed_total_padding(int(input_dims[1]) * upscaling_factor, int(stride[1]), int(kernel_height))

186

xpad = needed_total_padding(int(input_dims[2]) * upscaling_factor, int(stride[2]), int(kernel_width))

187

Jacob Bohlin

d47cc27

2020-08-24 11:42:14 +0200

[diff] [blame]

188

right_pad = max(((xpad + 1) // upscaling_factor) - 1, 0)

189

bottom_pad = max(((ypad + 1) // upscaling_factor) - 1, 0)

Jacob Bohlin

2020-07-07 17:15:22 +0200

[diff] [blame]

190

left_pad = max(kernel_width - 1 - right_pad, 0)

191

top_pad = max(kernel_height - 1 - bottom_pad, 0)

192

Jacob Bohlin

2020-05-20 09:03:40 +0200

[diff] [blame]

193

elif padding_type == b"VALID":

Jacob Bohlin

2020-07-07 17:15:22 +0200

[diff] [blame]

194

right_pad = max(kernel_width - 2, 0)

195

bottom_pad = max(kernel_height - 2, 0)

196

left_pad = kernel_width - 1

197

top_pad = kernel_height - 1

Jacob Bohlin

2020-05-20 09:03:40 +0200

[diff] [blame]

198

else:

199

assert 0, "Unknown padding"

200

201

padding = (top_pad, left_pad, bottom_pad, right_pad)

Jacob Bohlin

2020-07-07 17:15:22 +0200

[diff] [blame]

202

skirt = padding

Jacob Bohlin

2020-05-20 09:03:40 +0200

[diff] [blame]

203

return padding, skirt

204

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

205

206

def fixup_conv2d_backprop(op, arch):

207

if op.type == "Conv2DBackpropInput":

208

# flip the inputs

209

op.inputs[0], op.inputs[2] = op.inputs[2], op.inputs[0]

Jacob Bohlin

2020-05-20 09:03:40 +0200

[diff] [blame]

210

op.type = "Conv2DBackpropInputSwitchedBias"

Jacob Bohlin

2020-05-20 09:03:40 +0200

[diff] [blame]

211

212

# Update strides

Tim Hall

2020-06-15 20:47:35 +0100

[diff] [blame]

213

op.attrs.update({"stride_w": 1, "stride_h": 1, "strides": (1, 1, 1, 1)})

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

return op

Charles Xu

2020-07-02 15:12:40 +0200

[diff] [blame]

218

# Convert the op to an elementwise add

219

def convert_resizebilinear_1x1_to_add(op):

220

op.type = "AddAct"

221

op.name = op.name + "_add"

222

op.attrs.update({"npu_block_type": NpuBlockType.ElementWise})

223

op.attrs["resizebilinear"] = True

224

# Create an input tensor filled with zeros

225

shape = op.outputs[0].shape

226

tens = Tensor(shape, op.inputs[0].dtype, op.inputs[1].name + "_add")

227

tens.values = np.zeros(shape)

228

tens.quant_values = np.zeros(shape, np.uint8)

229

tens.quantization = QuantizationParameters(0.0, 255.0)

230

tens.quantization.scale_f32 = 1.0

231

tens.quantization.zero_point = 0

232

tens.consumer_list = [op]

233

tens_op = op.inputs[1].ops[0]

Michael McGeagh

2020-08-07 11:54:28 +0100

[diff] [blame]

234

tens_op.set_output_tensor(tens)

Charles Xu

2020-07-02 15:12:40 +0200

[diff] [blame]

235

# Set the add inputs

236

op.inputs[1] = op.inputs[0]

op.inputs[0] = tens

return op

Charles Xu

2020-08-06 12:17:26 +0200

[diff] [blame]

242

# Convert ResizeBilinear to a number of 2x2 pool ops

243

def convert_resizebilinear_to_2x2_pool(op):

count = 0

pre_op = op

outputs = op.outputs

op.attrs.update({"strides": (1, 1, 1, 1), "ksize": (1, 2, 2, 1)})

249

if op.attrs["align_corners"]:

250

shape_modifier = 1

251

op.attrs["padding"] = b"VALID"

252

else:

253

shape_modifier = 0

254

op.attrs["padding"] = b"SAME"

255

op.inputs[0].resampling_mode = resampling_mode.NEAREST

256

257

upscaled_shape = np.array(op.inputs[0].shape[1:3])

258

out_shape = np.array(op.outputs[0].shape[1:3])

259

if (upscaled_shape == upscaled_shape * 2 - shape_modifier).all():

260

return op

261

262

while (upscaled_shape < out_shape).all():

if count == 0:

scaled_op = pre_op

else:

scaled_op = op.clone("_{}".format(count))

267

scaled_op.inputs[0] = pre_op.outputs[0]

268

269

upscaled_shape = upscaled_shape * 2 - shape_modifier

270

271

if (upscaled_shape == out_shape).all():

272

scaled_op.outputs = outputs

273

scaled_op.outputs[0].ops = [scaled_op]

274

else:

275

shape = outputs[0].shape.copy()

276

shape[1:3] = upscaled_shape[0:2]

277

out_tens = Tensor(shape, DataType.int16, "{}_{}".format(op.outputs[0].name, count))

278

out_tens.quantization = op.outputs[0].quantization.clone()

279

out_tens.quantization.quant_min = np.iinfo(np.int16).min

280

out_tens.quantization.quant_max = np.iinfo(np.int16).max

281

scaled_op.set_output_tensor(out_tens)

pre_op = scaled_op

count += 1

# Setup the scale value

286

if scaled_op.inputs[0].dtype.bits == 8 and scaled_op.outputs[0].dtype.bits == 16:

287

scaled_op.attrs["rescale"] = 128

288

elif scaled_op.inputs[0].dtype.bits == 16 and scaled_op.outputs[0].dtype.bits == 8:

289

scaled_op.attrs["rescale"] = 1 / 128

290

elif "rescale" in scaled_op.attrs:

291

del scaled_op.attrs["rescale"]

return op

Charles Xu

2020-07-02 15:12:40 +0200

[diff] [blame]

296

def fixup_resizebilinear(op, arch):

Charles Xu

87c1350

2020-08-06 12:17:26 +0200

[diff] [blame]

297

if op.type == "ResizeBilinear" and op.run_on_npu:

298

if op.inputs[0].shape == op.outputs[0].shape:

Charles Xu

36ffaf3

2020-08-05 15:40:44 +0200

[diff] [blame]

299

# Bypass nop resizebilinear

300

op.inputs = op.inputs[:1]

301

op.type = "Identity"

Charles Xu

87c1350

2020-08-06 12:17:26 +0200

[diff] [blame]

302

elif op.inputs[0].shape[1] == 1 and op.inputs[0].shape[2] == 1:

303

convert_resizebilinear_1x1_to_add(op)

304

else:

305

convert_resizebilinear_to_2x2_pool(op)

Charles Xu

2020-07-02 15:12:40 +0200

[diff] [blame]

return op

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

310

def fixup_fully_connected_input(op, arch):

311

if op.type == "FullyConnectedAct":

312

inp = op.inputs[0]

313

weights = op.inputs[1]

314

315

n_in_elems = weights.shape[-2]

316

elms = inp.elements()

317

batch_size = elms // n_in_elems

318

assert batch_size * n_in_elems == elms

319

320

desired_shape = [batch_size, n_in_elems]

321

if inp.shape != desired_shape:

322

# mismatch, insert a reshape to fix this.

Michael McGeagh

2020-08-07 11:54:28 +0100

[diff] [blame]

323

op.inputs[0] = create_reshape_tensor(inp, desired_shape)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

return op

def fixup_pack_input(op, arch):

329

if op.type == "Pack":

330

# Pack is also referred to as Stack

331

# Requires the rewrite_concat function to be called on the op afterwards

332

axis = int(op.attrs["axis"])

333

desired_shape = op.inputs[0].shape[:axis] + [1] + op.inputs[0].shape[axis:]

334

335

# Construct 1 shape tensor to be used by all inserted reshape ops

Michael McGeagh

2020-08-07 11:54:28 +0100

[diff] [blame]

336

new_shape_tens = create_const_tensor(op.name + "_reshape_shape", [1], DataType.int32, desired_shape)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

337

338

for idx, inp in enumerate(op.inputs):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

339

reshape_out = inp.clone("_reshaped")

Michael McGeagh

2020-07-28 12:17:59 +0100

[diff] [blame]

340

reshape_out.set_all_shapes(desired_shape)

Michael McGeagh

2020-08-07 11:54:28 +0100

[diff] [blame]

341

342

reshape_op = Operation("Reshape", "{}{}_reshape".format(op.name, idx))

343

reshape_op.attrs["new_shape"] = desired_shape

344

reshape_op.inputs = [inp, new_shape_tens]

345

reshape_op.set_output_tensor(reshape_out)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

346

347

op.inputs[idx] = reshape_out

348

349

op.type = "PackReshaped"

return op

def fixup_unpack_output(tens, arch):

355

op = tens.ops[0]

356

if op.type in set(("Unpack", "StridedSlice")):

357

# Unpack is also referred to as Unstack

358

# Requires the rewrite_split function to be called on the op afterwards

Patrik Gustavsson

2020-04-30 08:57:23 +0200

[diff] [blame]

359

360

reshape_input_shape = tens.shape

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

361

if op.type == "StridedSlice":

Patrik Gustavsson

2020-04-30 08:57:23 +0200

[diff] [blame]

362

new_axis_mask = op.attrs["new_axis_mask"]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

363

shrink_axis_mask = op.attrs["shrink_axis_mask"]

Louis Verhaard

2020-05-25 15:05:26 +0200

[diff] [blame]

364

ellipsis_mask = op.attrs["ellipsis_mask"]

Patrik Gustavsson

2020-04-30 08:57:23 +0200

[diff] [blame]

365

366

if (new_axis_mask != 0 and shrink_axis_mask != 0) or ellipsis_mask != 0:

367

# Not supported, will be put on CPU

368

return tens

369

if shrink_axis_mask == 0 and new_axis_mask == 0:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

370

# Equal Rank StridedSlice, no need to insert reshape

371

return tens

Patrik Gustavsson

2020-04-30 08:57:23 +0200

[diff] [blame]

372

elif shrink_axis_mask != 0:

373

n = 0

374

axis = 0

375

while shrink_axis_mask:

376

prev_mask = shrink_axis_mask

377

n += 1

378

shrink_axis_mask &= shrink_axis_mask - 1

379

axis = int(math.log2(prev_mask - shrink_axis_mask))

380

reshape_input_shape = reshape_input_shape[:axis] + [1] + reshape_input_shape[axis:]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

381

Patrik Gustavsson

2020-04-30 08:57:23 +0200

[diff] [blame]

382

assert len(tens.shape) == (len(op.inputs[0].shape) - n)

383

op.attrs["shrink_axis_mask"] = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

384

Patrik Gustavsson

2020-04-30 08:57:23 +0200

[diff] [blame]

385

elif new_axis_mask != 0:

n = 0

axis = 0

while new_axis_mask:

prev_mask = new_axis_mask

390

n += 1

391

new_axis_mask &= new_axis_mask - 1

392

axis = int(math.log2(prev_mask - new_axis_mask))

Louis Verhaard

2020-05-25 15:05:26 +0200

[diff] [blame]

393

reshape_input_shape = reshape_input_shape[:axis] + reshape_input_shape[(axis + 1) :]

Patrik Gustavsson

2020-04-30 08:57:23 +0200

[diff] [blame]

394

new_axis_mask >>= 1

395

396

assert len(tens.shape) == (len(op.inputs[0].shape) + n)

397

op.attrs["new_axis_mask"] = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

398

else:

399

axis = int(op.attrs["axis"])

400

op.type = "UnpackReshaped"

Patrik Gustavsson

2020-04-30 08:57:23 +0200

[diff] [blame]

401

reshape_input_shape = tens.shape[:axis] + [1] + tens.shape[axis:]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

402

403

# Construct 1 shape tensor to be used by all inserted reshape ops

Michael McGeagh

2020-08-07 11:54:28 +0100

[diff] [blame]

404

new_shape_tens = create_const_tensor(op.name + "_reshape_shape", [1], DataType.int32, tens.shape)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

405

406

for idx, out_tens in enumerate(op.outputs):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

407

reshape_in = out_tens.clone("_reshaped")

Michael McGeagh

2020-07-28 12:17:59 +0100

[diff] [blame]

408

reshape_in.set_all_shapes(reshape_input_shape)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

409

reshape_in.ops = [op]

Michael McGeagh

2020-08-07 11:54:28 +0100

[diff] [blame]

410

411

reshape_op = Operation("Reshape", "{}{}_reshape".format(op.name, idx))

412

reshape_op.attrs["new_shape"] = reshape_input_shape

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

413

reshape_op.inputs = [reshape_in, new_shape_tens]

Michael McGeagh

2020-08-07 11:54:28 +0100

[diff] [blame]

414

reshape_op.set_output_tensor(out_tens)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

415

416

op.outputs[idx] = reshape_in

return tens

def add_padding_fields(op, arch):

Jacob Bohlin

2020-08-28 15:45:44 +0200

[diff] [blame]

422

if op.run_on_npu:

423

if "padding" in op.attrs:

Michael McGeagh

2020-09-08 11:07:35 +0100

[diff] [blame]

424

if op.type in conv_op | depthwise_op:

Jacob Bohlin

2020-08-28 15:45:44 +0200

[diff] [blame]

425

kernel_size = op.inputs[1].shape[:2]

426

input_shape = op.inputs[0].shape

Michael McGeagh

2020-09-08 11:07:35 +0100

[diff] [blame]

427

elif op.type in pool_op | reduce_sum_ops:

Jacob Bohlin

2020-08-28 15:45:44 +0200

[diff] [blame]

428

kernel_size = op.attrs["ksize"][1:3]

429

input_shape = op.inputs[0].shape

430

elif op.type == "ExtractImagePatches":

431

kernel_size = op.attrs["ksizes"][1:3]

432

input_shape = op.inputs[0].shape

433

else:

434

raise UnsupportedFeatureError("Unknown operation that uses padding: {}".format(op.type))

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

435

Jacob Bohlin

2020-08-28 15:45:44 +0200

[diff] [blame]

436

if op.type == "Conv2DBackpropInputSwitchedBias":

437

upscaling_factor = op.outputs[0].shape[1] // input_shape[1]

438

padding, skirt = calc_upscaled_padding_and_skirt(

439

op.attrs["padding"], kernel_size, op.attrs["strides"], input_shape, upscaling_factor

440

)

441

else:

442

dilation_h, dilation_w = op.get_dilation_h_w()

443

dilated_kernel_size = [dilation_h * (kernel_size[0] - 1) + 1, dilation_w * (kernel_size[1] - 1) + 1]

444

padding, skirt = calc_padding_and_skirt(

445

op.attrs["padding"], dilated_kernel_size, op.attrs["strides"], input_shape

446

)

Jacob Bohlin

2020-05-20 09:03:40 +0200

[diff] [blame]

447

Jacob Bohlin

2020-08-28 15:45:44 +0200

[diff] [blame]

448

op.attrs["explicit_padding"] = padding

449

op.attrs["skirt"] = skirt

Jacob Bohlin

2020-05-20 09:03:40 +0200

[diff] [blame]

450

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

return op

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

454

# Check if the op can be reordered

455

def get_prepend_op(op):

456

inp = op.inputs[0]

457

# The op should be reordered between prev_op and prep_op

458

prev_op = inp.ops[-1]

459

prep_op = None

460

while prev_op.type in memory_only_ops and len(prev_op.outputs) == 1 and len(prev_op.outputs[0].consumers()) == 1:

461

prep_op = prev_op

462

inp = prev_op.inputs[0]

463

prev_op = inp.ops[-1]

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

464

if prev_op is not None and len(prev_op.outputs) == 1 and len(prev_op.outputs[0].consumers()) == 1:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

return prep_op

return None

def mark_npu_block_type(op, arch):

471

npu_block_type = NpuBlockType.Default

472

if op.type in conv_op:

473

npu_block_type = NpuBlockType.ConvolutionMxN

474

elif op.type in fc_op:

475

npu_block_type = NpuBlockType.VectorProduct

476

elif op.type in depthwise_op:

477

npu_block_type = NpuBlockType.ConvolutionDepthWise

478

elif op.type in pool_op:

479

npu_block_type = NpuBlockType.Pooling

480

elif op.type in elementwise_op:

481

npu_block_type = NpuBlockType.ElementWise

Fredrik Svedberg

a0c3624

2020-06-03 15:43:31 +0200

[diff] [blame]

482

elif op.type in reduce_sum_ops:

483

npu_block_type = NpuBlockType.ReduceSum

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

484

485

op.attrs["npu_block_type"] = npu_block_type

return op

def convert_depthwise_to_conv(op, arch):

490

# Depthwise is equivalent to a single conv2d if the ifm depth is 1 and

491

# the ofm depth equals the depth multipler.

492

# If those conditions are true, then we can perform a simple

493

# switch of the operator type (and weight order)

494

Michael McGeagh

2020-09-08 11:07:35 +0100

[diff] [blame]

495

if (op.type in depthwise_op) and (op.attrs["depth_multiplier"] != 1):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

496

ifm_tensor = op.inputs[0]

497

weight_tensor = op.inputs[1]

498

ofm_tensor = op.outputs[0]

499

if (ifm_tensor.shape[3] == 1) and (ofm_tensor.shape[3] == op.attrs["depth_multiplier"]):

500

# Change op type to Conv2d

501

op.type = op.type.replace("DepthwiseConv2d", "Conv2D")

502

del op.attrs["channel_multiplier"]

503

del op.attrs["depth_multiplier"]

504

505

weight_tensor.quant_values = np.transpose(weight_tensor.quant_values, (0, 1, 3, 2))

Michael McGeagh

2020-07-28 12:17:59 +0100

[diff] [blame]

506

weight_tensor.set_all_shapes(list(weight_tensor.quant_values.shape))

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

507

else:

Louis Verhaard

2020-05-25 15:05:26 +0200

[diff] [blame]

508

raise UnsupportedFeatureError(

509

"Unsupported DepthwiseConv2d with depth_multiplier = {}, ifm channels = {}, ofm channels = {}".format(

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

510

op.attrs["depth_multiplier"], ifm_tensor.shape[3], ofm_tensor.shape[3]

511

)

512

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

return op

Jacob Bohlin

2020-06-23 12:12:56 +0200

[diff] [blame]

516

def reorder_depthwise_weights(op, arch):

Michael McGeagh

2020-09-08 11:07:35 +0100

[diff] [blame]

517

if op.type in depthwise_op:

Jacob Bohlin

e843d33

2020-06-23 12:12:56 +0200

[diff] [blame]

518

weight_tensor = op.inputs[1]

519

weight_tensor.quant_values = np.transpose(weight_tensor.quant_values, (0, 1, 3, 2))

Michael McGeagh

2020-07-28 12:17:59 +0100

[diff] [blame]

520

weight_tensor.set_all_shapes(list(weight_tensor.quant_values.shape))

Jacob Bohlin

e843d33

2020-06-23 12:12:56 +0200

[diff] [blame]

521

weight_tensor.weight_transpose_depthwise = True

return op

Michael McGeagh

2020-07-29 13:11:43 +0100

[diff] [blame]

526

def convert_conv_to_fc(op, arch):

527

# Conv 1x1 can be equivalent to Fully Connected.

528

# By representing certain convs as fully connected layers, Vela can better determine wether or not to use

529

# caching/double buffering for the weights.

530

# (Weights dont need to be reloaded for convs when IFM H and W are 1)

531

if op.type == "Conv2DBiasAct":

532

_, h, w, _ = op.inputs[0].shape

533

kh, kw, _, _ = op.inputs[1].shape

534

if h == 1 and w == 1 and kh == 1 and kw == 1:

535

# Overwrite this op as a Fully Connected Op

536

op.name += "_fc"

537

op.type = "FullyConnectedAct"

538

faf = op.attrs.get("fused_activation_function", None)

539

op.attrs = {

540

"fused_activation_function": faf,

541

"weights_format": 0,

542

"npu_block_type": NpuBlockType.VectorProduct,

543

}

544

# Reshape Weights to be 2D. HWIO becomes just IO (as H and W are 1, they can just be dropped)

545

weight_tensor = op.inputs[1]

546

weight_tensor.quant_values = weight_tensor.quant_values.squeeze(axis=(0, 1))

547

weight_tensor.set_all_shapes(list(weight_tensor.quant_values.shape))

548

# The output from a fully connected is expected to be 2D so we need to add a reshape layer to convert it

549

# back to 4D afterwards as the next layer is expecting that shape

550

orig_ofm_tensor = op.outputs[0]

551

# Reshape this ops output to be 2D: {(N*H*W), C} (We know N H and W are all 1 so this becomes {1, C})

552

fc_ofm_tensor = orig_ofm_tensor.clone("_fc")

553

fc_ofm_tensor.set_all_shapes([1, fc_ofm_tensor.shape[-1]])

554

fc_ofm_tensor.ops = [op]

555

# Add a reshape after the new OFM to convert it back to the original 4D shape

Michael McGeagh

2020-08-07 11:54:28 +0100

[diff] [blame]

556

reshape_name = op.name + "_reshape"

557

new_shape_tens = create_const_tensor(reshape_name + "_shape", [1], DataType.int32, orig_ofm_tensor.shape)

Michael McGeagh

2020-07-29 13:11:43 +0100

[diff] [blame]

558

reshape_op = Operation("Reshape", reshape_name)

Michael McGeagh

2020-07-29 13:11:43 +0100

[diff] [blame]

559

reshape_op.attrs["new_shape"] = orig_ofm_tensor.shape

Michael McGeagh

2020-08-07 11:54:28 +0100

[diff] [blame]

560

reshape_op.inputs = [fc_ofm_tensor, new_shape_tens]

561

reshape_op.set_output_tensor(orig_ofm_tensor)

Michael McGeagh

2020-07-29 13:11:43 +0100

[diff] [blame]

562

# Replace this ops OFM to point to the 2D tensor

563

op.outputs[0] = fc_ofm_tensor

return op

Michael McGeagh

2020-09-08 11:09:48 +0100

[diff] [blame^]

567

def fixup_relus_with_differing_ifm_ofm_scaling(op, arch):

568

if op.run_on_npu and op.type in relu_ops:

569

ifm = op.inputs[0]

570

ofm = op.outputs[0]

571

# Relu with differing IFM and OFM scaling cannot be fused with another primary op

572

# and requires its own to be inserted

573

if not ifm.is_scaling_equal(ofm):

574

# Override this op with its own primary op (avgpool)

575

relu_fused_op = create_avgpool_nop(op.name + "_avgpool")

576

# And fuse the original activation function to it

577

relu_fused_op.attrs["fused_activation_function"] = op.type

578

# Tidy up and assign the ifm and ofm to the new op

579

ifm.consumer_list.remove(op)

580

relu_fused_op.add_input_tensor(ifm)

581

relu_fused_op.set_output_tensor(ofm)

op = relu_fused_op

return op

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

586

# Reorder activation op if it's after the memory only operations

587

def fixup_act_reorder(op, arch):

588

if op.type in activation_ops:

589

prep_op = get_prepend_op(op)

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

590

if prep_op is not None:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

591

act_op = op.clone("_reordered")

592

act_op.inputs = [prep_op.inputs[0]]

593

act_op_out = act_op.inputs[0].clone("_acted")

594

act_op_out.quantization = op.outputs[0].quantization.clone()

Michael McGeagh

2020-08-07 11:54:28 +0100

[diff] [blame]

595

act_op.set_output_tensor(act_op_out)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

596

prep_op.inputs[0] = act_op_out

597

prep_op.outputs[0].quantization = act_op_out.quantization.clone()

598

599

# Mark the op so that it will be removed as passthrough later on

op.type = "Identity"

return op

Louis Verhaard

2020-06-03 08:56:44 +0200

[diff] [blame]

603

Charles Xu

2020-05-13 10:15:26 +0200

[diff] [blame]

604

def fixup_elementwise_with_scalars(op, arch):

605

if op.type in binary_elementwise_op:

Louis Verhaard

2020-06-03 08:56:44 +0200

[diff] [blame]

606

ifm_tensor, ifm2_tensor, _, _ = op.get_ifm_ifm2_weights_ofm()

Charles Xu

2020-05-13 10:15:26 +0200

[diff] [blame]

607

if ifm2_tensor.shape != [] and ifm_tensor.shape != []:

608

diff = len(ifm_tensor.shape) - len(ifm2_tensor.shape)

609

if diff > 0:

610

ifm2_tensor.shape = full_shape(len(ifm_tensor.shape), ifm2_tensor.shape, 1)

611

elif diff < 0:

612

ifm_tensor.shape = full_shape(len(ifm2_tensor.shape), ifm_tensor.shape, 1)

Louis Verhaard

2020-06-03 08:56:44 +0200

[diff] [blame]

613

elif ifm_tensor.shape == [] and ifm_tensor.quant_values is None:

614

# IFM is marked as a scalar, but is a result of an operation; change it to a shape of size 1

615

ifm_tensor.shape = len(ifm2_tensor.shape) * [1]

616

ifm_tensor.storage_shape = ifm_tensor.shape

617

elif ifm2_tensor.shape == [] and ifm2_tensor.quant_values is None:

618

# IFM2 is marked as a scalar, but is a result of an operation; change it to a shape of size 1

619

ifm2_tensor.shape = len(ifm_tensor.shape) * [1]

620

ifm2_tensor.storage_shape = ifm2_tensor.shape

Charles Xu

2020-05-13 10:15:26 +0200

[diff] [blame]

621

return op

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

622

Louis Verhaard

2020-06-03 08:56:44 +0200

[diff] [blame]

623

Tim Hall

4e12776

2020-05-15 16:05:49 +0100

[diff] [blame]

624

# Set input/output tensor equivalence to the same id for memory operations

625

def set_tensor_equivalence(op, arch):

Michael McGeagh

2020-09-08 11:07:35 +0100

[diff] [blame]

626

if op.type in memory_only_ops:

Tim Hall

4e12776

2020-05-15 16:05:49 +0100

[diff] [blame]

627

eid = op.outputs[0].equivalence_id

628

for inp in op.inputs:

629

inp.equivalence_id = eid

return op

Fredrik Svedberg

2020-06-03 15:43:31 +0200

[diff] [blame]

633

def convert_softmax(op, arch):

634

if op.type == "Softmax" and op.run_on_npu:

635

softmax = SoftMax(op)

636

op = softmax.get_graph()

return op

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

640

def convert_mul_max_to_abs_or_lrelu(op, arch):

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

641

r"""Whenever there is a subgraph with this topology:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

642

643

Input X For X = -1 or X > 0

644

| \ / This subgraph can be replaced with either

645

| Mul an Abs (if X = -1) or a LeakyReLU (if X > 0)

| /

Max

"""

if op.type == "Maximum":

651

# finds the Mul input(s) to the Max

652

muls = [i for i in op.inputs if i.ops[0].type == "MulAct"]

if len(muls) == 1:

mul = muls[0].ops[0]

elif len(muls) == 2:

# In the case both inputs are Muls, find the one with the same input as the Max

657

mul = [m for m in muls if len(set(op.inputs + m.ops[0].inputs)) == 1][0].ops[0]

else:

# No Mul inputs

return op

# make sure the Mul doesn't have any other consumers

Louis Verhaard

2020-08-25 13:36:41 +0200

[diff] [blame]

663

mul_ofm = mul.outputs[0]

664

if len(mul_ofm.consumers()) != 1:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

665

return op

666

# make sure the Mul doesn't have a faf

667

if mul.attrs["fused_activation_function"]:

668

return op

Louis Verhaard

2020-08-13 11:47:36 +0200

[diff] [blame]

669

ifm, _, _, ofm = op.get_ifm_weights_biases_ofm()

670

if ifm.dtype not in (DataType.uint8, DataType.int8) or ifm.dtype != ofm.dtype:

671

return op

Louis Verhaard

2020-08-25 13:36:41 +0200

[diff] [blame]

672

if not ifm.is_scaling_equal(ofm) or not ifm.is_scaling_equal(mul_ofm):

Louis Verhaard

2020-08-13 11:47:36 +0200

[diff] [blame]

673

# rewrite to LeakyRelu currently only makes sense if the quantization is identical

674

return op

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

675

676

# finds the branched input that goes to both the Max and the Mul

677

shared = set(op.inputs) & set(mul.inputs)

678

if len(shared) == 1:

679

shared_in = shared.pop()

680

# find the constant scalar input to the Mul

681

const_tens = (set(mul.inputs) - {shared_in}).pop()

682

# check that it is a scalar

683

if const_tens.shape != []:

684

return op

685

const = const_tens.ops[0]

686

# check that it is a constant

687

if const.type != "Const":

688

return op

Louis Verhaard

2020-08-13 11:47:36 +0200

[diff] [blame]

689

# Remove the Mul from the shared input's consumers

690

shared_in.consumer_list.remove(mul)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

else:

return op

val = const.outputs[0].values

695

if val >= 0:

696

new_op = "LeakyRelu"

697

op.attrs["alpha"] = val

Louis Verhaard

2020-08-25 13:36:41 +0200

[diff] [blame]

698

# to produce bit exact results, the alpha is not enough;

699

# save additional scaling info in attr "alpha_scale", to be used as input

700

# to the LUT construction

701

alpha_scalar = const_tens.quant_values - const_tens.quantization.zero_point

702

mul_ifm_scale = np.double(ifm.quantization.scale_f32)

703

mul_ifm2_scale = np.double(const_tens.quantization.scale_f32)

704

mul_ofm_scale = np.double(mul_ofm.quantization.scale_f32)

705

alpha_scale, alpha_shift = scaling.elementwise_mul_scale(mul_ifm_scale, mul_ifm2_scale, mul_ofm_scale)

706

op.attrs["alpha_scaling"] = (alpha_scalar, alpha_scale, alpha_shift)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

elif val == -1:

new_op = "Abs"

else:

return op

op.type = op.type.replace("Maximum", new_op)

713

op.name = op.name.replace("Maximum", new_op)

714

op.outputs[0].name = op.outputs[0].name.replace("Maximum", new_op)

715

op.inputs = [shared_in]

return op

Louis Verhaard

2020-08-13 11:47:36 +0200

[diff] [blame]

719

def convert_lrelu_to_mul_max(op, arch):

720

# Converts LeakyRelu to Max(alpha * IFM, identity * IFM)

721

# (the opposite of convert_mul_max_to_abs_or_lrelu)

722

ifm, _, _, ofm = op.get_ifm_weights_biases_ofm()

723

724

# Add multiplication with alpha

725

mul_alpha = Operation("MulAct", op.name + "_mul_alpha")

726

mul_alpha.add_input_tensor(ifm)

727

# Create const tensor containing alpha as scalar

728

alpha = op.attrs["alpha"]

729

quantization = ifm.quantization.clone()

730

quantization.min = 0

731

quantization.max = alpha * (quantization.quant_max - quantization.quant_min)

732

quantization.scale_f32 = alpha

733

quantization.zero_point = 0

734

alpha_tens = create_const_tensor(op.name + "_alpha_scalar", [], ifm.dtype, [1], np.int8, quantization=quantization)

735

mul_alpha.add_input_tensor(alpha_tens)

736

fm_alpha = ofm.clone(op.name + "_alpha")

737

mul_alpha.set_output_tensor(fm_alpha)

738

739

if ifm.is_scaling_equal(ofm):

740

# No identity multiplication is needed

741

fm_id = ifm

742

else:

743

# Add multiplication with identity

744

mul_identity = Operation("MulAct", op.name + "_mul_identity")

745

mul_identity.add_input_tensor(ifm)

746

# Create const tensor containing identity as scalar

747

quantization = ifm.quantization.clone()

748

quantization.min = 0

749

quantization.max = quantization.quant_max - quantization.quant_min

750

quantization.scale_f32 = 1

751

quantization.zero_point = 0

752

identity_tens = create_const_tensor(

753

op.name + "_id_scalar", [], ifm.dtype, [1], np.uint8, quantization=quantization

754

)

755

mul_identity.add_input_tensor(identity_tens)

756

fm_id = ofm.clone(op.name + "_id")

757

mul_identity.set_output_tensor(fm_id)

758

759

# Convert LeakyRelu to Max, add the results of the multiplication(s) as inputs

760

op.type = "Maximum"

761

op.name = op.name.replace("LeakyRelu", "Maximum")

762

op.inputs = []

763

ifm.consumer_list.remove(op)

764

op.add_input_tensor(fm_alpha)

765

op.add_input_tensor(fm_id)

return op

def convert_lrelu_to_lut(op, arch):

Louis Verhaard

2020-08-13 11:47:36 +0200

[diff] [blame]

770

# Rewrite LeakyRelu by Add with scalar 0 + LUT activation

Louis Verhaard

58520b9

2020-08-24 16:45:38 +0200

[diff] [blame]

771

ifm, _, _, ofm = op.get_ifm_weights_biases_ofm()

772

assert ifm.dtype.size_in_bytes() == 1

Louis Verhaard

2020-08-13 11:47:36 +0200

[diff] [blame]

773

op.type = "AddAct"

774

op.name = op.name + "_add"

775

op.attrs.update({"npu_block_type": NpuBlockType.ElementWise})

776

# Mark as no-op to enable potential fusing optimizations

777

op.attrs["is_nop"] = True

778

# Create an input tensor containing scalar zero

779

quantization = QuantizationParameters(0.0, 255.0)

Louis Verhaard

2020-08-25 13:36:41 +0200

[diff] [blame]

780

quantization.scale_f32 = ifm.quantization.scale_f32

Louis Verhaard

2020-08-13 11:47:36 +0200

[diff] [blame]

781

quantization.zero_point = 0

782

tens = create_const_tensor(op.inputs[0].name + "_add", [], ifm.dtype, [0], np.uint8, quantization=quantization)

783

op.add_input_tensor(tens)

Louis Verhaard

2020-08-13 11:47:36 +0200

[diff] [blame]

784

# Generate the LUT

Louis Verhaard

2020-08-25 13:36:41 +0200

[diff] [blame]

785

alpha = op.attrs["alpha"]

786

ifm_scale = np.double(ifm.quantization.scale_f32)

787

ofm_scale = np.double(ofm.quantization.scale_f32)

788

zp_in = ifm.quantization.zero_point

789

zp_out = ofm.quantization.zero_point

790

identity_scale, identity_shift = scaling.elementwise_mul_scale(ifm_scale, 1, ofm_scale)

791

alpha_scalar = 1

792

alpha_scale, alpha_shift = scaling.elementwise_mul_scale(ifm_scale, alpha, ofm_scale)

793

if "alpha_scaling" in op.attrs:

794

# The LeakyRelu was the result from convert_mul_max_to_abs_or_lrelu

795

alpha_scalar, alpha_scale, alpha_shift = op.attrs["alpha_scaling"]

796

values = []

Louis Verhaard

58520b9

2020-08-24 16:45:38 +0200

[diff] [blame]

797

ix = range(256) if ifm.dtype == DataType.uint8 else range(-128, 128)

Louis Verhaard

2020-08-25 13:36:41 +0200

[diff] [blame]

798

quantized_min = min(ix)

799

quantized_max = max(ix)

800

for x in ix:

801

if x < zp_in:

802

lut_result = zp_out + fp_math.multiply_by_quantized_multiplier(

803

alpha_scalar * (x - zp_in), alpha_scale, alpha_shift

804

)

805

else:

806

lut_result = zp_out + fp_math.multiply_by_quantized_multiplier(x - zp_in, identity_scale, identity_shift)

807

lut_result = min(quantized_max, max(quantized_min, lut_result))

808

values.append(lut_result)

809

# The LUT must be applied without any preceding rescaling (the LUT itself performs the rescale),

810

# so even if the OFM has a different scale than the IFM, the generated OFM scale instructions

811

# should be the same as the IFM

812

op.attrs["forced_output_quantization"] = ifm.quantization

Louis Verhaard

58520b9

2020-08-24 16:45:38 +0200

[diff] [blame]

813

lut_tensor = lut.create_lut_tensor(op.name + "_lut", values, DataType.int8)

Louis Verhaard

2020-08-13 11:47:36 +0200

[diff] [blame]

814

op.set_activation_lut(lut_tensor)

return op

def convert_lrelu(op, arch):

819

# Converts LeakyRelu to a LUT based solution if possible, otherwise a mul + max

820

if op.type != "LeakyRelu":

821

return op

822

ifm, _, _, ofm = op.get_ifm_weights_biases_ofm()

Louis Verhaard

2020-08-25 13:36:41 +0200

[diff] [blame]

823

if ifm.dtype in (DataType.uint8, DataType.int8) and ifm.dtype == ofm.dtype:

824

# use LUT for int8/uint8

825

return convert_lrelu_to_lut(op, arch)

826

if ifm.is_scaling_equal(ofm) and ifm.dtype == ofm.dtype and ifm.dtype == DataType.int16:

827

# use LeakyRelu unmodified for int16 with equal input/output scaling

828

return op

Louis Verhaard

2020-08-13 11:47:36 +0200

[diff] [blame]

829

return convert_lrelu_to_mul_max(op, arch)

830

831

Patrik Gustavsson

fa4cb29

2020-09-10 08:19:36 +0200

[diff] [blame]

832

def remove_unwanted_reshapes(op, arch):

833

# Try to remove reshapes enclosing ElementWise operator with only one non-constant input

834

if not op.run_on_npu or op.attrs["npu_block_type"] != NpuBlockType.ElementWise:

835

return op

836

837

# Check if the ElementWise operator only have one non-constant input

838

non_const_tens = [x for x in op.inputs if x.ops[0].type != "Const"]

839

if len(non_const_tens) != 1:

840

return op

841

ifm = non_const_tens[0]

842

843

# Check if operation is enclosed by Reshapes that can be removed

ofm = op.outputs[0]

prev_op = ifm.ops[0]

if (

len(ifm.consumer_list) == 1

848

and prev_op.type == "Reshape"

849

and len(ofm.consumer_list) == 1

850

and ofm.consumer_list[0].type == "Reshape"

851

):

852

# Operation is enclosed by reshapes, check if they can be removed

853

prev_op_ifm, _, _, prev_op_ofm = prev_op.get_ifm_weights_biases_ofm()

854

cons_op = ofm.consumer_list[0]

855

cons_op_ifm = ofm

856

cons_op_ofm = cons_op.outputs[0]

857

if len(prev_op_ifm.shape) == len(cons_op_ofm.shape):

858

# Check if quantization is the same in the input and output for the reshape ops

859

if prev_op_ifm.quantization.is_scaling_equal(

860

prev_op_ofm.quantization

861

) and cons_op_ifm.quantization.is_scaling_equal(cons_op_ofm.quantization):

862

op.inputs[0] = prev_op_ifm

863

op.outputs[0] = cons_op_ofm

return op

Louis Verhaard

2020-08-13 11:47:36 +0200

[diff] [blame]

867

def fuse_activation_function_with_prev(op, arch):

868

# if op is a no-op: attempts to move the activation function to the preceding op

869

if not op.attrs.get("is_nop", False) or op.attrs.get("fused_activation_function", None) is None:

870

return op

871

ifm, _, _, ofm = op.get_ifm_weights_biases_ofm()

872

# finds the input(s) to the operation

873

prev_op = ifm.ops[0]

874

# Note: the below checks on prev_op require that a first optimize pass on the full graph has been performed

875

fuse = (

876

prev_op.run_on_npu

877

and prev_op.attrs["npu_block_type"] != NpuBlockType.Default

878

and len(ifm.ops) == 1

879

and len(prev_op.outputs[0].consumers()) == 1

880

and prev_op.attrs.get("fused_activation_function", None) is None

Louis Verhaard

2020-08-13 11:47:36 +0200

[diff] [blame]

881

)

882

if op.activation_lut is not None and arch.shram_reserved_unused_banks == 0:

883

# TODO: if SHRAM LUT space is shared with SHRAM ACC (32, 64 MAC),

884

# LUT currently only works correctly for elementwise ops

885

fuse = False

Louis Verhaard

2020-08-13 11:47:36 +0200

[diff] [blame]

886

if not fuse:

887

return op

888

# Move the fused activation function + corresponding info to prev_op

Louis Verhaard

98a3499

2020-09-01 10:39:04 +0200

[diff] [blame]

889

for attr in ("fused_activation_function", "forced_output_quantization"):

Louis Verhaard

2020-08-13 11:47:36 +0200

[diff] [blame]

890

if attr in op.attrs:

891

prev_op.attrs[attr] = op.attrs[attr]

892

if op.activation_lut is not None:

893

prev_op.set_activation_lut(op.activation_lut)

894

# Bypass op

Louis Verhaard

98a3499

2020-09-01 10:39:04 +0200

[diff] [blame]

895

prev_op.set_output_tensor(ofm)

Louis Verhaard

2020-08-13 11:47:36 +0200

[diff] [blame]

return op

Dwight Lidman

2020-05-29 09:37:03 +0200

[diff] [blame]

899

def add_attrs_to_resizebilinear(op, arch):

Tim Hall

2020-06-15 20:47:35 +0100

[diff] [blame]

900

if op.type == "ResizeBilinear" and op.run_on_npu:

Dwight Lidman

2020-05-29 09:37:03 +0200

[diff] [blame]

901

input_tensor = op.inputs[0]

902

upscaled_shape = [input_tensor.shape[1] * 2, input_tensor.shape[2] * 2]

903

out_shape = op.outputs[0].shape[1:3]

904

if not op.attrs["align_corners"] and out_shape == upscaled_shape:

905

# this means the output is supposed to be a x2 upscale,

906

# so we need to do SAME padding

907

op.attrs["padding"] = b"SAME"

908

elif op.attrs["align_corners"] and out_shape == [upscaled_shape[0] - 1, upscaled_shape[1] - 1]:

909

# here we can just run the avg pool without padding and

910

# produce a (M * 2 - 1, N * 2 - 1) sized output

911

op.attrs["padding"] = b"VALID"

912

else:

Charles Xu

2020-07-02 15:12:40 +0200

[diff] [blame]

913

return op

Dwight Lidman

2020-05-29 09:37:03 +0200

[diff] [blame]

914

input_tensor.resampling_mode = resampling_mode.NEAREST

Tim Hall

2020-06-15 20:47:35 +0100

[diff] [blame]

915

op.attrs.update({"strides": (1, 1, 1, 1), "ksize": (1, 2, 2, 1)})

Dwight Lidman

2020-05-29 09:37:03 +0200

[diff] [blame]

return op

Jacob Bohlin

2020-08-26 18:21:28 +0200

[diff] [blame]

919

def fixup_bias_tensors(op, arch):

920

if op.needs_bias() and not op.inputs[-1]:

921

# Op has no bias, add bias tensor filled with zeros

922

nr_biases = op.inputs[1].shape[-1]

923

bias_values = [0] * nr_biases

924

bias_tensor = create_const_tensor(op.name + "_bias", [nr_biases], DataType.int32, bias_values)

925

bias_tensor.quant_values = bias_tensor.values

926

op.set_input_tensor(bias_tensor, -1)

Jacob Bohlin

67e0d8f

2020-08-20 10:53:02 +0200

[diff] [blame]

return op

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

931

def supported_operator_check(op, arch):

932

op.run_on_npu = arch.supported_operators.is_operator_supported(op)

return op

def optimise_graph_a(nng, arch, verbose_graph=False):

if verbose_graph:

nng.print_graph()

op_rewrite_list = [

# mark block type and check if the operations are supported

942

mark_npu_block_type,

Tim Hall

4e12776

2020-05-15 16:05:49 +0100

[diff] [blame]

943

set_tensor_equivalence,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

944

supported_operator_check,

945

# then do any rewrites of supported operators

946

convert_depthwise_to_conv,

Michael McGeagh

2020-07-29 13:11:43 +0100

[diff] [blame]

947

convert_conv_to_fc,

Fredrik Svedberg

a0c3624

2020-06-03 15:43:31 +0200

[diff] [blame]

948

convert_softmax,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

949

fixup_fully_connected_input,

950

fixup_pack_input,

951

fixup_conv2d_backprop,

Michael McGeagh

8dbf8cf

2020-09-08 11:09:48 +0100

[diff] [blame^]

952

fixup_relus_with_differing_ifm_ofm_scaling,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

953

fixup_act_reorder,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

954

mark_npu_block_type,

Charles Xu

2020-05-13 10:15:26 +0200

[diff] [blame]

955

fixup_elementwise_with_scalars,

Jacob Bohlin

e843d33

2020-06-23 12:12:56 +0200

[diff] [blame]

956

reorder_depthwise_weights,

Charles Xu

2020-07-02 15:12:40 +0200

[diff] [blame]

957

fixup_resizebilinear,

Jacob Bohlin

a41cd4d

2020-08-26 18:21:28 +0200

[diff] [blame]

958

fixup_bias_tensors,

Louis Verhaard

2020-08-13 11:47:36 +0200

[diff] [blame]

959

convert_mul_max_to_abs_or_lrelu,

Patrik Gustavsson

fa4cb29

2020-09-10 08:19:36 +0200

[diff] [blame]

960

remove_unwanted_reshapes,

Louis Verhaard

2020-08-13 11:47:36 +0200

[diff] [blame]

961

convert_lrelu,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

962

]

963

964

for idx, sg in enumerate(nng.subgraphs):

965

# rewrite graph pass

966

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

967

sg, arch, [fixup_unpack_output], op_rewrite_list, rewrite_unsupported=False

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

968

)

969

970

for idx, sg in enumerate(nng.subgraphs):

Louis Verhaard

2020-08-13 11:47:36 +0200

[diff] [blame]

971

# remove passthrough tensors and attempt further optimizations

972

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(

Charles Xu

87c1350

2020-08-06 12:17:26 +0200

[diff] [blame]

973

sg, arch, [remove_passthrough_tensor], [fuse_activation_function_with_prev, add_padding_fields]

Louis Verhaard

2020-08-13 11:47:36 +0200

[diff] [blame]

974

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

if verbose_graph:

nng.print_graph()

return nng

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

980

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

981

def optimise_graph_b(nng, arch, verbose_graph=False):

if verbose_graph:

nng.print_graph()

for idx, sg in enumerate(nng.subgraphs):

986

# combined rewrite graph pass

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

987

nng.subgraphs[idx] = rewrite_graph.rewrite_graph_pre_order(sg, arch, [rewrite_concat, rewrite_split], [])

Tim Hall