Blame - ethosu/vela/graph_optimiser_util.py - ml/ethos-u/ethos-u-vela

2021-06-28 07:41:58 +0200

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

16

# Description:

17

# Common functions and definitions used during the graph optimization.

Patrik Gustavsson

c74682c

2021-08-17 14:26:38 +0200

[diff] [blame]

18

from typing import Tuple

19

Patrik Gustavsson

2021-08-23 15:33:59 +0200

[diff] [blame]

20

import numpy as np

21

Patrik Gustavsson

f436ada

2021-09-14 14:56:48 +0200

[diff] [blame]

22

from . import lut

Tim Hall

d6efcd3

2022-09-02 15:01:01 +0100

[diff] [blame]

23

from .architecture_features import Accelerator

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

24

from .data_type import DataType

25

from .debug_database import DebugDatabase

Patrik Gustavsson

2021-08-23 15:33:59 +0200

[diff] [blame]

26

from .errors import UnsupportedFeatureError

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

27

from .errors import VelaError

28

from .operation import Op

Patrik Gustavsson

2021-08-23 15:33:59 +0200

[diff] [blame]

29

from .operation_util import create_avgpool_nop

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

30

from .shape4d import Shape4D

Patrik Gustavsson

f436ada

2021-09-14 14:56:48 +0200

[diff] [blame]

31

from .tensor import create_const_tensor

32

from .tensor import QuantizationParameters

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

33

Jonas Ohlsson

81942e9

2021-08-20 09:33:28 +0200

[diff] [blame]

34

memory_only_ops = (

35

Op.Reshape,

Jonas Ohlsson

2021-09-01 15:57:21 +0200

[diff] [blame]

36

Op.QuantizedReshape,

Jonas Ohlsson

81942e9

2021-08-20 09:33:28 +0200

[diff] [blame]

37

Op.Squeeze,

Jonas Ohlsson

2021-09-01 15:57:21 +0200

[diff] [blame]

38

Op.ExpandDims,

Patrik Gustavsson

ef3ebdd

2021-10-01 11:10:25 +0200

[diff] [blame]

39

Op.Identity,

Jonas Ohlsson

81942e9

2021-08-20 09:33:28 +0200

[diff] [blame]

40

)

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

41

Johan Alfvén

2022-09-28 20:06:25 +0200

[diff] [blame^]

42

# Ops that are dependent that the original ifm tensor shape is not changed

43

# by the bypass memory op function

44

original_ifm_shape_ops = (Op.Mean,)

45

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

46

47

def _avoid_nhcwb16_for_concat(tens):

48

# If axis corresponds to C-dimension, NHCWB16 can only be used in the output if all the concat_start's are a

49

# multiple of 16. This as, it is only then the address offset for the ofm, for all operations, will be 16 byte

50

# aligned. For other values of axis the address offsets will be 16 byte aligned, as they are all based on c = 0

51

# and those addresses are always 16 byte aligned due to the NHCWB16 format.

52

return any(op.write_offset.depth % 16 != 0 for op in tens.ops if op.write_offset is not None)

53

54

55

def _avoid_nhcwb16_for_split(tens):

56

# If read offset is not a multiple of 16 in the C-dimension, NHCWB16 need to be avoided in the input

James Ward

6bf1613

2021-09-08 11:14:20 +0100

[diff] [blame]

57

58

# Return True if NHCWB16 needs to be avoided

59

def offset_not_aligned(read_offset):

60

return read_offset is not None and (read_offset.depth % 16) != 0

61

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

62

for cons_op in tens.consumer_list:

63

if cons_op.ifm == tens:

James Ward

6bf1613

2021-09-08 11:14:20 +0100

[diff] [blame]

64

if offset_not_aligned(cons_op.read_offsets[0]):

65

return True

66

if cons_op.ifm2 is not None and cons_op.ifm2 == tens:

67

if offset_not_aligned(cons_op.read_offsets[1]):

68

return True

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

return False

def _avoid_nhcwb16_for_shapes(tens):

73

# check all producers/consumers to see if any op shape is preventing NHCWB16

74

for cons_op in tens.consumer_list:

75

if cons_op.ifm == tens:

76

cons_op_shape = cons_op.ifm_shapes[0]

77

elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == tens:

78

cons_op_shape = cons_op.ifm_shapes[1]

79

else:

80

assert False

81

if Shape4D(tens.shape) != cons_op_shape:

82

return True

83

84

for prod_op in tens.ops:

85

if Shape4D(tens.shape) != prod_op.ofm_shapes[0]:

return True

return False

# Check if non linear format can be used

92

def check_format_restrictions(tens, arch):

93

if len(tens.ops) < 1:

94

return

95

if tens.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const) or any(

96

cons is None for cons in tens.consumer_list

):

return

# Check if any of the producers/consumers is run on CPU

101

if not all(cons.run_on_npu for cons in tens.consumer_list):

102

return

103

if not all(prod.run_on_npu for prod in tens.ops):

104

return

105

106

# "Concat" ofm exception:

107

if _avoid_nhcwb16_for_concat(tens):

108

return

109

110

# "Split" ifm exception:

111

if _avoid_nhcwb16_for_split(tens):

112

return

113

114

# Shapes checking: check all producers/consumers are NHCWB16 compatible with tens.shape

115

if _avoid_nhcwb16_for_shapes(tens):

116

return

117

Rickard Bolin

fea1516

2022-07-04 16:19:16 +0000

[diff] [blame]

118

# Resize bilinear half pixel center implementation requires OFM with linear format to

119

# allow stride modification in H/W dimensions.

120

for op in tens.ops:

121

if op.original_type == Op.ResizeBilinear and op.type == Op.DepthwiseConv2DBias:

122

return

123

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

124

for op in tens.consumer_list:

Tim Hall

d6efcd3

2022-09-02 15:01:01 +0100

[diff] [blame]

125

if op.type == Op.ReduceSum and (

126

tens.dtype == DataType.int32 or arch.accelerator_config == Accelerator.Ethos_U65_512

127

):

128

# ReduceSum requires NHWC input

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

129

return

130

if op.type == Op.Reshape:

131

# Using NHCWB16 format for a no-op reshape is only an option if subsequent

132

# consumers do not also need to perform a reshape or if the OFM is going to

133

# be processed by CPU operations. No-op reshape consumers with empty lists

134

# (those that have no consumers, or null-consumers used as list terminators)

135

# must use normal NHWC output.

136

137

def incompatible_consumers(oper):

138

if oper and oper.type == Op.Reshape:

139

for consumer in oper.outputs[0].consumer_list:

140

yield from incompatible_consumers(consumer)

141

yield not oper or not oper.run_on_npu

142

143

if not any(incompatible_consumers(op)):

144

145

def get_rewrites(oper):

146

if oper and oper.type == Op.Reshape:

147

for consumer in oper.outputs[0].consumer_list:

148

yield from get_rewrites(consumer)

149

yield oper

150

151

# Detect no-op reshapes by comparing their full input and output tensor shapes.

152

inshape = op.ifm_shapes[0]

153

compatible_shape = [(inshape == oper.ofm_shapes[0]) for oper in get_rewrites(op)]

154

if not (compatible_shape and all(compatible_shape)):

return

else:

return

tens.needs_linear_format = False

160

161

Patrik Gustavsson

c74682c

2021-08-17 14:26:38 +0200

[diff] [blame]

162

def calc_explicit_padding(input_size, stride, filter_size, pad_before, pad_after) -> Tuple[int, int]:

163

"""

164

Based on explicit padding provided in a PAD operation, returns the corresponding hardware padding

165

that provides equivalent results.

166

"""

167

total_padding = needed_total_padding(input_size, stride, filter_size)

168

169

# The bottom/right padding might need downward adjustment depending on stride/input size

170

total_minus_before = total_padding - pad_before

171

output_pad_after = pad_after

172

while output_pad_after > 0 and output_pad_after % stride != total_minus_before % stride:

173

output_pad_after -= 1

174

return pad_before, output_pad_after

175

176

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

177

def needed_total_padding(input_size, stride, filter_size):

178

out_size = (input_size + stride - 1) // stride

179

needed_input = (out_size - 1) * stride + filter_size

180

total_padding = max(0, needed_input - input_size)

return total_padding

# Set input/output tensor equivalence to the same id for memory operations

185

def set_tensor_equivalence(op, arch, nng):

186

if op.type in memory_only_ops:

187

eid = op.outputs[0].equivalence_id

188

for inp in op.inputs:

189

inp.equivalence_id = eid

return op

def set_ifm_ofm_op_shapes(op, arch, nng):

194

if op.run_on_npu and op.type.needs_shapes():

195

if op.ifm_shapes or op.ofm_shapes:

196

# Shapes already set

197

return op

198

op.set_ifm_ofm_shapes()

return op

Johan Alfvén

2022-09-28 20:06:25 +0200

[diff] [blame^]

202

def bypass_need_to_keep_ofm_shape(op):

203

# Check if ifm must be replaced by ofm (rank is changed or the op that follow must have original ifm shape)

204

ifm_replaced_by_ofm = any(

205

ofm_cons is not None and ofm_cons.type in original_ifm_shape_ops for ofm_cons in op.ofm.consumer_list

206

) or len(op.ifm.shape) != len(op.ofm.shape)

207

return ifm_replaced_by_ofm

208

209

Jonas Ohlsson

2021-09-01 15:57:21 +0200

[diff] [blame]

210

def bypass_memory_only_ops(op):

211

assert op.type in memory_only_ops

Patrik Gustavsson

2021-08-23 15:33:59 +0200

[diff] [blame]

212

ofm = op.ofm

213

ifm = op.ifm

Jonas Ohlsson

2021-09-01 15:57:21 +0200

[diff] [blame]

214

Johan Alfvén

2022-09-28 14:22:54 +0200

[diff] [blame]

215

# Check if ifm/ofm are network ifm/ofm

Patrik Gustavsson

2021-08-23 15:33:59 +0200

[diff] [blame]

216

ifm_is_sg_ifm = ifm.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const)

Johan Alfvén

2022-09-28 14:22:54 +0200

[diff] [blame]

217

ifm_is_sg_ofm = any(ifm_cons is None for ifm_cons in ifm.consumer_list)

218

ofm_is_sg_ofm = any(ofm_cons is None for ofm_cons in ofm.consumer_list)

219

# Check if ifm/ofm is produced respectively consumed by CPU

Patrik Gustavsson

2021-08-23 15:33:59 +0200

[diff] [blame]

220

ifm_is_cpu_produced = any(ifm_prod is not None and not ifm_prod.run_on_npu for ifm_prod in op.ifm.ops)

Johan Alfvén

2022-09-28 14:22:54 +0200

[diff] [blame]

221

ofm_is_cpu_consumed = any(ofm_cons is not None and not ofm_cons.run_on_npu for ofm_cons in op.ofm.consumer_list)

Patrik Gustavsson

2021-08-23 15:33:59 +0200

[diff] [blame]

222

223

# This case should be handled prior to this function

Johan Alfvén

2022-09-28 14:22:54 +0200

[diff] [blame]

224

assert not ((ifm_is_sg_ifm or ifm_is_sg_ofm or ifm_is_cpu_produced) and (ofm_is_sg_ofm or ofm_is_cpu_consumed))

Patrik Gustavsson

2021-08-23 15:33:59 +0200

[diff] [blame]

225

Johan Alfvén

2022-09-28 20:06:25 +0200

[diff] [blame^]

226

if (ifm.shape != ofm.shape) and (ofm_is_sg_ofm or ofm_is_cpu_consumed or bypass_need_to_keep_ofm_shape(op)):

Johan Alfvén

2022-09-28 14:22:54 +0200

[diff] [blame]

227

# Bypassed by replacing ifm with ofm

228

ofm.ops = []

229

for prev_op in ifm.ops:

230

prev_op.outputs = [ofm]

231

ofm.ops.append(prev_op)

Patrik Gustavsson

2021-08-23 15:33:59 +0200

[diff] [blame]

232

Johan Alfvén

2022-09-28 14:22:54 +0200

[diff] [blame]

233

# All ifm consumers need to use ofm as input

234

for ifm_cons in ifm.consumer_list:

235

for ifm_idx, cons_ifm in enumerate(ifm_cons.inputs):

236

if cons_ifm == ifm:

237

ifm_cons.set_input_tensor(ofm, ifm_idx)

238

else:

239

# Bypassed by replacing ofm with ifm

240

for cons in ofm.consumer_list:

241

for ifm_idx, cons_ifm in enumerate(cons.inputs):

242

if cons_ifm == ofm:

243

cons.set_input_tensor(ifm, ifm_idx)

Patrik Gustavsson

2021-08-23 15:33:59 +0200

[diff] [blame]

244

245

Patrik Gustavsson

f1580f0

2021-09-01 12:43:02 +0200

[diff] [blame]

246

def move_splitsliceread_to_consumer(op, cons_op):

247

assert op.type == Op.SplitSliceRead

248

249

if cons_op.ifm == op.ofm:

250

cons_op.read_offsets[0] = op.read_offsets[0]

251

cons_op.read_shapes[0] = op.read_shapes[0]

252

cons_op.set_input_tensor(op.ifm, cons_op.type.info.indices.ifms[0])

253

cons_op.ifm_shapes[0] = op.ifm_shapes[0]

254

elif cons_op.type.is_binary_elementwise_op() and cons_op.ifm2 == op.ofm:

255

cons_op.read_offsets[1] = op.read_offsets[0]

256

cons_op.read_shapes[1] = op.read_shapes[0]

257

cons_op.set_input_tensor(op.ifm, cons_op.type.info.indices.ifms[1])

258

cons_op.ifm_shapes[1] = op.ifm_shapes[0]

259

Patrik Gustavsson

f1580f0

2021-09-01 12:43:02 +0200

[diff] [blame]

260

op.ofm.consumer_list.remove(cons_op)

261

op.ofm.ops = []

262

op.ifm.consumer_list.remove(op)

263

264

Jonas Ohlsson

2021-09-01 15:57:21 +0200

[diff] [blame]

265

def check_memory_only_removed(op, arch):

266

if op.run_on_npu and op.type in memory_only_ops:

267

# Memory only operators should have been removed

268

raise VelaError(f"Memory only {op.type} op {op} expected to have been removed, still remains")

Patrik Gustavsson

2021-06-28 07:41:58 +0200

[diff] [blame]

269

270

271

def record_optimised(op, arch):

272

if op.type != Op.Const:

273

DebugDatabase.add_optimised(op, op)

Patrik Gustavsson

2021-08-23 15:33:59 +0200

[diff] [blame]

274

275

Johan Alfvén

2022-09-28 20:06:25 +0200

[diff] [blame^]

276

def insert_copy_op_before_op(op):

277

# Create a avg_pool nop op with ifm as input

278

tens = op.ifm

279

copy_tens = tens.clone()

280

copy_op = create_avgpool_nop(f"{tens.name}_avgpool")

281

copy_op.add_input_tensor(tens)

282

copy_op.set_output_tensor(copy_tens)

283

copy_op.set_ifm_ofm_shapes()

284

285

op.set_input_tensor(copy_tens, 0)

286

287

DebugDatabase.add_optimised(op, copy_op)

288

289

Johan Alfvén

2022-09-28 14:22:54 +0200

[diff] [blame]

290

def insert_copy_op_after_tens(tens):

291

tens_cons_list_copy = tens.consumer_list.copy()

Patrik Gustavsson

2021-08-23 15:33:59 +0200

[diff] [blame]

292

293

# Create a avg_pool nop op with ifm as input

294

copy_tens = tens.clone()

295

copy_op = create_avgpool_nop(tens.name + "_avgpool")

296

copy_op.add_input_tensor(tens)

297

copy_op.set_output_tensor(copy_tens)

298

copy_op.set_ifm_ofm_shapes()

299

copy_op.run_on_npu = True

300

Johan Alfvén

2022-09-28 14:22:54 +0200

[diff] [blame]

301

# Set copy_ifm consumers

302

for tens_cons in tens_cons_list_copy:

303

if tens_cons is not None:

304

for ifm_idx, cons_inp in enumerate(tens_cons.inputs):

305

if cons_inp == tens:

306

tens_cons.set_input_tensor(copy_tens, ifm_idx)

Patrik Gustavsson

2021-08-23 15:33:59 +0200

[diff] [blame]

307

308

DebugDatabase.add_optimised(tens.ops[0], copy_op)

309

310

311

def fix_sg_input_output(op, arch, nng):

Jonas Ohlsson

2021-09-01 15:57:21 +0200

[diff] [blame]

312

if not op.run_on_npu or op.type not in memory_only_ops:

Patrik Gustavsson

2021-08-23 15:33:59 +0200

[diff] [blame]

313

return op

314

Johan Alfvén

2022-09-28 14:22:54 +0200

[diff] [blame]

315

# For the memory only operators we want to remove, tensors are removed.

316

# But in order to to do this, they cannot be outputs of the sg,

317

# this need to be fixed prior to the removal.

Patrik Gustavsson

2021-08-23 15:33:59 +0200

[diff] [blame]

318

# Solution is to add a avgpool NOP, to maintain the original tensor.

Johan Alfvén

2022-09-28 14:22:54 +0200

[diff] [blame]

319

# This is also valid when reshape ifm/ofm is produced respectively

320

# consumed by CPU

Patrik Gustavsson

2021-08-23 15:33:59 +0200

[diff] [blame]

321

Johan Alfvén

2022-09-28 20:06:25 +0200

[diff] [blame^]

322

# Rare case: original_ifm_shape_ops contain ops that are dependent

323

# that the original ifm tensor shape is not changed by the bypass memory

324

# function. If the memory only op ifm is subgraph ifm/ifm is cpu produced

325

# or the ifm is consumed by many, then there is a need to insert an avgpool

326

# NOP before the original_ifm_shape_ops. Also note that the NOP is only inserted

327

# before original_ifm_shape_ops. The above is also true when the memory only

328

# op change the rank between the IFM and OFM.

329

#

330

# Below is an example showing the case when there is a need for an AVG NOP

331

# when RESHAPE is bypassed by replacing IFM with OFM.

332

#

333

# Converts to And in bypass_memory

334

# ---> --->

335

# -----ADD----- -----ADD----- -----ADD-----

336

# | | | | | |

337

# 1x6x6x10 1x6x6x10 1x6x6x10 1x6x6x10 1x6x6x10 1x6x6x10

338

# RESHAPE MEAN AVG POOL MEAN AVG POOL MEAN

339

# | | | |

340

# 1x20x3x6 1x6x6x10 1x20x3x6

# MEAN RESHAPE MEAN

# |

# 1x20x3x6

# MEAN

ifm_has_multiple_cons = len(op.ifm.consumer_list) > 1

346

Johan Alfvén

2022-09-28 14:22:54 +0200

[diff] [blame]

347

# Check if operator ifm/ofm are sg ifm/ofm

Patrik Gustavsson

2021-08-23 15:33:59 +0200

[diff] [blame]

348

ifm_is_sg_ifm = op.ifm.ops[0].type in (Op.Placeholder, Op.SubgraphInput, Op.Const)

Johan Alfvén

2022-09-28 14:22:54 +0200

[diff] [blame]

349

ifm_is_sg_ofm = any(ifm_cons is None for ifm_cons in op.ifm.consumer_list)

350

ofm_is_sg_ofm = any(ofm_cons is None for ofm_cons in op.ofm.consumer_list)

351

# Check if ifm/ofm is produced respectively consumed by CPU

Johan Alfvén

5060ff5

2022-09-15 15:50:30 +0200

[diff] [blame]

352

ifm_is_cpu_produced = any(ifm_prod is not None and not ifm_prod.run_on_npu for ifm_prod in op.ifm.ops)

Johan Alfvén

2022-09-28 14:22:54 +0200

[diff] [blame]

353

ofm_is_cpu_consumed = any(ofm_cons is not None and not ofm_cons.run_on_npu for ofm_cons in op.ofm.consumer_list)

Johan Alfvén

5060ff5

2022-09-15 15:50:30 +0200

[diff] [blame]

354

Johan Alfvén

2022-09-28 20:06:25 +0200

[diff] [blame^]

355

if bypass_need_to_keep_ofm_shape(op):

356

# Bypass need to keep OFM shape

357

if ifm_has_multiple_cons:

358

# Rare case:

359

# IFM need to persist due to multiple consumers and copy op is needed

360

# OFM will replace IFM for the memory only op

361

insert_copy_op_before_op(op)

362

elif not (ofm_is_sg_ofm or ofm_is_cpu_consumed):

363

# Only one consumer and OFM is not subgraph output or cpu consumed,

364

# safe to replace ifm.shape by ofm.shape

365

# IFM can then replace OFM for the memory only op and no copy op is needed

366

op.ifm.shape = op.ofm.shape

367

368

# Special case when when OFM is sg_ofm or cpu_consumed

Johan Alfvén

2022-09-28 14:22:54 +0200

[diff] [blame]

369

if (ifm_is_sg_ofm or ifm_is_sg_ifm or ifm_is_cpu_produced) and (ofm_is_sg_ofm or ofm_is_cpu_consumed):

370

# Both ifm and ofm need to persist, but only ifm need a copy, in order to remove the memory only operator.

371

insert_copy_op_after_tens(op.ifm)

Patrik Gustavsson