Blame - ethosu/vela/pass_packing.py - ml/ethos-u/ethos-u-vela

2024-05-13 13:44:42 +0200

[diff] [blame]

1

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Rickard Bolin

bc6ee58

2022-11-04 08:24:29 +0000

[diff] [blame]

16

#

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

17

# Description:

18

# Packs a subgraph with Neural Network Operations into Passes. Each Pass has one or more Operations.

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

19

import collections

Diego Russo

e8a1045

2020-04-21 17:39:10 +0100

[diff] [blame]

20

import enum

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

21

Tim Hall

e6ccd87

2020-11-09 16:46:37 +0000

[diff] [blame]

22

from .debug_database import DebugDatabase

Diego Russo

e8a1045

2020-04-21 17:39:10 +0100

[diff] [blame]

23

from .nn_graph import Pass

24

from .nn_graph import PassPlacement

25

from .operation import NpuBlockType

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

26

from .operation import Op

Fredrik Svedberg

d9c2c42

2020-12-01 16:33:45 +0100

[diff] [blame]

27

from .operation_util import create_avgpool_nop

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

28

from .tensor import TensorPurpose

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

29

30

31

class PassFlags(enum.Flag):

32

Empty = 0

Patrik Gustavsson

2021-02-16 12:57:03 +0100

[diff] [blame]

33

Main = 1

34

Post = 2

35

Mac = 4

Tim Hall

d8339a7

2021-05-27 18:49:40 +0100

[diff] [blame]

ElementWise = 8

Npu = 16

Cpu = 32

StartupInit = 64

MemoryOnly = 128

PostFusingLimited = 256

Johan Alfven

2023-02-02 09:07:48 +0100

[diff] [blame]

42

Memcpy = 512

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

43

44

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

45

mac_main_ops = set(

46

(

47

# convolutions

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

Op.Conv2DBias,

Op.Conv2D,

Op.QuantizedConv2D,

Op.Conv2DBackpropInputSwitchedBias,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

52

# depth-wise convolutions

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

53

Op.DepthwiseConv2DBias,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

54

# FC layers

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

55

Op.QuantizedMatMul,

56

Op.MatMul,

57

Op.FullyConnected,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

58

# pooling

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

Op.QuantizedMaxPool,

Op.QuantizedAvgPool,

Op.AvgPool,

Op.MaxPool,

Op.ReduceSum,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

64

)

Tim Hall

885033b

2022-07-21 11:46:03 +0100

[diff] [blame]

65

# resize ops use pooling operations unless explicitly converted to other operations prior to pass packing

66

) | Op.op_set(Op.is_resize_op)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

67

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

68

binary_elem_wise_main_ops = Op.op_set(Op.is_binary_elementwise_op)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

69

Michael McGeagh

f3e3ad7

2020-12-02 12:39:03 +0000

[diff] [blame]

70

unary_elem_wise_main_ops = Op.op_set(Op.is_unary_elementwise_op)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

71

72

elem_wise_main_ops = binary_elem_wise_main_ops | unary_elem_wise_main_ops

73

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

74

activation_ops = Op.op_set(Op.is_relu_op)

75

npu_post_ops = activation_ops

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

76

77

npu_post_fuse_limited_ops = set(

78

# Set of post operators that should not be fused with main/elementwise ops

Patrik Gustavsson

138d47f

2021-02-08 10:13:48 +0100

[diff] [blame]

79

(Op.Sigmoid, Op.Tanh, Op.Quantize)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

80

)

81

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

82

elem_wise_ops = elem_wise_main_ops | activation_ops | set((Op.Sigmoid, Op.Tanh))

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

83

84

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

85

quantization_ops = set((Op.Dequantize, Op.Max, Op.Min))

86

cpu_ops = set((Op.Softmax, Op.LRN, Op.Shape, Op.Pad, Op.AddN)) | quantization_ops

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

87

patrik.gustavsson

1068362

2020-10-14 10:57:46 +0000

[diff] [blame]

88

startup_init_ops = set((Op.Const, Op.Placeholder, Op.SubgraphInput))

Jonas Ohlsson

d857507

2022-03-30 10:30:25 +0200

[diff] [blame]

89

memory_only_ops = set(

(

Op.Squeeze,

Op.Reshape,

Op.QuantizedReshape,

Op.ExpandDims,

)

)

Johan Alfven

2023-02-02 09:07:48 +0100

[diff] [blame]

97

memcpy_ops = set((Op.Memcpy,))

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

test_sequence = [

(

# ops_set

npu_post_ops,

# incompatible_pack_flags

Patrik Gustavsson

2021-02-16 12:57:03 +0100

[diff] [blame]

105

PassFlags.Cpu | PassFlags.MemoryOnly | PassFlags.Main,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

106

# flags_to_set

107

PassFlags.Npu | PassFlags.Post,

# flags_to_clear

PassFlags.Empty,

),

(

# ops_set

npu_post_fuse_limited_ops,

114

# incompatible_pack_flags

Tim Hall

b1a9a92

2021-10-29 12:51:53 +0100

[diff] [blame]

115

PassFlags.Cpu | PassFlags.MemoryOnly | PassFlags.Main | PassFlags.PostFusingLimited,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

116

# flags_to_set

117

PassFlags.Npu | PassFlags.PostFusingLimited,

# flags_to_clear

PassFlags.Empty,

),

(

# ops_set

mac_main_ops,

# incompatible_pack_flags

Patrik Gustavsson

2021-02-16 12:57:03 +0100

[diff] [blame]

125

PassFlags.Cpu | PassFlags.MemoryOnly | PassFlags.ElementWise | PassFlags.Main | PassFlags.PostFusingLimited,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

126

# flags_to_set

127

PassFlags.Npu | PassFlags.Mac | PassFlags.Main,

# flags_to_clear

PassFlags.Empty,

),

(

# ops_set

elem_wise_main_ops,

# incompatible_pack_flags

Patrik Gustavsson

2021-02-16 12:57:03 +0100

[diff] [blame]

135

PassFlags.Cpu | PassFlags.MemoryOnly | PassFlags.Mac | PassFlags.Main | PassFlags.PostFusingLimited,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

136

# flags_to_set

137

PassFlags.Npu | PassFlags.ElementWise | PassFlags.Main,

# flags_to_clear

PassFlags.Empty,

),

(

# ops_set

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

143

startup_init_ops,

144

# incompatible_pack_flags

145

PassFlags.Npu | PassFlags.Cpu | PassFlags.MemoryOnly,

146

# flags_to_set

147

PassFlags.StartupInit | PassFlags.Main,

# flags_to_clear

PassFlags.Empty,

),

(

# ops_set

memory_only_ops,

# incompatible_pack_flags

155

PassFlags.Npu | PassFlags.Cpu,

156

# flags_to_set

157

PassFlags.MemoryOnly | PassFlags.Main,

158

# flags_to_clear

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

159

PassFlags.Empty,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

160

),

161

(

162

# ops_set

Johan Alfven

2023-02-02 09:07:48 +0100

[diff] [blame]

163

memcpy_ops,

164

# incompatible_pack_flags

165

PassFlags.Cpu | PassFlags.MemoryOnly | PassFlags.Mac | PassFlags.Main | PassFlags.PostFusingLimited,

166

# flags_to_set

167

PassFlags.Npu | PassFlags.Memcpy | PassFlags.Main,

# flags_to_clear

PassFlags.Empty,

),

(

# ops_set

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

173

cpu_ops,

174

# incompatible_pack_flags

175

PassFlags.Npu | PassFlags.MemoryOnly | PassFlags.Main,

176

# flags_to_set

177

PassFlags.Cpu | PassFlags.Main,

178

# flags_to_clear

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

179

PassFlags.Empty,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

180

),

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

181

( # This last one is a fallback for unrecognised operations

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

182

# ops_set

183

None,

184

# incompatible_pack_flags

185

PassFlags.Npu | PassFlags.MemoryOnly | PassFlags.Main,

186

# flags_to_set

187

PassFlags.Cpu | PassFlags.Main,

188

# flags_to_clear

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

189

PassFlags.Empty,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

),

]

# Some sanity checking

194

for (operation_set, incompatible_pack_flags, flags_to_set, flags_to_clear) in test_sequence:

195

assert not flags_to_clear & flags_to_set

196

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

197

198

def pack_into_passes(nng, arch, verbose_packing=False):

Johan Alfvén

2022-01-27 06:47:26 +0100

[diff] [blame]

199

def visit_op(op, ignored):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

200

visit_op_refcount[op] += 1

201

202

if visit_op_refcount[op] == 1: # First-time visit, go and fix up unused output tensors

203

for tens in op.outputs:

204

if len(tens.consumers()) == 0:

205

visit_op_refcount[op] += 1

206

Johan Alfvén

2022-01-27 06:47:26 +0100

[diff] [blame]

207

assert visit_op_refcount[op] <= len(op.outputs)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

208

if visit_op_refcount[op] == len(op.outputs):

209

210

if op.type in startup_init_ops:

211

startup_list.append(op)

212

else:

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

213

ofm_tensor = op.ofm

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

214

if ofm_tensor is None:

215

ofm_tensor = op.outputs[0]

Tim Hall

73e843f

2021-02-04 22:47:46 +0000

[diff] [blame]

216

ofm_shape = op.ofm_shapes[0] if op.run_on_npu else None

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

217

Johan Alfvén

2022-01-27 06:47:26 +0100

[diff] [blame]

218

build_pass((op,), ofm_tensor, ofm_shape)

Patrik Gustavsson

6bb8f67

2020-12-21 14:49:13 +0100

[diff] [blame]

219

Johan Alfvén

2022-01-27 06:47:26 +0100

[diff] [blame]

220

def build_pass(start_ops_to_process, ofm_tensor=None, ofm_shape=None):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

221

reverse_ops_list = []

222

curr_flags = PassFlags.Empty

223

npu_block_type = NpuBlockType.Default

224

225

reverse_intermediates = []

226

input_set = set()

227

ifm_tensor = None

228

primary_op = None

Patrik Gustavsson

224e99b

2021-01-14 10:55:43 +0100

[diff] [blame]

229

ifm_shapes = None

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

230

231

to_process = collections.deque()

232

for start_op in start_ops_to_process:

233

to_process.append((start_op, None))

234

235

while to_process:

236

curr_op, tens = to_process.popleft()

237

238

if curr_op in reverse_ops_list:

239

continue

240

241

for operation_set, incompatible_pack_flags, flags_to_set, flags_to_clear in test_sequence:

242

if operation_set is None or curr_op.type in operation_set:

243

if not (curr_flags & incompatible_pack_flags):

244

if flags_to_set & PassFlags.Npu:

245

if not curr_op.run_on_npu:

246

continue

247

248

reverse_ops_list.append(curr_op)

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

249

new_block_type = curr_op.type.npu_block_type

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

250

if new_block_type != NpuBlockType.Default:

251

assert npu_block_type == NpuBlockType.Default

252

npu_block_type = new_block_type # Only one major block type per pass

253

assert primary_op is None

254

primary_op = curr_op

255

256

curr_flags &= ~flags_to_clear

257

curr_flags |= flags_to_set

258

259

if flags_to_set & PassFlags.Npu:

260

if flags_to_set & (

Johan Alfven

2023-02-02 09:07:48 +0100

[diff] [blame]

261

PassFlags.Mac

262

| PassFlags.ElementWise

263

| PassFlags.Post

264

| PassFlags.PostFusingLimited

265

| PassFlags.Memcpy

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

266

):

267

assert len(curr_op.inputs) >= 1

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

268

ifm_tensor = curr_op.ifm

Patrik Gustavsson

224e99b

2021-01-14 10:55:43 +0100

[diff] [blame]

269

ifm_shapes = curr_op.ifm_shapes.copy()

Louis Verhaard

04f8c00

2020-10-09 11:40:21 +0200

[diff] [blame]

270

assert ifm_tensor is not None, "IFM missing in {}".format(curr_op)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

271

assert ifm_tensor.purpose == TensorPurpose.FeatureMap

272

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

273

if operation_set is None:

Tim Hall

cd03504

2023-08-08 14:10:17 +0100

[diff] [blame]

274

assert not curr_op.run_on_npu # operator should have been placed on the CPU

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

275

Charles Xu

600351a

2020-05-18 08:54:47 +0200

[diff] [blame]

276

for inp in reversed(curr_op.inputs):

Andreas Nevalainen

d8c032d

2020-09-11 10:25:09 +0200

[diff] [blame]

277

if inp is None:

278

continue

Patrik Gustavsson

fcb1a00

2021-02-03 09:13:57 +0100

[diff] [blame]

279

if can_pack(inp, curr_op):

280

to_process.append((inp.ops[0], inp))

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

281

else:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

input_set.add(inp)

break

else:

# This operation is not compatible with already packed operations, just register the tensor as an input

288

assert tens is not None

289

input_set.add(tens)

290

291

if curr_flags & PassFlags.Npu and not curr_flags & (PassFlags.ElementWise | PassFlags.Mac):

292

# Make the choice that if we don't have a mac operation, the ambidextrous operations go on the

293

# element wise unit

294

curr_flags |= PassFlags.ElementWise

295

296

is_element_wise = True

297

for op in reverse_ops_list:

Tim Hall

d8339a7

2021-05-27 18:49:40 +0100

[diff] [blame]

298

if op.type not in elem_wise_ops and op.type:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

299

is_element_wise = False

300

break

301

302

placement = PassPlacement.Unknown

303

if curr_flags & PassFlags.Npu:

304

assert placement == PassPlacement.Unknown

305

placement = PassPlacement.Npu

306

if curr_flags & PassFlags.Cpu:

307

assert placement == PassPlacement.Unknown

308

placement = PassPlacement.Cpu

309

if curr_flags & PassFlags.MemoryOnly:

310

assert placement == PassPlacement.Unknown

311

placement = PassPlacement.MemoryOnly

312

if curr_flags & PassFlags.StartupInit:

313

assert placement == PassPlacement.Unknown

314

placement = PassPlacement.StartupInit

315

assert placement != PassPlacement.Unknown

316

317

ops_list = list(reversed(reverse_ops_list))

318

intermediates = list(reversed(reverse_intermediates))

319

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

320

if primary_op is None:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

321

primary_op = create_primary_op(ops_list)

Diego Russo

2020-04-14 18:41:58 +0100

[diff] [blame]

322

if primary_op is not None:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

323

visit_tensor_refcount[primary_op.inputs[0]] += 1

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

324

npu_block_type = primary_op.type.npu_block_type

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

325

for input_tens in primary_op.inputs:

326

if input_tens not in input_set:

327

input_set.add(input_tens)

328

329

ordered_input_list = []

Louis Verhaard

0b8268a

2020-08-05 16:11:29 +0200

[diff] [blame]

330

# Keep LUT-s in a separate list and add as inputs at the end

331

# to avoid that they would accidentally be assigned as ifm or ifm2

332

lut_list = []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

333

input_refcounts = collections.defaultdict(int)

Diqing Zhong

2abd3dd

2020-08-25 10:40:36 +0200

[diff] [blame]

334

input_ops_list = ops_list.copy()

335

336

# Check primary_op first

337

if primary_op is not None:

338

for inp in primary_op.inputs:

Andreas Nevalainen

d8c032d

2020-09-11 10:25:09 +0200

[diff] [blame]

339

if inp is None:

340

continue

Diqing Zhong

2abd3dd

2020-08-25 10:40:36 +0200

[diff] [blame]

341

add_input_list(inp, input_set, input_refcounts, lut_list, ordered_input_list)

342

input_ops_list.remove(primary_op)

343

344

# Check rest of the list

345

for op in input_ops_list:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

346

for inp in op.inputs:

Diqing Zhong

2abd3dd

2020-08-25 10:40:36 +0200

[diff] [blame]

347

add_input_list(inp, input_set, input_refcounts, lut_list, ordered_input_list)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

348

349

name = ops_list[0].name

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

350

ps = Pass(name, placement, is_element_wise, npu_block_type)

351

ps.ops = ops_list

352

ps.primary_op = primary_op

353

ps.inputs = ordered_input_list

354

ps.intermediates = intermediates

355

ps.outputs = list(ops_list[-1].outputs)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

356

357

# ElementWise operation, 2 IFMs

358

if ps.primary_op and ps.primary_op.type in binary_elem_wise_main_ops:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

359

ps.ifm_tensor = ps.inputs[0]

Fredrik Svedberg

a0c3624

2020-06-03 15:43:31 +0200

[diff] [blame]

360

ps.ifm2_tensor = ps.inputs[-1]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

361

Fredrik Svedberg

a0c3624

2020-06-03 15:43:31 +0200

[diff] [blame]

362

if len(ps.inputs) > 2:

363

ps.ifm_tensor = ps.inputs[-2]

Patrik Gustavsson

2349d42

2020-12-01 16:02:29 +0100

[diff] [blame]

364

365

# Get the corresponding ifm_shapes

366

for op in input_ops_list + [primary_op]:

Patrik Gustavsson

0a261cd

2020-12-23 08:50:44 +0100

[diff] [blame]

367

if op.run_on_npu:

368

if ps.ifm_tensor == op.ifm:

369

ps.ifm_shapes.append(op.ifm_shapes[0])

370

elif ps.ifm_tensor == op.ifm2:

371

ps.ifm_shapes.append(op.ifm_shapes[1])

Tim Hall

ffe8e28

2021-06-24 18:29:53 +0100

[diff] [blame]

372

Patrik Gustavsson

0a261cd

2020-12-23 08:50:44 +0100

[diff] [blame]

373

if ps.ifm2_tensor == op.ifm:

374

ps.ifm_shapes.append(op.ifm_shapes[0])

375

elif ps.ifm2_tensor == op.ifm2:

376

ps.ifm_shapes.append(op.ifm_shapes[1])

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

377

else:

378

ps.ifm_tensor = ifm_tensor

379

ps.ifm2_tensor = None

Patrik Gustavsson

cc6915c

2020-12-22 09:16:50 +0100

[diff] [blame]

380

if ps.primary_op is not None and ps.primary_op.run_on_npu:

Patrik Gustavsson

224e99b

2021-01-14 10:55:43 +0100

[diff] [blame]

381

ps.ifm_shapes.append(ifm_shapes[0])

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

382

383

ps.ofm_tensor = ofm_tensor

Patrik Gustavsson

6bb8f67

2020-12-21 14:49:13 +0100

[diff] [blame]

384

ps.ofm_shapes.append(ofm_shape)

Patrik Gustavsson

2349d42

2020-12-01 16:02:29 +0100

[diff] [blame]

385

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

386

assert ps.placement != PassPlacement.Npu or ps.ofm_tensor is not None

387

ps.weight_tensor = ps.get_primary_op_ifm_weights()[1]

388

ps.scale_tensor = ps.get_primary_op_ifm_weights_biases_ofm()[2]

Fredrik Svedberg

a0c3624

2020-06-03 15:43:31 +0200

[diff] [blame]

389

ps.lut_tensor = ps.get_primary_op_lut()

Louis Verhaard

0b8268a

2020-08-05 16:11:29 +0200

[diff] [blame]

390

ps.inputs.extend(lut_list)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

391

392

for op in ps.ops:

393

op.scheduled_pass = ps

394

395

reverse_pass_list.append(ps)

396

397

for inp, refcount in input_refcounts.items():

398

for _ in range(refcount):

visit_tensor(inp)

return ps

def visit_tensor(tens):

404

visit_tensor_refcount[tens] += 1

405

assert visit_tensor_refcount[tens] <= len(tens.consumers())

406

if visit_tensor_refcount[tens] == len(tens.consumers()):

Johan Alfvén

2022-01-27 06:47:26 +0100

[diff] [blame]

407

for op in reversed(tens.ops):

408

visit_op(op, tens)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

409

Jacob Bohlin

fb85873

2020-08-17 09:42:35 +0200

[diff] [blame]

410

def create_primary_op(op_list):

Patrik Gustavsson

e3b1b91

2021-02-09 15:38:46 +0100

[diff] [blame]

411

if any(op.type in (npu_post_ops | npu_post_fuse_limited_ops) and op.run_on_npu for op in op_list):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

412

# Configure a 1x1 AvgPool and attach the op onto it

Jacob Bohlin

fb85873

2020-08-17 09:42:35 +0200

[diff] [blame]

413

op = op_list[0]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

414

inp = op.inputs[0]

Michael McGeagh

8dbf8cf

2020-09-08 11:09:48 +0100

[diff] [blame]

415

avgpool_op = create_avgpool_nop(op.name + "_avgpool")

416

avgpool_op.add_input_tensor(inp)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

417

avgpool_out = inp.clone("_avgpooled")

418

avgpool_out.consumer_list.append(op)

Michael McGeagh

c5b549b

2020-08-07 11:54:28 +0100

[diff] [blame]

419

avgpool_op.set_output_tensor(avgpool_out)

Patrik Gustavsson

3a26920

2021-01-21 08:28:55 +0100

[diff] [blame]

420

avgpool_op.ifm_shapes = op.ifm_shapes.copy()

421

avgpool_op.ofm_shapes = op.ofm_shapes.copy()

Patrik Gustavsson

2021-02-16 12:57:03 +0100

[diff] [blame]

422

avgpool_op.read_offsets = op.read_offsets.copy()

Tim Hall

3df5b96

2021-12-17 14:09:19 +0000

[diff] [blame]

423

avgpool_op.read_shapes = op.read_shapes.copy()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

424

425

op.inputs[0] = avgpool_out

Jacob Bohlin

fb85873

2020-08-17 09:42:35 +0200

[diff] [blame]

426

op_list.insert(0, avgpool_op)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

427

Tim Hall

e6ccd87

2020-11-09 16:46:37 +0000

[diff] [blame]

428

DebugDatabase.add_optimised(op, avgpool_op)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

return avgpool_op

return None

Patrik Gustavsson

2021-02-03 09:13:57 +0100

[diff] [blame]

433

def can_pack(inp, curr_op):

434

if len(inp.ops) == 1:

435

next_op = inp.ops[0]

436

for outp in next_op.outputs:

437

consumers = outp.consumers()

438

if len(consumers) > 1 or (len(consumers) == 1 and consumers[0] != curr_op):

439

return False

440

441

# There cannot be any reshaping between next_op ofm and corresponding curr_op ifm

442

if len(curr_op.ifm_shapes) != 0 and len(next_op.ofm_shapes) != 0:

443

if inp == curr_op.ifm and next_op.ofm_shapes[0] != curr_op.ifm_shapes[0]:

444

return False

445

elif (

446

curr_op.ifm2 is not None and inp == curr_op.ifm2 and next_op.ofm_shapes[0] != curr_op.ifm_shapes[1]

):

return False

else:

return False

return True

Diqing Zhong

2020-08-25 10:40:36 +0200

[diff] [blame]

454

def add_input_list(inp_to_add, inp_set, inp_refcnts, lut_list, ordered_inp_list):

455

if inp_to_add in inp_set:

456

if inp_refcnts[inp_to_add] == 0:

457

if inp_to_add.purpose == TensorPurpose.LUT:

458

lut_list.append(inp_to_add)

459

else:

460

ordered_inp_list.append(inp_to_add)

461

inp_refcnts[inp_to_add] += 1

462

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

463

for sg in nng.subgraphs:

464

reverse_pass_list = []

465

visit_op_refcount = collections.defaultdict(int)

466

visit_tensor_refcount = collections.defaultdict(int)

startup_list = []

for tens in sg.output_tensors:

visit_tensor(tens)

if startup_list:

startup_ps = build_pass(startup_list)

475

startup_ps.outputs = [op.outputs[0] for op in startup_list] # Need to fixup the outputs

476

startup_ps.name = "startup_weight_initialisation"

477

Johan Alfvén

2022-04-19 16:07:05 +0200

[diff] [blame]

478

# Graphs with both CPU and NPU ops might not have an optimal order in

479

# the pass list due to how the graph is traversed (depth first search).

480

# This can result in more context switching between CPU and NPU.

481

# Try to optmize this by moving/grouping CPU ops where that is possible.

482

# Criteria for CPU pass to be moved:

483

#

Johan Alfven

a8fda88

2023-10-28 16:04:46 +0200

[diff] [blame]

484

# 1) CPU passes that only depends on sg.input_tensors can be

Johan Alfvén

2022-04-19 16:07:05 +0200

[diff] [blame]

485

# moved to the top of the list.

Johan Alfven

2023-02-07 13:01:03 +0100

[diff] [blame]

486

# ResourceVariables ops like VarHandle, ReadVariable, CallOnce

487

# can also be moved to the top of list.

Johan Alfvén

2022-04-19 16:07:05 +0200

[diff] [blame]

488

#

489

# 2) A CPU pass X is allowed to be grouped together with CPU pass Y

490

# if there is no NPU pass between pass X and pass Y that depends

491

# on output from pass X or a MemoryOnly pass.

492

#

493

# Criteria 2 will try to move as many CPU passes towards the bottom of

# the list.

pass_list_top = []

pass_list = []

# Filter out early passes from the rest

500

for ps in list(reversed(reverse_pass_list)):

501

if startup_ps == ps:

502

# startup pass belongs in the top

503

pass_list_top.insert(0, ps)

504

continue

505

Johan Alfven

a8fda88

2023-10-28 16:04:46 +0200

[diff] [blame]

506

ifm2 = ps.ops[0].ifm2

507

if ifm2 is None:

508

# Dynamic weights must be treated as ifm's.

509

if ps.ops[0].type == Op.FullyConnected and ps.ops[0].weights.purpose == TensorPurpose.FeatureMap:

510

# Op has dynamic weights, include this in the check below

511

ifm2 = ps.ops[0].weights

512

Johan Alfven

2023-02-07 13:01:03 +0100

[diff] [blame]

513

if ps.placement == PassPlacement.Cpu and (

514

ps.ops[0].ifm in sg.input_tensors

Johan Alfven

a8fda88

2023-10-28 16:04:46 +0200

[diff] [blame]

515

and (ifm2 in sg.input_tensors or ifm2 is None)

Johan Alfven

2023-02-07 13:01:03 +0100

[diff] [blame]

516

or (ps.ops[0].type in (Op.VarHandle, Op.ReadVariable, Op.CallOnce))

Johan Alfvén

2022-04-19 16:07:05 +0200

[diff] [blame]

517

):

Johan Alfven

2023-02-07 13:01:03 +0100

[diff] [blame]

518

# This CPU pass only depends on sg.input_tensors or resource variable

Johan Alfvén

2022-04-19 16:07:05 +0200

[diff] [blame]

519

pass_list_top.append(ps)

520

else:

521

# Add pass to the list that will be sorted in the next step

522

pass_list.append(ps)

523

Johan Alfven

2023-02-07 13:01:03 +0100

[diff] [blame]

524

# Sort ops by op_index (same call order as in the original graph)

525

pass_list_top = sorted(pass_list_top, key=lambda ps: -1 if ps.ops[0].op_index is None else ps.ops[0].op_index)

526

Johan Alfven

2024-05-13 13:44:42 +0200

[diff] [blame]

527

# A concat is implemented by several AvgPool ops writing to the same ofm but with slice offset

Johan Alfven

2024-05-20 11:31:41 +0200

[diff] [blame]

528

# If there is a cpu op in between, group all AvgPool ops for a concat so that they run

529

# within the same cmd stream

Johan Alfven

2024-05-13 13:44:42 +0200

[diff] [blame]

530

last_idx = len(pass_list) - 1

531

for npu_ps in reversed(pass_list):

532

if npu_ps.placement == PassPlacement.Cpu or not npu_ps.ops[0].original_type.is_concat_op():

533

continue

534

# Concat pass found, search forward for the next avgpool op writing to the same ofm

535

idx = pass_list.index(npu_ps)

Johan Alfven

2024-05-20 11:31:41 +0200

[diff] [blame]

536

concat_is_split_between_npu_ops = False

Johan Alfven

2024-05-13 13:44:42 +0200

[diff] [blame]

537

for next_ps in pass_list[idx + 1 :]:

Johan Alfven

2024-05-20 11:31:41 +0200

[diff] [blame]

538

if next_ps.placement == PassPlacement.Cpu:

539

concat_is_split_between_npu_ops = True

Johan Alfven

2024-05-13 13:44:42 +0200

[diff] [blame]

540

next_is_concat = next_ps.ops[0].original_type.is_concat_op()

Johan Alfven

2024-05-20 11:31:41 +0200

[diff] [blame]

541

if next_is_concat and next_ps.ops[0].ofm == npu_ps.ops[0].ofm and concat_is_split_between_npu_ops:

542

# Avgpool writing to the same OFM and there is a cpu op between them, group them

Johan Alfven

2024-05-13 13:44:42 +0200

[diff] [blame]

543

pass_list.remove(npu_ps)

544

insert_index = pass_list.index(next_ps)

545

pass_list.insert(insert_index, npu_ps)

546

break

547

Johan Alfvén

2022-04-19 16:07:05 +0200

[diff] [blame]

548

# Sort the rest of the list based on critera 2.

549

# Search from bottom of list and when a CPU pass is found

550

# search forward in the list and see if it is possible to join another CPU pass.

Johan Alfvén

2022-05-19 07:26:03 +0200

[diff] [blame]

551

last_idx = len(pass_list) - 1

Johan Alfvén

2022-04-19 16:07:05 +0200

[diff] [blame]

552

for cpu_ps in reversed(pass_list):

553

if cpu_ps.placement != PassPlacement.Cpu:

554

continue

555

# CPU pass found, search forward and move pass if possible

556

idx = pass_list.index(cpu_ps)

557

for next_ps in pass_list[idx + 1 :]:

558

if next_ps.placement == PassPlacement.Cpu:

559

# It is possible to move the CPU pass

560

pass_list.remove(cpu_ps)

561

insert_index = pass_list.index(next_ps)

562

pass_list.insert(insert_index, cpu_ps)

563

break

Johan Alfvén

2022-04-19 16:07:05 +0200

[diff] [blame]

564

Johan Alfven

c72cac8

2023-03-09 16:01:00 +0100

[diff] [blame]

565

# Check all outputs from the cpu pass

Johan Alfvén

2022-05-19 07:26:03 +0200

[diff] [blame]

566

if (

Johan Alfven

c72cac8

2023-03-09 16:01:00 +0100

[diff] [blame]

567

any(ofm in [next_ps.ops[0].ifm, next_ps.ops[0].ifm2] for ofm in cpu_ps.ops[0].outputs)

Johan Alfvén

2022-05-19 07:26:03 +0200

[diff] [blame]

568

or next_ps.placement == PassPlacement.MemoryOnly

569

):

Johan Alfven

c72cac8

2023-03-09 16:01:00 +0100

[diff] [blame]

570

# Not possible to move since next pass depends on the output from the cpu pass

Johan Alfvén

2022-05-19 07:26:03 +0200

[diff] [blame]

571

break

572

573

if pass_list.index(next_ps) == last_idx:

574

# Last element, ok to move the CPU pass

575

pass_list.remove(cpu_ps)

576

pass_list.append(cpu_ps)

577

break

578

Johan Alfvén

2022-04-19 16:07:05 +0200

[diff] [blame]

579

pass_list_top.extend(pass_list)

580

581

sg.passes = pass_list_top

Tim Hall