Blame - ethosu/vela/extract_npu_subgraphs.py - ml/ethos-u/ethos-u-vela

2020-04-27 18:20:16 +0100

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Rickard Bolin

bc6ee58

2022-11-04 08:24:29 +0000

[diff] [blame]

16

#

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

17

# Description:

18

# Vela separates CPU operations and NPU operations into separate internal subgraphs. The CPU operations are left

19

# untouched in the final output.

20

#

21

# Vela does this by identifying NPU passes and pulling them out from the main CPU graph into separate subgraphs, invoked

22

# by NpuOp operations. Later, Vela generates command streams and compressed weight streams for the NPU subgraphs and

23

# attaches them to the NpuOp. This encapsulates everything the NPU subgraph is supposed to do.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

24

import numpy as np

25

Diego Russo

e8a1045

2020-04-21 17:39:10 +0100

[diff] [blame]

26

from .nn_graph import Pass

27

from .nn_graph import PassPlacement

28

from .nn_graph import Subgraph

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

29

from .operation import CustomType

Diego Russo

e8a1045

2020-04-21 17:39:10 +0100

[diff] [blame]

30

from .operation import NpuBlockType

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

31

from .operation import Op

Diego Russo

e8a1045

2020-04-21 17:39:10 +0100

[diff] [blame]

32

from .operation import Operation

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

33

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

34

35

def make_npu_call_op_pass(npu_subgraph):

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

36

op = Operation(Op.CustomNpuOp, "call_" + npu_subgraph.name)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

37

op.attrs["subgraph"] = npu_subgraph

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

38

op.attrs["custom_type"] = CustomType.NpuOp

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

39

ps = Pass(op.name, PassPlacement.MemoryOnly, False, NpuBlockType.Default)

40

ps.ops = [op]

41

ps.primary_op = op

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

42

op.scheduled_pass = ps

43

44

# Inputs and outputs filled in later as we cut the graphs

return ps

def switch_tensor_for_op(op, orig_tens, new_tens):

49

50

op.inputs = [new_tens if tens == orig_tens else tens for tens in op.inputs]

51

op.outputs = [new_tens if tens == orig_tens else tens for tens in op.outputs]

52

53

ps = op.scheduled_pass

if ps is None:

return

ps.inputs = [new_tens if tens == orig_tens else tens for tens in ps.inputs]

58

ps.outputs = [new_tens if tens == orig_tens else tens for tens in ps.outputs]

59

60

if ps.ifm_tensor == orig_tens:

61

ps.ifm_tensor = new_tens

62

if ps.ifm2_tensor == orig_tens:

63

ps.ifm2_tensor = new_tens

64

if ps.ofm_tensor == orig_tens:

65

ps.ofm_tensor = new_tens

66

if ps.weight_tensor == orig_tens:

67

ps.weight_tensor = new_tens

68

if ps.scale_tensor == orig_tens:

69

ps.scale_tensor = new_tens

70

71

72

def rewrite_tensor_cpu_producer_npu_consumers(

73

orig_tens, call_ps, startup_init_ps, npu_subgraph, cpu_subgraph, subgraph_for_pass

74

):

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

75

is_const = orig_tens.ops[0].type == Op.Const

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

76

new_tens = orig_tens.clone("_npu")

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

77

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

78

op_type = Op.SubgraphInput

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

79

if is_const:

Louis Verhaard

2020-09-30 09:01:52 +0200

[diff] [blame]

80

op_type = Op.Const

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

81

op = Operation(op_type, orig_tens.name + "_input")

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

82

op.scheduled_pass = startup_init_ps

Michael McGeagh

c5b549b

2020-08-07 11:54:28 +0100

[diff] [blame]

83

op.set_output_tensor(new_tens)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

84

startup_init_ps.ops.append(op)

85

startup_init_ps.outputs.append(new_tens)

86

87

if not is_const:

88

call_ps.inputs.append(orig_tens)

89

call_ps.primary_op.inputs.append(orig_tens)

90

Johan Alfvén

8d57aaa

2022-02-04 11:19:17 +0100

[diff] [blame]

91

# Elementwise op can not overwrite ifm if input is used by many consumers

92

if orig_tens in cpu_subgraph.input_tensors and len(orig_tens.consumers()) > 1:

93

new_tens.ifm_write_protected = True

94

95

# Elementwise op can not overwrite ifm if tensor is used as output from sub graph

96

if orig_tens in cpu_subgraph.output_tensors:

97

new_tens.ifm_write_protected = True

98

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

99

for op in list(orig_tens.consumers()):

100

if op is None:

101

continue # Subgraph consumers handled separately.

102

ps = op.scheduled_pass

103

if subgraph_for_pass[ps] == npu_subgraph:

104

switch_tensor_for_op(op, orig_tens, new_tens)

105

orig_tens.consumer_list.remove(op)

106

new_tens.consumer_list.append(op)

107

108

# Deal with output tensors for the NPU graph. These are special.

109

npu_subgraph.output_tensors = [new_tens if tens == orig_tens else tens for tens in npu_subgraph.output_tensors]

110

111

112

def rewrite_tensor_npu_producer_cpu_consumers(

William Isaksson

2023-07-25 09:43:02 +0000

[diff] [blame]

113

orig_tens, call_ps, npu_subgraph, cpu_subgraph, subgraph_for_pass, multiple_npu_sg_have_same_cpu_out_tens

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

114

):

William Isaksson

2023-07-25 09:43:02 +0000

[diff] [blame]

115

if multiple_npu_sg_have_same_cpu_out_tens:

116

new_tens = orig_tens

117

orig_tens = orig_tens.src_tensor

118

else:

119

new_tens = orig_tens.clone("")

120

orig_tens.name = orig_tens.name + "_cpu"

121

new_tens.ops = []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

122

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

123

npu_subgraph.output_tensors.append(orig_tens)

124

125

call_ps.outputs.append(new_tens)

126

call_ps.primary_op.outputs.append(new_tens)

William Isaksson

2023-07-25 09:43:02 +0000

[diff] [blame]

127

new_tens.ops.append(call_ps.primary_op)

Johan Alfvén

1b9218e

2022-02-08 13:01:09 +0100

[diff] [blame]

128

# Elementwise op can not overwrite ifm if input is used by many consumers

129

if orig_tens in npu_subgraph.input_tensors and len(orig_tens.consumers()) > 1:

130

new_tens.ifm_write_protected = True

131

132

# Elementwise op can not overwrite ifm if tensor is used as output from sub graph

133

if orig_tens in npu_subgraph.output_tensors:

134

new_tens.ifm_write_protected = True

135

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

136

for op in list(orig_tens.consumers()):

137

if op is None:

138

continue # Subgraph consumers handled separately.

139

ps = op.scheduled_pass

140

if subgraph_for_pass[ps] != npu_subgraph:

141

switch_tensor_for_op(op, orig_tens, new_tens)

142

orig_tens.consumer_list.remove(op)

143

new_tens.consumer_list.append(op)

144

145

# Deal with output tensors for the CPU graph. These are special.

146

cpu_subgraph.output_tensors = [new_tens if tens == orig_tens else tens for tens in cpu_subgraph.output_tensors]

147

148

149

def extract_subgraph(nng, orig_sg, arch):

150

assert orig_sg.placement == PassPlacement.Cpu

151

152

passes = list(orig_sg.passes)

153

place_vec = np.array([ps.placement for ps in passes])

154

place_vec[

155

place_vec == PassPlacement.StartupInit

156

] = PassPlacement.Cpu # Keep the startup init pass on the CPU, we'll make new ones to move onto NPU.

157

158

# MemoryOnly passes that are either squeezed between NPU passes or on the boundary of NPU and CPU

Fredrik Svedberg

2b5939f

2021-10-14 15:16:30 +0200

[diff] [blame]

159

# passes should be assigned to the NPU, unless they are assigned to run on CPU explicitly.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

160

161

# Forward, then backwards

162

for is_reversed in range(2):

163

last_place = PassPlacement.Cpu

164

seq = enumerate(place_vec)

165

if is_reversed:

166

seq = reversed(list(seq))

167

for idx, place in seq:

Fredrik Svedberg

2b5939f

2021-10-14 15:16:30 +0200

[diff] [blame]

168

if place == PassPlacement.MemoryOnly and passes[idx].ops[0].run_on_npu:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

169

if last_place == PassPlacement.Npu:

170

place = PassPlacement.Npu

171

place_vec[idx] = place

172

173

if place != PassPlacement.MemoryOnly:

174

last_place = place

175

176

# Anything left, assign to the CPU.

177

place_vec[place_vec == PassPlacement.MemoryOnly] = PassPlacement.Cpu

178

179

if np.all(place_vec == PassPlacement.Cpu):

180

return [] # Nothing to do

181

182

# Create the subgraphs and split passes between them

new_subgraphs = []

split_count = 0

subgraph_for_pass = {}

187

orig_sg.passes = []

188

call_pass = {}

189

startup_init_passes = {}

190

191

last_place = PassPlacement.Cpu

192

curr_sg = orig_sg

193

194

for idx, place in enumerate(place_vec):

195

if place != last_place:

196

if place == PassPlacement.Npu:

197

split_count += 1

198

curr_sg = Subgraph("%s_split_%d" % (orig_sg.name, split_count), PassPlacement.Npu)

199

new_subgraphs.append(curr_sg)

200

call_ps = make_npu_call_op_pass(curr_sg)

201

subgraph_for_pass[call_ps] = orig_sg

202

orig_sg.passes.append(call_ps)

203

call_pass[curr_sg] = call_ps

204

205

startup_init_ps = Pass(

206

curr_sg.name + "_startup_init", PassPlacement.StartupInit, False, NpuBlockType.Default

207

)

208

curr_sg.passes.append(startup_init_ps)

209

startup_init_passes[curr_sg] = startup_init_ps

210

subgraph_for_pass[startup_init_ps] = curr_sg

else:

curr_sg = orig_sg

last_place = place

ps = passes[idx]

subgraph_for_pass[ps] = curr_sg

217

curr_sg.passes.append(ps)

218

219

# Rewrite tensors to fix up graphs.

220

221

for curr_sg in new_subgraphs:

222

for ps in curr_sg.passes:

223

for tens in ps.inputs:

224

source_sgs = [subgraph_for_pass[op.scheduled_pass] for op in tens.ops]

225

assert len(source_sgs) >= 0

226

producer_sg = source_sgs[0]

227

for sg in source_sgs:

228

assert sg == producer_sg # All need to be the same.

229

230

if producer_sg != curr_sg:

231

assert (

232

producer_sg == orig_sg

233

) # Because we go in-order, all the producers must be the original graph.

234

rewrite_tensor_cpu_producer_npu_consumers(

235

tens, call_pass[curr_sg], startup_init_passes[curr_sg], curr_sg, orig_sg, subgraph_for_pass

236

)

237

238

for tens in ps.outputs:

239

240

dest_sgs = [subgraph_for_pass[op.scheduled_pass] for op in tens.consumers() if op is not None]

241

need_rewrite = False

William Isaksson

2023-07-25 09:43:02 +0000

[diff] [blame]

242

multiple_npu_sg_have_same_cpu_out_tens = False

243

output_tensor = tens

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

for sg in dest_sgs:

if sg != curr_sg:

need_rewrite = True

break

William Isaksson

2023-07-25 09:43:02 +0000

[diff] [blame]

248

for orig_out_tens in orig_sg.output_tensors:

William Isaksson

631f600

2023-08-02 11:37:05 +0000

[diff] [blame]

249

if tens not in curr_sg.output_tensors:

250

if tens == orig_out_tens:

251

need_rewrite = True

252

elif tens.equivalence_id == orig_out_tens.equivalence_id:

253

need_rewrite = True

254

multiple_npu_sg_have_same_cpu_out_tens = True

255

output_tensor = orig_out_tens

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

256

257

if need_rewrite:

258

rewrite_tensor_npu_producer_cpu_consumers(

William Isaksson

2023-07-25 09:43:02 +0000

[diff] [blame]

output_tensor,

call_pass[curr_sg],

curr_sg,

orig_sg,

subgraph_for_pass,

multiple_npu_sg_have_same_cpu_out_tens,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

265

)

266

Johan Alfvén

211165a

2022-02-06 15:30:07 +0100

[diff] [blame]

267

for tens in curr_sg.output_tensors:

268

# ofm can depend on multiple ops. These ops can be divided into different NPU

269

# nodes due to CPU nodes. If that is the case the ofm must be NHWC.

Raul Farkas

72c6a24

2023-03-16 16:38:05 +0000

[diff] [blame]

270

tens.force_linear_format = True

Johan Alfvén

211165a

2022-02-06 15:30:07 +0100

[diff] [blame]

271

Tim Hall