blob: ada76f2859374e5112bc202fded3004f359a084f [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Vela separates CPU operations and NPU operations into separate internal subgraphs. The CPU operations are left
18# untouched in the final output.
19#
20# Vela does this by identifying NPU passes and pulling them out from the main CPU graph into separate subgraphs, invoked
21# by NpuOp operations. Later, Vela generates command streams and compressed weight streams for the NPU subgraphs and
22# attaches them to the NpuOp. This encapsulates everything the NPU subgraph is supposed to do.
Tim Hall79d07d22020-04-27 18:20:16 +010023import numpy as np
24
Diego Russoe8a10452020-04-21 17:39:10 +010025from .nn_graph import Pass
26from .nn_graph import PassPlacement
27from .nn_graph import Subgraph
Louis Verhaardaee5d752020-09-30 09:01:52 +020028from .operation import CustomType
Diego Russoe8a10452020-04-21 17:39:10 +010029from .operation import NpuBlockType
Louis Verhaardaee5d752020-09-30 09:01:52 +020030from .operation import Op
Diego Russoe8a10452020-04-21 17:39:10 +010031from .operation import Operation
Diego Russoea6111a2020-04-14 18:41:58 +010032
Tim Hall79d07d22020-04-27 18:20:16 +010033
34def make_npu_call_op_pass(npu_subgraph):
Louis Verhaardaee5d752020-09-30 09:01:52 +020035 op = Operation(Op.CustomNpuOp, "call_" + npu_subgraph.name)
Tim Hall79d07d22020-04-27 18:20:16 +010036 op.attrs["subgraph"] = npu_subgraph
Louis Verhaardaee5d752020-09-30 09:01:52 +020037 op.attrs["custom_type"] = CustomType.NpuOp
Tim Hall79d07d22020-04-27 18:20:16 +010038 ps = Pass(op.name, PassPlacement.MemoryOnly, False, NpuBlockType.Default)
39 ps.ops = [op]
40 ps.primary_op = op
Tim Hall79d07d22020-04-27 18:20:16 +010041 op.scheduled_pass = ps
42
43 # Inputs and outputs filled in later as we cut the graphs
44 return ps
45
46
47def switch_tensor_for_op(op, orig_tens, new_tens):
48
49 op.inputs = [new_tens if tens == orig_tens else tens for tens in op.inputs]
50 op.outputs = [new_tens if tens == orig_tens else tens for tens in op.outputs]
51
52 ps = op.scheduled_pass
53 if ps is None:
54 return
55
56 ps.inputs = [new_tens if tens == orig_tens else tens for tens in ps.inputs]
57 ps.outputs = [new_tens if tens == orig_tens else tens for tens in ps.outputs]
58
59 if ps.ifm_tensor == orig_tens:
60 ps.ifm_tensor = new_tens
61 if ps.ifm2_tensor == orig_tens:
62 ps.ifm2_tensor = new_tens
63 if ps.ofm_tensor == orig_tens:
64 ps.ofm_tensor = new_tens
65 if ps.weight_tensor == orig_tens:
66 ps.weight_tensor = new_tens
67 if ps.scale_tensor == orig_tens:
68 ps.scale_tensor = new_tens
69
70
71def rewrite_tensor_cpu_producer_npu_consumers(
72 orig_tens, call_ps, startup_init_ps, npu_subgraph, cpu_subgraph, subgraph_for_pass
73):
Louis Verhaardaee5d752020-09-30 09:01:52 +020074 is_const = orig_tens.ops[0].type == Op.Const
Tim Hall79d07d22020-04-27 18:20:16 +010075 new_tens = orig_tens.clone("_npu")
Tim Hall79d07d22020-04-27 18:20:16 +010076
Louis Verhaardaee5d752020-09-30 09:01:52 +020077 op_type = Op.SubgraphInput
Tim Hall79d07d22020-04-27 18:20:16 +010078 if is_const:
Louis Verhaardaee5d752020-09-30 09:01:52 +020079 op_type = Op.Const
Tim Hall79d07d22020-04-27 18:20:16 +010080 op = Operation(op_type, orig_tens.name + "_input")
Tim Hall79d07d22020-04-27 18:20:16 +010081 op.scheduled_pass = startup_init_ps
Michael McGeaghc5b549b2020-08-07 11:54:28 +010082 op.set_output_tensor(new_tens)
Tim Hall79d07d22020-04-27 18:20:16 +010083 startup_init_ps.ops.append(op)
84 startup_init_ps.outputs.append(new_tens)
85
86 if not is_const:
87 call_ps.inputs.append(orig_tens)
88 call_ps.primary_op.inputs.append(orig_tens)
89
Johan Alfvén8d57aaa2022-02-04 11:19:17 +010090 # Elementwise op can not overwrite ifm if input is used by many consumers
91 if orig_tens in cpu_subgraph.input_tensors and len(orig_tens.consumers()) > 1:
92 new_tens.ifm_write_protected = True
93
94 # Elementwise op can not overwrite ifm if tensor is used as output from sub graph
95 if orig_tens in cpu_subgraph.output_tensors:
96 new_tens.ifm_write_protected = True
97
Tim Hall79d07d22020-04-27 18:20:16 +010098 for op in list(orig_tens.consumers()):
99 if op is None:
100 continue # Subgraph consumers handled separately.
101 ps = op.scheduled_pass
102 if subgraph_for_pass[ps] == npu_subgraph:
103 switch_tensor_for_op(op, orig_tens, new_tens)
104 orig_tens.consumer_list.remove(op)
105 new_tens.consumer_list.append(op)
106
107 # Deal with output tensors for the NPU graph. These are special.
108 npu_subgraph.output_tensors = [new_tens if tens == orig_tens else tens for tens in npu_subgraph.output_tensors]
109
110
111def rewrite_tensor_npu_producer_cpu_consumers(
112 orig_tens, call_ps, startup_init_ps, npu_subgraph, cpu_subgraph, subgraph_for_pass
113):
114
James Ward93389782021-10-14 12:58:02 +0100115 new_tens = orig_tens.clone("")
116 orig_tens.name = orig_tens.name + "_cpu"
Tim Hall79d07d22020-04-27 18:20:16 +0100117 npu_subgraph.output_tensors.append(orig_tens)
118
119 call_ps.outputs.append(new_tens)
120 call_ps.primary_op.outputs.append(new_tens)
121 new_tens.ops = [call_ps.primary_op]
122
Johan Alfvén1b9218e2022-02-08 13:01:09 +0100123 # Elementwise op can not overwrite ifm if input is used by many consumers
124 if orig_tens in npu_subgraph.input_tensors and len(orig_tens.consumers()) > 1:
125 new_tens.ifm_write_protected = True
126
127 # Elementwise op can not overwrite ifm if tensor is used as output from sub graph
128 if orig_tens in npu_subgraph.output_tensors:
129 new_tens.ifm_write_protected = True
130
Tim Hall79d07d22020-04-27 18:20:16 +0100131 for op in list(orig_tens.consumers()):
132 if op is None:
133 continue # Subgraph consumers handled separately.
134 ps = op.scheduled_pass
135 if subgraph_for_pass[ps] != npu_subgraph:
136 switch_tensor_for_op(op, orig_tens, new_tens)
137 orig_tens.consumer_list.remove(op)
138 new_tens.consumer_list.append(op)
139
140 # Deal with output tensors for the CPU graph. These are special.
141 cpu_subgraph.output_tensors = [new_tens if tens == orig_tens else tens for tens in cpu_subgraph.output_tensors]
142
143
144def extract_subgraph(nng, orig_sg, arch):
145 assert orig_sg.placement == PassPlacement.Cpu
146
147 passes = list(orig_sg.passes)
148 place_vec = np.array([ps.placement for ps in passes])
149 place_vec[
150 place_vec == PassPlacement.StartupInit
151 ] = PassPlacement.Cpu # Keep the startup init pass on the CPU, we'll make new ones to move onto NPU.
152
153 # MemoryOnly passes that are either squeezed between NPU passes or on the boundary of NPU and CPU
Fredrik Svedberg2b5939f2021-10-14 15:16:30 +0200154 # passes should be assigned to the NPU, unless they are assigned to run on CPU explicitly.
Tim Hall79d07d22020-04-27 18:20:16 +0100155
156 # Forward, then backwards
157 for is_reversed in range(2):
158 last_place = PassPlacement.Cpu
159 seq = enumerate(place_vec)
160 if is_reversed:
161 seq = reversed(list(seq))
162 for idx, place in seq:
Fredrik Svedberg2b5939f2021-10-14 15:16:30 +0200163 if place == PassPlacement.MemoryOnly and passes[idx].ops[0].run_on_npu:
Tim Hall79d07d22020-04-27 18:20:16 +0100164 if last_place == PassPlacement.Npu:
165 place = PassPlacement.Npu
166 place_vec[idx] = place
167
168 if place != PassPlacement.MemoryOnly:
169 last_place = place
170
171 # Anything left, assign to the CPU.
172 place_vec[place_vec == PassPlacement.MemoryOnly] = PassPlacement.Cpu
173
174 if np.all(place_vec == PassPlacement.Cpu):
175 return [] # Nothing to do
176
177 # Create the subgraphs and split passes between them
178
179 new_subgraphs = []
180 split_count = 0
181 subgraph_for_pass = {}
182 orig_sg.passes = []
183 call_pass = {}
184 startup_init_passes = {}
185
186 last_place = PassPlacement.Cpu
187 curr_sg = orig_sg
188
189 for idx, place in enumerate(place_vec):
190 if place != last_place:
191 if place == PassPlacement.Npu:
192 split_count += 1
193 curr_sg = Subgraph("%s_split_%d" % (orig_sg.name, split_count), PassPlacement.Npu)
194 new_subgraphs.append(curr_sg)
195 call_ps = make_npu_call_op_pass(curr_sg)
196 subgraph_for_pass[call_ps] = orig_sg
197 orig_sg.passes.append(call_ps)
198 call_pass[curr_sg] = call_ps
199
200 startup_init_ps = Pass(
201 curr_sg.name + "_startup_init", PassPlacement.StartupInit, False, NpuBlockType.Default
202 )
203 curr_sg.passes.append(startup_init_ps)
204 startup_init_passes[curr_sg] = startup_init_ps
205 subgraph_for_pass[startup_init_ps] = curr_sg
206
207 else:
208 curr_sg = orig_sg
209 last_place = place
210 ps = passes[idx]
211 subgraph_for_pass[ps] = curr_sg
212 curr_sg.passes.append(ps)
213
214 # Rewrite tensors to fix up graphs.
215
216 for curr_sg in new_subgraphs:
217 for ps in curr_sg.passes:
218 for tens in ps.inputs:
219 source_sgs = [subgraph_for_pass[op.scheduled_pass] for op in tens.ops]
220 assert len(source_sgs) >= 0
221 producer_sg = source_sgs[0]
222 for sg in source_sgs:
223 assert sg == producer_sg # All need to be the same.
224
225 if producer_sg != curr_sg:
226 assert (
227 producer_sg == orig_sg
228 ) # Because we go in-order, all the producers must be the original graph.
229 rewrite_tensor_cpu_producer_npu_consumers(
230 tens, call_pass[curr_sg], startup_init_passes[curr_sg], curr_sg, orig_sg, subgraph_for_pass
231 )
232
233 for tens in ps.outputs:
234
235 dest_sgs = [subgraph_for_pass[op.scheduled_pass] for op in tens.consumers() if op is not None]
236 need_rewrite = False
237 for sg in dest_sgs:
238 if sg != curr_sg:
239 need_rewrite = True
240 break
241 if tens in orig_sg.output_tensors:
242 need_rewrite = True
243
244 if need_rewrite:
245 rewrite_tensor_npu_producer_cpu_consumers(
246 tens, call_pass[curr_sg], startup_init_passes[curr_sg], curr_sg, orig_sg, subgraph_for_pass
247 )
248
Johan Alfvén211165a2022-02-06 15:30:07 +0100249 for tens in curr_sg.output_tensors:
250 # ofm can depend on multiple ops. These ops can be divided into different NPU
251 # nodes due to CPU nodes. If that is the case the ofm must be NHWC.
252 tens.needs_linear_format = True
253
Tim Hall79d07d22020-04-27 18:20:16 +0100254 return new_subgraphs
255
256
257def extract_npu_subgraphs(nng, arch):
258
259 nng.refresh_after_modification()
260
261 for sg in list(nng.subgraphs):
262 if sg.placement == PassPlacement.Cpu:
263 new_subgraphs = extract_subgraph(nng, sg, arch)
264 nng.subgraphs += new_subgraphs
265
266 nng.refresh_after_modification()
267 nng.prune_startup_init_pass()
268
269 for sg in nng.subgraphs:
270 sg.build_pass_links()