blob: c4f66b8f75b7abcb31b0e609fdd6c70c0f42a7a8 [file] [log] [blame]
Raul Farkas72c6a242023-03-16 16:38:05 +00001# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Rickard Bolinbc6ee582022-11-04 08:24:29 +000016#
Tim Hall79d07d22020-04-27 18:20:16 +010017# Description:
18# Vela separates CPU operations and NPU operations into separate internal subgraphs. The CPU operations are left
19# untouched in the final output.
20#
21# Vela does this by identifying NPU passes and pulling them out from the main CPU graph into separate subgraphs, invoked
22# by NpuOp operations. Later, Vela generates command streams and compressed weight streams for the NPU subgraphs and
23# attaches them to the NpuOp. This encapsulates everything the NPU subgraph is supposed to do.
Tim Hall79d07d22020-04-27 18:20:16 +010024import numpy as np
25
Diego Russoe8a10452020-04-21 17:39:10 +010026from .nn_graph import Pass
27from .nn_graph import PassPlacement
28from .nn_graph import Subgraph
Louis Verhaardaee5d752020-09-30 09:01:52 +020029from .operation import CustomType
Diego Russoe8a10452020-04-21 17:39:10 +010030from .operation import NpuBlockType
Louis Verhaardaee5d752020-09-30 09:01:52 +020031from .operation import Op
Diego Russoe8a10452020-04-21 17:39:10 +010032from .operation import Operation
Diego Russoea6111a2020-04-14 18:41:58 +010033
Tim Hall79d07d22020-04-27 18:20:16 +010034
35def make_npu_call_op_pass(npu_subgraph):
Louis Verhaardaee5d752020-09-30 09:01:52 +020036 op = Operation(Op.CustomNpuOp, "call_" + npu_subgraph.name)
Tim Hall79d07d22020-04-27 18:20:16 +010037 op.attrs["subgraph"] = npu_subgraph
Louis Verhaardaee5d752020-09-30 09:01:52 +020038 op.attrs["custom_type"] = CustomType.NpuOp
Tim Hall79d07d22020-04-27 18:20:16 +010039 ps = Pass(op.name, PassPlacement.MemoryOnly, False, NpuBlockType.Default)
40 ps.ops = [op]
41 ps.primary_op = op
Tim Hall79d07d22020-04-27 18:20:16 +010042 op.scheduled_pass = ps
43
44 # Inputs and outputs filled in later as we cut the graphs
45 return ps
46
47
48def switch_tensor_for_op(op, orig_tens, new_tens):
49
50 op.inputs = [new_tens if tens == orig_tens else tens for tens in op.inputs]
51 op.outputs = [new_tens if tens == orig_tens else tens for tens in op.outputs]
52
53 ps = op.scheduled_pass
54 if ps is None:
55 return
56
57 ps.inputs = [new_tens if tens == orig_tens else tens for tens in ps.inputs]
58 ps.outputs = [new_tens if tens == orig_tens else tens for tens in ps.outputs]
59
60 if ps.ifm_tensor == orig_tens:
61 ps.ifm_tensor = new_tens
62 if ps.ifm2_tensor == orig_tens:
63 ps.ifm2_tensor = new_tens
64 if ps.ofm_tensor == orig_tens:
65 ps.ofm_tensor = new_tens
66 if ps.weight_tensor == orig_tens:
67 ps.weight_tensor = new_tens
68 if ps.scale_tensor == orig_tens:
69 ps.scale_tensor = new_tens
70
71
72def rewrite_tensor_cpu_producer_npu_consumers(
73 orig_tens, call_ps, startup_init_ps, npu_subgraph, cpu_subgraph, subgraph_for_pass
74):
Louis Verhaardaee5d752020-09-30 09:01:52 +020075 is_const = orig_tens.ops[0].type == Op.Const
Tim Hall79d07d22020-04-27 18:20:16 +010076 new_tens = orig_tens.clone("_npu")
Tim Hall79d07d22020-04-27 18:20:16 +010077
Louis Verhaardaee5d752020-09-30 09:01:52 +020078 op_type = Op.SubgraphInput
Tim Hall79d07d22020-04-27 18:20:16 +010079 if is_const:
Louis Verhaardaee5d752020-09-30 09:01:52 +020080 op_type = Op.Const
Tim Hall79d07d22020-04-27 18:20:16 +010081 op = Operation(op_type, orig_tens.name + "_input")
Tim Hall79d07d22020-04-27 18:20:16 +010082 op.scheduled_pass = startup_init_ps
Michael McGeaghc5b549b2020-08-07 11:54:28 +010083 op.set_output_tensor(new_tens)
Tim Hall79d07d22020-04-27 18:20:16 +010084 startup_init_ps.ops.append(op)
85 startup_init_ps.outputs.append(new_tens)
86
87 if not is_const:
88 call_ps.inputs.append(orig_tens)
89 call_ps.primary_op.inputs.append(orig_tens)
90
Johan Alfvén8d57aaa2022-02-04 11:19:17 +010091 # Elementwise op can not overwrite ifm if input is used by many consumers
92 if orig_tens in cpu_subgraph.input_tensors and len(orig_tens.consumers()) > 1:
93 new_tens.ifm_write_protected = True
94
95 # Elementwise op can not overwrite ifm if tensor is used as output from sub graph
96 if orig_tens in cpu_subgraph.output_tensors:
97 new_tens.ifm_write_protected = True
98
Tim Hall79d07d22020-04-27 18:20:16 +010099 for op in list(orig_tens.consumers()):
100 if op is None:
101 continue # Subgraph consumers handled separately.
102 ps = op.scheduled_pass
103 if subgraph_for_pass[ps] == npu_subgraph:
104 switch_tensor_for_op(op, orig_tens, new_tens)
105 orig_tens.consumer_list.remove(op)
106 new_tens.consumer_list.append(op)
107
108 # Deal with output tensors for the NPU graph. These are special.
109 npu_subgraph.output_tensors = [new_tens if tens == orig_tens else tens for tens in npu_subgraph.output_tensors]
110
111
112def rewrite_tensor_npu_producer_cpu_consumers(
William Isakssone4d57672023-07-25 09:43:02 +0000113 orig_tens, call_ps, npu_subgraph, cpu_subgraph, subgraph_for_pass, multiple_npu_sg_have_same_cpu_out_tens
Tim Hall79d07d22020-04-27 18:20:16 +0100114):
William Isakssone4d57672023-07-25 09:43:02 +0000115 if multiple_npu_sg_have_same_cpu_out_tens:
116 new_tens = orig_tens
117 orig_tens = orig_tens.src_tensor
118 else:
119 new_tens = orig_tens.clone("")
120 orig_tens.name = orig_tens.name + "_cpu"
121 new_tens.ops = []
Tim Hall79d07d22020-04-27 18:20:16 +0100122
Tim Hall79d07d22020-04-27 18:20:16 +0100123 npu_subgraph.output_tensors.append(orig_tens)
124
125 call_ps.outputs.append(new_tens)
126 call_ps.primary_op.outputs.append(new_tens)
William Isakssone4d57672023-07-25 09:43:02 +0000127 new_tens.ops.append(call_ps.primary_op)
Johan Alfvén1b9218e2022-02-08 13:01:09 +0100128 # Elementwise op can not overwrite ifm if input is used by many consumers
129 if orig_tens in npu_subgraph.input_tensors and len(orig_tens.consumers()) > 1:
130 new_tens.ifm_write_protected = True
131
132 # Elementwise op can not overwrite ifm if tensor is used as output from sub graph
133 if orig_tens in npu_subgraph.output_tensors:
134 new_tens.ifm_write_protected = True
135
Tim Hall79d07d22020-04-27 18:20:16 +0100136 for op in list(orig_tens.consumers()):
137 if op is None:
138 continue # Subgraph consumers handled separately.
139 ps = op.scheduled_pass
140 if subgraph_for_pass[ps] != npu_subgraph:
141 switch_tensor_for_op(op, orig_tens, new_tens)
142 orig_tens.consumer_list.remove(op)
143 new_tens.consumer_list.append(op)
144
145 # Deal with output tensors for the CPU graph. These are special.
146 cpu_subgraph.output_tensors = [new_tens if tens == orig_tens else tens for tens in cpu_subgraph.output_tensors]
147
148
149def extract_subgraph(nng, orig_sg, arch):
150 assert orig_sg.placement == PassPlacement.Cpu
151
152 passes = list(orig_sg.passes)
153 place_vec = np.array([ps.placement for ps in passes])
154 place_vec[
155 place_vec == PassPlacement.StartupInit
156 ] = PassPlacement.Cpu # Keep the startup init pass on the CPU, we'll make new ones to move onto NPU.
157
158 # MemoryOnly passes that are either squeezed between NPU passes or on the boundary of NPU and CPU
Fredrik Svedberg2b5939f2021-10-14 15:16:30 +0200159 # passes should be assigned to the NPU, unless they are assigned to run on CPU explicitly.
Tim Hall79d07d22020-04-27 18:20:16 +0100160
161 # Forward, then backwards
162 for is_reversed in range(2):
163 last_place = PassPlacement.Cpu
164 seq = enumerate(place_vec)
165 if is_reversed:
166 seq = reversed(list(seq))
167 for idx, place in seq:
Fredrik Svedberg2b5939f2021-10-14 15:16:30 +0200168 if place == PassPlacement.MemoryOnly and passes[idx].ops[0].run_on_npu:
Tim Hall79d07d22020-04-27 18:20:16 +0100169 if last_place == PassPlacement.Npu:
170 place = PassPlacement.Npu
171 place_vec[idx] = place
172
173 if place != PassPlacement.MemoryOnly:
174 last_place = place
175
176 # Anything left, assign to the CPU.
177 place_vec[place_vec == PassPlacement.MemoryOnly] = PassPlacement.Cpu
178
179 if np.all(place_vec == PassPlacement.Cpu):
180 return [] # Nothing to do
181
182 # Create the subgraphs and split passes between them
183
184 new_subgraphs = []
185 split_count = 0
186 subgraph_for_pass = {}
187 orig_sg.passes = []
188 call_pass = {}
189 startup_init_passes = {}
190
191 last_place = PassPlacement.Cpu
192 curr_sg = orig_sg
193
194 for idx, place in enumerate(place_vec):
195 if place != last_place:
196 if place == PassPlacement.Npu:
197 split_count += 1
198 curr_sg = Subgraph("%s_split_%d" % (orig_sg.name, split_count), PassPlacement.Npu)
199 new_subgraphs.append(curr_sg)
200 call_ps = make_npu_call_op_pass(curr_sg)
201 subgraph_for_pass[call_ps] = orig_sg
202 orig_sg.passes.append(call_ps)
203 call_pass[curr_sg] = call_ps
204
205 startup_init_ps = Pass(
206 curr_sg.name + "_startup_init", PassPlacement.StartupInit, False, NpuBlockType.Default
207 )
208 curr_sg.passes.append(startup_init_ps)
209 startup_init_passes[curr_sg] = startup_init_ps
210 subgraph_for_pass[startup_init_ps] = curr_sg
211
212 else:
213 curr_sg = orig_sg
214 last_place = place
215 ps = passes[idx]
216 subgraph_for_pass[ps] = curr_sg
217 curr_sg.passes.append(ps)
218
219 # Rewrite tensors to fix up graphs.
220
221 for curr_sg in new_subgraphs:
222 for ps in curr_sg.passes:
223 for tens in ps.inputs:
224 source_sgs = [subgraph_for_pass[op.scheduled_pass] for op in tens.ops]
225 assert len(source_sgs) >= 0
226 producer_sg = source_sgs[0]
227 for sg in source_sgs:
228 assert sg == producer_sg # All need to be the same.
229
230 if producer_sg != curr_sg:
231 assert (
232 producer_sg == orig_sg
233 ) # Because we go in-order, all the producers must be the original graph.
234 rewrite_tensor_cpu_producer_npu_consumers(
235 tens, call_pass[curr_sg], startup_init_passes[curr_sg], curr_sg, orig_sg, subgraph_for_pass
236 )
237
238 for tens in ps.outputs:
239
240 dest_sgs = [subgraph_for_pass[op.scheduled_pass] for op in tens.consumers() if op is not None]
241 need_rewrite = False
William Isakssone4d57672023-07-25 09:43:02 +0000242 multiple_npu_sg_have_same_cpu_out_tens = False
243 output_tensor = tens
Tim Hall79d07d22020-04-27 18:20:16 +0100244 for sg in dest_sgs:
245 if sg != curr_sg:
246 need_rewrite = True
247 break
William Isakssone4d57672023-07-25 09:43:02 +0000248 for orig_out_tens in orig_sg.output_tensors:
William Isaksson631f6002023-08-02 11:37:05 +0000249 if tens not in curr_sg.output_tensors:
250 if tens == orig_out_tens:
251 need_rewrite = True
252 elif tens.equivalence_id == orig_out_tens.equivalence_id:
253 need_rewrite = True
254 multiple_npu_sg_have_same_cpu_out_tens = True
255 output_tensor = orig_out_tens
Tim Hall79d07d22020-04-27 18:20:16 +0100256
257 if need_rewrite:
258 rewrite_tensor_npu_producer_cpu_consumers(
William Isakssone4d57672023-07-25 09:43:02 +0000259 output_tensor,
260 call_pass[curr_sg],
261 curr_sg,
262 orig_sg,
263 subgraph_for_pass,
264 multiple_npu_sg_have_same_cpu_out_tens,
Tim Hall79d07d22020-04-27 18:20:16 +0100265 )
266
Johan Alfvén211165a2022-02-06 15:30:07 +0100267 for tens in curr_sg.output_tensors:
268 # ofm can depend on multiple ops. These ops can be divided into different NPU
269 # nodes due to CPU nodes. If that is the case the ofm must be NHWC.
Raul Farkas72c6a242023-03-16 16:38:05 +0000270 tens.force_linear_format = True
Johan Alfvén211165a2022-02-06 15:30:07 +0100271
Tim Hall79d07d22020-04-27 18:20:16 +0100272 return new_subgraphs
273
274
275def extract_npu_subgraphs(nng, arch):
276
277 nng.refresh_after_modification()
278
279 for sg in list(nng.subgraphs):
280 if sg.placement == PassPlacement.Cpu:
281 new_subgraphs = extract_subgraph(nng, sg, arch)
282 nng.subgraphs += new_subgraphs
283
284 nng.refresh_after_modification()
285 nng.prune_startup_init_pass()
286
287 for sg in nng.subgraphs:
288 sg.build_pass_links()