Blame - ethosu/vela/extract_npu_subgraphs.py - ml/ethos-u/ethos-u-vela

blob: bf637b83b3faa3cacf39e720e7f1ef9743de18ea [file] [log] [blame]

Rickard Bolin	bc6ee58	2022-11-04 08:24:29 +0000	[diff] [blame^]	1	# SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
Rickard Bolin	bc6ee58	2022-11-04 08:24:29 +0000	[diff] [blame^]	16	#
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	17	# Description:
				18	# Vela separates CPU operations and NPU operations into separate internal subgraphs. The CPU operations are left
				19	# untouched in the final output.
				20	#
				21	# Vela does this by identifying NPU passes and pulling them out from the main CPU graph into separate subgraphs, invoked
				22	# by NpuOp operations. Later, Vela generates command streams and compressed weight streams for the NPU subgraphs and
				23	# attaches them to the NpuOp. This encapsulates everything the NPU subgraph is supposed to do.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	24	import numpy as np
				25
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	26	from .nn_graph import Pass
				27	from .nn_graph import PassPlacement
				28	from .nn_graph import Subgraph
Louis Verhaard	aee5d75	2020-09-30 09:01:52 +0200	[diff] [blame]	29	from .operation import CustomType
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	30	from .operation import NpuBlockType
Louis Verhaard	aee5d75	2020-09-30 09:01:52 +0200	[diff] [blame]	31	from .operation import Op
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	32	from .operation import Operation
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	33
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	34
				35	def make_npu_call_op_pass(npu_subgraph):
Louis Verhaard	aee5d75	2020-09-30 09:01:52 +0200	[diff] [blame]	36	op = Operation(Op.CustomNpuOp, "call_" + npu_subgraph.name)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	37	op.attrs["subgraph"] = npu_subgraph
Louis Verhaard	aee5d75	2020-09-30 09:01:52 +0200	[diff] [blame]	38	op.attrs["custom_type"] = CustomType.NpuOp
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	39	ps = Pass(op.name, PassPlacement.MemoryOnly, False, NpuBlockType.Default)
				40	ps.ops = [op]
				41	ps.primary_op = op
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	42	op.scheduled_pass = ps
				43
				44	# Inputs and outputs filled in later as we cut the graphs
				45	return ps
				46
				47
				48	def switch_tensor_for_op(op, orig_tens, new_tens):
				49
				50	op.inputs = [new_tens if tens == orig_tens else tens for tens in op.inputs]
				51	op.outputs = [new_tens if tens == orig_tens else tens for tens in op.outputs]
				52
				53	ps = op.scheduled_pass
				54	if ps is None:
				55	return
				56
				57	ps.inputs = [new_tens if tens == orig_tens else tens for tens in ps.inputs]
				58	ps.outputs = [new_tens if tens == orig_tens else tens for tens in ps.outputs]
				59
				60	if ps.ifm_tensor == orig_tens:
				61	ps.ifm_tensor = new_tens
				62	if ps.ifm2_tensor == orig_tens:
				63	ps.ifm2_tensor = new_tens
				64	if ps.ofm_tensor == orig_tens:
				65	ps.ofm_tensor = new_tens
				66	if ps.weight_tensor == orig_tens:
				67	ps.weight_tensor = new_tens
				68	if ps.scale_tensor == orig_tens:
				69	ps.scale_tensor = new_tens
				70
				71
				72	def rewrite_tensor_cpu_producer_npu_consumers(
				73	orig_tens, call_ps, startup_init_ps, npu_subgraph, cpu_subgraph, subgraph_for_pass
				74	):
Louis Verhaard	aee5d75	2020-09-30 09:01:52 +0200	[diff] [blame]	75	is_const = orig_tens.ops[0].type == Op.Const
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	76	new_tens = orig_tens.clone("_npu")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	77
Louis Verhaard	aee5d75	2020-09-30 09:01:52 +0200	[diff] [blame]	78	op_type = Op.SubgraphInput
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	79	if is_const:
Louis Verhaard	aee5d75	2020-09-30 09:01:52 +0200	[diff] [blame]	80	op_type = Op.Const
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	81	op = Operation(op_type, orig_tens.name + "_input")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	82	op.scheduled_pass = startup_init_ps
Michael McGeagh	c5b549b	2020-08-07 11:54:28 +0100	[diff] [blame]	83	op.set_output_tensor(new_tens)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	84	startup_init_ps.ops.append(op)
				85	startup_init_ps.outputs.append(new_tens)
				86
				87	if not is_const:
				88	call_ps.inputs.append(orig_tens)
				89	call_ps.primary_op.inputs.append(orig_tens)
				90
Johan Alfvén	8d57aaa	2022-02-04 11:19:17 +0100	[diff] [blame]	91	# Elementwise op can not overwrite ifm if input is used by many consumers
				92	if orig_tens in cpu_subgraph.input_tensors and len(orig_tens.consumers()) > 1:
				93	new_tens.ifm_write_protected = True
				94
				95	# Elementwise op can not overwrite ifm if tensor is used as output from sub graph
				96	if orig_tens in cpu_subgraph.output_tensors:
				97	new_tens.ifm_write_protected = True
				98
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	99	for op in list(orig_tens.consumers()):
				100	if op is None:
				101	continue # Subgraph consumers handled separately.
				102	ps = op.scheduled_pass
				103	if subgraph_for_pass[ps] == npu_subgraph:
				104	switch_tensor_for_op(op, orig_tens, new_tens)
				105	orig_tens.consumer_list.remove(op)
				106	new_tens.consumer_list.append(op)
				107
				108	# Deal with output tensors for the NPU graph. These are special.
				109	npu_subgraph.output_tensors = [new_tens if tens == orig_tens else tens for tens in npu_subgraph.output_tensors]
				110
				111
				112	def rewrite_tensor_npu_producer_cpu_consumers(
				113	orig_tens, call_ps, startup_init_ps, npu_subgraph, cpu_subgraph, subgraph_for_pass
				114	):
				115
James Ward	9338978	2021-10-14 12:58:02 +0100	[diff] [blame]	116	new_tens = orig_tens.clone("")
				117	orig_tens.name = orig_tens.name + "_cpu"
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	118	npu_subgraph.output_tensors.append(orig_tens)
				119
				120	call_ps.outputs.append(new_tens)
				121	call_ps.primary_op.outputs.append(new_tens)
				122	new_tens.ops = [call_ps.primary_op]
				123
Johan Alfvén	1b9218e	2022-02-08 13:01:09 +0100	[diff] [blame]	124	# Elementwise op can not overwrite ifm if input is used by many consumers
				125	if orig_tens in npu_subgraph.input_tensors and len(orig_tens.consumers()) > 1:
				126	new_tens.ifm_write_protected = True
				127
				128	# Elementwise op can not overwrite ifm if tensor is used as output from sub graph
				129	if orig_tens in npu_subgraph.output_tensors:
				130	new_tens.ifm_write_protected = True
				131
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	132	for op in list(orig_tens.consumers()):
				133	if op is None:
				134	continue # Subgraph consumers handled separately.
				135	ps = op.scheduled_pass
				136	if subgraph_for_pass[ps] != npu_subgraph:
				137	switch_tensor_for_op(op, orig_tens, new_tens)
				138	orig_tens.consumer_list.remove(op)
				139	new_tens.consumer_list.append(op)
				140
				141	# Deal with output tensors for the CPU graph. These are special.
				142	cpu_subgraph.output_tensors = [new_tens if tens == orig_tens else tens for tens in cpu_subgraph.output_tensors]
				143
				144
				145	def extract_subgraph(nng, orig_sg, arch):
				146	assert orig_sg.placement == PassPlacement.Cpu
				147
				148	passes = list(orig_sg.passes)
				149	place_vec = np.array([ps.placement for ps in passes])
				150	place_vec[
				151	place_vec == PassPlacement.StartupInit
				152	] = PassPlacement.Cpu # Keep the startup init pass on the CPU, we'll make new ones to move onto NPU.
				153
				154	# MemoryOnly passes that are either squeezed between NPU passes or on the boundary of NPU and CPU
Fredrik Svedberg	2b5939f	2021-10-14 15:16:30 +0200	[diff] [blame]	155	# passes should be assigned to the NPU, unless they are assigned to run on CPU explicitly.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	156
				157	# Forward, then backwards
				158	for is_reversed in range(2):
				159	last_place = PassPlacement.Cpu
				160	seq = enumerate(place_vec)
				161	if is_reversed:
				162	seq = reversed(list(seq))
				163	for idx, place in seq:
Fredrik Svedberg	2b5939f	2021-10-14 15:16:30 +0200	[diff] [blame]	164	if place == PassPlacement.MemoryOnly and passes[idx].ops[0].run_on_npu:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	165	if last_place == PassPlacement.Npu:
				166	place = PassPlacement.Npu
				167	place_vec[idx] = place
				168
				169	if place != PassPlacement.MemoryOnly:
				170	last_place = place
				171
				172	# Anything left, assign to the CPU.
				173	place_vec[place_vec == PassPlacement.MemoryOnly] = PassPlacement.Cpu
				174
				175	if np.all(place_vec == PassPlacement.Cpu):
				176	return [] # Nothing to do
				177
				178	# Create the subgraphs and split passes between them
				179
				180	new_subgraphs = []
				181	split_count = 0
				182	subgraph_for_pass = {}
				183	orig_sg.passes = []
				184	call_pass = {}
				185	startup_init_passes = {}
				186
				187	last_place = PassPlacement.Cpu
				188	curr_sg = orig_sg
				189
				190	for idx, place in enumerate(place_vec):
				191	if place != last_place:
				192	if place == PassPlacement.Npu:
				193	split_count += 1
				194	curr_sg = Subgraph("%s_split_%d" % (orig_sg.name, split_count), PassPlacement.Npu)
				195	new_subgraphs.append(curr_sg)
				196	call_ps = make_npu_call_op_pass(curr_sg)
				197	subgraph_for_pass[call_ps] = orig_sg
				198	orig_sg.passes.append(call_ps)
				199	call_pass[curr_sg] = call_ps
				200
				201	startup_init_ps = Pass(
				202	curr_sg.name + "_startup_init", PassPlacement.StartupInit, False, NpuBlockType.Default
				203	)
				204	curr_sg.passes.append(startup_init_ps)
				205	startup_init_passes[curr_sg] = startup_init_ps
				206	subgraph_for_pass[startup_init_ps] = curr_sg
				207
				208	else:
				209	curr_sg = orig_sg
				210	last_place = place
				211	ps = passes[idx]
				212	subgraph_for_pass[ps] = curr_sg
				213	curr_sg.passes.append(ps)
				214
				215	# Rewrite tensors to fix up graphs.
				216
				217	for curr_sg in new_subgraphs:
				218	for ps in curr_sg.passes:
				219	for tens in ps.inputs:
				220	source_sgs = [subgraph_for_pass[op.scheduled_pass] for op in tens.ops]
				221	assert len(source_sgs) >= 0
				222	producer_sg = source_sgs[0]
				223	for sg in source_sgs:
				224	assert sg == producer_sg # All need to be the same.
				225
				226	if producer_sg != curr_sg:
				227	assert (
				228	producer_sg == orig_sg
				229	) # Because we go in-order, all the producers must be the original graph.
				230	rewrite_tensor_cpu_producer_npu_consumers(
				231	tens, call_pass[curr_sg], startup_init_passes[curr_sg], curr_sg, orig_sg, subgraph_for_pass
				232	)
				233
				234	for tens in ps.outputs:
				235
				236	dest_sgs = [subgraph_for_pass[op.scheduled_pass] for op in tens.consumers() if op is not None]
				237	need_rewrite = False
				238	for sg in dest_sgs:
				239	if sg != curr_sg:
				240	need_rewrite = True
				241	break
				242	if tens in orig_sg.output_tensors:
				243	need_rewrite = True
				244
				245	if need_rewrite:
				246	rewrite_tensor_npu_producer_cpu_consumers(
				247	tens, call_pass[curr_sg], startup_init_passes[curr_sg], curr_sg, orig_sg, subgraph_for_pass
				248	)
				249
Johan Alfvén	211165a	2022-02-06 15:30:07 +0100	[diff] [blame]	250	for tens in curr_sg.output_tensors:
				251	# ofm can depend on multiple ops. These ops can be divided into different NPU
				252	# nodes due to CPU nodes. If that is the case the ofm must be NHWC.
				253	tens.needs_linear_format = True
				254
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	255	return new_subgraphs
				256
				257
				258	def extract_npu_subgraphs(nng, arch):
				259
				260	nng.refresh_after_modification()
				261
				262	for sg in list(nng.subgraphs):
				263	if sg.placement == PassPlacement.Cpu:
				264	new_subgraphs = extract_subgraph(nng, sg, arch)
				265	nng.subgraphs += new_subgraphs
				266
				267	nng.refresh_after_modification()
				268	nng.prune_startup_init_pass()
				269
				270	for sg in nng.subgraphs:
				271	sg.build_pass_links()