Blame - ethosu/vela/extract_npu_subgraphs.py - ml/ethos-u/ethos-u-vela

blob: ada76f2859374e5112bc202fded3004f359a084f [file] [log] [blame]

Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1	# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
				2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	16	# Description:
				17	# Vela separates CPU operations and NPU operations into separate internal subgraphs. The CPU operations are left
				18	# untouched in the final output.
				19	#
				20	# Vela does this by identifying NPU passes and pulling them out from the main CPU graph into separate subgraphs, invoked
				21	# by NpuOp operations. Later, Vela generates command streams and compressed weight streams for the NPU subgraphs and
				22	# attaches them to the NpuOp. This encapsulates everything the NPU subgraph is supposed to do.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	23	import numpy as np
				24
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	25	from .nn_graph import Pass
				26	from .nn_graph import PassPlacement
				27	from .nn_graph import Subgraph
Louis Verhaard	aee5d75	2020-09-30 09:01:52 +0200	[diff] [blame]	28	from .operation import CustomType
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	29	from .operation import NpuBlockType
Louis Verhaard	aee5d75	2020-09-30 09:01:52 +0200	[diff] [blame]	30	from .operation import Op
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	31	from .operation import Operation
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	32
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	33
				34	def make_npu_call_op_pass(npu_subgraph):
Louis Verhaard	aee5d75	2020-09-30 09:01:52 +0200	[diff] [blame]	35	op = Operation(Op.CustomNpuOp, "call_" + npu_subgraph.name)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	36	op.attrs["subgraph"] = npu_subgraph
Louis Verhaard	aee5d75	2020-09-30 09:01:52 +0200	[diff] [blame]	37	op.attrs["custom_type"] = CustomType.NpuOp
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	38	ps = Pass(op.name, PassPlacement.MemoryOnly, False, NpuBlockType.Default)
				39	ps.ops = [op]
				40	ps.primary_op = op
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	41	op.scheduled_pass = ps
				42
				43	# Inputs and outputs filled in later as we cut the graphs
				44	return ps
				45
				46
				47	def switch_tensor_for_op(op, orig_tens, new_tens):
				48
				49	op.inputs = [new_tens if tens == orig_tens else tens for tens in op.inputs]
				50	op.outputs = [new_tens if tens == orig_tens else tens for tens in op.outputs]
				51
				52	ps = op.scheduled_pass
				53	if ps is None:
				54	return
				55
				56	ps.inputs = [new_tens if tens == orig_tens else tens for tens in ps.inputs]
				57	ps.outputs = [new_tens if tens == orig_tens else tens for tens in ps.outputs]
				58
				59	if ps.ifm_tensor == orig_tens:
				60	ps.ifm_tensor = new_tens
				61	if ps.ifm2_tensor == orig_tens:
				62	ps.ifm2_tensor = new_tens
				63	if ps.ofm_tensor == orig_tens:
				64	ps.ofm_tensor = new_tens
				65	if ps.weight_tensor == orig_tens:
				66	ps.weight_tensor = new_tens
				67	if ps.scale_tensor == orig_tens:
				68	ps.scale_tensor = new_tens
				69
				70
				71	def rewrite_tensor_cpu_producer_npu_consumers(
				72	orig_tens, call_ps, startup_init_ps, npu_subgraph, cpu_subgraph, subgraph_for_pass
				73	):
Louis Verhaard	aee5d75	2020-09-30 09:01:52 +0200	[diff] [blame]	74	is_const = orig_tens.ops[0].type == Op.Const
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	75	new_tens = orig_tens.clone("_npu")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	76
Louis Verhaard	aee5d75	2020-09-30 09:01:52 +0200	[diff] [blame]	77	op_type = Op.SubgraphInput
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	78	if is_const:
Louis Verhaard	aee5d75	2020-09-30 09:01:52 +0200	[diff] [blame]	79	op_type = Op.Const
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	80	op = Operation(op_type, orig_tens.name + "_input")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	81	op.scheduled_pass = startup_init_ps
Michael McGeagh	c5b549b	2020-08-07 11:54:28 +0100	[diff] [blame]	82	op.set_output_tensor(new_tens)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	83	startup_init_ps.ops.append(op)
				84	startup_init_ps.outputs.append(new_tens)
				85
				86	if not is_const:
				87	call_ps.inputs.append(orig_tens)
				88	call_ps.primary_op.inputs.append(orig_tens)
				89
Johan Alfvén	8d57aaa	2022-02-04 11:19:17 +0100	[diff] [blame]	90	# Elementwise op can not overwrite ifm if input is used by many consumers
				91	if orig_tens in cpu_subgraph.input_tensors and len(orig_tens.consumers()) > 1:
				92	new_tens.ifm_write_protected = True
				93
				94	# Elementwise op can not overwrite ifm if tensor is used as output from sub graph
				95	if orig_tens in cpu_subgraph.output_tensors:
				96	new_tens.ifm_write_protected = True
				97
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	98	for op in list(orig_tens.consumers()):
				99	if op is None:
				100	continue # Subgraph consumers handled separately.
				101	ps = op.scheduled_pass
				102	if subgraph_for_pass[ps] == npu_subgraph:
				103	switch_tensor_for_op(op, orig_tens, new_tens)
				104	orig_tens.consumer_list.remove(op)
				105	new_tens.consumer_list.append(op)
				106
				107	# Deal with output tensors for the NPU graph. These are special.
				108	npu_subgraph.output_tensors = [new_tens if tens == orig_tens else tens for tens in npu_subgraph.output_tensors]
				109
				110
				111	def rewrite_tensor_npu_producer_cpu_consumers(
				112	orig_tens, call_ps, startup_init_ps, npu_subgraph, cpu_subgraph, subgraph_for_pass
				113	):
				114
James Ward	9338978	2021-10-14 12:58:02 +0100	[diff] [blame]	115	new_tens = orig_tens.clone("")
				116	orig_tens.name = orig_tens.name + "_cpu"
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	117	npu_subgraph.output_tensors.append(orig_tens)
				118
				119	call_ps.outputs.append(new_tens)
				120	call_ps.primary_op.outputs.append(new_tens)
				121	new_tens.ops = [call_ps.primary_op]
				122
Johan Alfvén	1b9218e	2022-02-08 13:01:09 +0100	[diff] [blame^]	123	# Elementwise op can not overwrite ifm if input is used by many consumers
				124	if orig_tens in npu_subgraph.input_tensors and len(orig_tens.consumers()) > 1:
				125	new_tens.ifm_write_protected = True
				126
				127	# Elementwise op can not overwrite ifm if tensor is used as output from sub graph
				128	if orig_tens in npu_subgraph.output_tensors:
				129	new_tens.ifm_write_protected = True
				130
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	131	for op in list(orig_tens.consumers()):
				132	if op is None:
				133	continue # Subgraph consumers handled separately.
				134	ps = op.scheduled_pass
				135	if subgraph_for_pass[ps] != npu_subgraph:
				136	switch_tensor_for_op(op, orig_tens, new_tens)
				137	orig_tens.consumer_list.remove(op)
				138	new_tens.consumer_list.append(op)
				139
				140	# Deal with output tensors for the CPU graph. These are special.
				141	cpu_subgraph.output_tensors = [new_tens if tens == orig_tens else tens for tens in cpu_subgraph.output_tensors]
				142
				143
				144	def extract_subgraph(nng, orig_sg, arch):
				145	assert orig_sg.placement == PassPlacement.Cpu
				146
				147	passes = list(orig_sg.passes)
				148	place_vec = np.array([ps.placement for ps in passes])
				149	place_vec[
				150	place_vec == PassPlacement.StartupInit
				151	] = PassPlacement.Cpu # Keep the startup init pass on the CPU, we'll make new ones to move onto NPU.
				152
				153	# MemoryOnly passes that are either squeezed between NPU passes or on the boundary of NPU and CPU
Fredrik Svedberg	2b5939f	2021-10-14 15:16:30 +0200	[diff] [blame]	154	# passes should be assigned to the NPU, unless they are assigned to run on CPU explicitly.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	155
				156	# Forward, then backwards
				157	for is_reversed in range(2):
				158	last_place = PassPlacement.Cpu
				159	seq = enumerate(place_vec)
				160	if is_reversed:
				161	seq = reversed(list(seq))
				162	for idx, place in seq:
Fredrik Svedberg	2b5939f	2021-10-14 15:16:30 +0200	[diff] [blame]	163	if place == PassPlacement.MemoryOnly and passes[idx].ops[0].run_on_npu:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	164	if last_place == PassPlacement.Npu:
				165	place = PassPlacement.Npu
				166	place_vec[idx] = place
				167
				168	if place != PassPlacement.MemoryOnly:
				169	last_place = place
				170
				171	# Anything left, assign to the CPU.
				172	place_vec[place_vec == PassPlacement.MemoryOnly] = PassPlacement.Cpu
				173
				174	if np.all(place_vec == PassPlacement.Cpu):
				175	return [] # Nothing to do
				176
				177	# Create the subgraphs and split passes between them
				178
				179	new_subgraphs = []
				180	split_count = 0
				181	subgraph_for_pass = {}
				182	orig_sg.passes = []
				183	call_pass = {}
				184	startup_init_passes = {}
				185
				186	last_place = PassPlacement.Cpu
				187	curr_sg = orig_sg
				188
				189	for idx, place in enumerate(place_vec):
				190	if place != last_place:
				191	if place == PassPlacement.Npu:
				192	split_count += 1
				193	curr_sg = Subgraph("%s_split_%d" % (orig_sg.name, split_count), PassPlacement.Npu)
				194	new_subgraphs.append(curr_sg)
				195	call_ps = make_npu_call_op_pass(curr_sg)
				196	subgraph_for_pass[call_ps] = orig_sg
				197	orig_sg.passes.append(call_ps)
				198	call_pass[curr_sg] = call_ps
				199
				200	startup_init_ps = Pass(
				201	curr_sg.name + "_startup_init", PassPlacement.StartupInit, False, NpuBlockType.Default
				202	)
				203	curr_sg.passes.append(startup_init_ps)
				204	startup_init_passes[curr_sg] = startup_init_ps
				205	subgraph_for_pass[startup_init_ps] = curr_sg
				206
				207	else:
				208	curr_sg = orig_sg
				209	last_place = place
				210	ps = passes[idx]
				211	subgraph_for_pass[ps] = curr_sg
				212	curr_sg.passes.append(ps)
				213
				214	# Rewrite tensors to fix up graphs.
				215
				216	for curr_sg in new_subgraphs:
				217	for ps in curr_sg.passes:
				218	for tens in ps.inputs:
				219	source_sgs = [subgraph_for_pass[op.scheduled_pass] for op in tens.ops]
				220	assert len(source_sgs) >= 0
				221	producer_sg = source_sgs[0]
				222	for sg in source_sgs:
				223	assert sg == producer_sg # All need to be the same.
				224
				225	if producer_sg != curr_sg:
				226	assert (
				227	producer_sg == orig_sg
				228	) # Because we go in-order, all the producers must be the original graph.
				229	rewrite_tensor_cpu_producer_npu_consumers(
				230	tens, call_pass[curr_sg], startup_init_passes[curr_sg], curr_sg, orig_sg, subgraph_for_pass
				231	)
				232
				233	for tens in ps.outputs:
				234
				235	dest_sgs = [subgraph_for_pass[op.scheduled_pass] for op in tens.consumers() if op is not None]
				236	need_rewrite = False
				237	for sg in dest_sgs:
				238	if sg != curr_sg:
				239	need_rewrite = True
				240	break
				241	if tens in orig_sg.output_tensors:
				242	need_rewrite = True
				243
				244	if need_rewrite:
				245	rewrite_tensor_npu_producer_cpu_consumers(
				246	tens, call_pass[curr_sg], startup_init_passes[curr_sg], curr_sg, orig_sg, subgraph_for_pass
				247	)
				248
Johan Alfvén	211165a	2022-02-06 15:30:07 +0100	[diff] [blame]	249	for tens in curr_sg.output_tensors:
				250	# ofm can depend on multiple ops. These ops can be divided into different NPU
				251	# nodes due to CPU nodes. If that is the case the ofm must be NHWC.
				252	tens.needs_linear_format = True
				253
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	254	return new_subgraphs
				255
				256
				257	def extract_npu_subgraphs(nng, arch):
				258
				259	nng.refresh_after_modification()
				260
				261	for sg in list(nng.subgraphs):
				262	if sg.placement == PassPlacement.Cpu:
				263	new_subgraphs = extract_subgraph(nng, sg, arch)
				264	nng.subgraphs += new_subgraphs
				265
				266	nng.refresh_after_modification()
				267	nng.prune_startup_init_pass()
				268
				269	for sg in nng.subgraphs:
				270	sg.build_pass_links()