Blame - ethosu/vela/pass_packing.py - ml/ethos-u/ethos-u-vela

blob: 1ad5b4f71b30e438cef1237f3a4967b86b7ed8fe [file] [log] [blame]

Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1	# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
				2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
				16
				17
				18	# Description:
				19	# Packs a subgraph with Neural Network Operations into Passes. Each Pass has one or more Operations.
				20
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	21	import enum
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	22	import collections
				23
				24	from .nn_graph import Pass, PassPlacement
				25	from .tensor import TensorPurpose
				26	from .operation import Operation, NpuBlockType
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	27
				28
				29	class PassFlags(enum.Flag):
				30	Empty = 0
				31	Pre = 1
				32	Main = 2
				33	Post = 4
				34	Mac = 8
				35	Dma = 32
				36	ElementWise = 256
				37	Npu = 512
				38	Cpu = 1024
				39	StartupInit = 2048
				40	MemoryOnly = 4096
				41	PostFusingLimited = 8192
				42
				43
				44	npu_pre_ops = set(("QuantizedResizeBilinear", "SplitSliceRead",))
				45
				46	mac_main_ops = set(
				47	(
				48	# convolutions
				49	"Conv2DBiasAct",
				50	"Conv2D",
				51	"QuantizedConv2D",
				52	"Conv2DBackpropInputSwitched",
				53	# depth-wise convolutions
				54	"DepthwiseConv2dBiasAct",
				55	"DepthwiseConv2dNative",
				56	"QuantizedDepthwiseConv2D",
				57	# FC layers
				58	"QuantizedMatMul",
				59	"MatMul",
				60	"FullyConnectedAct",
				61	# RNN/LSTM/GRU
				62	"BlockLSTM",
				63	# pooling
				64	"QuantizedMaxPool",
				65	"QuantizedAvgPool",
				66	"AvgPool",
				67	"MaxPool",
				68	"AvgPoolAct",
				69	"MaxPoolAct",
Dwight Lidman	3ec04ac	2020-04-30 11:54:48 +0200	[diff] [blame^]	70	# deconvolution
				71	"ResizeBilinear",
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	72	)
				73	)
				74
				75	binary_elem_wise_main_ops = set(
				76	(
				77	# binary element-wise
				78	"AddAct",
				79	"MulAct",
				80	"SubAct",
				81	"QuantizedAdd",
				82	"QuantizedSub",
				83	"QuantizedMul",
				84	"Mul",
				85	"Add",
				86	"Sub",
				87	"Minimum",
				88	"Maximum",
				89	)
				90	)
				91
				92	unary_elem_wise_main_ops = set(("LeakyRelu", "Abs")) # Unary element-wise operations
				93
				94	elem_wise_main_ops = binary_elem_wise_main_ops \| unary_elem_wise_main_ops
				95
				96	activation_ops = set(("QuantizedRelu", "QuantizedRelu1", "QuantizedRelu6", "Relu", "Relu6", "ReluN1To1"))
				97	npu_post_ops = activation_ops \| set(
				98	# Bias-add operations: Get rid of these once we have rewrites from Conv2D + BiasAdd + Activation to Conv2DBiasAct.
				99	("Mul", "Add", "QuantizedBiasAdd", "Requantize", "QuantizedBatchNorm", "BiasAdd", "FusedBatchNorm")
				100	)
				101
				102	npu_post_fuse_limited_ops = set(
				103	# Set of post operators that should not be fused with main/elementwise ops
				104	("ConcatSliceWrite", "Sigmoid", "Tanh")
				105	)
				106
				107	elem_wise_ops = elem_wise_main_ops \| activation_ops \| set(("Sigmoid", "Tanh"))
				108
				109
				110	quantization_ops = set(("Dequantize", "QuantizeV2", "Max", "Min"))
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	111	cpu_ops = set(("Softmax", "QuantizedSoftmax", "LRN", "Shape", "QuantizedPad", "Pad", "AddN")) \| quantization_ops
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	112
				113	npu_dma_ops = set(("DMA",))
				114	startup_init_ops = set(("Const", "VariableV2", "Placeholder", "SubgraphInput"))
				115	memory_only_ops = set(("Squeeze", "Reshape", "QuantizedReshape", "ExpandDims",))
				116
				117
				118	test_sequence = [
				119	(
				120	# ops_set
				121	npu_post_ops,
				122	# incompatible_pack_flags
				123	PassFlags.Cpu \| PassFlags.MemoryOnly \| PassFlags.Pre \| PassFlags.Main,
				124	# flags_to_set
				125	PassFlags.Npu \| PassFlags.Post,
				126	# flags_to_clear
				127	PassFlags.Empty,
				128	),
				129	(
				130	# ops_set
				131	npu_post_fuse_limited_ops,
				132	# incompatible_pack_flags
				133	PassFlags.Cpu \| PassFlags.MemoryOnly \| PassFlags.Pre \| PassFlags.Main,
				134	# flags_to_set
				135	PassFlags.Npu \| PassFlags.PostFusingLimited,
				136	# flags_to_clear
				137	PassFlags.Empty,
				138	),
				139	(
				140	# ops_set
				141	mac_main_ops,
				142	# incompatible_pack_flags
				143	PassFlags.Cpu
				144	\| PassFlags.MemoryOnly
				145	\| PassFlags.ElementWise
				146	\| PassFlags.Pre
				147	\| PassFlags.Main
				148	\| PassFlags.PostFusingLimited,
				149	# flags_to_set
				150	PassFlags.Npu \| PassFlags.Mac \| PassFlags.Main,
				151	# flags_to_clear
				152	PassFlags.Empty,
				153	),
				154	(
				155	# ops_set
				156	elem_wise_main_ops,
				157	# incompatible_pack_flags
				158	PassFlags.Cpu
				159	\| PassFlags.MemoryOnly
				160	\| PassFlags.Mac
				161	\| PassFlags.Pre
				162	\| PassFlags.Main
				163	\| PassFlags.PostFusingLimited,
				164	# flags_to_set
				165	PassFlags.Npu \| PassFlags.ElementWise \| PassFlags.Main,
				166	# flags_to_clear
				167	PassFlags.Empty,
				168	),
				169	(
				170	# ops_set
				171	npu_pre_ops,
				172	# incompatible_pack_flags
				173	PassFlags.Cpu \| PassFlags.MemoryOnly,
				174	# flags_to_set
				175	PassFlags.Npu \| PassFlags.Mac \| PassFlags.Pre \| PassFlags.ElementWise,
				176	# flags_to_clear
				177	PassFlags.Empty,
				178	),
				179	(
				180	# ops_set
				181	npu_dma_ops,
				182	# incompatible_pack_flags
				183	PassFlags.Cpu \| PassFlags.MemoryOnly,
				184	# flags_to_set
				185	PassFlags.Npu \| PassFlags.Dma,
				186	# flags_to_clear
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	187	PassFlags.Empty,
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	188	),
				189	(
				190	# ops_set
				191	startup_init_ops,
				192	# incompatible_pack_flags
				193	PassFlags.Npu \| PassFlags.Cpu \| PassFlags.MemoryOnly,
				194	# flags_to_set
				195	PassFlags.StartupInit \| PassFlags.Main,
				196	# flags_to_clear
				197	PassFlags.Empty,
				198	),
				199	(
				200	# ops_set
				201	memory_only_ops,
				202	# incompatible_pack_flags
				203	PassFlags.Npu \| PassFlags.Cpu,
				204	# flags_to_set
				205	PassFlags.MemoryOnly \| PassFlags.Main,
				206	# flags_to_clear
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	207	PassFlags.Empty,
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	208	),
				209	(
				210	# ops_set
				211	cpu_ops,
				212	# incompatible_pack_flags
				213	PassFlags.Npu \| PassFlags.MemoryOnly \| PassFlags.Main,
				214	# flags_to_set
				215	PassFlags.Cpu \| PassFlags.Main,
				216	# flags_to_clear
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	217	PassFlags.Empty,
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	218	),
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	219	( # This last one is a fallback for unrecognised operations
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	220	# ops_set
				221	None,
				222	# incompatible_pack_flags
				223	PassFlags.Npu \| PassFlags.MemoryOnly \| PassFlags.Main,
				224	# flags_to_set
				225	PassFlags.Cpu \| PassFlags.Main,
				226	# flags_to_clear
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	227	PassFlags.Empty,
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	228	),
				229	]
				230
				231	# Some sanity checking
				232	for (operation_set, incompatible_pack_flags, flags_to_set, flags_to_clear) in test_sequence:
				233	assert not flags_to_clear & flags_to_set
				234
				235	if operation_set is not None:
				236	for op in operation_set:
				237	assert len(op) > 1 # This is to avoid string literals being decomposed
				238
				239
				240	def pack_into_passes(nng, arch, verbose_packing=False):
				241	def visit_op(op, ignored):
				242	visit_op_refcount[op] += 1
				243
				244	if visit_op_refcount[op] == 1: # First-time visit, go and fix up unused output tensors
				245	for tens in op.outputs:
				246	if len(tens.consumers()) == 0:
				247	visit_op_refcount[op] += 1
				248
				249	assert visit_op_refcount[op] <= len(op.outputs)
				250	if visit_op_refcount[op] == len(op.outputs):
				251
				252	if op.type in startup_init_ops:
				253	startup_list.append(op)
				254	else:
				255	_, _, _, ofm_tensor = op.get_ifm_ifm2_weights_ofm()
				256	if ofm_tensor is None:
				257	ofm_tensor = op.outputs[0]
				258	build_pass((op,), ofm_tensor)
				259
				260	def build_pass(start_ops_to_process, ofm_tensor=None):
				261	reverse_ops_list = []
				262	curr_flags = PassFlags.Empty
				263	npu_block_type = NpuBlockType.Default
				264
				265	reverse_intermediates = []
				266	input_set = set()
				267	ifm_tensor = None
				268	primary_op = None
				269
				270	to_process = collections.deque()
				271	for start_op in start_ops_to_process:
				272	to_process.append((start_op, None))
				273
				274	while to_process:
				275	curr_op, tens = to_process.popleft()
				276
				277	if curr_op in reverse_ops_list:
				278	continue
				279
				280	for operation_set, incompatible_pack_flags, flags_to_set, flags_to_clear in test_sequence:
				281	if operation_set is None or curr_op.type in operation_set:
				282	if not (curr_flags & incompatible_pack_flags):
				283	if flags_to_set & PassFlags.Npu:
				284	if not curr_op.run_on_npu:
				285	continue
				286
				287	reverse_ops_list.append(curr_op)
				288	new_block_type = curr_op.attrs.get("npu_block_type", NpuBlockType.Default)
				289	if new_block_type != NpuBlockType.Default:
				290	assert npu_block_type == NpuBlockType.Default
				291	npu_block_type = new_block_type # Only one major block type per pass
				292	assert primary_op is None
				293	primary_op = curr_op
				294
				295	curr_flags &= ~flags_to_clear
				296	curr_flags \|= flags_to_set
				297
				298	if flags_to_set & PassFlags.Npu:
				299	if flags_to_set & (
				300	PassFlags.Mac \| PassFlags.ElementWise \| PassFlags.Post \| PassFlags.PostFusingLimited
				301	):
				302	assert len(curr_op.inputs) >= 1
				303	if curr_op.type == "BlockLSTM":
				304	ifm_tensor = curr_op.inputs[3]
				305	else:
				306	ifm_tensor = curr_op.inputs[0]
				307	assert ifm_tensor.purpose == TensorPurpose.FeatureMap
				308
				309	if flags_to_set & PassFlags.Dma:
				310	# DMAs are special - Output buffers need to be preserved as intermediates,
				311	# if the pass consumes the results
				312	if tens is not None:
				313	reverse_intermediates.append(tens)
				314
				315	if operation_set is None:
				316	print("Warning:", curr_op.type, "operation is unknown or unsupported, placing on CPU")
				317
				318	for inp in curr_op.inputs:
				319	can_pack = True
				320	if len(inp.ops) == 1:
				321	next_op = inp.ops[0]
				322	for outp in next_op.outputs:
				323	consumers = outp.consumers()
				324	if len(consumers) > 1 or (len(consumers) == 1 and consumers[0] != curr_op):
				325	can_pack = False
				326	break
				327	else:
				328	can_pack = False
				329
				330	if can_pack:
				331	to_process.append((next_op, inp))
				332	else:
				333	assert inp is not None
				334	input_set.add(inp)
				335
				336	break
				337
				338	else:
				339	# This operation is not compatible with already packed operations, just register the tensor as an input
				340	assert tens is not None
				341	input_set.add(tens)
				342
				343	if curr_flags & PassFlags.Npu and not curr_flags & (PassFlags.ElementWise \| PassFlags.Mac):
				344	# Make the choice that if we don't have a mac operation, the ambidextrous operations go on the
				345	# element wise unit
				346	curr_flags \|= PassFlags.ElementWise
				347
				348	is_element_wise = True
				349	for op in reverse_ops_list:
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	350	if op.type not in elem_wise_ops and op.type not in npu_dma_ops:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	351	is_element_wise = False
				352	break
				353
				354	placement = PassPlacement.Unknown
				355	if curr_flags & PassFlags.Npu:
				356	assert placement == PassPlacement.Unknown
				357	placement = PassPlacement.Npu
				358	if curr_flags & PassFlags.Cpu:
				359	assert placement == PassPlacement.Unknown
				360	placement = PassPlacement.Cpu
				361	if curr_flags & PassFlags.MemoryOnly:
				362	assert placement == PassPlacement.Unknown
				363	placement = PassPlacement.MemoryOnly
				364	if curr_flags & PassFlags.StartupInit:
				365	assert placement == PassPlacement.Unknown
				366	placement = PassPlacement.StartupInit
				367	assert placement != PassPlacement.Unknown
				368
				369	ops_list = list(reversed(reverse_ops_list))
				370	intermediates = list(reversed(reverse_intermediates))
				371
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	372	if primary_op is None:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	373	primary_op = create_primary_op(ops_list)
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	374	if primary_op is not None:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	375	visit_tensor_refcount[primary_op.inputs[0]] += 1
				376	npu_block_type = primary_op.attrs["npu_block_type"]
				377	for input_tens in primary_op.inputs:
				378	if input_tens not in input_set:
				379	input_set.add(input_tens)
				380
				381	ordered_input_list = []
				382	input_refcounts = collections.defaultdict(int)
				383	for op in ops_list:
				384	for inp in op.inputs:
				385	if inp in input_set:
				386	if input_refcounts[inp] == 0:
				387	ordered_input_list.append(inp)
				388	input_refcounts[inp] += 1
				389
				390	name = ops_list[0].name
				391	non_dma_ops = [op for op in ops_list if op.type != "DMA"]
				392	if non_dma_ops:
				393	name = non_dma_ops[0].name
				394	ps = Pass(name, placement, is_element_wise, npu_block_type)
				395	ps.ops = ops_list
				396	ps.primary_op = primary_op
				397	ps.inputs = ordered_input_list
				398	ps.intermediates = intermediates
				399	ps.outputs = list(ops_list[-1].outputs)
				400	ps.ifm_tensor = ifm_tensor
				401
				402	# ElementWise operation, 2 IFMs
				403	if ps.primary_op and ps.primary_op.type in binary_elem_wise_main_ops:
				404	ps.ifm_tensor = ps.inputs[0]
				405
				406	if len(ps.inputs) == 1:
				407	# Only 1 input, IFM and IFM2 are the same tensor
				408	ps.ifm2_tensor = ps.inputs[0]
				409	else:
				410	ps.ifm2_tensor = ps.inputs[1]
				411	else:
				412	ps.ifm_tensor = ifm_tensor
				413	ps.ifm2_tensor = None
				414
				415	ps.ofm_tensor = ofm_tensor
				416	assert ps.placement != PassPlacement.Npu or ps.ofm_tensor is not None
				417	ps.weight_tensor = ps.get_primary_op_ifm_weights()[1]
				418	ps.scale_tensor = ps.get_primary_op_ifm_weights_biases_ofm()[2]
				419
				420	for op in ps.ops:
				421	op.scheduled_pass = ps
				422
				423	reverse_pass_list.append(ps)
				424
				425	for inp, refcount in input_refcounts.items():
				426	for _ in range(refcount):
				427	visit_tensor(inp)
				428
				429	return ps
				430
				431	def visit_tensor(tens):
				432	visit_tensor_refcount[tens] += 1
				433	assert visit_tensor_refcount[tens] <= len(tens.consumers())
				434	if visit_tensor_refcount[tens] == len(tens.consumers()):
				435	for op in reversed(tens.ops):
				436	visit_op(op, tens)
				437
				438	def create_primary_op(ops_list):
				439	if any(op.type in (npu_pre_ops \| npu_post_ops \| npu_post_fuse_limited_ops) for op in ops_list):
				440	# Configure a 1x1 AvgPool and attach the op onto it
				441	op = ops_list[0]
				442	inp = op.inputs[0]
				443	avgpool_name = op.name + "_avgpool"
				444	avgpool_op = Operation("AvgPool", avgpool_name)
				445	avgpool_op.inputs = [inp]
				446	avgpool_op.inputs[0].consumer_list.append(avgpool_op)
				447	avgpool_op.attrs["padding"] = b"VALID"
				448	avgpool_op.attrs["npu_block_type"] = NpuBlockType.Pooling
				449	avgpool_op.attrs["stride_w"] = 1
				450	avgpool_op.attrs["stride_h"] = 1
				451	avgpool_op.attrs["filter_width"] = 1
				452	avgpool_op.attrs["filter_height"] = 1
				453	avgpool_op.attrs["strides"] = [1, 1, 1, 1]
				454	avgpool_op.attrs["ksize"] = [1, 1, 1, 1]
				455	avgpool_op.attrs["skirt"] = [0, 0, 0, 0]
				456	avgpool_op.attrs["explicit_padding"] = [0, 0, 0, 0]
				457	avgpool_out = inp.clone("_avgpooled")
				458	avgpool_out.consumer_list.append(op)
				459	avgpool_out.ops = [avgpool_op]
				460	avgpool_op.outputs = [avgpool_out]
				461
				462	op.inputs[0] = avgpool_out
				463	ops_list.insert(0, avgpool_op)
				464
				465	return avgpool_op
				466
				467	return None
				468
				469	for sg in nng.subgraphs:
				470	reverse_pass_list = []
				471	visit_op_refcount = collections.defaultdict(int)
				472	visit_tensor_refcount = collections.defaultdict(int)
				473
				474	startup_list = []
				475
				476	for tens in sg.output_tensors:
				477	visit_tensor(tens)
				478
				479	if startup_list:
				480	startup_ps = build_pass(startup_list)
				481	startup_ps.outputs = [op.outputs[0] for op in startup_list] # Need to fixup the outputs
				482	startup_ps.name = "startup_weight_initialisation"
				483
				484	sg.passes = list(reversed(reverse_pass_list))
				485	sg.build_pass_links()
				486
				487	if verbose_packing:
				488	nng.print_passes()
				489
				490	return nng