Blame - ethosu/vela/pass_packing.py - ml/ethos-u/ethos-u-vela

blob: 7b69e35d28e3566465beeed07af7b5bbaf75b811 [file] [log] [blame]

Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1	# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
				2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	16	# Description:
				17	# Packs a subgraph with Neural Network Operations into Passes. Each Pass has one or more Operations.
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	18	import collections
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	19	import enum
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	20
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	21	from .nn_graph import Pass
				22	from .nn_graph import PassPlacement
				23	from .operation import NpuBlockType
				24	from .operation import Operation
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	25	from .tensor import TensorPurpose
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	26
				27
				28	class PassFlags(enum.Flag):
				29	Empty = 0
				30	Pre = 1
				31	Main = 2
				32	Post = 4
				33	Mac = 8
				34	Dma = 32
				35	ElementWise = 256
				36	Npu = 512
				37	Cpu = 1024
				38	StartupInit = 2048
				39	MemoryOnly = 4096
				40	PostFusingLimited = 8192
				41
				42
				43	npu_pre_ops = set(("QuantizedResizeBilinear", "SplitSliceRead",))
				44
				45	mac_main_ops = set(
				46	(
				47	# convolutions
				48	"Conv2DBiasAct",
				49	"Conv2D",
				50	"QuantizedConv2D",
Jacob Bohlin	cf7da10	2020-05-20 09:03:40 +0200	[diff] [blame]	51	"Conv2DBackpropInputSwitchedBias",
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	52	# depth-wise convolutions
				53	"DepthwiseConv2dBiasAct",
				54	"DepthwiseConv2dNative",
				55	"QuantizedDepthwiseConv2D",
				56	# FC layers
				57	"QuantizedMatMul",
				58	"MatMul",
				59	"FullyConnectedAct",
				60	# RNN/LSTM/GRU
				61	"BlockLSTM",
				62	# pooling
				63	"QuantizedMaxPool",
				64	"QuantizedAvgPool",
				65	"AvgPool",
				66	"MaxPool",
				67	"AvgPoolAct",
				68	"MaxPoolAct",
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	69	"ReduceSum",
Dwight Lidman	3ec04ac	2020-04-30 11:54:48 +0200	[diff] [blame]	70	# deconvolution
				71	"ResizeBilinear",
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	72	)
				73	)
				74
				75	binary_elem_wise_main_ops = set(
				76	(
				77	# binary element-wise
				78	"AddAct",
				79	"MulAct",
				80	"SubAct",
				81	"QuantizedAdd",
				82	"QuantizedSub",
				83	"QuantizedMul",
				84	"Mul",
				85	"Add",
				86	"Sub",
				87	"Minimum",
				88	"Maximum",
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	89	"SHL",
				90	"SHR",
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	91	)
				92	)
				93
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	94	unary_elem_wise_main_ops = set(("LeakyRelu", "Abs", "CLZ",)) # Unary element-wise operations
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	95
				96	elem_wise_main_ops = binary_elem_wise_main_ops \| unary_elem_wise_main_ops
				97
				98	activation_ops = set(("QuantizedRelu", "QuantizedRelu1", "QuantizedRelu6", "Relu", "Relu6", "ReluN1To1"))
				99	npu_post_ops = activation_ops \| set(
				100	# Bias-add operations: Get rid of these once we have rewrites from Conv2D + BiasAdd + Activation to Conv2DBiasAct.
				101	("Mul", "Add", "QuantizedBiasAdd", "Requantize", "QuantizedBatchNorm", "BiasAdd", "FusedBatchNorm")
				102	)
				103
				104	npu_post_fuse_limited_ops = set(
				105	# Set of post operators that should not be fused with main/elementwise ops
Jacob Bohlin	9fbc491	2020-06-29 11:58:50 +0200	[diff] [blame]	106	("ConcatSliceWrite", "Sigmoid", "Tanh", "Quantize")
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	107	)
				108
				109	elem_wise_ops = elem_wise_main_ops \| activation_ops \| set(("Sigmoid", "Tanh"))
				110
				111
				112	quantization_ops = set(("Dequantize", "QuantizeV2", "Max", "Min"))
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	113	cpu_ops = set(("Softmax", "QuantizedSoftmax", "LRN", "Shape", "QuantizedPad", "Pad", "AddN")) \| quantization_ops
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	114
				115	npu_dma_ops = set(("DMA",))
				116	startup_init_ops = set(("Const", "VariableV2", "Placeholder", "SubgraphInput"))
				117	memory_only_ops = set(("Squeeze", "Reshape", "QuantizedReshape", "ExpandDims",))
				118
				119
				120	test_sequence = [
				121	(
				122	# ops_set
				123	npu_post_ops,
				124	# incompatible_pack_flags
				125	PassFlags.Cpu \| PassFlags.MemoryOnly \| PassFlags.Pre \| PassFlags.Main,
				126	# flags_to_set
				127	PassFlags.Npu \| PassFlags.Post,
				128	# flags_to_clear
				129	PassFlags.Empty,
				130	),
				131	(
				132	# ops_set
				133	npu_post_fuse_limited_ops,
				134	# incompatible_pack_flags
				135	PassFlags.Cpu \| PassFlags.MemoryOnly \| PassFlags.Pre \| PassFlags.Main,
				136	# flags_to_set
				137	PassFlags.Npu \| PassFlags.PostFusingLimited,
				138	# flags_to_clear
				139	PassFlags.Empty,
				140	),
				141	(
				142	# ops_set
				143	mac_main_ops,
				144	# incompatible_pack_flags
				145	PassFlags.Cpu
				146	\| PassFlags.MemoryOnly
				147	\| PassFlags.ElementWise
				148	\| PassFlags.Pre
				149	\| PassFlags.Main
				150	\| PassFlags.PostFusingLimited,
				151	# flags_to_set
				152	PassFlags.Npu \| PassFlags.Mac \| PassFlags.Main,
				153	# flags_to_clear
				154	PassFlags.Empty,
				155	),
				156	(
				157	# ops_set
				158	elem_wise_main_ops,
				159	# incompatible_pack_flags
				160	PassFlags.Cpu
				161	\| PassFlags.MemoryOnly
				162	\| PassFlags.Mac
				163	\| PassFlags.Pre
				164	\| PassFlags.Main
				165	\| PassFlags.PostFusingLimited,
				166	# flags_to_set
				167	PassFlags.Npu \| PassFlags.ElementWise \| PassFlags.Main,
				168	# flags_to_clear
				169	PassFlags.Empty,
				170	),
				171	(
				172	# ops_set
				173	npu_pre_ops,
				174	# incompatible_pack_flags
				175	PassFlags.Cpu \| PassFlags.MemoryOnly,
				176	# flags_to_set
				177	PassFlags.Npu \| PassFlags.Mac \| PassFlags.Pre \| PassFlags.ElementWise,
				178	# flags_to_clear
				179	PassFlags.Empty,
				180	),
				181	(
				182	# ops_set
				183	npu_dma_ops,
				184	# incompatible_pack_flags
				185	PassFlags.Cpu \| PassFlags.MemoryOnly,
				186	# flags_to_set
				187	PassFlags.Npu \| PassFlags.Dma,
				188	# flags_to_clear
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	189	PassFlags.Empty,
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	190	),
				191	(
				192	# ops_set
				193	startup_init_ops,
				194	# incompatible_pack_flags
				195	PassFlags.Npu \| PassFlags.Cpu \| PassFlags.MemoryOnly,
				196	# flags_to_set
				197	PassFlags.StartupInit \| PassFlags.Main,
				198	# flags_to_clear
				199	PassFlags.Empty,
				200	),
				201	(
				202	# ops_set
				203	memory_only_ops,
				204	# incompatible_pack_flags
				205	PassFlags.Npu \| PassFlags.Cpu,
				206	# flags_to_set
				207	PassFlags.MemoryOnly \| PassFlags.Main,
				208	# flags_to_clear
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	209	PassFlags.Empty,
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	210	),
				211	(
				212	# ops_set
				213	cpu_ops,
				214	# incompatible_pack_flags
				215	PassFlags.Npu \| PassFlags.MemoryOnly \| PassFlags.Main,
				216	# flags_to_set
				217	PassFlags.Cpu \| PassFlags.Main,
				218	# flags_to_clear
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	219	PassFlags.Empty,
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	220	),
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	221	( # This last one is a fallback for unrecognised operations
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	222	# ops_set
				223	None,
				224	# incompatible_pack_flags
				225	PassFlags.Npu \| PassFlags.MemoryOnly \| PassFlags.Main,
				226	# flags_to_set
				227	PassFlags.Cpu \| PassFlags.Main,
				228	# flags_to_clear
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	229	PassFlags.Empty,
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	230	),
				231	]
				232
				233	# Some sanity checking
				234	for (operation_set, incompatible_pack_flags, flags_to_set, flags_to_clear) in test_sequence:
				235	assert not flags_to_clear & flags_to_set
				236
				237	if operation_set is not None:
				238	for op in operation_set:
				239	assert len(op) > 1 # This is to avoid string literals being decomposed
				240
				241
				242	def pack_into_passes(nng, arch, verbose_packing=False):
				243	def visit_op(op, ignored):
				244	visit_op_refcount[op] += 1
				245
				246	if visit_op_refcount[op] == 1: # First-time visit, go and fix up unused output tensors
				247	for tens in op.outputs:
				248	if len(tens.consumers()) == 0:
				249	visit_op_refcount[op] += 1
				250
				251	assert visit_op_refcount[op] <= len(op.outputs)
				252	if visit_op_refcount[op] == len(op.outputs):
				253
				254	if op.type in startup_init_ops:
				255	startup_list.append(op)
				256	else:
				257	_, _, _, ofm_tensor = op.get_ifm_ifm2_weights_ofm()
				258	if ofm_tensor is None:
				259	ofm_tensor = op.outputs[0]
				260	build_pass((op,), ofm_tensor)
				261
				262	def build_pass(start_ops_to_process, ofm_tensor=None):
				263	reverse_ops_list = []
				264	curr_flags = PassFlags.Empty
				265	npu_block_type = NpuBlockType.Default
				266
				267	reverse_intermediates = []
				268	input_set = set()
				269	ifm_tensor = None
				270	primary_op = None
				271
				272	to_process = collections.deque()
				273	for start_op in start_ops_to_process:
				274	to_process.append((start_op, None))
				275
				276	while to_process:
				277	curr_op, tens = to_process.popleft()
				278
				279	if curr_op in reverse_ops_list:
				280	continue
				281
				282	for operation_set, incompatible_pack_flags, flags_to_set, flags_to_clear in test_sequence:
				283	if operation_set is None or curr_op.type in operation_set:
				284	if not (curr_flags & incompatible_pack_flags):
				285	if flags_to_set & PassFlags.Npu:
				286	if not curr_op.run_on_npu:
				287	continue
				288
				289	reverse_ops_list.append(curr_op)
				290	new_block_type = curr_op.attrs.get("npu_block_type", NpuBlockType.Default)
				291	if new_block_type != NpuBlockType.Default:
				292	assert npu_block_type == NpuBlockType.Default
				293	npu_block_type = new_block_type # Only one major block type per pass
				294	assert primary_op is None
				295	primary_op = curr_op
				296
				297	curr_flags &= ~flags_to_clear
				298	curr_flags \|= flags_to_set
				299
				300	if flags_to_set & PassFlags.Npu:
				301	if flags_to_set & (
				302	PassFlags.Mac \| PassFlags.ElementWise \| PassFlags.Post \| PassFlags.PostFusingLimited
				303	):
				304	assert len(curr_op.inputs) >= 1
				305	if curr_op.type == "BlockLSTM":
				306	ifm_tensor = curr_op.inputs[3]
				307	else:
				308	ifm_tensor = curr_op.inputs[0]
				309	assert ifm_tensor.purpose == TensorPurpose.FeatureMap
				310
				311	if flags_to_set & PassFlags.Dma:
				312	# DMAs are special - Output buffers need to be preserved as intermediates,
				313	# if the pass consumes the results
				314	if tens is not None:
				315	reverse_intermediates.append(tens)
				316
				317	if operation_set is None:
				318	print("Warning:", curr_op.type, "operation is unknown or unsupported, placing on CPU")
				319
Charles Xu	600351a	2020-05-18 08:54:47 +0200	[diff] [blame]	320	for inp in reversed(curr_op.inputs):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	321	can_pack = True
				322	if len(inp.ops) == 1:
				323	next_op = inp.ops[0]
				324	for outp in next_op.outputs:
				325	consumers = outp.consumers()
				326	if len(consumers) > 1 or (len(consumers) == 1 and consumers[0] != curr_op):
				327	can_pack = False
				328	break
				329	else:
				330	can_pack = False
				331
				332	if can_pack:
				333	to_process.append((next_op, inp))
				334	else:
				335	assert inp is not None
				336	input_set.add(inp)
				337
				338	break
				339
				340	else:
				341	# This operation is not compatible with already packed operations, just register the tensor as an input
				342	assert tens is not None
				343	input_set.add(tens)
				344
				345	if curr_flags & PassFlags.Npu and not curr_flags & (PassFlags.ElementWise \| PassFlags.Mac):
				346	# Make the choice that if we don't have a mac operation, the ambidextrous operations go on the
				347	# element wise unit
				348	curr_flags \|= PassFlags.ElementWise
				349
				350	is_element_wise = True
				351	for op in reverse_ops_list:
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	352	if op.type not in elem_wise_ops and op.type not in npu_dma_ops:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	353	is_element_wise = False
				354	break
				355
				356	placement = PassPlacement.Unknown
				357	if curr_flags & PassFlags.Npu:
				358	assert placement == PassPlacement.Unknown
				359	placement = PassPlacement.Npu
				360	if curr_flags & PassFlags.Cpu:
				361	assert placement == PassPlacement.Unknown
				362	placement = PassPlacement.Cpu
				363	if curr_flags & PassFlags.MemoryOnly:
				364	assert placement == PassPlacement.Unknown
				365	placement = PassPlacement.MemoryOnly
				366	if curr_flags & PassFlags.StartupInit:
				367	assert placement == PassPlacement.Unknown
				368	placement = PassPlacement.StartupInit
				369	assert placement != PassPlacement.Unknown
				370
				371	ops_list = list(reversed(reverse_ops_list))
				372	intermediates = list(reversed(reverse_intermediates))
				373
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	374	if primary_op is None:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	375	primary_op = create_primary_op(ops_list)
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	376	if primary_op is not None:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	377	visit_tensor_refcount[primary_op.inputs[0]] += 1
				378	npu_block_type = primary_op.attrs["npu_block_type"]
				379	for input_tens in primary_op.inputs:
				380	if input_tens not in input_set:
				381	input_set.add(input_tens)
				382
				383	ordered_input_list = []
Louis Verhaard	0b8268a	2020-08-05 16:11:29 +0200	[diff] [blame^]	384	# Keep LUT-s in a separate list and add as inputs at the end
				385	# to avoid that they would accidentally be assigned as ifm or ifm2
				386	lut_list = []
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	387	input_refcounts = collections.defaultdict(int)
				388	for op in ops_list:
				389	for inp in op.inputs:
				390	if inp in input_set:
				391	if input_refcounts[inp] == 0:
Louis Verhaard	0b8268a	2020-08-05 16:11:29 +0200	[diff] [blame^]	392	if inp.purpose == TensorPurpose.LUT:
				393	lut_list.append(inp)
				394	else:
				395	ordered_input_list.append(inp)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	396	input_refcounts[inp] += 1
				397
				398	name = ops_list[0].name
				399	non_dma_ops = [op for op in ops_list if op.type != "DMA"]
				400	if non_dma_ops:
				401	name = non_dma_ops[0].name
				402	ps = Pass(name, placement, is_element_wise, npu_block_type)
				403	ps.ops = ops_list
				404	ps.primary_op = primary_op
				405	ps.inputs = ordered_input_list
				406	ps.intermediates = intermediates
				407	ps.outputs = list(ops_list[-1].outputs)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	408
				409	# ElementWise operation, 2 IFMs
				410	if ps.primary_op and ps.primary_op.type in binary_elem_wise_main_ops:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	411	ps.ifm_tensor = ps.inputs[0]
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	412	ps.ifm2_tensor = ps.inputs[-1]
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	413
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	414	if len(ps.inputs) > 2:
				415	ps.ifm_tensor = ps.inputs[-2]
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	416	else:
				417	ps.ifm_tensor = ifm_tensor
				418	ps.ifm2_tensor = None
				419
				420	ps.ofm_tensor = ofm_tensor
				421	assert ps.placement != PassPlacement.Npu or ps.ofm_tensor is not None
				422	ps.weight_tensor = ps.get_primary_op_ifm_weights()[1]
				423	ps.scale_tensor = ps.get_primary_op_ifm_weights_biases_ofm()[2]
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	424	ps.lut_tensor = ps.get_primary_op_lut()
Louis Verhaard	0b8268a	2020-08-05 16:11:29 +0200	[diff] [blame^]	425	ps.inputs.extend(lut_list)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	426
				427	for op in ps.ops:
				428	op.scheduled_pass = ps
				429
				430	reverse_pass_list.append(ps)
				431
				432	for inp, refcount in input_refcounts.items():
				433	for _ in range(refcount):
				434	visit_tensor(inp)
				435
				436	return ps
				437
				438	def visit_tensor(tens):
				439	visit_tensor_refcount[tens] += 1
				440	assert visit_tensor_refcount[tens] <= len(tens.consumers())
				441	if visit_tensor_refcount[tens] == len(tens.consumers()):
				442	for op in reversed(tens.ops):
				443	visit_op(op, tens)
				444
				445	def create_primary_op(ops_list):
				446	if any(op.type in (npu_pre_ops \| npu_post_ops \| npu_post_fuse_limited_ops) for op in ops_list):
				447	# Configure a 1x1 AvgPool and attach the op onto it
				448	op = ops_list[0]
				449	inp = op.inputs[0]
				450	avgpool_name = op.name + "_avgpool"
				451	avgpool_op = Operation("AvgPool", avgpool_name)
				452	avgpool_op.inputs = [inp]
				453	avgpool_op.inputs[0].consumer_list.append(avgpool_op)
				454	avgpool_op.attrs["padding"] = b"VALID"
				455	avgpool_op.attrs["npu_block_type"] = NpuBlockType.Pooling
				456	avgpool_op.attrs["stride_w"] = 1
				457	avgpool_op.attrs["stride_h"] = 1
				458	avgpool_op.attrs["filter_width"] = 1
				459	avgpool_op.attrs["filter_height"] = 1
				460	avgpool_op.attrs["strides"] = [1, 1, 1, 1]
				461	avgpool_op.attrs["ksize"] = [1, 1, 1, 1]
				462	avgpool_op.attrs["skirt"] = [0, 0, 0, 0]
				463	avgpool_op.attrs["explicit_padding"] = [0, 0, 0, 0]
				464	avgpool_out = inp.clone("_avgpooled")
				465	avgpool_out.consumer_list.append(op)
Michael McGeagh	c5b549b	2020-08-07 11:54:28 +0100	[diff] [blame]	466	avgpool_op.set_output_tensor(avgpool_out)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	467
				468	op.inputs[0] = avgpool_out
				469	ops_list.insert(0, avgpool_op)
				470
				471	return avgpool_op
				472
				473	return None
				474
				475	for sg in nng.subgraphs:
				476	reverse_pass_list = []
				477	visit_op_refcount = collections.defaultdict(int)
				478	visit_tensor_refcount = collections.defaultdict(int)
				479
				480	startup_list = []
				481
				482	for tens in sg.output_tensors:
				483	visit_tensor(tens)
				484
				485	if startup_list:
				486	startup_ps = build_pass(startup_list)
				487	startup_ps.outputs = [op.outputs[0] for op in startup_list] # Need to fixup the outputs
				488	startup_ps.name = "startup_weight_initialisation"
				489
				490	sg.passes = list(reversed(reverse_pass_list))
				491	sg.build_pass_links()
				492
				493	if verbose_packing:
				494	nng.print_passes()
				495
				496	return nng