Blame - ethosu/vela/high_level_command_stream_generator.py - ml/ethos-u/ethos-u-vela

blob: 2297a3bf914263e6b54ac57ee706eaa1e75f1efe [file] [log] [blame]

Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1	# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
				2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	16	# Description:
				17	# Generate a high-level command stream from a scheduled subgraph with CascadedPasses.
				18	#
				19	# Also used during scheduling to work out allowable IFM/OFM overlap, this functionality can be accessed using
				20	# calc_allowed_ofm_ifm_overlap_for_cascaded_pass().
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	21	from .high_level_command_stream import Box
				22	from .high_level_command_stream import DMA
				23	from .high_level_command_stream import NpuStripe
				24	from .nn_graph import PassPlacement
				25	from .nn_graph import SchedulingStrategy
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	26	from .operation import NpuBlockType
Charles Xu	7879222	2020-05-13 10:15:26 +0200	[diff] [blame]	27	from .tensor import TensorPurpose
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	28
				29
Charles Xu	7879222	2020-05-13 10:15:26 +0200	[diff] [blame]	30	def dma_if_necessary(ps, box, tensor):
Louis Verhaard	3c07c97	2020-05-07 08:12:58 +0200	[diff] [blame]	31	if tensor.needs_dma():
Charles Xu	7879222	2020-05-13 10:15:26 +0200	[diff] [blame]	32	dma_op = tensor.ops[0]
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	33	in_tensor = dma_op.inputs[0]
Charles Xu	7879222	2020-05-13 10:15:26 +0200	[diff] [blame]	34	yield DMA(in_tensor, tensor, box)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	35
Tim Hall	c30f495	2020-06-15 20:47:35 +0100	[diff] [blame]	36
Charles Xu	600351a	2020-05-18 08:54:47 +0200	[diff] [blame]	37	def match_tensor(source, derived):
				38	if source == derived:
				39	return True
				40	ops = derived.ops
Tim Hall	c30f495	2020-06-15 20:47:35 +0100	[diff] [blame]	41	return ops != [] and len(ops) == 1 and ops[0].type == "SplitSliceRead" and source == ops[0].inputs[0]
				42
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	43
				44	def generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx):
				45	is_first = idx == 0
				46	is_last = idx == len(passes) - 1
				47	ps = passes[idx]
				48	block_config = block_configs[idx]
Charles Xu	600351a	2020-05-18 08:54:47 +0200	[diff] [blame]	49	npu_block_type = ps.npu_block_type
				50	split_offsets = [None, None] # offset for [ifm, ifm2]
				51
				52	ifm_idx = 0
				53	for op in ps.ops:
				54	if op.type == "SplitSliceRead":
				55	split_offsets[ifm_idx] = op.attrs["split_start"]
				56	ps.primary_op.attrs["fused_memory_function"] = op.type
				57	ifm_idx += 1
				58
				59	if len(ps.inputs) == 2 and npu_block_type == NpuBlockType.ElementWise:
				60	# Ensure correct imf and ifm2 order
Tim Hall	c30f495	2020-06-15 20:47:35 +0100	[diff] [blame]	61	if match_tensor(ps.inputs[0], ps.primary_op.inputs[1]) and match_tensor(ps.inputs[1], ps.primary_op.inputs[0]):
Charles Xu	600351a	2020-05-18 08:54:47 +0200	[diff] [blame]	62	ps.ifm_tensor, ps.ifm2_tensor = ps.ifm2_tensor, ps.ifm_tensor
				63	split_offsets[0], split_offsets[1] = split_offsets[1], split_offsets[0]
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	64
				65	ifm_tensor = ps.ifm_tensor
				66	ifm2_tensor = ps.ifm2_tensor
				67	ofm_tensor = ps.ofm_tensor
				68	weight_tensor = ps.weight_tensor
				69	scale_tensor = ps.scale_tensor
				70
				71	ofm_start = [0] * len(ofm_tensor.shape)
				72	ofm_end = list(ofm_tensor.shape)
				73
				74	strides = None
				75	skirt = None
Jacob Bohlin	611fcdf	2020-06-11 15:09:57 +0200	[diff] [blame]	76	upscaling = 1
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	77	if ps.primary_op is not None:
				78	strides = ps.primary_op.attrs.get("strides", None)
				79	skirt = ps.primary_op.attrs.get("skirt", None)
Jacob Bohlin	611fcdf	2020-06-11 15:09:57 +0200	[diff] [blame]	80	if ps.primary_op.type in set(("Conv2DBackpropInputSwitchedBias", "ResizeBilinear")):
				81	upscaling = ofm_tensor.shape[-3] // ifm_tensor.shape[-3]
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	82
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	83	concat_axis = 0
				84	concat_offset = 0
				85
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	86	# Fusable activation functions
				87	activation_ops = set(("Sigmoid", "Tanh", "Relu", "Relu6", "ReluN1To1"))
				88
				89	for op in ps.ops:
				90	if op.type == "ConcatSliceWrite":
				91	concat_axis = op.attrs["concat_axis"]
				92	concat_start = op.attrs["concat_start"]
				93	concat_end = op.attrs["concat_end"]
				94
				95	ofm_start[concat_axis] = concat_start
				96	ofm_end[concat_axis] = concat_end
				97	concat_offset = concat_start
				98	ps.primary_op.attrs["fused_memory_function"] = op.type
				99	elif op.type in activation_ops:
				100	ps.primary_op.attrs["fused_activation_function"] = op.type
				101
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	102	if strat == SchedulingStrategy.WeightStream:
				103	ofm_step = block_config[-1]
				104	ofm_stop = ofm_end[-1]
Louis Verhaard	3c07c97	2020-05-07 08:12:58 +0200	[diff] [blame]	105	if weight_tensor is None or not weight_tensor.needs_dma():
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	106	ofm_step = ofm_stop
				107	for start in range(ofm_start[-1], ofm_stop, ofm_step):
				108	end = min(start + ofm_step, ofm_stop)
				109	ofm_start[-1] = start
				110	ofm_end[-1] = end
				111	ofm_box = Box(ofm_start, ofm_end)
				112	ifm_box = None
				113	ifm2_box = None
				114
				115	if ifm_tensor.shape != []:
				116	ifm_box, _, _ = ofm_box.transform_with_strides_and_skirt(
Tim Hall	c30f495	2020-06-15 20:47:35 +0100	[diff] [blame]	117	strides,
				118	skirt,
				119	ifm_tensor.shape,
				120	npu_block_type,
				121	concat_axis,
				122	concat_offset,
				123	split_offsets[0],
				124	upscaling,
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	125	)
				126	else:
				127	ifm_box = Box([], [])
				128	if ifm2_tensor is not None and ifm2_tensor.shape != []:
				129	ifm2_box, _, _ = ofm_box.transform_with_strides_and_skirt(
Tim Hall	c30f495	2020-06-15 20:47:35 +0100	[diff] [blame]	130	strides,
				131	skirt,
				132	ifm2_tensor.shape,
				133	npu_block_type,
				134	concat_axis,
				135	concat_offset,
				136	split_offsets[1],
				137	upscaling,
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	138	)
				139	else:
				140	ifm2_box = Box([], [])
				141
Charles Xu	7879222	2020-05-13 10:15:26 +0200	[diff] [blame]	142	for intermediate in ps.intermediates:
Tim Hall	c30f495	2020-06-15 20:47:35 +0100	[diff] [blame]	143	if (
				144	intermediate is not None
				145	and intermediate.shape != []
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame^]	146	and intermediate.purpose in (TensorPurpose.FeatureMap, TensorPurpose.LUT)
Tim Hall	c30f495	2020-06-15 20:47:35 +0100	[diff] [blame]	147	):
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame^]	148	if intermediate.purpose is TensorPurpose.FeatureMap:
				149	intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt(
				150	strides,
				151	skirt,
				152	intermediate.shape,
				153	npu_block_type,
				154	concat_axis,
				155	concat_offset,
				156	split_offsets[0],
				157	upscaling,
				158	)
				159	else:
				160	intermediate_box = Box([0] * len(intermediate.shape), list(intermediate.shape))
Charles Xu	7879222	2020-05-13 10:15:26 +0200	[diff] [blame]	161	yield from dma_if_necessary(ps, intermediate_box, intermediate)
				162
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	163	weight_box = None
				164	if weight_tensor is not None:
				165	weight_oc_start = start
				166	weight_oc_end = end
				167	if concat_axis - len(weight_tensor.shape) == -1:
				168	weight_oc_start -= concat_offset
				169	weight_oc_end -= concat_offset
				170
				171	weight_box = Box.make_weight_box(
				172	weight_tensor.shape,
				173	npu_block_type,
				174	weight_oc_start,
				175	weight_oc_end,
				176	weight_tensor.weight_transpose_depthwise,
				177	)
Charles Xu	7879222	2020-05-13 10:15:26 +0200	[diff] [blame]	178	yield from dma_if_necessary(ps, weight_box, weight_tensor)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	179
				180	yield NpuStripe(
				181	ps,
				182	block_config,
				183	is_first,
				184	is_last,
				185	True,
				186	True,
				187	ifm_tensor,
				188	ifm_box,
				189	ofm_tensor,
				190	ofm_box,
				191	weight_tensor,
				192	weight_box,
				193	scale_tensor,
				194	concat_axis,
				195	concat_offset,
				196	ifm2_tensor=ifm2_tensor,
				197	ifm2_box=ifm2_box,
				198	)
				199
				200	elif strat == SchedulingStrategy.IfmStream:
				201	y_step = block_config[0]
				202	y_start = 0
				203	y_dim = 1
				204	if len(ofm_tensor.shape) >= 3:
				205	y_start = ofm_start[-3]
				206	y_dim = ofm_end[-3]
				207	if idx > 0:
				208	ifm_y_present = 0
				209	prev_pass = passes[idx - 1]
				210	prev_pass_gen = generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx - 1)
				211	else:
				212	ifm_y_present = 1
				213	if len(ifm_tensor.shape) >= 3:
				214	ifm_y_present = ifm_tensor.shape[-3]
				215	prev_pass_gen = []
				216	prev_pass = None
				217
				218	if len(passes) == 1:
				219	# no cascading, can just issue one big stripe
				220	# but only if we've done allocation and OFM does not overlap IFM
Charles Xu	04ce34c	2020-06-23 12:42:28 +0200	[diff] [blame]	221	if ifm_tensor.address is not None and ofm_tensor.address is not None:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	222	if (
				223	ifm_tensor.address + ifm_tensor.storage_size() <= ofm_tensor.address
				224	or ofm_tensor.address + ofm_tensor.storage_size() <= ifm_tensor.address
				225	):
				226	y_step = y_dim
				227
				228	weight_box = None
				229
				230	for start in range(y_start, y_dim, y_step):
				231	end = min(start + y_step, y_dim)
				232	if len(ofm_tensor.shape) >= 3:
				233	ofm_start[-3] = start
				234	ofm_end[-3] = end
				235	ofm_box = Box(ofm_start, ofm_end)
				236
				237	k_height = 1
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame^]	238	if npu_block_type == set((NpuBlockType.Pooling, NpuBlockType.ReduceSum)):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	239	if ps.primary_op is not None:
				240	k_height = ps.primary_op.attrs["ksize"][1]
				241	else:
				242	if weight_tensor is not None:
				243	k_height = weight_tensor.shape[0]
				244
				245	ifm_box, pad_top, pad_bottom = ofm_box.transform_with_strides_and_skirt(
Tim Hall	c30f495	2020-06-15 20:47:35 +0100	[diff] [blame]	246	strides,
				247	skirt,
				248	ifm_tensor.shape,
				249	npu_block_type,
				250	concat_axis,
				251	concat_offset,
				252	split_offsets[0],
				253	k_height,
				254	upscaling,
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	255	)
				256
Charles Xu	7879222	2020-05-13 10:15:26 +0200	[diff] [blame]	257	for intermediate in ps.intermediates:
Tim Hall	c30f495	2020-06-15 20:47:35 +0100	[diff] [blame]	258	if (
				259	intermediate is not None
				260	and intermediate.shape != []
				261	and intermediate.purpose == TensorPurpose.FeatureMap
				262	):
Charles Xu	7879222	2020-05-13 10:15:26 +0200	[diff] [blame]	263	intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt(
Tim Hall	c30f495	2020-06-15 20:47:35 +0100	[diff] [blame]	264	strides,
				265	skirt,
				266	intermediate.shape,
				267	npu_block_type,
				268	concat_axis,
				269	concat_offset,
				270	split_offsets[0],
				271	upscaling,
Charles Xu	7879222	2020-05-13 10:15:26 +0200	[diff] [blame]	272	)
				273	yield from dma_if_necessary(ps, intermediate_box, intermediate)
				274
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	275	ifm_y_needed = 1
				276	if len(ifm_box.end_coord) >= 3:
				277	ifm_y_needed = ifm_box.end_coord[-3]
				278	if ifm_y_present < ifm_y_needed:
				279	for prev_cmd in prev_pass_gen:
				280	yield prev_cmd
				281	rng = prev_cmd.get_ofm_y_range_for_pass(prev_pass)
				282	if rng is not None:
				283	ifm_y_present = max(ifm_y_present, rng[1])
				284	if ifm_y_present >= ifm_y_needed:
				285	break
				286
				287	if weight_tensor is not None and weight_box is None:
				288	weight_box = Box.make_weight_box(
				289	weight_tensor.shape, npu_block_type, weights_transposed=weight_tensor.weight_transpose_depthwise
				290	)
Charles Xu	7879222	2020-05-13 10:15:26 +0200	[diff] [blame]	291	yield from dma_if_necessary(ps, weight_box, weight_tensor)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	292
				293	# Check if first/last stripe in pass
				294	is_first_h_stripe = start == y_start
				295	is_last_h_stripe = (start + y_step) >= y_dim
				296
				297	stripe = NpuStripe(
				298	ps,
				299	block_config,
				300	is_first,
				301	is_last,
				302	is_first_h_stripe,
				303	is_last_h_stripe,
				304	ifm_tensor,
				305	ifm_box,
				306	ofm_tensor,
				307	ofm_box,
				308	weight_tensor,
				309	weight_box,
				310	scale_tensor,
				311	concat_axis,
				312	concat_offset,
				313	None,
				314	None,
				315	pad_top,
				316	pad_bottom,
				317	)
				318	yield stripe
				319	else:
				320	assert 0, "unknown scheduling strategy"
				321
				322
				323	def generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
				324	if strat == SchedulingStrategy.WeightStream:
				325	for idx in range(len(passes)):
				326	yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx)
				327	elif strat == SchedulingStrategy.IfmStream:
				328	yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, len(passes) - 1)
				329	else:
				330	assert 0, "Unknown streaming strategy"
				331
				332
				333	def generate_high_level_command_stream_for_cascaded_pass(cps):
				334	yield from generate_high_level_command_stream_for_pass_list(
				335	cps.strategy, cps.passes, [ps.block_config for ps in cps.passes]
				336	)
				337
				338
				339	def generate_high_level_command_stream(nng, sg, arch, verbose_high_level_command_stream):
				340	res = []
				341	for cps in sg.cascaded_passes:
				342	if cps.placement == PassPlacement.Npu:
				343	res += list(generate_high_level_command_stream_for_cascaded_pass(cps))
				344
				345	sg.high_level_command_stream = res
				346	if verbose_high_level_command_stream:
				347	sg.print_high_level_command_stream()
				348
				349
				350	def calc_allowed_ofm_ifm_overlap_for_pass_list(strat, passes, block_configs):
				351	highest_ofm_write = 0
				352	if not passes[0].ifm_tensor or not passes[-1].ofm_tensor:
				353	return 0
				354
				355	ifm_read = passes[0].ifm_tensor.storage_size
				356	min_overlap = 999999999999999999999
				357	ofm_size = passes[-1].ofm_tensor.storage_size()
				358	if strat == SchedulingStrategy.WeightStream:
				359	return 0
				360	for cmd in generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
				361	if cmd.is_npu_pass_command():
				362	if cmd.is_first:
				363	ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.start_coord, is_top_box=False)
				364	if ifm_read is None:
				365	return 0
				366	if cmd.is_last:
				367	write_offset = cmd.ofm_tensor.address_offset_for_coordinate(cmd.ofm_box.end_coord, is_top_box=True)
				368	if write_offset is None:
				369	return 0
				370	highest_ofm_write = max(write_offset, highest_ofm_write)
				371
				372	if cmd.is_first or cmd.is_last:
				373	overlap_required = max(highest_ofm_write - min(ifm_read, ofm_size), 0)
				374	can_overwrite = ofm_size - overlap_required
				375	min_overlap = min(min_overlap, can_overwrite)
				376
				377	if cmd.is_first:
				378	ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.end_coord, is_top_box=True)
				379
				380	min_overlap = max(min_overlap, 0)
				381	return min_overlap
				382
				383
				384	def calc_allowed_ofm_ifm_overlap_for_cascaded_pass(cps):
				385	return calc_allowed_ofm_ifm_overlap_for_pass_list(cps.strategy, cps.passes, [ps.block_config for ps in cps.passes])