Blame - ethosu/vela/high_level_command_stream_generator.py - ml/ethos-u/ethos-u-vela

blob: 50b913d88eddf24b4ca1ee0f8c5fcf5c92adc7a0 [file] [log] [blame]

Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1	# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
				2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	16	# Description:
				17	# Generate a high-level command stream from a scheduled subgraph with CascadedPasses.
				18	#
				19	# Also used during scheduling to work out allowable IFM/OFM overlap, this functionality can be accessed using
				20	# calc_allowed_ofm_ifm_overlap_for_cascaded_pass().
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	21	from .high_level_command_stream import Box
				22	from .high_level_command_stream import DMA
				23	from .high_level_command_stream import NpuStripe
				24	from .nn_graph import PassPlacement
				25	from .nn_graph import SchedulingStrategy
Charles Xu	89a6bbf	2020-08-11 12:31:58 +0200	[diff] [blame]	26	from .numeric_util import round_up_divide
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	27	from .operation import NpuBlockType
Charles Xu	7879222	2020-05-13 10:15:26 +0200	[diff] [blame]	28	from .tensor import TensorPurpose
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	29
				30
Charles Xu	7879222	2020-05-13 10:15:26 +0200	[diff] [blame]	31	def dma_if_necessary(ps, box, tensor):
Louis Verhaard	3c07c97	2020-05-07 08:12:58 +0200	[diff] [blame]	32	if tensor.needs_dma():
Charles Xu	7879222	2020-05-13 10:15:26 +0200	[diff] [blame]	33	dma_op = tensor.ops[0]
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	34	in_tensor = dma_op.inputs[0]
Louis Verhaard	0b8268a	2020-08-05 16:11:29 +0200	[diff] [blame]	35	yield DMA(ps, in_tensor, tensor, box)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	36
Tim Hall	c30f495	2020-06-15 20:47:35 +0100	[diff] [blame]	37
Charles Xu	600351a	2020-05-18 08:54:47 +0200	[diff] [blame]	38	def match_tensor(source, derived):
				39	if source == derived:
				40	return True
				41	ops = derived.ops
Tim Hall	c30f495	2020-06-15 20:47:35 +0100	[diff] [blame]	42	return ops != [] and len(ops) == 1 and ops[0].type == "SplitSliceRead" and source == ops[0].inputs[0]
				43
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	44
				45	def generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx):
				46	is_first = idx == 0
				47	is_last = idx == len(passes) - 1
				48	ps = passes[idx]
				49	block_config = block_configs[idx]
Charles Xu	600351a	2020-05-18 08:54:47 +0200	[diff] [blame]	50	npu_block_type = ps.npu_block_type
				51	split_offsets = [None, None] # offset for [ifm, ifm2]
				52
Charles Xu	600351a	2020-05-18 08:54:47 +0200	[diff] [blame]	53	if len(ps.inputs) == 2 and npu_block_type == NpuBlockType.ElementWise:
Patrik Gustavsson	438e563	2020-09-01 12:23:25 +0200	[diff] [blame^]	54	# Ensure correct ifm and ifm2 order
Tim Hall	c30f495	2020-06-15 20:47:35 +0100	[diff] [blame]	55	if match_tensor(ps.inputs[0], ps.primary_op.inputs[1]) and match_tensor(ps.inputs[1], ps.primary_op.inputs[0]):
Charles Xu	600351a	2020-05-18 08:54:47 +0200	[diff] [blame]	56	ps.ifm_tensor, ps.ifm2_tensor = ps.ifm2_tensor, ps.ifm_tensor
Patrik Gustavsson	438e563	2020-09-01 12:23:25 +0200	[diff] [blame^]	57
				58	for op in ps.ops:
				59	if op.type == "SplitSliceRead":
				60	ps.primary_op.attrs["fused_memory_function"] = op.type
				61	assert len(op.inputs) == 1
				62	if match_tensor(ps.ifm_tensor, op.inputs[0]):
				63	split_offsets[0] = op.attrs["split_start"]
				64	elif match_tensor(ps.ifm2_tensor, op.inputs[0]):
				65	split_offsets[1] = op.attrs["split_start"]
				66	else:
				67	assert False
				68	else:
				69	ifm_idx = 0
				70	for op in ps.ops:
				71	if op.type == "SplitSliceRead":
				72	assert ifm_idx < 2
				73	split_offsets[ifm_idx] = op.attrs["split_start"]
				74	ps.primary_op.attrs["fused_memory_function"] = op.type
				75	ifm_idx += 1
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	76
				77	ifm_tensor = ps.ifm_tensor
				78	ifm2_tensor = ps.ifm2_tensor
				79	ofm_tensor = ps.ofm_tensor
				80	weight_tensor = ps.weight_tensor
				81	scale_tensor = ps.scale_tensor
				82
				83	ofm_start = [0] * len(ofm_tensor.shape)
				84	ofm_end = list(ofm_tensor.shape)
				85
				86	strides = None
				87	skirt = None
Jacob Bohlin	611fcdf	2020-06-11 15:09:57 +0200	[diff] [blame]	88	upscaling = 1
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	89	if ps.primary_op is not None:
				90	strides = ps.primary_op.attrs.get("strides", None)
				91	skirt = ps.primary_op.attrs.get("skirt", None)
Charles Xu	89a6bbf	2020-08-11 12:31:58 +0200	[diff] [blame]	92	if ps.primary_op.type == "Conv2DBackpropInputSwitchedBias":
Jacob Bohlin	611fcdf	2020-06-11 15:09:57 +0200	[diff] [blame]	93	upscaling = ofm_tensor.shape[-3] // ifm_tensor.shape[-3]
Charles Xu	89a6bbf	2020-08-11 12:31:58 +0200	[diff] [blame]	94	elif ps.primary_op.type == "ResizeBilinear":
				95	upscaling = round_up_divide(ofm_tensor.shape[-3], ifm_tensor.shape[-3])
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	96
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	97	concat_axis = 0
				98	concat_offset = 0
				99
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	100	# Fusable activation functions
				101	activation_ops = set(("Sigmoid", "Tanh", "Relu", "Relu6", "ReluN1To1"))
				102
				103	for op in ps.ops:
				104	if op.type == "ConcatSliceWrite":
				105	concat_axis = op.attrs["concat_axis"]
				106	concat_start = op.attrs["concat_start"]
				107	concat_end = op.attrs["concat_end"]
				108
				109	ofm_start[concat_axis] = concat_start
				110	ofm_end[concat_axis] = concat_end
				111	concat_offset = concat_start
				112	ps.primary_op.attrs["fused_memory_function"] = op.type
				113	elif op.type in activation_ops:
				114	ps.primary_op.attrs["fused_activation_function"] = op.type
				115
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	116	if strat == SchedulingStrategy.WeightStream:
				117	ofm_step = block_config[-1]
				118	ofm_stop = ofm_end[-1]
Louis Verhaard	3c07c97	2020-05-07 08:12:58 +0200	[diff] [blame]	119	if weight_tensor is None or not weight_tensor.needs_dma():
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	120	ofm_step = ofm_stop
				121	for start in range(ofm_start[-1], ofm_stop, ofm_step):
				122	end = min(start + ofm_step, ofm_stop)
				123	ofm_start[-1] = start
				124	ofm_end[-1] = end
				125	ofm_box = Box(ofm_start, ofm_end)
				126	ifm_box = None
				127	ifm2_box = None
				128
				129	if ifm_tensor.shape != []:
				130	ifm_box, _, _ = ofm_box.transform_with_strides_and_skirt(
Tim Hall	c30f495	2020-06-15 20:47:35 +0100	[diff] [blame]	131	strides,
				132	skirt,
				133	ifm_tensor.shape,
				134	npu_block_type,
				135	concat_axis,
				136	concat_offset,
				137	split_offsets[0],
				138	upscaling,
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	139	)
				140	else:
				141	ifm_box = Box([], [])
				142	if ifm2_tensor is not None and ifm2_tensor.shape != []:
				143	ifm2_box, _, _ = ofm_box.transform_with_strides_and_skirt(
Tim Hall	c30f495	2020-06-15 20:47:35 +0100	[diff] [blame]	144	strides,
				145	skirt,
				146	ifm2_tensor.shape,
				147	npu_block_type,
				148	concat_axis,
				149	concat_offset,
				150	split_offsets[1],
				151	upscaling,
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	152	)
				153	else:
				154	ifm2_box = Box([], [])
				155
Charles Xu	7879222	2020-05-13 10:15:26 +0200	[diff] [blame]	156	for intermediate in ps.intermediates:
Tim Hall	c30f495	2020-06-15 20:47:35 +0100	[diff] [blame]	157	if (
				158	intermediate is not None
				159	and intermediate.shape != []
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	160	and intermediate.purpose in (TensorPurpose.FeatureMap, TensorPurpose.LUT)
Tim Hall	c30f495	2020-06-15 20:47:35 +0100	[diff] [blame]	161	):
Fredrik Svedberg	a0c3624	2020-06-03 15:43:31 +0200	[diff] [blame]	162	if intermediate.purpose is TensorPurpose.FeatureMap:
				163	intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt(
				164	strides,
				165	skirt,
				166	intermediate.shape,
				167	npu_block_type,
				168	concat_axis,
				169	concat_offset,
				170	split_offsets[0],
				171	upscaling,
				172	)
				173	else:
				174	intermediate_box = Box([0] * len(intermediate.shape), list(intermediate.shape))
Charles Xu	7879222	2020-05-13 10:15:26 +0200	[diff] [blame]	175	yield from dma_if_necessary(ps, intermediate_box, intermediate)
				176
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	177	weight_box = None
				178	if weight_tensor is not None:
				179	weight_oc_start = start
				180	weight_oc_end = end
				181	if concat_axis - len(weight_tensor.shape) == -1:
				182	weight_oc_start -= concat_offset
				183	weight_oc_end -= concat_offset
				184
				185	weight_box = Box.make_weight_box(
				186	weight_tensor.shape,
				187	npu_block_type,
				188	weight_oc_start,
				189	weight_oc_end,
				190	weight_tensor.weight_transpose_depthwise,
				191	)
Charles Xu	7879222	2020-05-13 10:15:26 +0200	[diff] [blame]	192	yield from dma_if_necessary(ps, weight_box, weight_tensor)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	193
				194	yield NpuStripe(
				195	ps,
				196	block_config,
				197	is_first,
				198	is_last,
				199	True,
				200	True,
				201	ifm_tensor,
				202	ifm_box,
				203	ofm_tensor,
				204	ofm_box,
				205	weight_tensor,
				206	weight_box,
				207	scale_tensor,
				208	concat_axis,
				209	concat_offset,
				210	ifm2_tensor=ifm2_tensor,
				211	ifm2_box=ifm2_box,
				212	)
				213
				214	elif strat == SchedulingStrategy.IfmStream:
				215	y_step = block_config[0]
				216	y_start = 0
				217	y_dim = 1
				218	if len(ofm_tensor.shape) >= 3:
				219	y_start = ofm_start[-3]
				220	y_dim = ofm_end[-3]
				221	if idx > 0:
				222	ifm_y_present = 0
				223	prev_pass = passes[idx - 1]
				224	prev_pass_gen = generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx - 1)
				225	else:
				226	ifm_y_present = 1
				227	if len(ifm_tensor.shape) >= 3:
				228	ifm_y_present = ifm_tensor.shape[-3]
				229	prev_pass_gen = []
				230	prev_pass = None
				231
				232	if len(passes) == 1:
				233	# no cascading, can just issue one big stripe
				234	# but only if we've done allocation and OFM does not overlap IFM
Charles Xu	04ce34c	2020-06-23 12:42:28 +0200	[diff] [blame]	235	if ifm_tensor.address is not None and ofm_tensor.address is not None:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	236	if (
				237	ifm_tensor.address + ifm_tensor.storage_size() <= ofm_tensor.address
				238	or ofm_tensor.address + ofm_tensor.storage_size() <= ifm_tensor.address
				239	):
				240	y_step = y_dim
				241
				242	weight_box = None
				243
				244	for start in range(y_start, y_dim, y_step):
				245	end = min(start + y_step, y_dim)
				246	if len(ofm_tensor.shape) >= 3:
				247	ofm_start[-3] = start
				248	ofm_end[-3] = end
				249	ofm_box = Box(ofm_start, ofm_end)
				250
				251	k_height = 1
Charles Xu	89a6bbf	2020-08-11 12:31:58 +0200	[diff] [blame]	252	if npu_block_type in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	253	if ps.primary_op is not None:
				254	k_height = ps.primary_op.attrs["ksize"][1]
				255	else:
				256	if weight_tensor is not None:
				257	k_height = weight_tensor.shape[0]
				258
				259	ifm_box, pad_top, pad_bottom = ofm_box.transform_with_strides_and_skirt(
Tim Hall	c30f495	2020-06-15 20:47:35 +0100	[diff] [blame]	260	strides,
				261	skirt,
				262	ifm_tensor.shape,
				263	npu_block_type,
				264	concat_axis,
				265	concat_offset,
				266	split_offsets[0],
				267	k_height,
				268	upscaling,
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	269	)
				270
Charles Xu	7879222	2020-05-13 10:15:26 +0200	[diff] [blame]	271	for intermediate in ps.intermediates:
Tim Hall	c30f495	2020-06-15 20:47:35 +0100	[diff] [blame]	272	if (
				273	intermediate is not None
				274	and intermediate.shape != []
				275	and intermediate.purpose == TensorPurpose.FeatureMap
				276	):
Charles Xu	7879222	2020-05-13 10:15:26 +0200	[diff] [blame]	277	intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt(
Tim Hall	c30f495	2020-06-15 20:47:35 +0100	[diff] [blame]	278	strides,
				279	skirt,
				280	intermediate.shape,
				281	npu_block_type,
				282	concat_axis,
				283	concat_offset,
				284	split_offsets[0],
				285	upscaling,
Charles Xu	7879222	2020-05-13 10:15:26 +0200	[diff] [blame]	286	)
				287	yield from dma_if_necessary(ps, intermediate_box, intermediate)
				288
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	289	ifm_y_needed = 1
				290	if len(ifm_box.end_coord) >= 3:
				291	ifm_y_needed = ifm_box.end_coord[-3]
				292	if ifm_y_present < ifm_y_needed:
				293	for prev_cmd in prev_pass_gen:
				294	yield prev_cmd
				295	rng = prev_cmd.get_ofm_y_range_for_pass(prev_pass)
				296	if rng is not None:
				297	ifm_y_present = max(ifm_y_present, rng[1])
				298	if ifm_y_present >= ifm_y_needed:
				299	break
				300
				301	if weight_tensor is not None and weight_box is None:
				302	weight_box = Box.make_weight_box(
				303	weight_tensor.shape, npu_block_type, weights_transposed=weight_tensor.weight_transpose_depthwise
				304	)
Charles Xu	7879222	2020-05-13 10:15:26 +0200	[diff] [blame]	305	yield from dma_if_necessary(ps, weight_box, weight_tensor)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	306
				307	# Check if first/last stripe in pass
				308	is_first_h_stripe = start == y_start
				309	is_last_h_stripe = (start + y_step) >= y_dim
				310
				311	stripe = NpuStripe(
				312	ps,
				313	block_config,
				314	is_first,
				315	is_last,
				316	is_first_h_stripe,
				317	is_last_h_stripe,
				318	ifm_tensor,
				319	ifm_box,
				320	ofm_tensor,
				321	ofm_box,
				322	weight_tensor,
				323	weight_box,
				324	scale_tensor,
				325	concat_axis,
				326	concat_offset,
				327	None,
				328	None,
				329	pad_top,
				330	pad_bottom,
				331	)
				332	yield stripe
				333	else:
				334	assert 0, "unknown scheduling strategy"
				335
				336
				337	def generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
				338	if strat == SchedulingStrategy.WeightStream:
				339	for idx in range(len(passes)):
				340	yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx)
				341	elif strat == SchedulingStrategy.IfmStream:
				342	yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, len(passes) - 1)
				343	else:
				344	assert 0, "Unknown streaming strategy"
				345
				346
				347	def generate_high_level_command_stream_for_cascaded_pass(cps):
				348	yield from generate_high_level_command_stream_for_pass_list(
				349	cps.strategy, cps.passes, [ps.block_config for ps in cps.passes]
				350	)
				351
				352
				353	def generate_high_level_command_stream(nng, sg, arch, verbose_high_level_command_stream):
				354	res = []
				355	for cps in sg.cascaded_passes:
				356	if cps.placement == PassPlacement.Npu:
				357	res += list(generate_high_level_command_stream_for_cascaded_pass(cps))
				358
				359	sg.high_level_command_stream = res
				360	if verbose_high_level_command_stream:
				361	sg.print_high_level_command_stream()
				362
				363
				364	def calc_allowed_ofm_ifm_overlap_for_pass_list(strat, passes, block_configs):
				365	highest_ofm_write = 0
				366	if not passes[0].ifm_tensor or not passes[-1].ofm_tensor:
				367	return 0
				368
				369	ifm_read = passes[0].ifm_tensor.storage_size
				370	min_overlap = 999999999999999999999
				371	ofm_size = passes[-1].ofm_tensor.storage_size()
				372	if strat == SchedulingStrategy.WeightStream:
				373	return 0
				374	for cmd in generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
				375	if cmd.is_npu_pass_command():
				376	if cmd.is_first:
				377	ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.start_coord, is_top_box=False)
				378	if ifm_read is None:
				379	return 0
				380	if cmd.is_last:
				381	write_offset = cmd.ofm_tensor.address_offset_for_coordinate(cmd.ofm_box.end_coord, is_top_box=True)
				382	if write_offset is None:
				383	return 0
				384	highest_ofm_write = max(write_offset, highest_ofm_write)
				385
				386	if cmd.is_first or cmd.is_last:
				387	overlap_required = max(highest_ofm_write - min(ifm_read, ofm_size), 0)
				388	can_overwrite = ofm_size - overlap_required
				389	min_overlap = min(min_overlap, can_overwrite)
				390
				391	if cmd.is_first:
				392	ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.end_coord, is_top_box=True)
				393
				394	min_overlap = max(min_overlap, 0)
				395	return min_overlap
				396
				397
				398	def calc_allowed_ofm_ifm_overlap_for_cascaded_pass(cps):
				399	return calc_allowed_ofm_ifm_overlap_for_pass_list(cps.strategy, cps.passes, [ps.block_config for ps in cps.passes])