Blame - ethosu/vela/high_level_command_stream_generator.py - ml/ethos-u/ethos-u-vela

blob: 0cd3ad22cc541ce75aeae77e42922fef05734c0e [file] [log] [blame]

Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1	# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
				2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	16	# Description:
				17	# Generate a high-level command stream from a scheduled subgraph with CascadedPasses.
				18	#
				19	# Also used during scheduling to work out allowable IFM/OFM overlap, this functionality can be accessed using
				20	# calc_allowed_ofm_ifm_overlap_for_cascaded_pass().
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	21	from .high_level_command_stream import Box
				22	from .high_level_command_stream import DMA
				23	from .high_level_command_stream import NpuStripe
				24	from .nn_graph import PassPlacement
				25	from .nn_graph import SchedulingStrategy
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	26	from .operation import NpuBlockType
Charles Xu	7879222	2020-05-13 10:15:26 +0200	[diff] [blame]	27	from .tensor import TensorPurpose
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	28
				29
Charles Xu	7879222	2020-05-13 10:15:26 +0200	[diff] [blame]	30	def dma_if_necessary(ps, box, tensor):
Louis Verhaard	3c07c97	2020-05-07 08:12:58 +0200	[diff] [blame]	31	if tensor.needs_dma():
Charles Xu	7879222	2020-05-13 10:15:26 +0200	[diff] [blame]	32	dma_op = tensor.ops[0]
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	33	in_tensor = dma_op.inputs[0]
Charles Xu	7879222	2020-05-13 10:15:26 +0200	[diff] [blame]	34	yield DMA(in_tensor, tensor, box)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	35
Charles Xu	600351a	2020-05-18 08:54:47 +0200	[diff] [blame^]	36	def match_tensor(source, derived):
				37	if source == derived:
				38	return True
				39	ops = derived.ops
				40	return (ops != [] and
				41	len(ops) ==1 and
				42	ops[0].type == "SplitSliceRead" and
				43	source == ops[0].inputs[0])
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	44
				45	def generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx):
				46	is_first = idx == 0
				47	is_last = idx == len(passes) - 1
				48	ps = passes[idx]
				49	block_config = block_configs[idx]
Charles Xu	600351a	2020-05-18 08:54:47 +0200	[diff] [blame^]	50	npu_block_type = ps.npu_block_type
				51	split_offsets = [None, None] # offset for [ifm, ifm2]
				52
				53	ifm_idx = 0
				54	for op in ps.ops:
				55	if op.type == "SplitSliceRead":
				56	split_offsets[ifm_idx] = op.attrs["split_start"]
				57	ps.primary_op.attrs["fused_memory_function"] = op.type
				58	ifm_idx += 1
				59
				60	if len(ps.inputs) == 2 and npu_block_type == NpuBlockType.ElementWise:
				61	# Ensure correct imf and ifm2 order
				62	if (match_tensor(ps.inputs[0], ps.primary_op.inputs[1]) and
				63	match_tensor(ps.inputs[1], ps.primary_op.inputs[0])):
				64	ps.ifm_tensor, ps.ifm2_tensor = ps.ifm2_tensor, ps.ifm_tensor
				65	split_offsets[0], split_offsets[1] = split_offsets[1], split_offsets[0]
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	66
				67	ifm_tensor = ps.ifm_tensor
				68	ifm2_tensor = ps.ifm2_tensor
				69	ofm_tensor = ps.ofm_tensor
				70	weight_tensor = ps.weight_tensor
				71	scale_tensor = ps.scale_tensor
				72
				73	ofm_start = [0] * len(ofm_tensor.shape)
				74	ofm_end = list(ofm_tensor.shape)
				75
				76	strides = None
				77	skirt = None
				78	if ps.primary_op is not None:
				79	strides = ps.primary_op.attrs.get("strides", None)
				80	skirt = ps.primary_op.attrs.get("skirt", None)
				81
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	82	concat_axis = 0
				83	concat_offset = 0
				84
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	85	# Fusable activation functions
				86	activation_ops = set(("Sigmoid", "Tanh", "Relu", "Relu6", "ReluN1To1"))
				87
				88	for op in ps.ops:
				89	if op.type == "ConcatSliceWrite":
				90	concat_axis = op.attrs["concat_axis"]
				91	concat_start = op.attrs["concat_start"]
				92	concat_end = op.attrs["concat_end"]
				93
				94	ofm_start[concat_axis] = concat_start
				95	ofm_end[concat_axis] = concat_end
				96	concat_offset = concat_start
				97	ps.primary_op.attrs["fused_memory_function"] = op.type
				98	elif op.type in activation_ops:
				99	ps.primary_op.attrs["fused_activation_function"] = op.type
				100
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	101	if strat == SchedulingStrategy.WeightStream:
				102	ofm_step = block_config[-1]
				103	ofm_stop = ofm_end[-1]
Louis Verhaard	3c07c97	2020-05-07 08:12:58 +0200	[diff] [blame]	104	if weight_tensor is None or not weight_tensor.needs_dma():
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	105	ofm_step = ofm_stop
				106	for start in range(ofm_start[-1], ofm_stop, ofm_step):
				107	end = min(start + ofm_step, ofm_stop)
				108	ofm_start[-1] = start
				109	ofm_end[-1] = end
				110	ofm_box = Box(ofm_start, ofm_end)
				111	ifm_box = None
				112	ifm2_box = None
				113
				114	if ifm_tensor.shape != []:
				115	ifm_box, _, _ = ofm_box.transform_with_strides_and_skirt(
				116	strides, skirt, ifm_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0]
				117	)
				118	else:
				119	ifm_box = Box([], [])
				120	if ifm2_tensor is not None and ifm2_tensor.shape != []:
				121	ifm2_box, _, _ = ofm_box.transform_with_strides_and_skirt(
				122	strides, skirt, ifm2_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[1]
				123	)
				124	else:
				125	ifm2_box = Box([], [])
				126
Charles Xu	7879222	2020-05-13 10:15:26 +0200	[diff] [blame]	127	for intermediate in ps.intermediates:
				128	if intermediate != None and intermediate.shape != [] and intermediate.purpose == TensorPurpose.FeatureMap:
				129	intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt(
				130	strides, skirt, intermediate.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0]
				131	)
				132	yield from dma_if_necessary(ps, intermediate_box, intermediate)
				133
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	134	weight_box = None
				135	if weight_tensor is not None:
				136	weight_oc_start = start
				137	weight_oc_end = end
				138	if concat_axis - len(weight_tensor.shape) == -1:
				139	weight_oc_start -= concat_offset
				140	weight_oc_end -= concat_offset
				141
				142	weight_box = Box.make_weight_box(
				143	weight_tensor.shape,
				144	npu_block_type,
				145	weight_oc_start,
				146	weight_oc_end,
				147	weight_tensor.weight_transpose_depthwise,
				148	)
Charles Xu	7879222	2020-05-13 10:15:26 +0200	[diff] [blame]	149	yield from dma_if_necessary(ps, weight_box, weight_tensor)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	150
				151	yield NpuStripe(
				152	ps,
				153	block_config,
				154	is_first,
				155	is_last,
				156	True,
				157	True,
				158	ifm_tensor,
				159	ifm_box,
				160	ofm_tensor,
				161	ofm_box,
				162	weight_tensor,
				163	weight_box,
				164	scale_tensor,
				165	concat_axis,
				166	concat_offset,
				167	ifm2_tensor=ifm2_tensor,
				168	ifm2_box=ifm2_box,
				169	)
				170
				171	elif strat == SchedulingStrategy.IfmStream:
				172	y_step = block_config[0]
				173	y_start = 0
				174	y_dim = 1
				175	if len(ofm_tensor.shape) >= 3:
				176	y_start = ofm_start[-3]
				177	y_dim = ofm_end[-3]
				178	if idx > 0:
				179	ifm_y_present = 0
				180	prev_pass = passes[idx - 1]
				181	prev_pass_gen = generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx - 1)
				182	else:
				183	ifm_y_present = 1
				184	if len(ifm_tensor.shape) >= 3:
				185	ifm_y_present = ifm_tensor.shape[-3]
				186	prev_pass_gen = []
				187	prev_pass = None
				188
				189	if len(passes) == 1:
				190	# no cascading, can just issue one big stripe
				191	# but only if we've done allocation and OFM does not overlap IFM
				192	if ifm_tensor.address != -1 and ofm_tensor.address != -1:
				193	if (
				194	ifm_tensor.address + ifm_tensor.storage_size() <= ofm_tensor.address
				195	or ofm_tensor.address + ofm_tensor.storage_size() <= ifm_tensor.address
				196	):
				197	y_step = y_dim
				198
				199	weight_box = None
				200
				201	for start in range(y_start, y_dim, y_step):
				202	end = min(start + y_step, y_dim)
				203	if len(ofm_tensor.shape) >= 3:
				204	ofm_start[-3] = start
				205	ofm_end[-3] = end
				206	ofm_box = Box(ofm_start, ofm_end)
				207
				208	k_height = 1
				209	if npu_block_type == NpuBlockType.Pooling:
				210	if ps.primary_op is not None:
				211	k_height = ps.primary_op.attrs["ksize"][1]
				212	else:
				213	if weight_tensor is not None:
				214	k_height = weight_tensor.shape[0]
				215
				216	ifm_box, pad_top, pad_bottom = ofm_box.transform_with_strides_and_skirt(
				217	strides, skirt, ifm_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0], k_height
				218	)
				219
Charles Xu	7879222	2020-05-13 10:15:26 +0200	[diff] [blame]	220	for intermediate in ps.intermediates:
				221	if intermediate != None and intermediate.shape != [] and intermediate.purpose == TensorPurpose.FeatureMap:
				222	intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt(
				223	strides, skirt, intermediate.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0]
				224	)
				225	yield from dma_if_necessary(ps, intermediate_box, intermediate)
				226
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	227	ifm_y_needed = 1
				228	if len(ifm_box.end_coord) >= 3:
				229	ifm_y_needed = ifm_box.end_coord[-3]
				230	if ifm_y_present < ifm_y_needed:
				231	for prev_cmd in prev_pass_gen:
				232	yield prev_cmd
				233	rng = prev_cmd.get_ofm_y_range_for_pass(prev_pass)
				234	if rng is not None:
				235	ifm_y_present = max(ifm_y_present, rng[1])
				236	if ifm_y_present >= ifm_y_needed:
				237	break
				238
				239	if weight_tensor is not None and weight_box is None:
				240	weight_box = Box.make_weight_box(
				241	weight_tensor.shape, npu_block_type, weights_transposed=weight_tensor.weight_transpose_depthwise
				242	)
Charles Xu	7879222	2020-05-13 10:15:26 +0200	[diff] [blame]	243	yield from dma_if_necessary(ps, weight_box, weight_tensor)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	244
				245	# Check if first/last stripe in pass
				246	is_first_h_stripe = start == y_start
				247	is_last_h_stripe = (start + y_step) >= y_dim
				248
				249	stripe = NpuStripe(
				250	ps,
				251	block_config,
				252	is_first,
				253	is_last,
				254	is_first_h_stripe,
				255	is_last_h_stripe,
				256	ifm_tensor,
				257	ifm_box,
				258	ofm_tensor,
				259	ofm_box,
				260	weight_tensor,
				261	weight_box,
				262	scale_tensor,
				263	concat_axis,
				264	concat_offset,
				265	None,
				266	None,
				267	pad_top,
				268	pad_bottom,
				269	)
				270	yield stripe
				271	else:
				272	assert 0, "unknown scheduling strategy"
				273
				274
				275	def generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
				276	if strat == SchedulingStrategy.WeightStream:
				277	for idx in range(len(passes)):
				278	yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx)
				279	elif strat == SchedulingStrategy.IfmStream:
				280	yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, len(passes) - 1)
				281	else:
				282	assert 0, "Unknown streaming strategy"
				283
				284
				285	def generate_high_level_command_stream_for_cascaded_pass(cps):
				286	yield from generate_high_level_command_stream_for_pass_list(
				287	cps.strategy, cps.passes, [ps.block_config for ps in cps.passes]
				288	)
				289
				290
				291	def generate_high_level_command_stream(nng, sg, arch, verbose_high_level_command_stream):
				292	res = []
				293	for cps in sg.cascaded_passes:
				294	if cps.placement == PassPlacement.Npu:
				295	res += list(generate_high_level_command_stream_for_cascaded_pass(cps))
				296
				297	sg.high_level_command_stream = res
				298	if verbose_high_level_command_stream:
				299	sg.print_high_level_command_stream()
				300
				301
				302	def calc_allowed_ofm_ifm_overlap_for_pass_list(strat, passes, block_configs):
				303	highest_ofm_write = 0
				304	if not passes[0].ifm_tensor or not passes[-1].ofm_tensor:
				305	return 0
				306
				307	ifm_read = passes[0].ifm_tensor.storage_size
				308	min_overlap = 999999999999999999999
				309	ofm_size = passes[-1].ofm_tensor.storage_size()
				310	if strat == SchedulingStrategy.WeightStream:
				311	return 0
				312	for cmd in generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
				313	if cmd.is_npu_pass_command():
				314	if cmd.is_first:
				315	ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.start_coord, is_top_box=False)
				316	if ifm_read is None:
				317	return 0
				318	if cmd.is_last:
				319	write_offset = cmd.ofm_tensor.address_offset_for_coordinate(cmd.ofm_box.end_coord, is_top_box=True)
				320	if write_offset is None:
				321	return 0
				322	highest_ofm_write = max(write_offset, highest_ofm_write)
				323
				324	if cmd.is_first or cmd.is_last:
				325	overlap_required = max(highest_ofm_write - min(ifm_read, ofm_size), 0)
				326	can_overwrite = ofm_size - overlap_required
				327	min_overlap = min(min_overlap, can_overwrite)
				328
				329	if cmd.is_first:
				330	ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.end_coord, is_top_box=True)
				331
				332	min_overlap = max(min_overlap, 0)
				333	return min_overlap
				334
				335
				336	def calc_allowed_ofm_ifm_overlap_for_cascaded_pass(cps):
				337	return calc_allowed_ofm_ifm_overlap_for_pass_list(cps.strategy, cps.passes, [ps.block_config for ps in cps.passes])