Blame - ethosu/vela/high_level_command_stream_generator.py - ml/ethos-u/ethos-u-vela

blob: 364df6f88e04dbdc7020d51daafb62732bd2076a [file] [log] [blame]

Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame^]	1	# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
				2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
				16
				17
				18	# Description:
				19	# Generate a high-level command stream from a scheduled subgraph with CascadedPasses.
				20	#
				21	# Also used during scheduling to work out allowable IFM/OFM overlap, this functionality can be accessed using
				22	# calc_allowed_ofm_ifm_overlap_for_cascaded_pass().
				23
				24	from .nn_graph import SchedulingStrategy, PassPlacement
				25	import numpy as np
				26	from .operation import NpuBlockType
				27	from .high_level_command_stream import Box, CommandType, Command, NpuStripe, DMA
				28
				29
				30	def need_dma(tens):
				31	return len(tens.ops) == 1 and tens.ops[0].type == "DMA"
				32
				33
				34	def dma_weights_if_necessary(ps, box, weight_tensor):
				35	if need_dma(weight_tensor):
				36	dma_op = weight_tensor.ops[0]
				37	in_tensor = dma_op.inputs[0]
				38	yield DMA(in_tensor, weight_tensor, box)
				39
				40
				41	def generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx):
				42	is_first = idx == 0
				43	is_last = idx == len(passes) - 1
				44	ps = passes[idx]
				45	block_config = block_configs[idx]
				46
				47	ifm_tensor = ps.ifm_tensor
				48	ifm2_tensor = ps.ifm2_tensor
				49	ofm_tensor = ps.ofm_tensor
				50	weight_tensor = ps.weight_tensor
				51	scale_tensor = ps.scale_tensor
				52
				53	ofm_start = [0] * len(ofm_tensor.shape)
				54	ofm_end = list(ofm_tensor.shape)
				55
				56	strides = None
				57	skirt = None
				58	if ps.primary_op is not None:
				59	strides = ps.primary_op.attrs.get("strides", None)
				60	skirt = ps.primary_op.attrs.get("skirt", None)
				61
				62	npu_block_type = ps.npu_block_type
				63
				64	concat_axis = 0
				65	concat_offset = 0
				66
				67	split_offsets = [None, None] # offset for [ifm, ifm2]
				68
				69	# Fusable activation functions
				70	activation_ops = set(("Sigmoid", "Tanh", "Relu", "Relu6", "ReluN1To1"))
				71
				72	for op in ps.ops:
				73	if op.type == "ConcatSliceWrite":
				74	concat_axis = op.attrs["concat_axis"]
				75	concat_start = op.attrs["concat_start"]
				76	concat_end = op.attrs["concat_end"]
				77
				78	ofm_start[concat_axis] = concat_start
				79	ofm_end[concat_axis] = concat_end
				80	concat_offset = concat_start
				81	ps.primary_op.attrs["fused_memory_function"] = op.type
				82	elif op.type in activation_ops:
				83	ps.primary_op.attrs["fused_activation_function"] = op.type
				84
				85	# The ops list has to be reversed here since the Pass Packing is done in reverse
				86	ifm_idx = 0
				87	for op in reversed(ps.ops):
				88	if op.type == "SplitSliceRead":
				89	split_offsets[ifm_idx] = op.attrs["split_start"]
				90	ps.primary_op.attrs["fused_memory_function"] = op.type
				91	ifm_idx += 1
				92
				93	if strat == SchedulingStrategy.WeightStream:
				94	ofm_step = block_config[-1]
				95	ofm_stop = ofm_end[-1]
				96	if weight_tensor is None or not need_dma(weight_tensor):
				97	ofm_step = ofm_stop
				98	for start in range(ofm_start[-1], ofm_stop, ofm_step):
				99	end = min(start + ofm_step, ofm_stop)
				100	ofm_start[-1] = start
				101	ofm_end[-1] = end
				102	ofm_box = Box(ofm_start, ofm_end)
				103	ifm_box = None
				104	ifm2_box = None
				105
				106	if ifm_tensor.shape != []:
				107	ifm_box, _, _ = ofm_box.transform_with_strides_and_skirt(
				108	strides, skirt, ifm_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0]
				109	)
				110	else:
				111	ifm_box = Box([], [])
				112	if ifm2_tensor is not None and ifm2_tensor.shape != []:
				113	ifm2_box, _, _ = ofm_box.transform_with_strides_and_skirt(
				114	strides, skirt, ifm2_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[1]
				115	)
				116	else:
				117	ifm2_box = Box([], [])
				118
				119	weight_box = None
				120	if weight_tensor is not None:
				121	weight_oc_start = start
				122	weight_oc_end = end
				123	if concat_axis - len(weight_tensor.shape) == -1:
				124	weight_oc_start -= concat_offset
				125	weight_oc_end -= concat_offset
				126
				127	weight_box = Box.make_weight_box(
				128	weight_tensor.shape,
				129	npu_block_type,
				130	weight_oc_start,
				131	weight_oc_end,
				132	weight_tensor.weight_transpose_depthwise,
				133	)
				134	yield from dma_weights_if_necessary(ps, weight_box, weight_tensor)
				135
				136	yield NpuStripe(
				137	ps,
				138	block_config,
				139	is_first,
				140	is_last,
				141	True,
				142	True,
				143	ifm_tensor,
				144	ifm_box,
				145	ofm_tensor,
				146	ofm_box,
				147	weight_tensor,
				148	weight_box,
				149	scale_tensor,
				150	concat_axis,
				151	concat_offset,
				152	ifm2_tensor=ifm2_tensor,
				153	ifm2_box=ifm2_box,
				154	)
				155
				156	elif strat == SchedulingStrategy.IfmStream:
				157	y_step = block_config[0]
				158	y_start = 0
				159	y_dim = 1
				160	if len(ofm_tensor.shape) >= 3:
				161	y_start = ofm_start[-3]
				162	y_dim = ofm_end[-3]
				163	if idx > 0:
				164	ifm_y_present = 0
				165	prev_pass = passes[idx - 1]
				166	prev_pass_gen = generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx - 1)
				167	else:
				168	ifm_y_present = 1
				169	if len(ifm_tensor.shape) >= 3:
				170	ifm_y_present = ifm_tensor.shape[-3]
				171	prev_pass_gen = []
				172	prev_pass = None
				173
				174	if len(passes) == 1:
				175	# no cascading, can just issue one big stripe
				176	# but only if we've done allocation and OFM does not overlap IFM
				177	if ifm_tensor.address != -1 and ofm_tensor.address != -1:
				178	if (
				179	ifm_tensor.address + ifm_tensor.storage_size() <= ofm_tensor.address
				180	or ofm_tensor.address + ofm_tensor.storage_size() <= ifm_tensor.address
				181	):
				182	y_step = y_dim
				183
				184	weight_box = None
				185
				186	for start in range(y_start, y_dim, y_step):
				187	end = min(start + y_step, y_dim)
				188	if len(ofm_tensor.shape) >= 3:
				189	ofm_start[-3] = start
				190	ofm_end[-3] = end
				191	ofm_box = Box(ofm_start, ofm_end)
				192
				193	k_height = 1
				194	if npu_block_type == NpuBlockType.Pooling:
				195	if ps.primary_op is not None:
				196	k_height = ps.primary_op.attrs["ksize"][1]
				197	else:
				198	if weight_tensor is not None:
				199	k_height = weight_tensor.shape[0]
				200
				201	ifm_box, pad_top, pad_bottom = ofm_box.transform_with_strides_and_skirt(
				202	strides, skirt, ifm_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0], k_height
				203	)
				204
				205	ifm_y_needed = 1
				206	if len(ifm_box.end_coord) >= 3:
				207	ifm_y_needed = ifm_box.end_coord[-3]
				208	if ifm_y_present < ifm_y_needed:
				209	for prev_cmd in prev_pass_gen:
				210	yield prev_cmd
				211	rng = prev_cmd.get_ofm_y_range_for_pass(prev_pass)
				212	if rng is not None:
				213	ifm_y_present = max(ifm_y_present, rng[1])
				214	if ifm_y_present >= ifm_y_needed:
				215	break
				216
				217	if weight_tensor is not None and weight_box is None:
				218	weight_box = Box.make_weight_box(
				219	weight_tensor.shape, npu_block_type, weights_transposed=weight_tensor.weight_transpose_depthwise
				220	)
				221	yield from dma_weights_if_necessary(ps, weight_box, weight_tensor)
				222
				223	# Check if first/last stripe in pass
				224	is_first_h_stripe = start == y_start
				225	is_last_h_stripe = (start + y_step) >= y_dim
				226
				227	stripe = NpuStripe(
				228	ps,
				229	block_config,
				230	is_first,
				231	is_last,
				232	is_first_h_stripe,
				233	is_last_h_stripe,
				234	ifm_tensor,
				235	ifm_box,
				236	ofm_tensor,
				237	ofm_box,
				238	weight_tensor,
				239	weight_box,
				240	scale_tensor,
				241	concat_axis,
				242	concat_offset,
				243	None,
				244	None,
				245	pad_top,
				246	pad_bottom,
				247	)
				248	yield stripe
				249	else:
				250	assert 0, "unknown scheduling strategy"
				251
				252
				253	def generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
				254	if strat == SchedulingStrategy.WeightStream:
				255	for idx in range(len(passes)):
				256	yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx)
				257	elif strat == SchedulingStrategy.IfmStream:
				258	yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, len(passes) - 1)
				259	else:
				260	assert 0, "Unknown streaming strategy"
				261
				262
				263	def generate_high_level_command_stream_for_cascaded_pass(cps):
				264	yield from generate_high_level_command_stream_for_pass_list(
				265	cps.strategy, cps.passes, [ps.block_config for ps in cps.passes]
				266	)
				267
				268
				269	def generate_high_level_command_stream(nng, sg, arch, verbose_high_level_command_stream):
				270	res = []
				271	for cps in sg.cascaded_passes:
				272	if cps.placement == PassPlacement.Npu:
				273	res += list(generate_high_level_command_stream_for_cascaded_pass(cps))
				274
				275	sg.high_level_command_stream = res
				276	if verbose_high_level_command_stream:
				277	sg.print_high_level_command_stream()
				278
				279
				280	def calc_allowed_ofm_ifm_overlap_for_pass_list(strat, passes, block_configs):
				281	highest_ofm_write = 0
				282	if not passes[0].ifm_tensor or not passes[-1].ofm_tensor:
				283	return 0
				284
				285	ifm_read = passes[0].ifm_tensor.storage_size
				286	min_overlap = 999999999999999999999
				287	ofm_size = passes[-1].ofm_tensor.storage_size()
				288	if strat == SchedulingStrategy.WeightStream:
				289	return 0
				290	for cmd in generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
				291	if cmd.is_npu_pass_command():
				292	if cmd.is_first:
				293	ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.start_coord, is_top_box=False)
				294	if ifm_read is None:
				295	return 0
				296	if cmd.is_last:
				297	write_offset = cmd.ofm_tensor.address_offset_for_coordinate(cmd.ofm_box.end_coord, is_top_box=True)
				298	if write_offset is None:
				299	return 0
				300	highest_ofm_write = max(write_offset, highest_ofm_write)
				301
				302	if cmd.is_first or cmd.is_last:
				303	overlap_required = max(highest_ofm_write - min(ifm_read, ofm_size), 0)
				304	can_overwrite = ofm_size - overlap_required
				305	min_overlap = min(min_overlap, can_overwrite)
				306
				307	if cmd.is_first:
				308	ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.end_coord, is_top_box=True)
				309
				310	min_overlap = max(min_overlap, 0)
				311	return min_overlap
				312
				313
				314	def calc_allowed_ofm_ifm_overlap_for_cascaded_pass(cps):
				315	return calc_allowed_ofm_ifm_overlap_for_pass_list(cps.strategy, cps.passes, [ps.block_config for ps in cps.passes])