Blame - ethosu/vela/cascade_builder.py - ml/ethos-u/ethos-u-vela

blob: 94c856f4820b35ee3b2290d8ca28b54a72c34df6 [file] [log] [blame]

Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	1	# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
				2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
				16	#
				17	# Description:
				18	# Groups Operators in a schedule together to form Cascades.
Johan Alfvén	255dad7	2022-07-16 18:27:05 +0200	[diff] [blame]	19	from collections import namedtuple
				20
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	21	from .numeric_util import round_up
				22	from .operation import NpuBlockType
erik.andersson@arm.com	6b2a0b4	2022-03-22 15:35:30 +0100	[diff] [blame]	23	from .operation import Op
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	24	from .shape4d import Shape4D
				25
				26	non_cascadable_blocks = (
				27	NpuBlockType.Default,
				28	NpuBlockType.VectorProduct,
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	29	NpuBlockType.ReduceSum,
				30	)
				31
				32
				33	class CascadeInfo:
				34	"""Contains metadata about a cascade"""
				35
				36	def __init__(self, start, end, buffers, mem_usage: int):
				37	self.start = start
				38	self.end = end
				39	self.buffers = buffers
				40	self.mem_usage = mem_usage
				41
				42
				43	class BufferMap:
				44	"""Caches the buffers seen"""
				45
				46	def __init__(self):
				47	self.buffer_map = {}
				48
				49	def get_buffer(self, producer, consumer, cost):
				50	assert producer or consumer
				51	key = (producer, consumer)
				52	if key not in self.buffer_map:
				53	# No cached buffer between these two SchedulerOperations
				54	if consumer is None:
				55	# There are either no consumers or multiple consumers - FeatureMap needs to be stored in full
				56	buffer_shape = producer.ofm.shape
				57	buffer_size = producer.ofm_size_in_bytes()
				58	elif producer is None:
				59	# First Op in subgraph or cascade - FeatureMap needs to be stored in full
				60	buffer_shape = consumer.ifm.shape
				61	buffer_size = consumer.ifm_size_in_bytes()
				62	elif producer.requires_full_ofm or consumer.requires_full_ifm:
				63	# FeatureMap needs to be stored in full
				64	buffer_shape = max(producer.ofm.shape, consumer.ifm.shape)
				65	buffer_size = max(producer.ofm_size_in_bytes(), consumer.ifm_size_in_bytes())
				66	else:
				67	# Use a rolling buffer
				68	buffer_shape = rolling_buffer_shape(cost[producer].stripe, cost[consumer].stripe_input)
				69	buffer_size = buffer_shape.elements() * producer.ofm.dtype.size_in_bytes()
				70
				71	self.buffer_map[key] = (buffer_shape, buffer_size)
				72
				73	return self.buffer_map[key]
				74
				75
				76	def rolling_buffer_shape(producer_stripe: Shape4D, consumer_stripe_input: Shape4D) -> Shape4D:
				77	"""Calculates the storage shape of the rolling buffer between two SchedulerOperations in a Cascade"""
				78	buffer_height = round_up(producer_stripe.height + consumer_stripe_input.height, consumer_stripe_input.height)
				79	# Rolling buffers have to conform to NHCWB16 format
				80	return consumer_stripe_input.with_height(buffer_height).with_depth(round_up(producer_stripe.depth, 16))
				81
				82
				83	class CascadeBuilder:
				84	"""Class for grouping SchedulerOperations into cascades"""
				85
				86	def __init__(self, sched_ops, spilling, non_local_mem_usage=None):
				87	self.sched_ops = sched_ops
				88	self.no_cascade = 0
				89	self.non_local_mem_usage = non_local_mem_usage if non_local_mem_usage else {}
				90	self.spilling = spilling
				91
				92	def _is_cascadable(self, sched_op, cost) -> bool:
				93	"""Checks if 'sched_op' can be cascaded"""
erik.andersson@arm.com	6b2a0b4	2022-03-22 15:35:30 +0100	[diff] [blame]	94
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	95	return (
				96	sched_op.op_type.npu_block_type not in non_cascadable_blocks
				97	and cost.stripe.height < sched_op.ofm.shape.height
Johan Alfvén	ab677b3	2022-05-09 13:02:24 +0200	[diff] [blame]	98	and sched_op.parent_op.read_offsets[0] is None
				99	and sched_op.parent_op.read_offsets[1] is None
erik.andersson@arm.com	6b2a0b4	2022-03-22 15:35:30 +0100	[diff] [blame]	100	and self.element_wise_cascading_conformity(sched_op)
Johan Alfvén	dc7414a	2022-08-18 11:12:40 +0200	[diff] [blame^]	101	and not sched_op.parent_op.type.is_resize_op()
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	102	)
				103
Johan Alfvén	255dad7	2022-07-16 18:27:05 +0200	[diff] [blame]	104	def _is_mergeable(self, sched_op) -> bool:
				105	# Code based on merge_elementwise_op_ranges from live_range.py
				106
				107	if not sched_op.op_type.is_elementwise_op():
				108	return False
				109
				110	elem_op = sched_op.parent_op
				111
				112	# Check if overwriting the inputs can be allowed
				113	OpShapeTens = namedtuple("OpShapeTens", ["op_shape", "tens"])
				114	outp = OpShapeTens(elem_op.ofm_shapes[0], elem_op.ofm)
				115
				116	# check output tensor only has one producer
				117	if len(outp.tens.ops) != 1:
				118	return False
				119
				120	inps = []
				121	if elem_op.ifm is not None:
				122	inps.append(OpShapeTens(elem_op.ifm_shapes[0], elem_op.ifm))
				123	if elem_op.ifm2 is not None:
				124	inps.append(OpShapeTens(elem_op.ifm_shapes[1], elem_op.ifm2))
				125
				126	# find an input tensor that can be overwritten by the output
				127	for inp in inps:
				128	if (
				129	# check op input and output shapes allow overlapping
				130	inp.op_shape == outp.op_shape
				131	# check input tensor is valid
				132	and inp.tens is not None
				133	and inp.tens.shape != []
				134	# check input and output tensors are compatible
				135	and inp.tens.format == outp.tens.format
				136	and inp.tens.dtype == outp.tens.dtype
				137	# check input tensor only has one consumer
				138	and len(inp.tens.consumer_list) == 1
				139	):
				140	return True
				141
				142	return False
				143
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	144	def _estimate_sram_usage(self, sched_op, cost) -> int:
				145	"""Estimate the SRAM required for the Op if all FeatureMaps are in SRAM"""
				146	ifm2_size = sched_op.ifm2_size_in_bytes()
				147	if sched_op.requires_full_ifm:
				148	ifm_size = sched_op.ifm_size_in_bytes()
				149	else:
				150	ifm_size = (
				151	cost.stripe_input.with_depth(round_up(cost.stripe_input.depth, 16)).elements()
				152	* sched_op.ifm.dtype.size_in_bytes()
				153	)
				154	if sched_op.requires_full_ofm:
				155	ofm_size = sched_op.ofm_size_in_bytes()
				156	else:
				157	ofm_size = (
				158	cost.stripe.with_depth(round_up(cost.stripe.depth, 16)).elements() * sched_op.ofm.dtype.size_in_bytes()
				159	)
				160
Johan Alfvén	255dad7	2022-07-16 18:27:05 +0200	[diff] [blame]	161	if self._is_mergeable(sched_op):
				162	# ofm will use the ifm buffer to reduce SRAM usage, hence ofm_size = 0
				163	ofm_size = 0
				164
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	165	return ifm_size + ifm2_size + ofm_size + self.non_local_mem_usage.get(sched_op, 0)
				166
erik.andersson@arm.com	6b2a0b4	2022-03-22 15:35:30 +0100	[diff] [blame]	167	@staticmethod
				168	def element_wise_cascading_conformity(sched_op):
				169	"""Check the inputs of the op to see if it's a candidate for cascading."""
				170	# Cascading sub-operators of Softmax results in a crash when handling Sub and RescaleAdd ops
				171
				172	ifm = sched_op.parent_op.ifm
				173	ifm2 = sched_op.parent_op.ifm2
				174
				175	if sched_op.op_type in [Op.RescaleAdd]:
				176	return False
				177
				178	if sched_op.parent_op.type.is_binary_elementwise_op() and ifm and ifm2:
				179	# We cannot rule out cascadability if at least one IFM is constant
				180	return Op.Const in (ifm.ops[0], ifm2.ops[0])
				181	else:
				182	# Either one IFM is not variable or it is not a binary elementwise op - we cannot rule out cascadability
				183	return True
				184
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	185	def build_cascades(self, ref_schedule, fallback_schedule, guiding_mem_limit):
				186	ref_cost = ref_schedule.cost_map
				187	fallback_cost = fallback_schedule.cost_map
				188	cost = {}
				189	cascade_map = {}
				190	buffers = BufferMap()
				191
				192	# Peak memory usage so far - updated continously, unless dedicated SRAM where this is a hard limit
				193	peak_sram_usage = guiding_mem_limit
				194
				195	idx = 0
				196	while idx < len(self.sched_ops):
				197	op = self.sched_ops[idx]
				198	if op in cost:
				199	# Already processed this Op
				200	idx += 1
				201	continue
				202
				203	if not self._is_cascadable(op, ref_cost[op]):
				204	# Op is not a candidate for cascading - assign fallback cost
				205	cost[op] = fallback_cost[op]
				206	if not self.spilling:
				207	peak_sram_usage = max(self._estimate_sram_usage(op, fallback_cost[op]), peak_sram_usage)
				208	idx += 1
				209	continue
				210
				211	# Propose a cascade starting with this Op
				212	cascade_start = op.index
				213	# Keep track of which Ops are in the proposed cascade as well as the best cascade so far
				214	ops_in_cascade = [op]
				215	ops_in_best_cascade = [op]
Rickard Bolin	fd8b500	2022-05-16 09:11:06 +0000	[diff] [blame]	216	# Get the size of the weight buffer(s)
				217	weight_buffer = sum(tens.storage_size() for tens in ref_cost[op].buffered_weight_tensors)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	218
				219	# The first IFM needs to be stored in full
				220	cascade_ifm_size = op.ifm_size_in_bytes() if not self.spilling else 0
				221
				222	# Add non-local memory usage
				223	cascade_ifm_size += self.non_local_mem_usage.get(op, 0)
				224
				225	# Sum of all intermediate cascade buffers (including weight buffers)
				226	cascade_buffers = weight_buffer
				227	# Best cascade size - Initially it's the fallback cost of the first Op in the cascade
				228	best_cascade_size = self._estimate_sram_usage(op, fallback_cost[op])
				229
				230	# Op is the producer of the OFM consumed by the next Op to consider
				231	producer = op
				232	while True:
				233	dependants = producer.get_dependants()
				234	if len(dependants) != 1:
				235	# producer is either the last Op in the schedule or the start of a branch
				236	break
				237
				238	current_op = dependants[0]
				239	if (
				240	current_op in cost
				241	or current_op not in ref_cost
				242	or not self._is_cascadable(current_op, ref_cost[current_op])
				243	or producer.ofm.shape != current_op.ifm.shape
Louis Verhaard	04bd3e9	2021-08-19 16:36:32 +0200	[diff] [blame]	244	or current_op.requires_full_ifm
				245	or producer.requires_full_ofm
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	246	):
				247	# Current op has already been processed or cannot be cascaded
				248	break
				249
Louis Verhaard	37ba98c	2022-03-16 09:56:45 +0100	[diff] [blame]	250	if producer.index + 1 != current_op.index:
				251	# Cascading is possible, but requires reordering of operations in the schedule,
				252	# this is currently not supported
				253	break
				254
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	255	# Get the size of the FeatureMap buffers between current and neighbouring Ops
				256	op_full_ifm = current_op.ifm_size_in_bytes()
				257	op_full_ofm = current_op.ofm_size_in_bytes()
				258	_, op_ifm_buffer = buffers.get_buffer(producer, current_op, ref_cost)
				259
Rickard Bolin	fd8b500	2022-05-16 09:11:06 +0000	[diff] [blame]	260	# Get the size of the weight buffer(s)
				261	op_weight_buffer = sum(tens.storage_size() for tens in ref_cost[current_op].buffered_weight_tensors)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	262
				263	# Calculate the uncascaded memory requirement for current Op
				264	uncascaded_sram_usage = op_full_ifm + op_full_ofm + self.non_local_mem_usage.get(current_op, 0)
				265
				266	# Add current Op to cascade
				267	ops_in_cascade.append(current_op)
				268
				269	# Increase the accumulated intermediate buffers in the cascade
				270	cascade_buffers += op_ifm_buffer + op_weight_buffer
				271
				272	if self.spilling:
				273	# For Dedicated SRAM only the intermediate buffers are in SRAM
				274	if uncascaded_sram_usage < peak_sram_usage or cascade_buffers > peak_sram_usage:
				275	# Cascade until an Op fits in its entirety or the accumulated buffers no longer fit
				276	break
				277	else:
				278	# Any addition to the cascade that fits is the new best cascade for Dedicated SRAM
				279	ops_in_best_cascade = [op for op in ops_in_cascade]
				280	best_cascade_size = cascade_buffers
				281
				282	else:
				283	# Calculate the total size of the current cascade
				284	cascade_size = cascade_ifm_size + cascade_buffers + op_full_ofm
				285
				286	# Determine if cascading search should stop
				287	if (
				288	uncascaded_sram_usage < peak_sram_usage
				289	and best_cascade_size < peak_sram_usage
				290	or (cascade_ifm_size + cascade_buffers) > best_cascade_size
				291	):
				292	# Both the existing cascade and current Op fits
				293	break
				294
Johan Alfvén	255dad7	2022-07-16 18:27:05 +0200	[diff] [blame]	295	"""
				296	One of two conditions will update the best cascade:
				297
				298	- cascade_size < best_cascade_size or
				299	- cascade_size < uncascaded_sram_usage
				300
				301	The last condition is illustrated below, showing an example where it is
				302	better to choose a larger cascade_size (with more OPs) because it will
				303	use less total SRAM usage.
				304
				305	For simplicity, all featuremaps have same size.
				306
				307	Cascade OP1-OP2, OP3 is standalone
				308
				309	-> \|OP1\| -> roll buffer -> \|OP2\| -> FM -> \|OP3\| -> FM
				310	/
				311	\|OP0\| -> FM
				312	\
				313	-> ....
				314
				315
				316	best_cascade_size : FM + roll buffer + FM
				317	uncascaded_sram_usage: FM + FM + FM
				318
				319	compared with:
				320
				321	Cascade OP1-OP3
				322
				323	-> \|OP1\| -> roll buffer -> \|OP2\| -> roll buffer -> \|OP3\| -> FM
				324	/
				325	\|OP0\| -> FM
				326	\
				327	-> ....
				328
				329
				330	cascade_size : FM + roll buffer + roll buffer + FM
				331
				332
				333	So, for this use case the comparison will be
				334
				335	(FM + roll buffer + roll buffer + FM) < (FM + roll buffer + FM) or
				336	(FM + roll buffer + roll buffer + FM) < (FM + FM + FM)
				337
				338	hence, better to choose Cascade OP1-OP3 in this case.
				339	"""
				340	if cascade_size < best_cascade_size or cascade_size < uncascaded_sram_usage:
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	341	best_cascade_size = cascade_ifm_size + cascade_buffers + op_full_ofm
				342	ops_in_best_cascade = [op for op in ops_in_cascade]
				343
				344	producer = current_op
				345
				346	if len(ops_in_best_cascade) > 1:
				347	# A cascade was created - assign cascade and ref_cost to all of the Ops
				348	cascade_end = cascade_start + (len(ops_in_best_cascade) - 1)
				349	buffers_in_cascade = {}
				350	prev_op = None
				351	for cascaded_op in ops_in_best_cascade:
Louis Verhaard	37ba98c	2022-03-16 09:56:45 +0100	[diff] [blame]	352	assert cascade_start <= cascaded_op.index <= cascade_end
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	353	cost[cascaded_op] = ref_cost[cascaded_op]
				354	cost[cascaded_op].cascade = cascade_end
				355	if prev_op:
				356	rolling_buffer_shape, _ = buffers.get_buffer(prev_op, cascaded_op, ref_cost)
				357	buffers_in_cascade[cascaded_op] = rolling_buffer_shape
				358
				359	prev_op = cascaded_op
				360
				361	# Create a CascadeInfo for the cascade
				362	cascade_map[cascade_end] = CascadeInfo(
				363	cascade_start, cascade_end, buffers_in_cascade, best_cascade_size
				364	)
				365	if not self.spilling:
				366	# Update peak memory usage
				367	peak_sram_usage = max(best_cascade_size, peak_sram_usage)
				368	else:
				369	# Assign fallback cost to the initial Op
				370	cost[op] = fallback_cost[op]
				371	if not self.spilling:
				372	peak_sram_usage = max(self._estimate_sram_usage(op, fallback_cost[op]), peak_sram_usage)
				373
erik.andersson@arm.com	6b2a0b4	2022-03-22 15:35:30 +0100	[diff] [blame]	374	# Update costing and cascade information for the ref_schedule
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	375	ref_schedule.cost_map = cost
				376	ref_schedule.cascades = cascade_map
				377	return ref_schedule