Blame - ethosu/vela/cascade_builder.py - ml/ethos-u/ethos-u-vela

blob: e7105e2caf1e8920f113d45e289e637ac3ff7da0 [file] [log] [blame]

Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	1	# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
				2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
				16	#
				17	# Description:
				18	# Groups Operators in a schedule together to form Cascades.
				19	from .numeric_util import round_up
				20	from .operation import NpuBlockType
				21	from .shape4d import Shape4D
				22
				23	non_cascadable_blocks = (
				24	NpuBlockType.Default,
				25	NpuBlockType.VectorProduct,
				26	NpuBlockType.ElementWise,
				27	NpuBlockType.ReduceSum,
				28	)
				29
				30
				31	class CascadeInfo:
				32	"""Contains metadata about a cascade"""
				33
				34	def __init__(self, start, end, buffers, mem_usage: int):
				35	self.start = start
				36	self.end = end
				37	self.buffers = buffers
				38	self.mem_usage = mem_usage
				39
				40
				41	class BufferMap:
				42	"""Caches the buffers seen"""
				43
				44	def __init__(self):
				45	self.buffer_map = {}
				46
				47	def get_buffer(self, producer, consumer, cost):
				48	assert producer or consumer
				49	key = (producer, consumer)
				50	if key not in self.buffer_map:
				51	# No cached buffer between these two SchedulerOperations
				52	if consumer is None:
				53	# There are either no consumers or multiple consumers - FeatureMap needs to be stored in full
				54	buffer_shape = producer.ofm.shape
				55	buffer_size = producer.ofm_size_in_bytes()
				56	elif producer is None:
				57	# First Op in subgraph or cascade - FeatureMap needs to be stored in full
				58	buffer_shape = consumer.ifm.shape
				59	buffer_size = consumer.ifm_size_in_bytes()
				60	elif producer.requires_full_ofm or consumer.requires_full_ifm:
				61	# FeatureMap needs to be stored in full
				62	buffer_shape = max(producer.ofm.shape, consumer.ifm.shape)
				63	buffer_size = max(producer.ofm_size_in_bytes(), consumer.ifm_size_in_bytes())
				64	else:
				65	# Use a rolling buffer
				66	buffer_shape = rolling_buffer_shape(cost[producer].stripe, cost[consumer].stripe_input)
				67	buffer_size = buffer_shape.elements() * producer.ofm.dtype.size_in_bytes()
				68
				69	self.buffer_map[key] = (buffer_shape, buffer_size)
				70
				71	return self.buffer_map[key]
				72
				73
				74	def rolling_buffer_shape(producer_stripe: Shape4D, consumer_stripe_input: Shape4D) -> Shape4D:
				75	"""Calculates the storage shape of the rolling buffer between two SchedulerOperations in a Cascade"""
				76	buffer_height = round_up(producer_stripe.height + consumer_stripe_input.height, consumer_stripe_input.height)
				77	# Rolling buffers have to conform to NHCWB16 format
				78	return consumer_stripe_input.with_height(buffer_height).with_depth(round_up(producer_stripe.depth, 16))
				79
				80
				81	class CascadeBuilder:
				82	"""Class for grouping SchedulerOperations into cascades"""
				83
				84	def __init__(self, sched_ops, spilling, non_local_mem_usage=None):
				85	self.sched_ops = sched_ops
				86	self.no_cascade = 0
				87	self.non_local_mem_usage = non_local_mem_usage if non_local_mem_usage else {}
				88	self.spilling = spilling
				89
				90	def _is_cascadable(self, sched_op, cost) -> bool:
				91	"""Checks if 'sched_op' can be cascaded"""
				92	return (
				93	sched_op.op_type.npu_block_type not in non_cascadable_blocks
				94	and cost.stripe.height < sched_op.ofm.shape.height
Johan Alfvén	ab677b3	2022-05-09 13:02:24 +0200	[diff] [blame]	95	and sched_op.parent_op.read_offsets[0] is None
				96	and sched_op.parent_op.read_offsets[1] is None
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	97	)
				98
				99	def _estimate_sram_usage(self, sched_op, cost) -> int:
				100	"""Estimate the SRAM required for the Op if all FeatureMaps are in SRAM"""
				101	ifm2_size = sched_op.ifm2_size_in_bytes()
				102	if sched_op.requires_full_ifm:
				103	ifm_size = sched_op.ifm_size_in_bytes()
				104	else:
				105	ifm_size = (
				106	cost.stripe_input.with_depth(round_up(cost.stripe_input.depth, 16)).elements()
				107	* sched_op.ifm.dtype.size_in_bytes()
				108	)
				109	if sched_op.requires_full_ofm:
				110	ofm_size = sched_op.ofm_size_in_bytes()
				111	else:
				112	ofm_size = (
				113	cost.stripe.with_depth(round_up(cost.stripe.depth, 16)).elements() * sched_op.ofm.dtype.size_in_bytes()
				114	)
				115
				116	return ifm_size + ifm2_size + ofm_size + self.non_local_mem_usage.get(sched_op, 0)
				117
				118	def build_cascades(self, ref_schedule, fallback_schedule, guiding_mem_limit):
				119	ref_cost = ref_schedule.cost_map
				120	fallback_cost = fallback_schedule.cost_map
				121	cost = {}
				122	cascade_map = {}
				123	buffers = BufferMap()
				124
				125	# Peak memory usage so far - updated continously, unless dedicated SRAM where this is a hard limit
				126	peak_sram_usage = guiding_mem_limit
				127
				128	idx = 0
				129	while idx < len(self.sched_ops):
				130	op = self.sched_ops[idx]
				131	if op in cost:
				132	# Already processed this Op
				133	idx += 1
				134	continue
				135
				136	if not self._is_cascadable(op, ref_cost[op]):
				137	# Op is not a candidate for cascading - assign fallback cost
				138	cost[op] = fallback_cost[op]
				139	if not self.spilling:
				140	peak_sram_usage = max(self._estimate_sram_usage(op, fallback_cost[op]), peak_sram_usage)
				141	idx += 1
				142	continue
				143
				144	# Propose a cascade starting with this Op
				145	cascade_start = op.index
				146	# Keep track of which Ops are in the proposed cascade as well as the best cascade so far
				147	ops_in_cascade = [op]
				148	ops_in_best_cascade = [op]
Rickard Bolin	fd8b500	2022-05-16 09:11:06 +0000	[diff] [blame]	149	# Get the size of the weight buffer(s)
				150	weight_buffer = sum(tens.storage_size() for tens in ref_cost[op].buffered_weight_tensors)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	151
				152	# The first IFM needs to be stored in full
				153	cascade_ifm_size = op.ifm_size_in_bytes() if not self.spilling else 0
				154
				155	# Add non-local memory usage
				156	cascade_ifm_size += self.non_local_mem_usage.get(op, 0)
				157
				158	# Sum of all intermediate cascade buffers (including weight buffers)
				159	cascade_buffers = weight_buffer
				160	# Best cascade size - Initially it's the fallback cost of the first Op in the cascade
				161	best_cascade_size = self._estimate_sram_usage(op, fallback_cost[op])
				162
				163	# Op is the producer of the OFM consumed by the next Op to consider
				164	producer = op
				165	while True:
				166	dependants = producer.get_dependants()
				167	if len(dependants) != 1:
				168	# producer is either the last Op in the schedule or the start of a branch
				169	break
				170
				171	current_op = dependants[0]
				172	if (
				173	current_op in cost
				174	or current_op not in ref_cost
				175	or not self._is_cascadable(current_op, ref_cost[current_op])
				176	or producer.ofm.shape != current_op.ifm.shape
Louis Verhaard	04bd3e9	2021-08-19 16:36:32 +0200	[diff] [blame]	177	or current_op.requires_full_ifm
				178	or producer.requires_full_ofm
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	179	):
				180	# Current op has already been processed or cannot be cascaded
				181	break
				182
Louis Verhaard	37ba98c	2022-03-16 09:56:45 +0100	[diff] [blame]	183	if producer.index + 1 != current_op.index:
				184	# Cascading is possible, but requires reordering of operations in the schedule,
				185	# this is currently not supported
				186	break
				187
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	188	# Get the size of the FeatureMap buffers between current and neighbouring Ops
				189	op_full_ifm = current_op.ifm_size_in_bytes()
				190	op_full_ofm = current_op.ofm_size_in_bytes()
				191	_, op_ifm_buffer = buffers.get_buffer(producer, current_op, ref_cost)
				192
Rickard Bolin	fd8b500	2022-05-16 09:11:06 +0000	[diff] [blame]	193	# Get the size of the weight buffer(s)
				194	op_weight_buffer = sum(tens.storage_size() for tens in ref_cost[current_op].buffered_weight_tensors)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	195
				196	# Calculate the uncascaded memory requirement for current Op
				197	uncascaded_sram_usage = op_full_ifm + op_full_ofm + self.non_local_mem_usage.get(current_op, 0)
				198
				199	# Add current Op to cascade
				200	ops_in_cascade.append(current_op)
				201
				202	# Increase the accumulated intermediate buffers in the cascade
				203	cascade_buffers += op_ifm_buffer + op_weight_buffer
				204
				205	if self.spilling:
				206	# For Dedicated SRAM only the intermediate buffers are in SRAM
				207	if uncascaded_sram_usage < peak_sram_usage or cascade_buffers > peak_sram_usage:
				208	# Cascade until an Op fits in its entirety or the accumulated buffers no longer fit
				209	break
				210	else:
				211	# Any addition to the cascade that fits is the new best cascade for Dedicated SRAM
				212	ops_in_best_cascade = [op for op in ops_in_cascade]
				213	best_cascade_size = cascade_buffers
				214
				215	else:
				216	# Calculate the total size of the current cascade
				217	cascade_size = cascade_ifm_size + cascade_buffers + op_full_ofm
				218
				219	# Determine if cascading search should stop
				220	if (
				221	uncascaded_sram_usage < peak_sram_usage
				222	and best_cascade_size < peak_sram_usage
				223	or (cascade_ifm_size + cascade_buffers) > best_cascade_size
				224	):
				225	# Both the existing cascade and current Op fits
				226	break
				227
				228	# Determine if current cascade is the best so far
				229	if cascade_size < best_cascade_size:
				230	best_cascade_size = cascade_ifm_size + cascade_buffers + op_full_ofm
				231	ops_in_best_cascade = [op for op in ops_in_cascade]
				232
				233	producer = current_op
				234
				235	if len(ops_in_best_cascade) > 1:
				236	# A cascade was created - assign cascade and ref_cost to all of the Ops
				237	cascade_end = cascade_start + (len(ops_in_best_cascade) - 1)
				238	buffers_in_cascade = {}
				239	prev_op = None
				240	for cascaded_op in ops_in_best_cascade:
Louis Verhaard	37ba98c	2022-03-16 09:56:45 +0100	[diff] [blame]	241	assert cascade_start <= cascaded_op.index <= cascade_end
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	242	cost[cascaded_op] = ref_cost[cascaded_op]
				243	cost[cascaded_op].cascade = cascade_end
				244	if prev_op:
				245	rolling_buffer_shape, _ = buffers.get_buffer(prev_op, cascaded_op, ref_cost)
				246	buffers_in_cascade[cascaded_op] = rolling_buffer_shape
				247
				248	prev_op = cascaded_op
				249
				250	# Create a CascadeInfo for the cascade
				251	cascade_map[cascade_end] = CascadeInfo(
				252	cascade_start, cascade_end, buffers_in_cascade, best_cascade_size
				253	)
				254	if not self.spilling:
				255	# Update peak memory usage
				256	peak_sram_usage = max(best_cascade_size, peak_sram_usage)
				257	else:
				258	# Assign fallback cost to the initial Op
				259	cost[op] = fallback_cost[op]
				260	if not self.spilling:
				261	peak_sram_usage = max(self._estimate_sram_usage(op, fallback_cost[op]), peak_sram_usage)
				262
				263	# Update costing and cascde information for the ref_schedule
				264	ref_schedule.cost_map = cost
				265	ref_schedule.cascades = cascade_map
				266	return ref_schedule