ethosu/vela/cascade_builder.py - ml/ethos-u/ethos-u-vela - Gitiles

 # SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
 #
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the License); you may
 # not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an AS IS BASIS, WITHOUT
 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 # Description:
 # Groups Operators in a schedule together to form Cascades.
 from .live_range import ofm_can_reuse_ifm
 from .numeric_util import round_up
 from .operation import NpuBlockType
 from .operation import Op
 from .operation import Padding
 from .shape4d import Shape4D

 non_cascadable_blocks = (
     NpuBlockType.Default,
     NpuBlockType.VectorProduct,
     NpuBlockType.ReduceSum,
 )


 class CascadeInfo:
     """Contains metadata about a cascade"""

     def __init__(self, start, end, buffers, mem_usage: int):
         self.start = start
         self.end = end
         self.buffers = buffers
         self.mem_usage = mem_usage


 class BufferMap:
     """Caches the buffers seen"""

     def __init__(self):
         self.buffer_map = {}

     def get_buffer(self, producer, consumer, cost):
         assert producer or consumer
         key = (producer, consumer)
         if key not in self.buffer_map:
             # No cached buffer between these two SchedulerOperations
             if consumer is None:
                 # There are either no consumers or multiple consumers - FeatureMap needs to be stored in full
                 buffer_shape = producer.ofm.shape
                 buffer_size = producer.ofm_size_in_bytes()
             elif producer is None:
                 # First Op in subgraph or cascade - FeatureMap needs to be stored in full
                 buffer_shape = consumer.ifm.shape
                 buffer_size = consumer.ifm_size_in_bytes()
             elif producer.requires_full_ofm or consumer.requires_full_ifm:
                 # FeatureMap needs to be stored in full
                 buffer_shape = max(producer.ofm.shape, consumer.ifm.shape)
                 buffer_size = max(producer.ofm_size_in_bytes(), consumer.ifm_size_in_bytes())
             else:
                 # Use a rolling buffer
                 buffer_shape = rolling_buffer_shape(cost[producer].stripe, cost[consumer].stripe_input)
                 buffer_size = buffer_shape.elements() * producer.ofm.dtype.size_in_bytes()

             self.buffer_map[key] = (buffer_shape, buffer_size)

         return self.buffer_map[key]


 def rolling_buffer_shape(producer_stripe: Shape4D, consumer_stripe_input: Shape4D) -> Shape4D:
     """Calculates the storage shape of the rolling buffer between two SchedulerOperations in a Cascade"""
     buffer_height = round_up(producer_stripe.height + consumer_stripe_input.height, consumer_stripe_input.height)
     # Striding on the consumer op can result in IFM widths that are narrower than the OFM width of the producer.
     # Therefore, the maximum of the two needs to be used.
     buffer_width = max(producer_stripe.width, consumer_stripe_input.width)
     # Rolling buffers have to conform to NHCWB16 format
     return Shape4D([1, buffer_height, buffer_width, round_up(producer_stripe.depth, 16)])


 class CascadeBuilder:
     """Class for grouping SchedulerOperations into cascades"""

     def __init__(self, sched_ops, spilling, non_local_mem_usage=None):
         self.sched_ops = sched_ops
         self.no_cascade = 0
         self.non_local_mem_usage = non_local_mem_usage if non_local_mem_usage else {}
         self.spilling = spilling

     def _is_cascadable(self, sched_op, cost) -> bool:
         """Checks if 'sched_op' can be cascaded"""

         return (
             sched_op.op_type.npu_block_type not in non_cascadable_blocks
             and cost.stripe.height < sched_op.ofm.shape.height
             and sched_op.parent_op.read_offsets[0] is None
             and sched_op.parent_op.read_offsets[1] is None
             and self.elementwise_cascadable(sched_op)
             and not sched_op.parent_op.type.is_resize_op()
             and not sched_op.parent_op.type == Op.Conv2DBackpropInputSwitchedBias
             and sched_op.parent_op.attrs.get("padding", None) != Padding.TILE
         )

     def _estimate_sram_usage(self, sched_op, cost) -> int:
         """Estimate the SRAM required for the Op if all FeatureMaps are in SRAM"""
         if sched_op.parent_op.type.is_binary_elementwise_op():
             # ifm2 is scalar or constant and will always persist in permanent memory
             ifm2_size = 0
         else:
             ifm2_size = sched_op.ifm2_size_in_bytes()
         if sched_op.requires_full_ifm:
             ifm_size = sched_op.ifm_size_in_bytes()
         else:
             ifm_size = (
                 cost.stripe_input.with_depth(round_up(cost.stripe_input.depth, 16)).elements()
                 * sched_op.ifm.dtype.size_in_bytes()
             )
         if ofm_can_reuse_ifm(sched_op):
             # ofm will use the ifm buffer to reduce SRAM usage, hence ofm_size = 0
             ofm_size = 0
         elif sched_op.requires_full_ofm:
             ofm_size = sched_op.ofm_size_in_bytes()
         else:
             ofm_size = (
                 cost.stripe.with_depth(round_up(cost.stripe.depth, 16)).elements() * sched_op.ofm.dtype.size_in_bytes()
             )

         return ifm_size + ifm2_size + ofm_size + self.non_local_mem_usage.get(sched_op, 0)

     @staticmethod
     def elementwise_cascadable(sched_op):
         """Check if the elementwise can be cascaded."""

         if sched_op.parent_op.type.is_binary_elementwise_op():
             ifm = sched_op.parent_op.ifm
             ifm2 = sched_op.parent_op.ifm2
             ofm = sched_op.parent_op.ofm

             # IFM must be non-constant/non-scalar/non-broadcast
             ifm_cascadable = not (ifm.is_const or ifm.is_scalar or ifm.is_broadcast(ofm))

             # IFM2 must be constant or scalar
             ifm2_cascadable = ifm2.is_const or ifm2.is_scalar

             return ifm_cascadable and ifm2_cascadable
         else:
             return True

     def build_cascades(self, ref_schedule, fallback_schedule, guiding_mem_limit):
         ref_cost = ref_schedule.cost_map
         fallback_cost = fallback_schedule.cost_map
         cost = {}
         cascade_map = {}
         buffers = BufferMap()

         # Peak memory usage so far - updated continously, unless dedicated SRAM where this is a hard limit
         peak_sram_usage = guiding_mem_limit

         idx = 0
         while idx < len(self.sched_ops):
             op = self.sched_ops[idx]
             if op in cost:
                 # Already processed this Op
                 idx += 1
                 continue

             if not self._is_cascadable(op, ref_cost[op]):
                 # Op is not a candidate for cascading - assign fallback cost
                 cost[op] = fallback_cost[op]
                 if not self.spilling:
                     peak_sram_usage = max(self._estimate_sram_usage(op, fallback_cost[op]), peak_sram_usage)
                 idx += 1
                 continue

             # Propose a cascade starting with this Op
             cascade_start = op.index
             # Keep track of which Ops are in the proposed cascade as well as the best cascade so far
             ops_in_cascade = [op]
             ops_in_best_cascade = [op]
             # Get the size of the weight buffer(s)
             weight_buffer = sum(tens.storage_size() for tens in ref_cost[op].buffered_weight_tensors)

             # The first IFM needs to be stored in full
             cascade_ifm_size = op.ifm_size_in_bytes() if not self.spilling else 0

             # Sum of all intermediate cascade buffers (including weight buffers)
             cascade_buffers = weight_buffer
             # Best cascade size - Initially it's the fallback cost of the first Op in the cascade
             best_cascade_size = self._estimate_sram_usage(op, fallback_cost[op])

             # Op is the producer of the OFM consumed by the next Op to consider
             producer = op
             while True:
                 dependants = producer.get_dependants()
                 if len(dependants) != 1:
                     # producer is either the last Op in the schedule or the start of a branch
                     break

                 current_op = dependants[0]
                 if (
                     current_op in cost
                     or current_op not in ref_cost
                     or not self._is_cascadable(current_op, ref_cost[current_op])
                     or producer.ofm.shape != current_op.ifm.shape
                     or current_op.requires_full_ifm
                     or producer.requires_full_ofm
                 ):
                     # Current op has already been processed or cannot be cascaded
                     break

                 if producer.index + 1 != current_op.index:
                     # Cascading is possible, but requires reordering of operations in the schedule,
                     # this is currently not supported
                     break

                 # Get the size of the FeatureMap buffers between current and neighbouring Ops
                 op_full_ofm = current_op.ofm_size_in_bytes()
                 _, op_ifm_buffer = buffers.get_buffer(producer, current_op, ref_cost)

                 # Get the size of the weight buffer(s)
                 op_weight_buffer = sum(tens.storage_size() for tens in ref_cost[current_op].buffered_weight_tensors)

                 # Calculate the uncascaded memory requirement for current Op
                 uncascaded_sram_usage = self._estimate_sram_usage(current_op, fallback_cost[current_op])

                 # Add current Op to cascade
                 ops_in_cascade.append(current_op)

                 # Increase the accumulated intermediate buffers in the cascade
                 cascade_buffers += op_ifm_buffer + op_weight_buffer

                 if self.spilling:
                     # For Dedicated SRAM only the intermediate buffers are in SRAM
                     if uncascaded_sram_usage < peak_sram_usage or cascade_buffers > peak_sram_usage:
                         # Cascade until an Op fits in its entirety or the accumulated buffers no longer fit
                         break
                     else:
                         # Any addition to the cascade that fits is the new best cascade for Dedicated SRAM
                         ops_in_best_cascade = [op for op in ops_in_cascade]
                         best_cascade_size = cascade_buffers

                 else:
                     # Calculate the total size of the current cascade including non local mem usage
                     cascade_size = (
                         cascade_ifm_size + cascade_buffers + op_full_ofm + self.non_local_mem_usage.get(op, 0)
                     )

                     # Determine if cascading search should stop
                     if (
                         uncascaded_sram_usage < peak_sram_usage
                         and best_cascade_size < peak_sram_usage
                         or (cascade_ifm_size + cascade_buffers) > best_cascade_size
                     ):
                         # Both the existing cascade and current Op fits or
                         # not possible to reduce cascade size any further
                         break

                     """
                     One of two conditions will update the best cascade:

                     - cascade_size < best_cascade_size or
                     - cascade_size < uncascaded_sram_usage

                     The last condition is illustrated below, showing an example where it is
                     better to choose a larger cascade_size (with more OPs) because it will
                     use less total SRAM usage.

                     For simplicity, all featuremaps have same size.

                     Cascade OP1-OP2, OP3 is standalone

                                 ->  |OP1| -> roll buffer -> |OP2| -> FM -> |OP3| -> FM
                                /
                     |OP0| -> FM
                                \
                                 ->  ....


                     best_cascade_size    : FM + roll buffer + FM
                     uncascaded_sram_usage: FM + FM + FM

                     compared with:

                     Cascade OP1-OP3

                                 ->  |OP1| -> roll buffer -> |OP2| -> roll buffer -> |OP3| -> FM
                                /
                     |OP0| -> FM
                                \
                                 ->  ....


                     cascade_size         : FM + roll buffer + roll buffer + FM


                     So, for this use case the comparison will be

                     (FM + roll buffer + roll buffer + FM) <  (FM + roll buffer + FM) or
                     (FM + roll buffer + roll buffer + FM) <  (FM + FM + FM)

                     hence, better to choose Cascade OP1-OP3 in this case.
                     """
                     if cascade_size < best_cascade_size or cascade_size < uncascaded_sram_usage:
                         best_cascade_size = cascade_size
                         ops_in_best_cascade = [op for op in ops_in_cascade]

                 producer = current_op

             if len(ops_in_best_cascade) > 1:
                 # A cascade was created - assign cascade and ref_cost to all of the Ops
                 cascade_end = cascade_start + (len(ops_in_best_cascade) - 1)
                 buffers_in_cascade = {}
                 prev_op = None
                 for cascaded_op in ops_in_best_cascade:
                     assert cascade_start <= cascaded_op.index <= cascade_end
                     cost[cascaded_op] = ref_cost[cascaded_op]
                     cost[cascaded_op].cascade = cascade_end
                     if prev_op:
                         rolling_buffer_shape, _ = buffers.get_buffer(prev_op, cascaded_op, ref_cost)
                         buffers_in_cascade[cascaded_op] = rolling_buffer_shape

                     prev_op = cascaded_op

                 # Create a CascadeInfo for the cascade, only store the actual size used by
                 # the cascade so non local usage is removed. This is done in order to be
                 # able to calculate the correct non local usage in the scheduler when
                 # optimizing the sub schedules.
                 cascade_map[cascade_end] = CascadeInfo(
                     cascade_start,
                     cascade_end,
                     buffers_in_cascade,
                     best_cascade_size - self.non_local_mem_usage.get(op, 0),
                 )
                 if not self.spilling:
                     # Update peak memory usage
                     peak_sram_usage = max(best_cascade_size, peak_sram_usage)
             else:
                 # Assign fallback cost to the initial Op
                 cost[op] = fallback_cost[op]
                 if not self.spilling:
                     peak_sram_usage = max(self._estimate_sram_usage(op, fallback_cost[op]), peak_sram_usage)

         # Update costing and cascade information for the ref_schedule
         ref_schedule.cost_map = cost
         ref_schedule.cascades = cascade_map
         return ref_schedule
	# SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
	#
	# SPDX-License-Identifier: Apache-2.0
	#
	# Licensed under the Apache License, Version 2.0 (the License); you may
	# not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#
	# Description:
	# Groups Operators in a schedule together to form Cascades.
	from .live_range import ofm_can_reuse_ifm
	from .numeric_util import round_up
	from .operation import NpuBlockType
	from .operation import Op
	from .operation import Padding
	from .shape4d import Shape4D

	non_cascadable_blocks = (
	NpuBlockType.Default,
	NpuBlockType.VectorProduct,
	NpuBlockType.ReduceSum,
	)


	class CascadeInfo:
	"""Contains metadata about a cascade"""

	def __init__(self, start, end, buffers, mem_usage: int):
	self.start = start
	self.end = end
	self.buffers = buffers
	self.mem_usage = mem_usage


	class BufferMap:
	"""Caches the buffers seen"""

	def __init__(self):
	self.buffer_map = {}

	def get_buffer(self, producer, consumer, cost):
	assert producer or consumer
	key = (producer, consumer)
	if key not in self.buffer_map:
	# No cached buffer between these two SchedulerOperations
	if consumer is None:
	# There are either no consumers or multiple consumers - FeatureMap needs to be stored in full
	buffer_shape = producer.ofm.shape
	buffer_size = producer.ofm_size_in_bytes()
	elif producer is None:
	# First Op in subgraph or cascade - FeatureMap needs to be stored in full
	buffer_shape = consumer.ifm.shape
	buffer_size = consumer.ifm_size_in_bytes()
	elif producer.requires_full_ofm or consumer.requires_full_ifm:
	# FeatureMap needs to be stored in full
	buffer_shape = max(producer.ofm.shape, consumer.ifm.shape)
	buffer_size = max(producer.ofm_size_in_bytes(), consumer.ifm_size_in_bytes())
	else:
	# Use a rolling buffer
	buffer_shape = rolling_buffer_shape(cost[producer].stripe, cost[consumer].stripe_input)
	buffer_size = buffer_shape.elements() * producer.ofm.dtype.size_in_bytes()

	self.buffer_map[key] = (buffer_shape, buffer_size)

	return self.buffer_map[key]


	def rolling_buffer_shape(producer_stripe: Shape4D, consumer_stripe_input: Shape4D) -> Shape4D:
	"""Calculates the storage shape of the rolling buffer between two SchedulerOperations in a Cascade"""
	buffer_height = round_up(producer_stripe.height + consumer_stripe_input.height, consumer_stripe_input.height)
	# Striding on the consumer op can result in IFM widths that are narrower than the OFM width of the producer.
	# Therefore, the maximum of the two needs to be used.
	buffer_width = max(producer_stripe.width, consumer_stripe_input.width)
	# Rolling buffers have to conform to NHCWB16 format
	return Shape4D([1, buffer_height, buffer_width, round_up(producer_stripe.depth, 16)])


	class CascadeBuilder:
	"""Class for grouping SchedulerOperations into cascades"""

	def __init__(self, sched_ops, spilling, non_local_mem_usage=None):
	self.sched_ops = sched_ops
	self.no_cascade = 0
	self.non_local_mem_usage = non_local_mem_usage if non_local_mem_usage else {}
	self.spilling = spilling

	def _is_cascadable(self, sched_op, cost) -> bool:
	"""Checks if 'sched_op' can be cascaded"""

	return (
	sched_op.op_type.npu_block_type not in non_cascadable_blocks
	and cost.stripe.height < sched_op.ofm.shape.height
	and sched_op.parent_op.read_offsets[0] is None
	and sched_op.parent_op.read_offsets[1] is None
	and self.elementwise_cascadable(sched_op)
	and not sched_op.parent_op.type.is_resize_op()
	and not sched_op.parent_op.type == Op.Conv2DBackpropInputSwitchedBias
	and sched_op.parent_op.attrs.get("padding", None) != Padding.TILE
	)

	def _estimate_sram_usage(self, sched_op, cost) -> int:
	"""Estimate the SRAM required for the Op if all FeatureMaps are in SRAM"""
	if sched_op.parent_op.type.is_binary_elementwise_op():
	# ifm2 is scalar or constant and will always persist in permanent memory
	ifm2_size = 0
	else:
	ifm2_size = sched_op.ifm2_size_in_bytes()
	if sched_op.requires_full_ifm:
	ifm_size = sched_op.ifm_size_in_bytes()
	else:
	ifm_size = (
	cost.stripe_input.with_depth(round_up(cost.stripe_input.depth, 16)).elements()
	* sched_op.ifm.dtype.size_in_bytes()
	)
	if ofm_can_reuse_ifm(sched_op):
	# ofm will use the ifm buffer to reduce SRAM usage, hence ofm_size = 0
	ofm_size = 0
	elif sched_op.requires_full_ofm:
	ofm_size = sched_op.ofm_size_in_bytes()
	else:
	ofm_size = (
	cost.stripe.with_depth(round_up(cost.stripe.depth, 16)).elements() * sched_op.ofm.dtype.size_in_bytes()
	)

	return ifm_size + ifm2_size + ofm_size + self.non_local_mem_usage.get(sched_op, 0)

	@staticmethod
	def elementwise_cascadable(sched_op):
	"""Check if the elementwise can be cascaded."""

	if sched_op.parent_op.type.is_binary_elementwise_op():
	ifm = sched_op.parent_op.ifm
	ifm2 = sched_op.parent_op.ifm2
	ofm = sched_op.parent_op.ofm

	# IFM must be non-constant/non-scalar/non-broadcast
	ifm_cascadable = not (ifm.is_const or ifm.is_scalar or ifm.is_broadcast(ofm))

	# IFM2 must be constant or scalar
	ifm2_cascadable = ifm2.is_const or ifm2.is_scalar

	return ifm_cascadable and ifm2_cascadable
	else:
	return True

	def build_cascades(self, ref_schedule, fallback_schedule, guiding_mem_limit):
	ref_cost = ref_schedule.cost_map
	fallback_cost = fallback_schedule.cost_map
	cost = {}
	cascade_map = {}
	buffers = BufferMap()

	# Peak memory usage so far - updated continously, unless dedicated SRAM where this is a hard limit
	peak_sram_usage = guiding_mem_limit

	idx = 0
	while idx < len(self.sched_ops):
	op = self.sched_ops[idx]
	if op in cost:
	# Already processed this Op
	idx += 1
	continue

	if not self._is_cascadable(op, ref_cost[op]):
	# Op is not a candidate for cascading - assign fallback cost
	cost[op] = fallback_cost[op]
	if not self.spilling:
	peak_sram_usage = max(self._estimate_sram_usage(op, fallback_cost[op]), peak_sram_usage)
	idx += 1
	continue

	# Propose a cascade starting with this Op
	cascade_start = op.index
	# Keep track of which Ops are in the proposed cascade as well as the best cascade so far
	ops_in_cascade = [op]
	ops_in_best_cascade = [op]
	# Get the size of the weight buffer(s)
	weight_buffer = sum(tens.storage_size() for tens in ref_cost[op].buffered_weight_tensors)

	# The first IFM needs to be stored in full
	cascade_ifm_size = op.ifm_size_in_bytes() if not self.spilling else 0

	# Sum of all intermediate cascade buffers (including weight buffers)
	cascade_buffers = weight_buffer
	# Best cascade size - Initially it's the fallback cost of the first Op in the cascade
	best_cascade_size = self._estimate_sram_usage(op, fallback_cost[op])

	# Op is the producer of the OFM consumed by the next Op to consider
	producer = op
	while True:
	dependants = producer.get_dependants()
	if len(dependants) != 1:
	# producer is either the last Op in the schedule or the start of a branch
	break

	current_op = dependants[0]
	if (
	current_op in cost
	or current_op not in ref_cost
	or not self._is_cascadable(current_op, ref_cost[current_op])
	or producer.ofm.shape != current_op.ifm.shape
	or current_op.requires_full_ifm
	or producer.requires_full_ofm
	):
	# Current op has already been processed or cannot be cascaded
	break

	if producer.index + 1 != current_op.index:
	# Cascading is possible, but requires reordering of operations in the schedule,
	# this is currently not supported
	break

	# Get the size of the FeatureMap buffers between current and neighbouring Ops
	op_full_ofm = current_op.ofm_size_in_bytes()
	_, op_ifm_buffer = buffers.get_buffer(producer, current_op, ref_cost)

	# Get the size of the weight buffer(s)
	op_weight_buffer = sum(tens.storage_size() for tens in ref_cost[current_op].buffered_weight_tensors)

	# Calculate the uncascaded memory requirement for current Op
	uncascaded_sram_usage = self._estimate_sram_usage(current_op, fallback_cost[current_op])

	# Add current Op to cascade
	ops_in_cascade.append(current_op)

	# Increase the accumulated intermediate buffers in the cascade
	cascade_buffers += op_ifm_buffer + op_weight_buffer

	if self.spilling:
	# For Dedicated SRAM only the intermediate buffers are in SRAM
	if uncascaded_sram_usage < peak_sram_usage or cascade_buffers > peak_sram_usage:
	# Cascade until an Op fits in its entirety or the accumulated buffers no longer fit
	break
	else:
	# Any addition to the cascade that fits is the new best cascade for Dedicated SRAM
	ops_in_best_cascade = [op for op in ops_in_cascade]
	best_cascade_size = cascade_buffers

	else:
	# Calculate the total size of the current cascade including non local mem usage
	cascade_size = (
	cascade_ifm_size + cascade_buffers + op_full_ofm + self.non_local_mem_usage.get(op, 0)
	)

	# Determine if cascading search should stop
	if (
	uncascaded_sram_usage < peak_sram_usage
	and best_cascade_size < peak_sram_usage
	or (cascade_ifm_size + cascade_buffers) > best_cascade_size
	):
	# Both the existing cascade and current Op fits or
	# not possible to reduce cascade size any further
	break

	"""
	One of two conditions will update the best cascade:

	- cascade_size < best_cascade_size or
	- cascade_size < uncascaded_sram_usage

	The last condition is illustrated below, showing an example where it is
	better to choose a larger cascade_size (with more OPs) because it will
	use less total SRAM usage.

	For simplicity, all featuremaps have same size.

	Cascade OP1-OP2, OP3 is standalone

	-> \|OP1\| -> roll buffer -> \|OP2\| -> FM -> \|OP3\| -> FM
	/
	\|OP0\| -> FM
	\
	-> ....


	best_cascade_size : FM + roll buffer + FM
	uncascaded_sram_usage: FM + FM + FM

	compared with:

	Cascade OP1-OP3

	-> \|OP1\| -> roll buffer -> \|OP2\| -> roll buffer -> \|OP3\| -> FM
	/
	\|OP0\| -> FM
	\
	-> ....


	cascade_size : FM + roll buffer + roll buffer + FM


	So, for this use case the comparison will be

	(FM + roll buffer + roll buffer + FM) < (FM + roll buffer + FM) or
	(FM + roll buffer + roll buffer + FM) < (FM + FM + FM)

	hence, better to choose Cascade OP1-OP3 in this case.
	"""
	if cascade_size < best_cascade_size or cascade_size < uncascaded_sram_usage:
	best_cascade_size = cascade_size
	ops_in_best_cascade = [op for op in ops_in_cascade]

	producer = current_op

	if len(ops_in_best_cascade) > 1:
	# A cascade was created - assign cascade and ref_cost to all of the Ops
	cascade_end = cascade_start + (len(ops_in_best_cascade) - 1)
	buffers_in_cascade = {}
	prev_op = None
	for cascaded_op in ops_in_best_cascade:
	assert cascade_start <= cascaded_op.index <= cascade_end
	cost[cascaded_op] = ref_cost[cascaded_op]
	cost[cascaded_op].cascade = cascade_end
	if prev_op:
	rolling_buffer_shape, _ = buffers.get_buffer(prev_op, cascaded_op, ref_cost)
	buffers_in_cascade[cascaded_op] = rolling_buffer_shape

	prev_op = cascaded_op

	# Create a CascadeInfo for the cascade, only store the actual size used by
	# the cascade so non local usage is removed. This is done in order to be
	# able to calculate the correct non local usage in the scheduler when
	# optimizing the sub schedules.
	cascade_map[cascade_end] = CascadeInfo(
	cascade_start,
	cascade_end,
	buffers_in_cascade,
	best_cascade_size - self.non_local_mem_usage.get(op, 0),
	)
	if not self.spilling:
	# Update peak memory usage
	peak_sram_usage = max(best_cascade_size, peak_sram_usage)
	else:
	# Assign fallback cost to the initial Op
	cost[op] = fallback_cost[op]
	if not self.spilling:
	peak_sram_usage = max(self._estimate_sram_usage(op, fallback_cost[op]), peak_sram_usage)

	# Update costing and cascade information for the ref_schedule
	ref_schedule.cost_map = cost
	ref_schedule.cascades = cascade_map
	return ref_schedule