ethosu/vela/high_level_command_stream_generator.py - ml/ethos-u/ethos-u-vela - Gitiles

 # Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the License); you may
 # not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an AS IS BASIS, WITHOUT
 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Description:
 # Generate a high-level command stream from a scheduled subgraph with CascadedPasses.
 #
 # Also used during scheduling to work out allowable IFM/OFM overlap, this functionality can be accessed using
 # calc_allowed_ofm_ifm_overlap_for_cascaded_pass().
 from .high_level_command_stream import Box
 from .high_level_command_stream import DMA
 from .high_level_command_stream import NpuStripe
 from .nn_graph import PassPlacement
 from .nn_graph import SchedulingStrategy
 from .numeric_util import round_up_divide
 from .operation import NpuBlockType
 from .tensor import TensorPurpose


 def dma_if_necessary(ps, box, tensor):
     if tensor.needs_dma():
         dma_op = tensor.ops[0]
         in_tensor = dma_op.inputs[0]
         yield DMA(ps, in_tensor, tensor, box)


 def match_tensor(source, derived):
     if source == derived:
         return True
     ops = derived.ops
     return ops != [] and len(ops) == 1 and ops[0].type == "SplitSliceRead" and source == ops[0].inputs[0]


 def generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx):
     is_first = idx == 0
     is_last = idx == len(passes) - 1
     ps = passes[idx]
     block_config = block_configs[idx]
     npu_block_type = ps.npu_block_type
     split_offsets = [None, None]  # offset for [ifm, ifm2]

     if len(ps.inputs) == 2 and npu_block_type == NpuBlockType.ElementWise:
         # Ensure correct ifm and ifm2 order
         if match_tensor(ps.inputs[0], ps.primary_op.inputs[1]) and match_tensor(ps.inputs[1], ps.primary_op.inputs[0]):
             ps.ifm_tensor, ps.ifm2_tensor = ps.ifm2_tensor, ps.ifm_tensor

         for op in ps.ops:
             if op.type == "SplitSliceRead":
                 ps.primary_op.attrs["fused_memory_function"] = op.type
                 assert len(op.inputs) == 1
                 if match_tensor(ps.ifm_tensor, op.inputs[0]):
                     split_offsets[0] = op.attrs["split_start"]
                 elif match_tensor(ps.ifm2_tensor, op.inputs[0]):
                     split_offsets[1] = op.attrs["split_start"]
                 else:
                     assert False
     else:
         ifm_idx = 0
         for op in ps.ops:
             if op.type == "SplitSliceRead":
                 assert ifm_idx < 2
                 split_offsets[ifm_idx] = op.attrs["split_start"]
                 ps.primary_op.attrs["fused_memory_function"] = op.type
                 ifm_idx += 1

     ifm_tensor = ps.ifm_tensor
     ifm2_tensor = ps.ifm2_tensor
     ofm_tensor = ps.ofm_tensor
     weight_tensor = ps.weight_tensor
     scale_tensor = ps.scale_tensor

     ofm_start = [0] * len(ofm_tensor.shape)
     ofm_end = list(ofm_tensor.shape)

     strides = None
     skirt = None
     upscaling = 1
     if ps.primary_op is not None:
         strides = ps.primary_op.attrs.get("strides", None)
         skirt = ps.primary_op.attrs.get("skirt", None)
         if ps.primary_op.type == "Conv2DBackpropInputSwitchedBias":
             upscaling = ofm_tensor.shape[-3] // ifm_tensor.shape[-3]
         elif ps.primary_op.type == "ResizeBilinear":
             upscaling = round_up_divide(ofm_tensor.shape[-3], ifm_tensor.shape[-3])

     concat_axis = 0
     concat_offset = 0

     # Fusable activation functions
     activation_ops = set(("Sigmoid", "Tanh", "Relu", "Relu6", "ReluN1To1"))

     for op in ps.ops:
         if op.type == "ConcatSliceWrite":
             concat_axis = op.attrs["concat_axis"]
             concat_start = op.attrs["concat_start"]
             concat_end = op.attrs["concat_end"]

             ofm_start[concat_axis] = concat_start
             ofm_end[concat_axis] = concat_end
             concat_offset = concat_start
             ps.primary_op.attrs["fused_memory_function"] = op.type
         elif op.type in activation_ops:
             ps.primary_op.attrs["fused_activation_function"] = op.type

     if strat == SchedulingStrategy.WeightStream:
         ofm_step = block_config[-1]
         ofm_stop = ofm_end[-1]
         if weight_tensor is None or not weight_tensor.needs_dma():
             ofm_step = ofm_stop
         for start in range(ofm_start[-1], ofm_stop, ofm_step):
             end = min(start + ofm_step, ofm_stop)
             ofm_start[-1] = start
             ofm_end[-1] = end
             ofm_box = Box(ofm_start, ofm_end)
             ifm_box = None
             ifm2_box = None

             if ifm_tensor.shape != []:
                 ifm_box, _, _ = ofm_box.transform_with_strides_and_skirt(
                     strides,
                     skirt,
                     ifm_tensor.shape,
                     npu_block_type,
                     concat_axis,
                     concat_offset,
                     split_offsets[0],
                     upscaling,
                 )
             else:
                 ifm_box = Box([], [])
             if ifm2_tensor is not None and ifm2_tensor.shape != []:
                 ifm2_box, _, _ = ofm_box.transform_with_strides_and_skirt(
                     strides,
                     skirt,
                     ifm2_tensor.shape,
                     npu_block_type,
                     concat_axis,
                     concat_offset,
                     split_offsets[1],
                     upscaling,
                 )
             else:
                 ifm2_box = Box([], [])

             for intermediate in ps.intermediates:
                 if (
                     intermediate is not None
                     and intermediate.shape != []
                     and intermediate.purpose in (TensorPurpose.FeatureMap, TensorPurpose.LUT)
                 ):
                     if intermediate.purpose is TensorPurpose.FeatureMap:
                         intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt(
                             strides,
                             skirt,
                             intermediate.shape,
                             npu_block_type,
                             concat_axis,
                             concat_offset,
                             split_offsets[0],
                             upscaling,
                         )
                     else:
                         intermediate_box = Box([0] * len(intermediate.shape), list(intermediate.shape))
                     yield from dma_if_necessary(ps, intermediate_box, intermediate)

             weight_box = None
             if weight_tensor is not None:
                 weight_oc_start = start
                 weight_oc_end = end
                 if concat_axis - len(weight_tensor.shape) == -1:
                     weight_oc_start -= concat_offset
                     weight_oc_end -= concat_offset

                 weight_box = Box.make_weight_box(
                     weight_tensor.shape,
                     npu_block_type,
                     weight_oc_start,
                     weight_oc_end,
                     weight_tensor.weight_transpose_depthwise,
                 )
                 yield from dma_if_necessary(ps, weight_box, weight_tensor)

             yield NpuStripe(
                 ps,
                 block_config,
                 is_first,
                 is_last,
                 True,
                 True,
                 ifm_tensor,
                 ifm_box,
                 ofm_tensor,
                 ofm_box,
                 weight_tensor,
                 weight_box,
                 scale_tensor,
                 concat_axis,
                 concat_offset,
                 ifm2_tensor=ifm2_tensor,
                 ifm2_box=ifm2_box,
             )

     elif strat == SchedulingStrategy.IfmStream:
         y_step = block_config[0]
         y_start = 0
         y_dim = 1
         if len(ofm_tensor.shape) >= 3:
             y_start = ofm_start[-3]
             y_dim = ofm_end[-3]
         if idx > 0:
             ifm_y_present = 0
             prev_pass = passes[idx - 1]
             prev_pass_gen = generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx - 1)
         else:
             ifm_y_present = 1
             if len(ifm_tensor.shape) >= 3:
                 ifm_y_present = ifm_tensor.shape[-3]
             prev_pass_gen = []
             prev_pass = None

         if len(passes) == 1:
             # no cascading, can just issue one big stripe
             # but only if we've done allocation and OFM does not overlap IFM
             if ifm_tensor.address is not None and ofm_tensor.address is not None:
                 if (
                     ifm_tensor.address + ifm_tensor.storage_size() <= ofm_tensor.address
                     or ofm_tensor.address + ofm_tensor.storage_size() <= ifm_tensor.address
                 ):
                     y_step = y_dim

         weight_box = None

         for start in range(y_start, y_dim, y_step):
             end = min(start + y_step, y_dim)
             if len(ofm_tensor.shape) >= 3:
                 ofm_start[-3] = start
                 ofm_end[-3] = end
             ofm_box = Box(ofm_start, ofm_end)

             k_height = 1
             if npu_block_type in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):
                 if ps.primary_op is not None:
                     k_height = ps.primary_op.attrs["ksize"][1]
             else:
                 if weight_tensor is not None:
                     k_height = weight_tensor.shape[0]

             ifm_box, pad_top, pad_bottom = ofm_box.transform_with_strides_and_skirt(
                 strides,
                 skirt,
                 ifm_tensor.shape,
                 npu_block_type,
                 concat_axis,
                 concat_offset,
                 split_offsets[0],
                 k_height,
                 upscaling,
             )

             for intermediate in ps.intermediates:
                 if (
                     intermediate is not None
                     and intermediate.shape != []
                     and intermediate.purpose == TensorPurpose.FeatureMap
                 ):
                     intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt(
                         strides,
                         skirt,
                         intermediate.shape,
                         npu_block_type,
                         concat_axis,
                         concat_offset,
                         split_offsets[0],
                         upscaling,
                     )
                     yield from dma_if_necessary(ps, intermediate_box, intermediate)

             ifm_y_needed = 1
             if len(ifm_box.end_coord) >= 3:
                 ifm_y_needed = ifm_box.end_coord[-3]
             if ifm_y_present < ifm_y_needed:
                 for prev_cmd in prev_pass_gen:
                     yield prev_cmd
                     rng = prev_cmd.get_ofm_y_range_for_pass(prev_pass)
                     if rng is not None:
                         ifm_y_present = max(ifm_y_present, rng[1])
                         if ifm_y_present >= ifm_y_needed:
                             break

             if weight_tensor is not None and weight_box is None:
                 weight_box = Box.make_weight_box(
                     weight_tensor.shape, npu_block_type, weights_transposed=weight_tensor.weight_transpose_depthwise
                 )
                 yield from dma_if_necessary(ps, weight_box, weight_tensor)

             # Check if first/last stripe in pass
             is_first_h_stripe = start == y_start
             is_last_h_stripe = (start + y_step) >= y_dim

             stripe = NpuStripe(
                 ps,
                 block_config,
                 is_first,
                 is_last,
                 is_first_h_stripe,
                 is_last_h_stripe,
                 ifm_tensor,
                 ifm_box,
                 ofm_tensor,
                 ofm_box,
                 weight_tensor,
                 weight_box,
                 scale_tensor,
                 concat_axis,
                 concat_offset,
                 None,
                 None,
                 pad_top,
                 pad_bottom,
             )
             yield stripe
     else:
         assert 0, "unknown scheduling strategy"


 def generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
     if strat == SchedulingStrategy.WeightStream:
         for idx in range(len(passes)):
             yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx)
     elif strat == SchedulingStrategy.IfmStream:
         yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, len(passes) - 1)
     else:
         assert 0, "Unknown streaming strategy"


 def generate_high_level_command_stream_for_cascaded_pass(cps):
     yield from generate_high_level_command_stream_for_pass_list(
         cps.strategy, cps.passes, [ps.block_config for ps in cps.passes]
     )


 def generate_high_level_command_stream(nng, sg, arch, verbose_high_level_command_stream):
     res = []
     for cps in sg.cascaded_passes:
         if cps.placement == PassPlacement.Npu:
             res += list(generate_high_level_command_stream_for_cascaded_pass(cps))

     sg.high_level_command_stream = res
     if verbose_high_level_command_stream:
         sg.print_high_level_command_stream()


 def calc_allowed_ofm_ifm_overlap_for_pass_list(strat, passes, block_configs):
     highest_ofm_write = 0
     if not passes[0].ifm_tensor or not passes[-1].ofm_tensor:
         return 0

     ifm_read = passes[0].ifm_tensor.storage_size
     min_overlap = 999999999999999999999
     ofm_size = passes[-1].ofm_tensor.storage_size()
     if strat == SchedulingStrategy.WeightStream:
         return 0
     for cmd in generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
         if cmd.is_npu_pass_command():
             if cmd.is_first:
                 ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.start_coord, is_top_box=False)
                 if ifm_read is None:
                     return 0
             if cmd.is_last:
                 write_offset = cmd.ofm_tensor.address_offset_for_coordinate(cmd.ofm_box.end_coord, is_top_box=True)
                 if write_offset is None:
                     return 0
                 highest_ofm_write = max(write_offset, highest_ofm_write)

             if cmd.is_first or cmd.is_last:
                 overlap_required = max(highest_ofm_write - min(ifm_read, ofm_size), 0)
                 can_overwrite = ofm_size - overlap_required
                 min_overlap = min(min_overlap, can_overwrite)

             if cmd.is_first:
                 ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.end_coord, is_top_box=True)

     min_overlap = max(min_overlap, 0)
     return min_overlap


 def calc_allowed_ofm_ifm_overlap_for_cascaded_pass(cps):
     return calc_allowed_ofm_ifm_overlap_for_pass_list(cps.strategy, cps.passes, [ps.block_config for ps in cps.passes])
	# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
	#
	# SPDX-License-Identifier: Apache-2.0
	#
	# Licensed under the Apache License, Version 2.0 (the License); you may
	# not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# Description:
	# Generate a high-level command stream from a scheduled subgraph with CascadedPasses.
	#
	# Also used during scheduling to work out allowable IFM/OFM overlap, this functionality can be accessed using
	# calc_allowed_ofm_ifm_overlap_for_cascaded_pass().
	from .high_level_command_stream import Box
	from .high_level_command_stream import DMA
	from .high_level_command_stream import NpuStripe
	from .nn_graph import PassPlacement
	from .nn_graph import SchedulingStrategy
	from .numeric_util import round_up_divide
	from .operation import NpuBlockType
	from .tensor import TensorPurpose


	def dma_if_necessary(ps, box, tensor):
	if tensor.needs_dma():
	dma_op = tensor.ops[0]
	in_tensor = dma_op.inputs[0]
	yield DMA(ps, in_tensor, tensor, box)


	def match_tensor(source, derived):
	if source == derived:
	return True
	ops = derived.ops
	return ops != [] and len(ops) == 1 and ops[0].type == "SplitSliceRead" and source == ops[0].inputs[0]


	def generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx):
	is_first = idx == 0
	is_last = idx == len(passes) - 1
	ps = passes[idx]
	block_config = block_configs[idx]
	npu_block_type = ps.npu_block_type
	split_offsets = [None, None] # offset for [ifm, ifm2]

	if len(ps.inputs) == 2 and npu_block_type == NpuBlockType.ElementWise:
	# Ensure correct ifm and ifm2 order
	if match_tensor(ps.inputs[0], ps.primary_op.inputs[1]) and match_tensor(ps.inputs[1], ps.primary_op.inputs[0]):
	ps.ifm_tensor, ps.ifm2_tensor = ps.ifm2_tensor, ps.ifm_tensor

	for op in ps.ops:
	if op.type == "SplitSliceRead":
	ps.primary_op.attrs["fused_memory_function"] = op.type
	assert len(op.inputs) == 1
	if match_tensor(ps.ifm_tensor, op.inputs[0]):
	split_offsets[0] = op.attrs["split_start"]
	elif match_tensor(ps.ifm2_tensor, op.inputs[0]):
	split_offsets[1] = op.attrs["split_start"]
	else:
	assert False
	else:
	ifm_idx = 0
	for op in ps.ops:
	if op.type == "SplitSliceRead":
	assert ifm_idx < 2
	split_offsets[ifm_idx] = op.attrs["split_start"]
	ps.primary_op.attrs["fused_memory_function"] = op.type
	ifm_idx += 1

	ifm_tensor = ps.ifm_tensor
	ifm2_tensor = ps.ifm2_tensor
	ofm_tensor = ps.ofm_tensor
	weight_tensor = ps.weight_tensor
	scale_tensor = ps.scale_tensor

	ofm_start = [0] * len(ofm_tensor.shape)
	ofm_end = list(ofm_tensor.shape)

	strides = None
	skirt = None
	upscaling = 1
	if ps.primary_op is not None:
	strides = ps.primary_op.attrs.get("strides", None)
	skirt = ps.primary_op.attrs.get("skirt", None)
	if ps.primary_op.type == "Conv2DBackpropInputSwitchedBias":
	upscaling = ofm_tensor.shape[-3] // ifm_tensor.shape[-3]
	elif ps.primary_op.type == "ResizeBilinear":
	upscaling = round_up_divide(ofm_tensor.shape[-3], ifm_tensor.shape[-3])

	concat_axis = 0
	concat_offset = 0

	# Fusable activation functions
	activation_ops = set(("Sigmoid", "Tanh", "Relu", "Relu6", "ReluN1To1"))

	for op in ps.ops:
	if op.type == "ConcatSliceWrite":
	concat_axis = op.attrs["concat_axis"]
	concat_start = op.attrs["concat_start"]
	concat_end = op.attrs["concat_end"]

	ofm_start[concat_axis] = concat_start
	ofm_end[concat_axis] = concat_end
	concat_offset = concat_start
	ps.primary_op.attrs["fused_memory_function"] = op.type
	elif op.type in activation_ops:
	ps.primary_op.attrs["fused_activation_function"] = op.type

	if strat == SchedulingStrategy.WeightStream:
	ofm_step = block_config[-1]
	ofm_stop = ofm_end[-1]
	if weight_tensor is None or not weight_tensor.needs_dma():
	ofm_step = ofm_stop
	for start in range(ofm_start[-1], ofm_stop, ofm_step):
	end = min(start + ofm_step, ofm_stop)
	ofm_start[-1] = start
	ofm_end[-1] = end
	ofm_box = Box(ofm_start, ofm_end)
	ifm_box = None
	ifm2_box = None

	if ifm_tensor.shape != []:
	ifm_box, _, _ = ofm_box.transform_with_strides_and_skirt(
	strides,
	skirt,
	ifm_tensor.shape,
	npu_block_type,
	concat_axis,
	concat_offset,
	split_offsets[0],
	upscaling,
	)
	else:
	ifm_box = Box([], [])
	if ifm2_tensor is not None and ifm2_tensor.shape != []:
	ifm2_box, _, _ = ofm_box.transform_with_strides_and_skirt(
	strides,
	skirt,
	ifm2_tensor.shape,
	npu_block_type,
	concat_axis,
	concat_offset,
	split_offsets[1],
	upscaling,
	)
	else:
	ifm2_box = Box([], [])

	for intermediate in ps.intermediates:
	if (
	intermediate is not None
	and intermediate.shape != []
	and intermediate.purpose in (TensorPurpose.FeatureMap, TensorPurpose.LUT)
	):
	if intermediate.purpose is TensorPurpose.FeatureMap:
	intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt(
	strides,
	skirt,
	intermediate.shape,
	npu_block_type,
	concat_axis,
	concat_offset,
	split_offsets[0],
	upscaling,
	)
	else:
	intermediate_box = Box([0] * len(intermediate.shape), list(intermediate.shape))
	yield from dma_if_necessary(ps, intermediate_box, intermediate)

	weight_box = None
	if weight_tensor is not None:
	weight_oc_start = start
	weight_oc_end = end
	if concat_axis - len(weight_tensor.shape) == -1:
	weight_oc_start -= concat_offset
	weight_oc_end -= concat_offset

	weight_box = Box.make_weight_box(
	weight_tensor.shape,
	npu_block_type,
	weight_oc_start,
	weight_oc_end,
	weight_tensor.weight_transpose_depthwise,
	)
	yield from dma_if_necessary(ps, weight_box, weight_tensor)

	yield NpuStripe(
	ps,
	block_config,
	is_first,
	is_last,
	True,
	True,
	ifm_tensor,
	ifm_box,
	ofm_tensor,
	ofm_box,
	weight_tensor,
	weight_box,
	scale_tensor,
	concat_axis,
	concat_offset,
	ifm2_tensor=ifm2_tensor,
	ifm2_box=ifm2_box,
	)

	elif strat == SchedulingStrategy.IfmStream:
	y_step = block_config[0]
	y_start = 0
	y_dim = 1
	if len(ofm_tensor.shape) >= 3:
	y_start = ofm_start[-3]
	y_dim = ofm_end[-3]
	if idx > 0:
	ifm_y_present = 0
	prev_pass = passes[idx - 1]
	prev_pass_gen = generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx - 1)
	else:
	ifm_y_present = 1
	if len(ifm_tensor.shape) >= 3:
	ifm_y_present = ifm_tensor.shape[-3]
	prev_pass_gen = []
	prev_pass = None

	if len(passes) == 1:
	# no cascading, can just issue one big stripe
	# but only if we've done allocation and OFM does not overlap IFM
	if ifm_tensor.address is not None and ofm_tensor.address is not None:
	if (
	ifm_tensor.address + ifm_tensor.storage_size() <= ofm_tensor.address
	or ofm_tensor.address + ofm_tensor.storage_size() <= ifm_tensor.address
	):
	y_step = y_dim

	weight_box = None

	for start in range(y_start, y_dim, y_step):
	end = min(start + y_step, y_dim)
	if len(ofm_tensor.shape) >= 3:
	ofm_start[-3] = start
	ofm_end[-3] = end
	ofm_box = Box(ofm_start, ofm_end)

	k_height = 1
	if npu_block_type in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):
	if ps.primary_op is not None:
	k_height = ps.primary_op.attrs["ksize"][1]
	else:
	if weight_tensor is not None:
	k_height = weight_tensor.shape[0]

	ifm_box, pad_top, pad_bottom = ofm_box.transform_with_strides_and_skirt(
	strides,
	skirt,
	ifm_tensor.shape,
	npu_block_type,
	concat_axis,
	concat_offset,
	split_offsets[0],
	k_height,
	upscaling,
	)

	for intermediate in ps.intermediates:
	if (
	intermediate is not None
	and intermediate.shape != []
	and intermediate.purpose == TensorPurpose.FeatureMap
	):
	intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt(
	strides,
	skirt,
	intermediate.shape,
	npu_block_type,
	concat_axis,
	concat_offset,
	split_offsets[0],
	upscaling,
	)
	yield from dma_if_necessary(ps, intermediate_box, intermediate)

	ifm_y_needed = 1
	if len(ifm_box.end_coord) >= 3:
	ifm_y_needed = ifm_box.end_coord[-3]
	if ifm_y_present < ifm_y_needed:
	for prev_cmd in prev_pass_gen:
	yield prev_cmd
	rng = prev_cmd.get_ofm_y_range_for_pass(prev_pass)
	if rng is not None:
	ifm_y_present = max(ifm_y_present, rng[1])
	if ifm_y_present >= ifm_y_needed:
	break

	if weight_tensor is not None and weight_box is None:
	weight_box = Box.make_weight_box(
	weight_tensor.shape, npu_block_type, weights_transposed=weight_tensor.weight_transpose_depthwise
	)
	yield from dma_if_necessary(ps, weight_box, weight_tensor)

	# Check if first/last stripe in pass
	is_first_h_stripe = start == y_start
	is_last_h_stripe = (start + y_step) >= y_dim

	stripe = NpuStripe(
	ps,
	block_config,
	is_first,
	is_last,
	is_first_h_stripe,
	is_last_h_stripe,
	ifm_tensor,
	ifm_box,
	ofm_tensor,
	ofm_box,
	weight_tensor,
	weight_box,
	scale_tensor,
	concat_axis,
	concat_offset,
	None,
	None,
	pad_top,
	pad_bottom,
	)
	yield stripe
	else:
	assert 0, "unknown scheduling strategy"


	def generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
	if strat == SchedulingStrategy.WeightStream:
	for idx in range(len(passes)):
	yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx)
	elif strat == SchedulingStrategy.IfmStream:
	yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, len(passes) - 1)
	else:
	assert 0, "Unknown streaming strategy"


	def generate_high_level_command_stream_for_cascaded_pass(cps):
	yield from generate_high_level_command_stream_for_pass_list(
	cps.strategy, cps.passes, [ps.block_config for ps in cps.passes]
	)


	def generate_high_level_command_stream(nng, sg, arch, verbose_high_level_command_stream):
	res = []
	for cps in sg.cascaded_passes:
	if cps.placement == PassPlacement.Npu:
	res += list(generate_high_level_command_stream_for_cascaded_pass(cps))

	sg.high_level_command_stream = res
	if verbose_high_level_command_stream:
	sg.print_high_level_command_stream()


	def calc_allowed_ofm_ifm_overlap_for_pass_list(strat, passes, block_configs):
	highest_ofm_write = 0
	if not passes[0].ifm_tensor or not passes[-1].ofm_tensor:
	return 0

	ifm_read = passes[0].ifm_tensor.storage_size
	min_overlap = 999999999999999999999
	ofm_size = passes[-1].ofm_tensor.storage_size()
	if strat == SchedulingStrategy.WeightStream:
	return 0
	for cmd in generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
	if cmd.is_npu_pass_command():
	if cmd.is_first:
	ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.start_coord, is_top_box=False)
	if ifm_read is None:
	return 0
	if cmd.is_last:
	write_offset = cmd.ofm_tensor.address_offset_for_coordinate(cmd.ofm_box.end_coord, is_top_box=True)
	if write_offset is None:
	return 0
	highest_ofm_write = max(write_offset, highest_ofm_write)

	if cmd.is_first or cmd.is_last:
	overlap_required = max(highest_ofm_write - min(ifm_read, ofm_size), 0)
	can_overwrite = ofm_size - overlap_required
	min_overlap = min(min_overlap, can_overwrite)

	if cmd.is_first:
	ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.end_coord, is_top_box=True)

	min_overlap = max(min_overlap, 0)
	return min_overlap


	def calc_allowed_ofm_ifm_overlap_for_cascaded_pass(cps):
	return calc_allowed_ofm_ifm_overlap_for_pass_list(cps.strategy, cps.passes, [ps.block_config for ps in cps.passes])