blob: 50b913d88eddf24b4ca1ee0f8c5fcf5c92adc7a0 [file] [log] [blame]
# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Description:
# Generate a high-level command stream from a scheduled subgraph with CascadedPasses.
#
# Also used during scheduling to work out allowable IFM/OFM overlap, this functionality can be accessed using
# calc_allowed_ofm_ifm_overlap_for_cascaded_pass().
from .high_level_command_stream import Box
from .high_level_command_stream import DMA
from .high_level_command_stream import NpuStripe
from .nn_graph import PassPlacement
from .nn_graph import SchedulingStrategy
from .numeric_util import round_up_divide
from .operation import NpuBlockType
from .tensor import TensorPurpose
def dma_if_necessary(ps, box, tensor):
if tensor.needs_dma():
dma_op = tensor.ops[0]
in_tensor = dma_op.inputs[0]
yield DMA(ps, in_tensor, tensor, box)
def match_tensor(source, derived):
if source == derived:
return True
ops = derived.ops
return ops != [] and len(ops) == 1 and ops[0].type == "SplitSliceRead" and source == ops[0].inputs[0]
def generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx):
is_first = idx == 0
is_last = idx == len(passes) - 1
ps = passes[idx]
block_config = block_configs[idx]
npu_block_type = ps.npu_block_type
split_offsets = [None, None] # offset for [ifm, ifm2]
if len(ps.inputs) == 2 and npu_block_type == NpuBlockType.ElementWise:
# Ensure correct ifm and ifm2 order
if match_tensor(ps.inputs[0], ps.primary_op.inputs[1]) and match_tensor(ps.inputs[1], ps.primary_op.inputs[0]):
ps.ifm_tensor, ps.ifm2_tensor = ps.ifm2_tensor, ps.ifm_tensor
for op in ps.ops:
if op.type == "SplitSliceRead":
ps.primary_op.attrs["fused_memory_function"] = op.type
assert len(op.inputs) == 1
if match_tensor(ps.ifm_tensor, op.inputs[0]):
split_offsets[0] = op.attrs["split_start"]
elif match_tensor(ps.ifm2_tensor, op.inputs[0]):
split_offsets[1] = op.attrs["split_start"]
else:
assert False
else:
ifm_idx = 0
for op in ps.ops:
if op.type == "SplitSliceRead":
assert ifm_idx < 2
split_offsets[ifm_idx] = op.attrs["split_start"]
ps.primary_op.attrs["fused_memory_function"] = op.type
ifm_idx += 1
ifm_tensor = ps.ifm_tensor
ifm2_tensor = ps.ifm2_tensor
ofm_tensor = ps.ofm_tensor
weight_tensor = ps.weight_tensor
scale_tensor = ps.scale_tensor
ofm_start = [0] * len(ofm_tensor.shape)
ofm_end = list(ofm_tensor.shape)
strides = None
skirt = None
upscaling = 1
if ps.primary_op is not None:
strides = ps.primary_op.attrs.get("strides", None)
skirt = ps.primary_op.attrs.get("skirt", None)
if ps.primary_op.type == "Conv2DBackpropInputSwitchedBias":
upscaling = ofm_tensor.shape[-3] // ifm_tensor.shape[-3]
elif ps.primary_op.type == "ResizeBilinear":
upscaling = round_up_divide(ofm_tensor.shape[-3], ifm_tensor.shape[-3])
concat_axis = 0
concat_offset = 0
# Fusable activation functions
activation_ops = set(("Sigmoid", "Tanh", "Relu", "Relu6", "ReluN1To1"))
for op in ps.ops:
if op.type == "ConcatSliceWrite":
concat_axis = op.attrs["concat_axis"]
concat_start = op.attrs["concat_start"]
concat_end = op.attrs["concat_end"]
ofm_start[concat_axis] = concat_start
ofm_end[concat_axis] = concat_end
concat_offset = concat_start
ps.primary_op.attrs["fused_memory_function"] = op.type
elif op.type in activation_ops:
ps.primary_op.attrs["fused_activation_function"] = op.type
if strat == SchedulingStrategy.WeightStream:
ofm_step = block_config[-1]
ofm_stop = ofm_end[-1]
if weight_tensor is None or not weight_tensor.needs_dma():
ofm_step = ofm_stop
for start in range(ofm_start[-1], ofm_stop, ofm_step):
end = min(start + ofm_step, ofm_stop)
ofm_start[-1] = start
ofm_end[-1] = end
ofm_box = Box(ofm_start, ofm_end)
ifm_box = None
ifm2_box = None
if ifm_tensor.shape != []:
ifm_box, _, _ = ofm_box.transform_with_strides_and_skirt(
strides,
skirt,
ifm_tensor.shape,
npu_block_type,
concat_axis,
concat_offset,
split_offsets[0],
upscaling,
)
else:
ifm_box = Box([], [])
if ifm2_tensor is not None and ifm2_tensor.shape != []:
ifm2_box, _, _ = ofm_box.transform_with_strides_and_skirt(
strides,
skirt,
ifm2_tensor.shape,
npu_block_type,
concat_axis,
concat_offset,
split_offsets[1],
upscaling,
)
else:
ifm2_box = Box([], [])
for intermediate in ps.intermediates:
if (
intermediate is not None
and intermediate.shape != []
and intermediate.purpose in (TensorPurpose.FeatureMap, TensorPurpose.LUT)
):
if intermediate.purpose is TensorPurpose.FeatureMap:
intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt(
strides,
skirt,
intermediate.shape,
npu_block_type,
concat_axis,
concat_offset,
split_offsets[0],
upscaling,
)
else:
intermediate_box = Box([0] * len(intermediate.shape), list(intermediate.shape))
yield from dma_if_necessary(ps, intermediate_box, intermediate)
weight_box = None
if weight_tensor is not None:
weight_oc_start = start
weight_oc_end = end
if concat_axis - len(weight_tensor.shape) == -1:
weight_oc_start -= concat_offset
weight_oc_end -= concat_offset
weight_box = Box.make_weight_box(
weight_tensor.shape,
npu_block_type,
weight_oc_start,
weight_oc_end,
weight_tensor.weight_transpose_depthwise,
)
yield from dma_if_necessary(ps, weight_box, weight_tensor)
yield NpuStripe(
ps,
block_config,
is_first,
is_last,
True,
True,
ifm_tensor,
ifm_box,
ofm_tensor,
ofm_box,
weight_tensor,
weight_box,
scale_tensor,
concat_axis,
concat_offset,
ifm2_tensor=ifm2_tensor,
ifm2_box=ifm2_box,
)
elif strat == SchedulingStrategy.IfmStream:
y_step = block_config[0]
y_start = 0
y_dim = 1
if len(ofm_tensor.shape) >= 3:
y_start = ofm_start[-3]
y_dim = ofm_end[-3]
if idx > 0:
ifm_y_present = 0
prev_pass = passes[idx - 1]
prev_pass_gen = generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx - 1)
else:
ifm_y_present = 1
if len(ifm_tensor.shape) >= 3:
ifm_y_present = ifm_tensor.shape[-3]
prev_pass_gen = []
prev_pass = None
if len(passes) == 1:
# no cascading, can just issue one big stripe
# but only if we've done allocation and OFM does not overlap IFM
if ifm_tensor.address is not None and ofm_tensor.address is not None:
if (
ifm_tensor.address + ifm_tensor.storage_size() <= ofm_tensor.address
or ofm_tensor.address + ofm_tensor.storage_size() <= ifm_tensor.address
):
y_step = y_dim
weight_box = None
for start in range(y_start, y_dim, y_step):
end = min(start + y_step, y_dim)
if len(ofm_tensor.shape) >= 3:
ofm_start[-3] = start
ofm_end[-3] = end
ofm_box = Box(ofm_start, ofm_end)
k_height = 1
if npu_block_type in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):
if ps.primary_op is not None:
k_height = ps.primary_op.attrs["ksize"][1]
else:
if weight_tensor is not None:
k_height = weight_tensor.shape[0]
ifm_box, pad_top, pad_bottom = ofm_box.transform_with_strides_and_skirt(
strides,
skirt,
ifm_tensor.shape,
npu_block_type,
concat_axis,
concat_offset,
split_offsets[0],
k_height,
upscaling,
)
for intermediate in ps.intermediates:
if (
intermediate is not None
and intermediate.shape != []
and intermediate.purpose == TensorPurpose.FeatureMap
):
intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt(
strides,
skirt,
intermediate.shape,
npu_block_type,
concat_axis,
concat_offset,
split_offsets[0],
upscaling,
)
yield from dma_if_necessary(ps, intermediate_box, intermediate)
ifm_y_needed = 1
if len(ifm_box.end_coord) >= 3:
ifm_y_needed = ifm_box.end_coord[-3]
if ifm_y_present < ifm_y_needed:
for prev_cmd in prev_pass_gen:
yield prev_cmd
rng = prev_cmd.get_ofm_y_range_for_pass(prev_pass)
if rng is not None:
ifm_y_present = max(ifm_y_present, rng[1])
if ifm_y_present >= ifm_y_needed:
break
if weight_tensor is not None and weight_box is None:
weight_box = Box.make_weight_box(
weight_tensor.shape, npu_block_type, weights_transposed=weight_tensor.weight_transpose_depthwise
)
yield from dma_if_necessary(ps, weight_box, weight_tensor)
# Check if first/last stripe in pass
is_first_h_stripe = start == y_start
is_last_h_stripe = (start + y_step) >= y_dim
stripe = NpuStripe(
ps,
block_config,
is_first,
is_last,
is_first_h_stripe,
is_last_h_stripe,
ifm_tensor,
ifm_box,
ofm_tensor,
ofm_box,
weight_tensor,
weight_box,
scale_tensor,
concat_axis,
concat_offset,
None,
None,
pad_top,
pad_bottom,
)
yield stripe
else:
assert 0, "unknown scheduling strategy"
def generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
if strat == SchedulingStrategy.WeightStream:
for idx in range(len(passes)):
yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx)
elif strat == SchedulingStrategy.IfmStream:
yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, len(passes) - 1)
else:
assert 0, "Unknown streaming strategy"
def generate_high_level_command_stream_for_cascaded_pass(cps):
yield from generate_high_level_command_stream_for_pass_list(
cps.strategy, cps.passes, [ps.block_config for ps in cps.passes]
)
def generate_high_level_command_stream(nng, sg, arch, verbose_high_level_command_stream):
res = []
for cps in sg.cascaded_passes:
if cps.placement == PassPlacement.Npu:
res += list(generate_high_level_command_stream_for_cascaded_pass(cps))
sg.high_level_command_stream = res
if verbose_high_level_command_stream:
sg.print_high_level_command_stream()
def calc_allowed_ofm_ifm_overlap_for_pass_list(strat, passes, block_configs):
highest_ofm_write = 0
if not passes[0].ifm_tensor or not passes[-1].ofm_tensor:
return 0
ifm_read = passes[0].ifm_tensor.storage_size
min_overlap = 999999999999999999999
ofm_size = passes[-1].ofm_tensor.storage_size()
if strat == SchedulingStrategy.WeightStream:
return 0
for cmd in generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
if cmd.is_npu_pass_command():
if cmd.is_first:
ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.start_coord, is_top_box=False)
if ifm_read is None:
return 0
if cmd.is_last:
write_offset = cmd.ofm_tensor.address_offset_for_coordinate(cmd.ofm_box.end_coord, is_top_box=True)
if write_offset is None:
return 0
highest_ofm_write = max(write_offset, highest_ofm_write)
if cmd.is_first or cmd.is_last:
overlap_required = max(highest_ofm_write - min(ifm_read, ofm_size), 0)
can_overwrite = ofm_size - overlap_required
min_overlap = min(min_overlap, can_overwrite)
if cmd.is_first:
ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.end_coord, is_top_box=True)
min_overlap = max(min_overlap, 0)
return min_overlap
def calc_allowed_ofm_ifm_overlap_for_cascaded_pass(cps):
return calc_allowed_ofm_ifm_overlap_for_pass_list(cps.strategy, cps.passes, [ps.block_config for ps in cps.passes])