| # Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. |
| # |
| # SPDX-License-Identifier: Apache-2.0 |
| # |
| # Licensed under the Apache License, Version 2.0 (the License); you may |
| # not use this file except in compliance with the License. |
| # You may obtain a copy of the License at |
| # |
| # www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, software |
| # distributed under the License is distributed on an AS IS BASIS, WITHOUT |
| # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| # See the License for the specific language governing permissions and |
| # limitations under the License. |
| # Description: |
| # Generate a high-level command stream from a scheduled subgraph with CascadedPasses. |
| # |
| # Also used during scheduling to work out allowable IFM/OFM overlap, this functionality can be accessed using |
| # calc_allowed_ofm_ifm_overlap_for_cascaded_pass(). |
| from .high_level_command_stream import Box |
| from .high_level_command_stream import DMA |
| from .high_level_command_stream import NpuStripe |
| from .nn_graph import PassPlacement |
| from .nn_graph import SchedulingStrategy |
| from .numeric_util import round_up_divide |
| from .operation import create_activation_function |
| from .operation import NpuBlockType |
| from .operation import Op |
| from .tensor import TensorPurpose |
| |
| |
| def dma_if_necessary(ps, box, tensor): |
| if tensor.needs_dma(): |
| dma_op = tensor.ops[0] |
| in_tensor = dma_op.inputs[0] |
| yield DMA(ps, in_tensor, tensor, box) |
| |
| |
| def match_tensor(source, derived): |
| if source == derived: |
| return True |
| ops = derived.ops |
| return ops != [] and len(ops) == 1 and ops[0].type == Op.SplitSliceRead and source == ops[0].inputs[0] |
| |
| |
| def generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx): |
| is_first = idx == 0 |
| is_last = idx == len(passes) - 1 |
| ps = passes[idx] |
| block_config = block_configs[idx] |
| npu_block_type = ps.npu_block_type |
| split_offsets = [None, None] # offset for [ifm, ifm2] |
| |
| if ps.ifm_tensor is not None and ps.ifm2_tensor is not None and npu_block_type == NpuBlockType.ElementWise: |
| # Ensure correct ifm and ifm2 order |
| if match_tensor(ps.inputs[0], ps.primary_op.inputs[1]) and match_tensor(ps.inputs[1], ps.primary_op.inputs[0]): |
| ps.ifm_tensor, ps.ifm2_tensor = ps.ifm2_tensor, ps.ifm_tensor |
| |
| for op in ps.ops: |
| if op.type == Op.SplitSliceRead: |
| ps.primary_op.memory_function = op.type |
| assert len(op.inputs) == 1 |
| if match_tensor(ps.ifm_tensor, op.inputs[0]): |
| split_offsets[0] = op.attrs["split_start"] |
| elif match_tensor(ps.ifm2_tensor, op.inputs[0]): |
| split_offsets[1] = op.attrs["split_start"] |
| else: |
| assert False |
| else: |
| ifm_idx = 0 |
| for op in ps.ops: |
| if op.type == Op.SplitSliceRead: |
| assert ifm_idx < 2 |
| split_offsets[ifm_idx] = op.attrs["split_start"] |
| ps.primary_op.memory_function = op.type |
| ifm_idx += 1 |
| |
| ifm_tensor = ps.ifm_tensor |
| ifm2_tensor = ps.ifm2_tensor |
| ofm_tensor = ps.ofm_tensor |
| weight_tensor = ps.weight_tensor |
| scale_tensor = ps.scale_tensor |
| |
| ofm_start = [0] * len(ofm_tensor.shape) |
| ofm_end = list(ofm_tensor.shape) |
| |
| strides = None |
| skirt = None |
| upscaling = 1 |
| if ps.primary_op is not None: |
| strides = ps.primary_op.attrs.get("strides", None) |
| skirt = ps.primary_op.attrs.get("skirt", None) |
| if ps.primary_op.type == Op.Conv2DBackpropInputSwitchedBias: |
| upscaling = ofm_tensor.shape[-3] // ifm_tensor.shape[-3] |
| elif ps.primary_op.type == Op.ResizeBilinear: |
| upscaling = round_up_divide(ofm_tensor.shape[-3], ifm_tensor.shape[-3]) |
| |
| concat_axis = 0 |
| concat_offset = 0 |
| |
| for op in ps.ops: |
| if op.type == Op.ConcatSliceWrite: |
| concat_axis = op.attrs["concat_axis"] |
| concat_start = op.attrs["concat_start"] |
| concat_end = op.attrs["concat_end"] |
| |
| ofm_start[concat_axis] = concat_start |
| ofm_end[concat_axis] = concat_end |
| concat_offset = concat_start |
| ps.primary_op.memory_function = op.type |
| elif op.type.is_relu_op() or op.type in (Op.Tanh, Op.Sigmoid): |
| ps.primary_op.activation = create_activation_function(op.type) |
| |
| if strat == SchedulingStrategy.WeightStream: |
| ofm_step = block_config[-1] |
| ofm_stop = ofm_end[-1] |
| if weight_tensor is None or not weight_tensor.needs_dma(): |
| ofm_step = ofm_stop |
| for start in range(ofm_start[-1], ofm_stop, ofm_step): |
| end = min(start + ofm_step, ofm_stop) |
| ofm_start[-1] = start |
| ofm_end[-1] = end |
| ofm_box = Box(ofm_start, ofm_end) |
| ifm_box = None |
| ifm2_box = None |
| |
| if ifm_tensor.shape != []: |
| ifm_box, _, _ = ofm_box.transform_with_strides_and_skirt( |
| strides, |
| skirt, |
| ifm_tensor.shape, |
| npu_block_type, |
| concat_axis, |
| concat_offset, |
| split_offsets[0], |
| upscaling, |
| ) |
| else: |
| ifm_box = Box([], []) |
| if ifm2_tensor is not None and ifm2_tensor.shape != []: |
| ifm2_box, _, _ = ofm_box.transform_with_strides_and_skirt( |
| strides, |
| skirt, |
| ifm2_tensor.shape, |
| npu_block_type, |
| concat_axis, |
| concat_offset, |
| split_offsets[1], |
| upscaling, |
| ) |
| else: |
| ifm2_box = Box([], []) |
| |
| for intermediate in ps.intermediates: |
| if ( |
| intermediate is not None |
| and intermediate.shape != [] |
| and intermediate.purpose in (TensorPurpose.FeatureMap, TensorPurpose.LUT) |
| ): |
| if intermediate.purpose is TensorPurpose.FeatureMap: |
| intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt( |
| strides, |
| skirt, |
| intermediate.shape, |
| npu_block_type, |
| concat_axis, |
| concat_offset, |
| split_offsets[0], |
| upscaling, |
| ) |
| else: |
| intermediate_box = Box([0] * len(intermediate.shape), list(intermediate.shape)) |
| yield from dma_if_necessary(ps, intermediate_box, intermediate) |
| |
| weight_box = None |
| if weight_tensor is not None: |
| weight_oc_start = start |
| weight_oc_end = end |
| if concat_axis - len(weight_tensor.shape) == -1: |
| weight_oc_start -= concat_offset |
| weight_oc_end -= concat_offset |
| |
| weight_box = Box.make_weight_box( |
| weight_tensor.shape, |
| npu_block_type, |
| weight_oc_start, |
| weight_oc_end, |
| weight_tensor.weight_transpose_depthwise, |
| ) |
| yield from dma_if_necessary(ps, weight_box, weight_tensor) |
| |
| yield NpuStripe( |
| ps, |
| block_config, |
| is_first, |
| is_last, |
| True, |
| True, |
| ifm_tensor, |
| ifm_box, |
| ofm_tensor, |
| ofm_box, |
| weight_tensor, |
| weight_box, |
| scale_tensor, |
| concat_axis, |
| concat_offset, |
| ifm2_tensor=ifm2_tensor, |
| ifm2_box=ifm2_box, |
| ) |
| |
| elif strat == SchedulingStrategy.IfmStream: |
| y_step = block_config[0] |
| y_start = 0 |
| y_dim = 1 |
| if len(ofm_tensor.shape) >= 3: |
| y_start = ofm_start[-3] |
| y_dim = ofm_end[-3] |
| if idx > 0: |
| ifm_y_present = 0 |
| prev_pass = passes[idx - 1] |
| prev_pass_gen = generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx - 1) |
| else: |
| ifm_y_present = 1 |
| if len(ifm_tensor.shape) >= 3: |
| ifm_y_present = ifm_tensor.shape[-3] |
| prev_pass_gen = [] |
| prev_pass = None |
| |
| if len(passes) == 1: |
| # no cascading, can just issue one big stripe |
| # but only if we've done allocation and OFM does not overlap IFM |
| if ifm_tensor.address is not None and ofm_tensor.address is not None: |
| if ( |
| ifm_tensor.address + ifm_tensor.storage_size() <= ofm_tensor.address |
| or ofm_tensor.address + ofm_tensor.storage_size() <= ifm_tensor.address |
| ): |
| y_step = y_dim |
| |
| weight_box = None |
| scale_box = None |
| |
| for start in range(y_start, y_dim, y_step): |
| end = min(start + y_step, y_dim) |
| if len(ofm_tensor.shape) >= 3: |
| ofm_start[-3] = start |
| ofm_end[-3] = end |
| ofm_box = Box(ofm_start, ofm_end) |
| |
| k_height = 1 |
| if npu_block_type in (NpuBlockType.Pooling, NpuBlockType.ReduceSum): |
| if ps.primary_op is not None: |
| k_height = ps.primary_op.attrs["ksize"][1] |
| else: |
| if weight_tensor is not None: |
| k_height = weight_tensor.shape[0] |
| |
| ifm_box, pad_top, pad_bottom = ofm_box.transform_with_strides_and_skirt( |
| strides, |
| skirt, |
| ifm_tensor.shape, |
| npu_block_type, |
| concat_axis, |
| concat_offset, |
| split_offsets[0], |
| k_height, |
| upscaling, |
| ) |
| |
| for intermediate in ps.intermediates: |
| if ( |
| intermediate is not None |
| and intermediate.shape != [] |
| and intermediate.purpose in (TensorPurpose.FeatureMap, TensorPurpose.LUT) |
| ): |
| if intermediate.purpose is TensorPurpose.FeatureMap: |
| intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt( |
| strides, |
| skirt, |
| intermediate.shape, |
| npu_block_type, |
| concat_axis, |
| concat_offset, |
| split_offsets[0], |
| upscaling, |
| ) |
| else: |
| intermediate_box = Box([0] * len(intermediate.shape), list(intermediate.shape)) |
| yield from dma_if_necessary(ps, intermediate_box, intermediate) |
| |
| ifm_y_needed = 1 |
| if len(ifm_box.end_coord) >= 3: |
| ifm_y_needed = ifm_box.end_coord[-3] |
| if ifm_y_present < ifm_y_needed: |
| for prev_cmd in prev_pass_gen: |
| yield prev_cmd |
| rng = prev_cmd.get_ofm_y_range_for_pass(prev_pass) |
| if rng is not None: |
| ifm_y_present = max(ifm_y_present, rng[1]) |
| if ifm_y_present >= ifm_y_needed: |
| break |
| |
| if scale_tensor is not None and scale_tensor.purpose == TensorPurpose.FSBias and scale_box is None: |
| scale_box = Box([0] * len(scale_tensor.shape), list(scale_tensor.shape)) |
| yield from dma_if_necessary(ps, scale_box, scale_tensor) |
| |
| if weight_tensor is not None and weight_box is None: |
| weight_box = Box.make_weight_box( |
| weight_tensor.shape, npu_block_type, weights_transposed=weight_tensor.weight_transpose_depthwise |
| ) |
| yield from dma_if_necessary(ps, weight_box, weight_tensor) |
| |
| # Check if first/last stripe in pass |
| is_first_h_stripe = start == y_start |
| is_last_h_stripe = (start + y_step) >= y_dim |
| |
| stripe = NpuStripe( |
| ps, |
| block_config, |
| is_first, |
| is_last, |
| is_first_h_stripe, |
| is_last_h_stripe, |
| ifm_tensor, |
| ifm_box, |
| ofm_tensor, |
| ofm_box, |
| weight_tensor, |
| weight_box, |
| scale_tensor, |
| concat_axis, |
| concat_offset, |
| None, |
| None, |
| pad_top, |
| pad_bottom, |
| ) |
| yield stripe |
| else: |
| assert 0, "unknown scheduling strategy" |
| |
| |
| def generate_high_level_command_stream_for_pass_list(strat, passes, block_configs): |
| if strat == SchedulingStrategy.WeightStream: |
| for idx in range(len(passes)): |
| yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx) |
| elif strat == SchedulingStrategy.IfmStream: |
| yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, len(passes) - 1) |
| else: |
| assert 0, "Unknown streaming strategy" |
| |
| |
| def generate_high_level_command_stream_for_cascaded_pass(cps): |
| yield from generate_high_level_command_stream_for_pass_list( |
| cps.strategy, cps.passes, [ps.block_config for ps in cps.passes] |
| ) |
| |
| |
| def generate_high_level_command_stream(nng, sg, arch, verbose_high_level_command_stream): |
| res = [] |
| for cps in sg.cascaded_passes: |
| if cps.placement == PassPlacement.Npu: |
| res += list(generate_high_level_command_stream_for_cascaded_pass(cps)) |
| |
| sg.high_level_command_stream = res |
| if verbose_high_level_command_stream: |
| sg.print_high_level_command_stream() |
| |
| |
| def calc_allowed_ofm_ifm_overlap_for_pass_list(strat, passes, block_configs): |
| highest_ofm_write = 0 |
| if not passes[0].ifm_tensor or not passes[-1].ofm_tensor: |
| return 0 |
| |
| ifm_read = passes[0].ifm_tensor.storage_size |
| min_overlap = 999999999999999999999 |
| ofm_size = passes[-1].ofm_tensor.storage_size() |
| if strat == SchedulingStrategy.WeightStream: |
| return 0 |
| for cmd in generate_high_level_command_stream_for_pass_list(strat, passes, block_configs): |
| if cmd.is_npu_pass_command(): |
| if cmd.is_first: |
| ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.start_coord, is_top_box=False) |
| if ifm_read is None: |
| return 0 |
| if cmd.is_last: |
| write_offset = cmd.ofm_tensor.address_offset_for_coordinate(cmd.ofm_box.end_coord, is_top_box=True) |
| if write_offset is None: |
| return 0 |
| highest_ofm_write = max(write_offset, highest_ofm_write) |
| |
| if cmd.is_first or cmd.is_last: |
| overlap_required = max(highest_ofm_write - min(ifm_read, ofm_size), 0) |
| can_overwrite = ofm_size - overlap_required |
| min_overlap = min(min_overlap, can_overwrite) |
| |
| if cmd.is_first: |
| ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.end_coord, is_top_box=True) |
| |
| min_overlap = max(min_overlap, 0) |
| return min_overlap |