Johan Alfven | 9072496 | 2023-02-02 09:07:48 +0100 | [diff] [blame^] | 1 | # SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com> |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 2 | # |
| 3 | # SPDX-License-Identifier: Apache-2.0 |
| 4 | # |
| 5 | # Licensed under the Apache License, Version 2.0 (the License); you may |
| 6 | # not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| 8 | # |
| 9 | # www.apache.org/licenses/LICENSE-2.0 |
| 10 | # |
| 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an AS IS BASIS, WITHOUT |
| 13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
Rickard Bolin | bc6ee58 | 2022-11-04 08:24:29 +0000 | [diff] [blame] | 16 | # |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 17 | # Description: |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 18 | # Generate a high-level command stream from a schedule |
Diego Russo | e8a1045 | 2020-04-21 17:39:10 +0100 | [diff] [blame] | 19 | from .high_level_command_stream import Box |
| 20 | from .high_level_command_stream import DMA |
Johan Alfven | 9072496 | 2023-02-02 09:07:48 +0100 | [diff] [blame^] | 21 | from .high_level_command_stream import NOP |
Diego Russo | e8a1045 | 2020-04-21 17:39:10 +0100 | [diff] [blame] | 22 | from .high_level_command_stream import NpuStripe |
Charles Xu | 89a6bbf | 2020-08-11 12:31:58 +0200 | [diff] [blame] | 23 | from .numeric_util import round_up_divide |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 24 | from .operation import create_activation_function |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 25 | from .operation import NpuBlockType |
Louis Verhaard | aee5d75 | 2020-09-30 09:01:52 +0200 | [diff] [blame] | 26 | from .operation import Op |
patrik.gustavsson | eeb8515 | 2020-12-21 17:10:40 +0000 | [diff] [blame] | 27 | from .shape4d import Shape4D |
Charles Xu | 7879222 | 2020-05-13 10:15:26 +0200 | [diff] [blame] | 28 | from .tensor import TensorPurpose |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 29 | |
| 30 | |
Charles Xu | 7879222 | 2020-05-13 10:15:26 +0200 | [diff] [blame] | 31 | def dma_if_necessary(ps, box, tensor): |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 32 | src_tensor = tensor.src_tensor |
| 33 | if src_tensor and tensor.mem_area != src_tensor.mem_area: |
| 34 | yield DMA(ps, src_tensor, tensor, box) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 35 | |
Tim Hall | c30f495 | 2020-06-15 20:47:35 +0100 | [diff] [blame] | 36 | |
Johan Alfven | 9072496 | 2023-02-02 09:07:48 +0100 | [diff] [blame^] | 37 | def dma_feature_map_if_necessary(ps, src_tensor, dst_tensor): |
| 38 | box = Box([0] * len(src_tensor.shape), list(src_tensor.shape)) |
| 39 | src_addr = src_tensor.address_for_coordinate(box.start_coord) |
| 40 | dst_addr = dst_tensor.address_for_coordinate(box.start_coord) |
| 41 | |
| 42 | if src_addr != dst_addr or src_tensor.mem_area != dst_tensor.mem_area: |
| 43 | yield DMA(ps, src_tensor, dst_tensor, box) |
| 44 | else: |
| 45 | # Source and destination is the same so no need for a DMA transaction |
| 46 | # Create a NOP for visibility when printing the high_level_command_stream |
| 47 | yield NOP(ps, src_tensor, dst_tensor) |
| 48 | |
| 49 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 50 | def generate_high_level_command_stream_for_schedule(nng, sg, arch, verbose_high_level_command_stream): |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 51 | res = [] |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 52 | # sg.sched_ops are ordered by execution |
| 53 | processed_cascades = set() |
| 54 | for sched_op in sg.sched_ops: |
| 55 | op_info = sg.schedule.cost_map[sched_op] |
| 56 | if op_info.cascade in processed_cascades: |
| 57 | # This cascade has already been processed |
| 58 | continue |
| 59 | |
| 60 | if op_info.cascade == 0: |
| 61 | # Generate high-level commands for this Op in isolation |
| 62 | res += list(generate_high_level_commands_for_sched_op(sched_op, sg.schedule)) |
| 63 | else: |
| 64 | # Generate high-level commands for the whole cascade |
| 65 | cascade_info = sg.schedule.cascades[op_info.cascade] |
| 66 | # Start from the last Op in the cascade |
| 67 | res += list(generate_high_level_commands_for_sched_op(sg.sched_ops[cascade_info.end], sg.schedule)) |
| 68 | processed_cascades.add(op_info.cascade) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 69 | |
| 70 | sg.high_level_command_stream = res |
| 71 | if verbose_high_level_command_stream: |
| 72 | sg.print_high_level_command_stream() |
| 73 | |
| 74 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 75 | def generate_high_level_commands_for_sched_op(sched_op, schedule): |
| 76 | op_info = schedule.cost_map[sched_op] |
| 77 | cascade_info = schedule.cascades.get(op_info.cascade) |
| 78 | npu_block_type = sched_op.parent_ps.npu_block_type |
| 79 | block_config = op_info.block_config |
| 80 | ps = sched_op.parent_ps |
| 81 | parent_op = sched_op.parent_op |
| 82 | ofm_tensor = ps.ofm_tensor |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 83 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 84 | # Get Tensors and Full Shapes |
Jonas Ohlsson | d857507 | 2022-03-30 10:30:25 +0200 | [diff] [blame] | 85 | ( |
| 86 | ifm_tensor, |
| 87 | ifm2_tensor, |
| 88 | uncomp_weight_tensor, |
| 89 | _, |
| 90 | _, |
| 91 | ) = parent_op.get_ifm_ifm2_weights_biases_ofm() |
Fredrik Svedberg | b81e1bb | 2022-10-11 21:50:51 +0200 | [diff] [blame] | 92 | if sched_op.reversed_operands: |
| 93 | ifm2_tensor, ifm_tensor = ifm_tensor, ifm2_tensor |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 94 | ifm = sched_op.ifm |
| 95 | ifm2 = sched_op.ifm2 |
| 96 | ofm_shape = sched_op.ofm.shape |
| 97 | |
| 98 | # Get Kernel strides and upscaling factor |
| 99 | kernel_stride = sched_op.kernel.stride |
| 100 | strides = [1, kernel_stride.y, kernel_stride.x, 1] |
| 101 | skirt = parent_op.attrs.get("skirt", None) |
| 102 | upscaling = 1 |
| 103 | if sched_op.op_type == Op.Conv2DBackpropInputSwitchedBias: |
| 104 | upscaling = ofm_shape.height // ifm.shape.height |
Tim Hall | 885033b | 2022-07-21 11:46:03 +0100 | [diff] [blame] | 105 | elif sched_op.op_type.is_resize_op(): |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 106 | upscaling = round_up_divide(ofm_shape.height, ifm.shape.height) |
| 107 | |
Rickard Bolin | 1c08afa | 2022-01-07 14:22:52 +0000 | [diff] [blame] | 108 | # Get kernel height and height dilation |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 109 | k_height = 1 |
| 110 | if npu_block_type in (NpuBlockType.Pooling, NpuBlockType.ReduceSum): |
| 111 | if parent_op is not None: |
| 112 | k_height = parent_op.attrs["ksize"][1] |
| 113 | else: |
| 114 | if uncomp_weight_tensor is not None: |
| 115 | k_height = uncomp_weight_tensor.shape[0] |
| 116 | |
Rickard Bolin | 1c08afa | 2022-01-07 14:22:52 +0000 | [diff] [blame] | 117 | k_height_dilation = parent_op.attrs.get("dilation", (_, 1, _, _))[-3] |
| 118 | |
| 119 | # Calculate dilated kernel height |
| 120 | k_dilated_height = k_height_dilation * (k_height - 1) + 1 |
| 121 | |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 122 | # Define Start and End coordinates for the OFM |
| 123 | ofm_start = Shape4D(0, 0, 0, op_info.ofm_depth_slices[0]) |
| 124 | ofm_end = ofm_shape |
| 125 | |
| 126 | ofm_depth_slices = op_info.ofm_depth_slices |
| 127 | |
| 128 | # Read/Write offsets |
| 129 | read_offsets = list(parent_op.read_offsets) # offset for [ifm, ifm2] |
| 130 | read_shapes = list(parent_op.read_shapes) # read shapes for [ifm, ifm2] |
| 131 | write_offset = Shape4D(0, 0, 0, 0) |
| 132 | if parent_op.write_offset is not None: |
| 133 | write_offset = parent_op.write_offset |
| 134 | ofm_start = write_offset |
| 135 | ofm_end = parent_op.write_offset + parent_op.write_shape |
| 136 | |
| 137 | # Create activation function if needed |
| 138 | for op in ps.ops: |
| 139 | if op.type.is_relu_op() or op.type in (Op.Tanh, Op.Sigmoid): |
Patrik Gustavsson | 8f1f9aa | 2021-06-28 07:41:58 +0200 | [diff] [blame] | 140 | ps.primary_op.activation = create_activation_function( |
| 141 | op.type, min=op.attrs.get("min", None), max=op.attrs.get("max", None) |
| 142 | ) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 143 | |
| 144 | # Generate commands for the Op that produces this Op's IFM, if applicable |
| 145 | if cascade_info is None or cascade_info.start == sched_op.index: |
| 146 | # Lone Op or First Op in cascade - all IFM data is present |
| 147 | ifm_present = Box([0, 0, 0, 0], ifm.shape.as_list()) |
| 148 | producer_op = None |
| 149 | prev_cmd_gen = [] |
| 150 | else: |
| 151 | ifm_present = Box([0, 0, 0, 0], [0, 0, 0, 0]) |
| 152 | producer_op = sched_op.ifm.connection.producers[0] |
| 153 | prev_cmd_gen = generate_high_level_commands_for_sched_op(producer_op, schedule) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 154 | ofm_step = op_info.stripe |
| 155 | for start_height in range(ofm_start.height, ofm_end.height, ofm_step.height): |
| 156 | end_height = min(start_height + ofm_step.height, ofm_end.height) |
| 157 | for start_width in range(ofm_start.width, ofm_end.width, ofm_step.width): |
| 158 | end_width = min(start_width + ofm_step.width, ofm_end.width) |
| 159 | |
Dwight Lidman | 8f78ac2 | 2021-08-13 14:04:30 +0200 | [diff] [blame] | 160 | lut_dma_done = False |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 161 | for depth_idx, start_channel in enumerate(ofm_depth_slices[:-1]): |
| 162 | start_channel = max(start_channel, ofm_start.depth) |
| 163 | end_channel = min(ofm_depth_slices[depth_idx + 1], ofm_end.depth) |
| 164 | |
| 165 | # Construct the OFM box for the current stripe |
| 166 | ofm_box_start = Shape4D(ofm_start.batch, start_height, start_width, start_channel) |
| 167 | ofm_box_end = Shape4D(ofm_end.batch, end_height, end_width, end_channel) |
| 168 | ofm_box = Box(ofm_box_start.as_list(), ofm_box_end.as_list()) |
| 169 | ifm_box = Box([], []) |
| 170 | ifm2_box = Box([], []) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 171 | # Calculate IFM input box based on the OFM box |
| 172 | if ifm: |
| 173 | ifm_box, pad_top, pad_bottom = ofm_box.transform_with_strides_and_skirt( |
| 174 | strides, |
| 175 | skirt, |
| 176 | ifm.shape, |
| 177 | npu_block_type, |
| 178 | write_offset.as_list(), |
Rickard Bolin | 1c08afa | 2022-01-07 14:22:52 +0000 | [diff] [blame] | 179 | k_dilated_height, |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 180 | read_offsets[0], |
| 181 | read_shapes[0], |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 182 | upscaling, |
erik.andersson@arm.com | 6b2a0b4 | 2022-03-22 15:35:30 +0100 | [diff] [blame] | 183 | op.type, |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 184 | ) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 185 | # Calculate IFM2 input box based on the OFM box |
| 186 | if ifm2: |
| 187 | ifm2_box, pad_top, pad_bottom = ofm_box.transform_with_strides_and_skirt( |
| 188 | strides, |
| 189 | skirt, |
| 190 | ifm2.shape, |
| 191 | npu_block_type, |
| 192 | write_offset.as_list(), |
Rickard Bolin | 1c08afa | 2022-01-07 14:22:52 +0000 | [diff] [blame] | 193 | k_dilated_height, |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 194 | read_offsets[1], |
| 195 | read_shapes[1], |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 196 | upscaling, |
erik.andersson@arm.com | 6b2a0b4 | 2022-03-22 15:35:30 +0100 | [diff] [blame] | 197 | op.type, |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 198 | ) |
| 199 | |
| 200 | ifm_required = ifm_box |
| 201 | # Get the Op that produces this Op's IFM data - only applicable within cascades |
| 202 | if producer_op: |
| 203 | assert op_info.cascade != 0 |
| 204 | assert op_info.cascade == schedule.cost_map[producer_op].cascade |
Fredrik Svedberg | d03dc50 | 2022-06-30 10:44:12 +0200 | [diff] [blame] | 205 | if not ifm_required.is_subbox_of(ifm_present): |
| 206 | for prev_cmd in prev_cmd_gen: |
| 207 | yield prev_cmd |
| 208 | if prev_cmd.is_npu_pass_command() and prev_cmd.ps == producer_op.parent_ps: |
| 209 | ifm_present.end_coord = prev_cmd.ofm_box.end_coord |
| 210 | if ifm_required.is_subbox_of(ifm_present): |
| 211 | # There is enough IFM data - exit loop |
| 212 | break |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 213 | |
| 214 | # Information about the current stripe's location in the cascade |
| 215 | is_first_h_stripe = ofm_box_start.height == ofm_start.height |
| 216 | is_last_h_stripe = ofm_box_end.height >= ofm_end.height |
| 217 | |
| 218 | # Calculate the weight box - i.e. the subshape of weights needed for this NpuStripe command |
| 219 | weight_tensor = op_info.npu_weights_tensor |
Tim Hall | d784af7 | 2021-06-08 21:25:57 +0100 | [diff] [blame] | 220 | scale_tensor = op_info.npu_scales_tensor |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 221 | if op_info.npu_weights_tensor: |
| 222 | weight_box = Box([0, 0, 0, start_channel], [1, 1, 1, end_channel]) |
| 223 | |
Johan Alfvén | af1d443 | 2022-12-21 11:23:01 +0100 | [diff] [blame] | 224 | if op_info.buffered_weight_tensors: |
Rickard Bolin | fd8b500 | 2022-05-16 09:11:06 +0000 | [diff] [blame] | 225 | idx = depth_idx % len(op_info.buffered_weight_tensors) |
Rickard Bolin | fd8b500 | 2022-05-16 09:11:06 +0000 | [diff] [blame] | 226 | weight_tensor = op_info.buffered_weight_tensors[idx] |
Johan Alfvén | af1d443 | 2022-12-21 11:23:01 +0100 | [diff] [blame] | 227 | if is_first_h_stripe: |
| 228 | yield from dma_if_necessary( |
| 229 | sched_op.parent_ps, weight_box, op_info.buffered_weight_tensors[idx] |
| 230 | ) |
Tim Hall | d8339a7 | 2021-05-27 18:49:40 +0100 | [diff] [blame] | 231 | else: |
| 232 | weight_box = None |
| 233 | |
Dwight Lidman | 8f78ac2 | 2021-08-13 14:04:30 +0200 | [diff] [blame] | 234 | # Should only be done once per loop but not before weights above |
| 235 | if parent_op.activation_lut and not lut_dma_done: |
| 236 | lut_tensor = [tens for tens in parent_op.inputs if tens.purpose == TensorPurpose.LUT][0] |
| 237 | lut_box = Box([0] * len(lut_tensor.shape), list(lut_tensor.shape)) |
| 238 | lut_dma_done = True |
| 239 | yield from dma_if_necessary(sched_op.parent_ps, lut_box, lut_tensor) |
| 240 | |
Johan Alfven | 9072496 | 2023-02-02 09:07:48 +0100 | [diff] [blame^] | 241 | if parent_op.type == Op.Memcpy: |
| 242 | yield from dma_feature_map_if_necessary(sched_op.parent_ps, ifm_tensor, ofm_tensor) |
| 243 | else: |
| 244 | yield NpuStripe( |
| 245 | sched_op.parent_ps, |
| 246 | block_config.old_style_representation(), |
| 247 | is_first_h_stripe, |
| 248 | is_last_h_stripe, |
| 249 | ifm_tensor, |
| 250 | ifm_box, |
| 251 | ofm_tensor, |
| 252 | ofm_box, |
| 253 | weight_tensor, |
| 254 | weight_box, |
| 255 | scale_tensor, |
| 256 | ifm2_tensor=ifm2_tensor, |
| 257 | ifm2_box=ifm2_box, |
| 258 | pad_top=pad_top, |
| 259 | pad_bottom=pad_bottom, |
| 260 | reversed_operands=sched_op.reversed_operands, |
| 261 | ) |