blob: 1dac1815d35eb5ae8d1b0fc5a5bf185c063a8246 [file] [log] [blame]
Johan Alfven90724962023-02-02 09:07:48 +01001# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Rickard Bolinbc6ee582022-11-04 08:24:29 +000016#
Tim Hall79d07d22020-04-27 18:20:16 +010017# Description:
Tim Halld8339a72021-05-27 18:49:40 +010018# Generate a high-level command stream from a schedule
Johan Alfven0b4ac762023-06-12 10:56:42 +020019from .architecture_allocator import is_nearest
Diego Russoe8a10452020-04-21 17:39:10 +010020from .high_level_command_stream import Box
21from .high_level_command_stream import DMA
Johan Alfven90724962023-02-02 09:07:48 +010022from .high_level_command_stream import NOP
Diego Russoe8a10452020-04-21 17:39:10 +010023from .high_level_command_stream import NpuStripe
Charles Xu89a6bbf2020-08-11 12:31:58 +020024from .numeric_util import round_up_divide
Louis Verhaarde8a5a782020-11-02 18:04:27 +010025from .operation import create_activation_function
Tim Hall79d07d22020-04-27 18:20:16 +010026from .operation import NpuBlockType
Louis Verhaardaee5d752020-09-30 09:01:52 +020027from .operation import Op
patrik.gustavssoneeb85152020-12-21 17:10:40 +000028from .shape4d import Shape4D
Charles Xu78792222020-05-13 10:15:26 +020029from .tensor import TensorPurpose
Tim Hall79d07d22020-04-27 18:20:16 +010030
31
Charles Xu78792222020-05-13 10:15:26 +020032def dma_if_necessary(ps, box, tensor):
Tim Halld8339a72021-05-27 18:49:40 +010033 src_tensor = tensor.src_tensor
34 if src_tensor and tensor.mem_area != src_tensor.mem_area:
35 yield DMA(ps, src_tensor, tensor, box)
Tim Hall79d07d22020-04-27 18:20:16 +010036
Tim Hallc30f4952020-06-15 20:47:35 +010037
Johan Alfven90724962023-02-02 09:07:48 +010038def dma_feature_map_if_necessary(ps, src_tensor, dst_tensor):
39 box = Box([0] * len(src_tensor.shape), list(src_tensor.shape))
40 src_addr = src_tensor.address_for_coordinate(box.start_coord)
41 dst_addr = dst_tensor.address_for_coordinate(box.start_coord)
42
43 if src_addr != dst_addr or src_tensor.mem_area != dst_tensor.mem_area:
44 yield DMA(ps, src_tensor, dst_tensor, box)
45 else:
46 # Source and destination is the same so no need for a DMA transaction
47 # Create a NOP for visibility when printing the high_level_command_stream
48 yield NOP(ps, src_tensor, dst_tensor)
49
50
Tim Halld8339a72021-05-27 18:49:40 +010051def generate_high_level_command_stream_for_schedule(nng, sg, arch, verbose_high_level_command_stream):
Tim Hall79d07d22020-04-27 18:20:16 +010052 res = []
Tim Halld8339a72021-05-27 18:49:40 +010053 # sg.sched_ops are ordered by execution
54 processed_cascades = set()
55 for sched_op in sg.sched_ops:
56 op_info = sg.schedule.cost_map[sched_op]
57 if op_info.cascade in processed_cascades:
58 # This cascade has already been processed
59 continue
60
61 if op_info.cascade == 0:
62 # Generate high-level commands for this Op in isolation
63 res += list(generate_high_level_commands_for_sched_op(sched_op, sg.schedule))
64 else:
65 # Generate high-level commands for the whole cascade
66 cascade_info = sg.schedule.cascades[op_info.cascade]
67 # Start from the last Op in the cascade
68 res += list(generate_high_level_commands_for_sched_op(sg.sched_ops[cascade_info.end], sg.schedule))
69 processed_cascades.add(op_info.cascade)
Tim Hall79d07d22020-04-27 18:20:16 +010070
71 sg.high_level_command_stream = res
72 if verbose_high_level_command_stream:
73 sg.print_high_level_command_stream()
74
75
Tim Halld8339a72021-05-27 18:49:40 +010076def generate_high_level_commands_for_sched_op(sched_op, schedule):
77 op_info = schedule.cost_map[sched_op]
78 cascade_info = schedule.cascades.get(op_info.cascade)
79 npu_block_type = sched_op.parent_ps.npu_block_type
80 block_config = op_info.block_config
81 ps = sched_op.parent_ps
82 parent_op = sched_op.parent_op
83 ofm_tensor = ps.ofm_tensor
Tim Hall79d07d22020-04-27 18:20:16 +010084
Tim Halld8339a72021-05-27 18:49:40 +010085 # Get Tensors and Full Shapes
Jonas Ohlssond8575072022-03-30 10:30:25 +020086 (
87 ifm_tensor,
88 ifm2_tensor,
89 uncomp_weight_tensor,
90 _,
91 _,
92 ) = parent_op.get_ifm_ifm2_weights_biases_ofm()
Fredrik Svedbergb81e1bb2022-10-11 21:50:51 +020093 if sched_op.reversed_operands:
94 ifm2_tensor, ifm_tensor = ifm_tensor, ifm2_tensor
Tim Halld8339a72021-05-27 18:49:40 +010095 ifm = sched_op.ifm
96 ifm2 = sched_op.ifm2
97 ofm_shape = sched_op.ofm.shape
98
99 # Get Kernel strides and upscaling factor
100 kernel_stride = sched_op.kernel.stride
101 strides = [1, kernel_stride.y, kernel_stride.x, 1]
102 skirt = parent_op.attrs.get("skirt", None)
103 upscaling = 1
104 if sched_op.op_type == Op.Conv2DBackpropInputSwitchedBias:
105 upscaling = ofm_shape.height // ifm.shape.height
Johan Alfven0b4ac762023-06-12 10:56:42 +0200106 elif is_nearest(sched_op.resampling_mode):
Tim Halld8339a72021-05-27 18:49:40 +0100107 upscaling = round_up_divide(ofm_shape.height, ifm.shape.height)
108
Rickard Bolin1c08afa2022-01-07 14:22:52 +0000109 # Get kernel height and height dilation
Tim Halld8339a72021-05-27 18:49:40 +0100110 k_height = 1
111 if npu_block_type in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):
112 if parent_op is not None:
113 k_height = parent_op.attrs["ksize"][1]
114 else:
115 if uncomp_weight_tensor is not None:
116 k_height = uncomp_weight_tensor.shape[0]
117
Rickard Bolin1c08afa2022-01-07 14:22:52 +0000118 k_height_dilation = parent_op.attrs.get("dilation", (_, 1, _, _))[-3]
119
120 # Calculate dilated kernel height
121 k_dilated_height = k_height_dilation * (k_height - 1) + 1
122
Tim Halld8339a72021-05-27 18:49:40 +0100123 # Define Start and End coordinates for the OFM
124 ofm_start = Shape4D(0, 0, 0, op_info.ofm_depth_slices[0])
125 ofm_end = ofm_shape
126
127 ofm_depth_slices = op_info.ofm_depth_slices
128
129 # Read/Write offsets
130 read_offsets = list(parent_op.read_offsets) # offset for [ifm, ifm2]
131 read_shapes = list(parent_op.read_shapes) # read shapes for [ifm, ifm2]
132 write_offset = Shape4D(0, 0, 0, 0)
133 if parent_op.write_offset is not None:
134 write_offset = parent_op.write_offset
135 ofm_start = write_offset
136 ofm_end = parent_op.write_offset + parent_op.write_shape
137
138 # Create activation function if needed
139 for op in ps.ops:
140 if op.type.is_relu_op() or op.type in (Op.Tanh, Op.Sigmoid):
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200141 ps.primary_op.activation = create_activation_function(
142 op.type, min=op.attrs.get("min", None), max=op.attrs.get("max", None)
143 )
Tim Halld8339a72021-05-27 18:49:40 +0100144
145 # Generate commands for the Op that produces this Op's IFM, if applicable
146 if cascade_info is None or cascade_info.start == sched_op.index:
147 # Lone Op or First Op in cascade - all IFM data is present
148 ifm_present = Box([0, 0, 0, 0], ifm.shape.as_list())
149 producer_op = None
150 prev_cmd_gen = []
151 else:
152 ifm_present = Box([0, 0, 0, 0], [0, 0, 0, 0])
153 producer_op = sched_op.ifm.connection.producers[0]
154 prev_cmd_gen = generate_high_level_commands_for_sched_op(producer_op, schedule)
Tim Halld8339a72021-05-27 18:49:40 +0100155 ofm_step = op_info.stripe
156 for start_height in range(ofm_start.height, ofm_end.height, ofm_step.height):
157 end_height = min(start_height + ofm_step.height, ofm_end.height)
158 for start_width in range(ofm_start.width, ofm_end.width, ofm_step.width):
159 end_width = min(start_width + ofm_step.width, ofm_end.width)
160
Dwight Lidman8f78ac22021-08-13 14:04:30 +0200161 lut_dma_done = False
Tim Halld8339a72021-05-27 18:49:40 +0100162 for depth_idx, start_channel in enumerate(ofm_depth_slices[:-1]):
163 start_channel = max(start_channel, ofm_start.depth)
164 end_channel = min(ofm_depth_slices[depth_idx + 1], ofm_end.depth)
165
166 # Construct the OFM box for the current stripe
167 ofm_box_start = Shape4D(ofm_start.batch, start_height, start_width, start_channel)
168 ofm_box_end = Shape4D(ofm_end.batch, end_height, end_width, end_channel)
169 ofm_box = Box(ofm_box_start.as_list(), ofm_box_end.as_list())
170 ifm_box = Box([], [])
171 ifm2_box = Box([], [])
Tim Halld8339a72021-05-27 18:49:40 +0100172 # Calculate IFM input box based on the OFM box
173 if ifm:
174 ifm_box, pad_top, pad_bottom = ofm_box.transform_with_strides_and_skirt(
175 strides,
176 skirt,
177 ifm.shape,
178 npu_block_type,
179 write_offset.as_list(),
Rickard Bolin1c08afa2022-01-07 14:22:52 +0000180 k_dilated_height,
Tim Halld8339a72021-05-27 18:49:40 +0100181 read_offsets[0],
182 read_shapes[0],
Tim Halld8339a72021-05-27 18:49:40 +0100183 upscaling,
wilisa016fcc1f62023-02-20 11:49:58 +0000184 sched_op.op_type,
Tim Halld8339a72021-05-27 18:49:40 +0100185 )
Tim Halld8339a72021-05-27 18:49:40 +0100186 # Calculate IFM2 input box based on the OFM box
187 if ifm2:
188 ifm2_box, pad_top, pad_bottom = ofm_box.transform_with_strides_and_skirt(
189 strides,
190 skirt,
191 ifm2.shape,
192 npu_block_type,
193 write_offset.as_list(),
Rickard Bolin1c08afa2022-01-07 14:22:52 +0000194 k_dilated_height,
Tim Halld8339a72021-05-27 18:49:40 +0100195 read_offsets[1],
196 read_shapes[1],
Tim Halld8339a72021-05-27 18:49:40 +0100197 upscaling,
wilisa016fcc1f62023-02-20 11:49:58 +0000198 sched_op.op_type,
Tim Halld8339a72021-05-27 18:49:40 +0100199 )
200
201 ifm_required = ifm_box
202 # Get the Op that produces this Op's IFM data - only applicable within cascades
203 if producer_op:
204 assert op_info.cascade != 0
205 assert op_info.cascade == schedule.cost_map[producer_op].cascade
Fredrik Svedbergd03dc502022-06-30 10:44:12 +0200206 if not ifm_required.is_subbox_of(ifm_present):
207 for prev_cmd in prev_cmd_gen:
208 yield prev_cmd
209 if prev_cmd.is_npu_pass_command() and prev_cmd.ps == producer_op.parent_ps:
210 ifm_present.end_coord = prev_cmd.ofm_box.end_coord
211 if ifm_required.is_subbox_of(ifm_present):
212 # There is enough IFM data - exit loop
213 break
Tim Halld8339a72021-05-27 18:49:40 +0100214
215 # Information about the current stripe's location in the cascade
216 is_first_h_stripe = ofm_box_start.height == ofm_start.height
217 is_last_h_stripe = ofm_box_end.height >= ofm_end.height
218
219 # Calculate the weight box - i.e. the subshape of weights needed for this NpuStripe command
220 weight_tensor = op_info.npu_weights_tensor
Tim Halld784af72021-06-08 21:25:57 +0100221 scale_tensor = op_info.npu_scales_tensor
Tim Halld8339a72021-05-27 18:49:40 +0100222 if op_info.npu_weights_tensor:
223 weight_box = Box([0, 0, 0, start_channel], [1, 1, 1, end_channel])
224
Johan Alfvénaf1d4432022-12-21 11:23:01 +0100225 if op_info.buffered_weight_tensors:
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000226 idx = depth_idx % len(op_info.buffered_weight_tensors)
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000227 weight_tensor = op_info.buffered_weight_tensors[idx]
Johan Alfvénaf1d4432022-12-21 11:23:01 +0100228 if is_first_h_stripe:
229 yield from dma_if_necessary(
230 sched_op.parent_ps, weight_box, op_info.buffered_weight_tensors[idx]
231 )
Tim Halld8339a72021-05-27 18:49:40 +0100232 else:
233 weight_box = None
234
Dwight Lidman8f78ac22021-08-13 14:04:30 +0200235 # Should only be done once per loop but not before weights above
236 if parent_op.activation_lut and not lut_dma_done:
237 lut_tensor = [tens for tens in parent_op.inputs if tens.purpose == TensorPurpose.LUT][0]
238 lut_box = Box([0] * len(lut_tensor.shape), list(lut_tensor.shape))
239 lut_dma_done = True
240 yield from dma_if_necessary(sched_op.parent_ps, lut_box, lut_tensor)
241
Johan Alfven90724962023-02-02 09:07:48 +0100242 if parent_op.type == Op.Memcpy:
243 yield from dma_feature_map_if_necessary(sched_op.parent_ps, ifm_tensor, ofm_tensor)
244 else:
245 yield NpuStripe(
246 sched_op.parent_ps,
247 block_config.old_style_representation(),
248 is_first_h_stripe,
249 is_last_h_stripe,
250 ifm_tensor,
251 ifm_box,
252 ofm_tensor,
253 ofm_box,
254 weight_tensor,
255 weight_box,
256 scale_tensor,
257 ifm2_tensor=ifm2_tensor,
258 ifm2_box=ifm2_box,
259 pad_top=pad_top,
260 pad_bottom=pad_bottom,
261 reversed_operands=sched_op.reversed_operands,
262 )