blob: 3d0a1e58d36bab4a2da109d360ffa4124edfa3f5 [file] [log] [blame]
Patrik Gustavssone3b1b912021-02-09 15:38:46 +01001# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
Tim Halld8339a72021-05-27 18:49:40 +010017# Generate a high-level command stream from a schedule
Diego Russoe8a10452020-04-21 17:39:10 +010018from .high_level_command_stream import Box
19from .high_level_command_stream import DMA
20from .high_level_command_stream import NpuStripe
Charles Xu89a6bbf2020-08-11 12:31:58 +020021from .numeric_util import round_up_divide
Louis Verhaarde8a5a782020-11-02 18:04:27 +010022from .operation import create_activation_function
Tim Hall79d07d22020-04-27 18:20:16 +010023from .operation import NpuBlockType
Louis Verhaardaee5d752020-09-30 09:01:52 +020024from .operation import Op
patrik.gustavssoneeb85152020-12-21 17:10:40 +000025from .shape4d import Shape4D
Charles Xu78792222020-05-13 10:15:26 +020026from .tensor import TensorPurpose
Tim Hall79d07d22020-04-27 18:20:16 +010027
28
Charles Xu78792222020-05-13 10:15:26 +020029def dma_if_necessary(ps, box, tensor):
Tim Halld8339a72021-05-27 18:49:40 +010030 src_tensor = tensor.src_tensor
31 if src_tensor and tensor.mem_area != src_tensor.mem_area:
32 yield DMA(ps, src_tensor, tensor, box)
Tim Hall79d07d22020-04-27 18:20:16 +010033
Tim Hallc30f4952020-06-15 20:47:35 +010034
Tim Halld8339a72021-05-27 18:49:40 +010035def generate_high_level_command_stream_for_schedule(nng, sg, arch, verbose_high_level_command_stream):
Tim Hall79d07d22020-04-27 18:20:16 +010036 res = []
Tim Halld8339a72021-05-27 18:49:40 +010037 # sg.sched_ops are ordered by execution
38 processed_cascades = set()
39 for sched_op in sg.sched_ops:
40 op_info = sg.schedule.cost_map[sched_op]
41 if op_info.cascade in processed_cascades:
42 # This cascade has already been processed
43 continue
44
45 if op_info.cascade == 0:
46 # Generate high-level commands for this Op in isolation
47 res += list(generate_high_level_commands_for_sched_op(sched_op, sg.schedule))
48 else:
49 # Generate high-level commands for the whole cascade
50 cascade_info = sg.schedule.cascades[op_info.cascade]
51 # Start from the last Op in the cascade
52 res += list(generate_high_level_commands_for_sched_op(sg.sched_ops[cascade_info.end], sg.schedule))
53 processed_cascades.add(op_info.cascade)
Tim Hall79d07d22020-04-27 18:20:16 +010054
55 sg.high_level_command_stream = res
56 if verbose_high_level_command_stream:
57 sg.print_high_level_command_stream()
58
59
Tim Halld8339a72021-05-27 18:49:40 +010060def generate_high_level_commands_for_sched_op(sched_op, schedule):
61 op_info = schedule.cost_map[sched_op]
62 cascade_info = schedule.cascades.get(op_info.cascade)
63 npu_block_type = sched_op.parent_ps.npu_block_type
64 block_config = op_info.block_config
65 ps = sched_op.parent_ps
66 parent_op = sched_op.parent_op
67 ofm_tensor = ps.ofm_tensor
Tim Hall79d07d22020-04-27 18:20:16 +010068
Tim Halld8339a72021-05-27 18:49:40 +010069 # Get Tensors and Full Shapes
70 (ifm_tensor, ifm2_tensor, uncomp_weight_tensor, _, _,) = parent_op.get_ifm_ifm2_weights_biases_ofm()
71 ifm = sched_op.ifm
72 ifm2 = sched_op.ifm2
73 ofm_shape = sched_op.ofm.shape
74
75 # Get Kernel strides and upscaling factor
76 kernel_stride = sched_op.kernel.stride
77 strides = [1, kernel_stride.y, kernel_stride.x, 1]
78 skirt = parent_op.attrs.get("skirt", None)
79 upscaling = 1
80 if sched_op.op_type == Op.Conv2DBackpropInputSwitchedBias:
81 upscaling = ofm_shape.height // ifm.shape.height
82 elif sched_op.op_type == Op.ResizeBilinear:
83 upscaling = round_up_divide(ofm_shape.height, ifm.shape.height)
84
85 # Get Kernel height
86 k_height = 1
87 if npu_block_type in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):
88 if parent_op is not None:
89 k_height = parent_op.attrs["ksize"][1]
90 else:
91 if uncomp_weight_tensor is not None:
92 k_height = uncomp_weight_tensor.shape[0]
93
94 # Define Start and End coordinates for the OFM
95 ofm_start = Shape4D(0, 0, 0, op_info.ofm_depth_slices[0])
96 ofm_end = ofm_shape
97
98 ofm_depth_slices = op_info.ofm_depth_slices
99
100 # Read/Write offsets
101 read_offsets = list(parent_op.read_offsets) # offset for [ifm, ifm2]
102 read_shapes = list(parent_op.read_shapes) # read shapes for [ifm, ifm2]
103 write_offset = Shape4D(0, 0, 0, 0)
104 if parent_op.write_offset is not None:
105 write_offset = parent_op.write_offset
106 ofm_start = write_offset
107 ofm_end = parent_op.write_offset + parent_op.write_shape
108
109 # Create activation function if needed
110 for op in ps.ops:
111 if op.type.is_relu_op() or op.type in (Op.Tanh, Op.Sigmoid):
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200112 ps.primary_op.activation = create_activation_function(
113 op.type, min=op.attrs.get("min", None), max=op.attrs.get("max", None)
114 )
Tim Halld8339a72021-05-27 18:49:40 +0100115
116 # Generate commands for the Op that produces this Op's IFM, if applicable
117 if cascade_info is None or cascade_info.start == sched_op.index:
118 # Lone Op or First Op in cascade - all IFM data is present
119 ifm_present = Box([0, 0, 0, 0], ifm.shape.as_list())
120 producer_op = None
121 prev_cmd_gen = []
122 else:
123 ifm_present = Box([0, 0, 0, 0], [0, 0, 0, 0])
124 producer_op = sched_op.ifm.connection.producers[0]
125 prev_cmd_gen = generate_high_level_commands_for_sched_op(producer_op, schedule)
126
127 ofm_step = op_info.stripe
128 for start_height in range(ofm_start.height, ofm_end.height, ofm_step.height):
129 end_height = min(start_height + ofm_step.height, ofm_end.height)
130 for start_width in range(ofm_start.width, ofm_end.width, ofm_step.width):
131 end_width = min(start_width + ofm_step.width, ofm_end.width)
132
Dwight Lidman8f78ac22021-08-13 14:04:30 +0200133 lut_dma_done = False
Tim Halld8339a72021-05-27 18:49:40 +0100134 for depth_idx, start_channel in enumerate(ofm_depth_slices[:-1]):
135 start_channel = max(start_channel, ofm_start.depth)
136 end_channel = min(ofm_depth_slices[depth_idx + 1], ofm_end.depth)
137
138 # Construct the OFM box for the current stripe
139 ofm_box_start = Shape4D(ofm_start.batch, start_height, start_width, start_channel)
140 ofm_box_end = Shape4D(ofm_end.batch, end_height, end_width, end_channel)
141 ofm_box = Box(ofm_box_start.as_list(), ofm_box_end.as_list())
142 ifm_box = Box([], [])
143 ifm2_box = Box([], [])
144
145 # Calculate IFM input box based on the OFM box
146 if ifm:
147 ifm_box, pad_top, pad_bottom = ofm_box.transform_with_strides_and_skirt(
148 strides,
149 skirt,
150 ifm.shape,
151 npu_block_type,
152 write_offset.as_list(),
153 read_offsets[0],
154 read_shapes[0],
155 k_height,
156 upscaling,
157 )
158
159 # Calculate IFM2 input box based on the OFM box
160 if ifm2:
161 ifm2_box, pad_top, pad_bottom = ofm_box.transform_with_strides_and_skirt(
162 strides,
163 skirt,
164 ifm2.shape,
165 npu_block_type,
166 write_offset.as_list(),
167 read_offsets[1],
168 read_shapes[1],
169 k_height,
170 upscaling,
171 )
172
173 ifm_required = ifm_box
174 # Get the Op that produces this Op's IFM data - only applicable within cascades
175 if producer_op:
176 assert op_info.cascade != 0
177 assert op_info.cascade == schedule.cost_map[producer_op].cascade
178 for prev_cmd in prev_cmd_gen:
179 yield prev_cmd
180 if prev_cmd.is_npu_pass_command() and prev_cmd.ps == producer_op.parent_ps:
181 ifm_present.end_coord = prev_cmd.ofm_box.end_coord
182 if ifm_required.is_subbox_of(ifm_present):
183 # There is enough IFM data - exit loop
184 break
185
186 # Information about the current stripe's location in the cascade
187 is_first_h_stripe = ofm_box_start.height == ofm_start.height
188 is_last_h_stripe = ofm_box_end.height >= ofm_end.height
189
190 # Calculate the weight box - i.e. the subshape of weights needed for this NpuStripe command
191 weight_tensor = op_info.npu_weights_tensor
Tim Halld784af72021-06-08 21:25:57 +0100192 scale_tensor = op_info.npu_scales_tensor
Tim Halld8339a72021-05-27 18:49:40 +0100193 if op_info.npu_weights_tensor:
194 weight_box = Box([0, 0, 0, start_channel], [1, 1, 1, end_channel])
195
196 if op_info.buffered_weight_tensor and is_first_h_stripe:
197 yield from dma_if_necessary(sched_op.parent_ps, weight_box, op_info.buffered_weight_tensor)
198 weight_tensor = op_info.buffered_weight_tensor
199 else:
200 weight_box = None
201
Dwight Lidman8f78ac22021-08-13 14:04:30 +0200202 # Should only be done once per loop but not before weights above
203 if parent_op.activation_lut and not lut_dma_done:
204 lut_tensor = [tens for tens in parent_op.inputs if tens.purpose == TensorPurpose.LUT][0]
205 lut_box = Box([0] * len(lut_tensor.shape), list(lut_tensor.shape))
206 lut_dma_done = True
207 yield from dma_if_necessary(sched_op.parent_ps, lut_box, lut_tensor)
208
Tim Halld8339a72021-05-27 18:49:40 +0100209 yield NpuStripe(
210 sched_op.parent_ps,
211 block_config.old_style_representation(),
212 is_first_h_stripe,
213 is_last_h_stripe,
214 ifm_tensor,
215 ifm_box,
216 ofm_tensor,
217 ofm_box,
218 weight_tensor,
219 weight_box,
Tim Halld784af72021-06-08 21:25:57 +0100220 scale_tensor,
Tim Halld8339a72021-05-27 18:49:40 +0100221 ifm2_tensor=ifm2_tensor,
222 ifm2_box=ifm2_box,
223 pad_top=pad_top,
224 pad_bottom=pad_bottom,
Patrik Gustavsson2349d422020-12-01 16:02:29 +0100225 )