blob: 3b968dc80fa1b95a49b1150118d863026c5a8f86 [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Generate a high-level command stream from a scheduled subgraph with CascadedPasses.
18#
19# Also used during scheduling to work out allowable IFM/OFM overlap, this functionality can be accessed using
20# calc_allowed_ofm_ifm_overlap_for_cascaded_pass().
Diego Russoe8a10452020-04-21 17:39:10 +010021from .high_level_command_stream import Box
22from .high_level_command_stream import DMA
23from .high_level_command_stream import NpuStripe
24from .nn_graph import PassPlacement
25from .nn_graph import SchedulingStrategy
Tim Hall79d07d22020-04-27 18:20:16 +010026from .operation import NpuBlockType
Charles Xu78792222020-05-13 10:15:26 +020027from .tensor import TensorPurpose
Tim Hall79d07d22020-04-27 18:20:16 +010028
29
Charles Xu78792222020-05-13 10:15:26 +020030def dma_if_necessary(ps, box, tensor):
Louis Verhaard3c07c972020-05-07 08:12:58 +020031 if tensor.needs_dma():
Charles Xu78792222020-05-13 10:15:26 +020032 dma_op = tensor.ops[0]
Tim Hall79d07d22020-04-27 18:20:16 +010033 in_tensor = dma_op.inputs[0]
Charles Xu78792222020-05-13 10:15:26 +020034 yield DMA(in_tensor, tensor, box)
Tim Hall79d07d22020-04-27 18:20:16 +010035
36
37def generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx):
38 is_first = idx == 0
39 is_last = idx == len(passes) - 1
40 ps = passes[idx]
41 block_config = block_configs[idx]
42
43 ifm_tensor = ps.ifm_tensor
44 ifm2_tensor = ps.ifm2_tensor
45 ofm_tensor = ps.ofm_tensor
46 weight_tensor = ps.weight_tensor
47 scale_tensor = ps.scale_tensor
48
49 ofm_start = [0] * len(ofm_tensor.shape)
50 ofm_end = list(ofm_tensor.shape)
51
52 strides = None
53 skirt = None
54 if ps.primary_op is not None:
55 strides = ps.primary_op.attrs.get("strides", None)
56 skirt = ps.primary_op.attrs.get("skirt", None)
57
58 npu_block_type = ps.npu_block_type
59
60 concat_axis = 0
61 concat_offset = 0
62
63 split_offsets = [None, None] # offset for [ifm, ifm2]
64
65 # Fusable activation functions
66 activation_ops = set(("Sigmoid", "Tanh", "Relu", "Relu6", "ReluN1To1"))
67
68 for op in ps.ops:
69 if op.type == "ConcatSliceWrite":
70 concat_axis = op.attrs["concat_axis"]
71 concat_start = op.attrs["concat_start"]
72 concat_end = op.attrs["concat_end"]
73
74 ofm_start[concat_axis] = concat_start
75 ofm_end[concat_axis] = concat_end
76 concat_offset = concat_start
77 ps.primary_op.attrs["fused_memory_function"] = op.type
78 elif op.type in activation_ops:
79 ps.primary_op.attrs["fused_activation_function"] = op.type
80
81 # The ops list has to be reversed here since the Pass Packing is done in reverse
82 ifm_idx = 0
83 for op in reversed(ps.ops):
84 if op.type == "SplitSliceRead":
85 split_offsets[ifm_idx] = op.attrs["split_start"]
86 ps.primary_op.attrs["fused_memory_function"] = op.type
87 ifm_idx += 1
88
89 if strat == SchedulingStrategy.WeightStream:
90 ofm_step = block_config[-1]
91 ofm_stop = ofm_end[-1]
Louis Verhaard3c07c972020-05-07 08:12:58 +020092 if weight_tensor is None or not weight_tensor.needs_dma():
Tim Hall79d07d22020-04-27 18:20:16 +010093 ofm_step = ofm_stop
94 for start in range(ofm_start[-1], ofm_stop, ofm_step):
95 end = min(start + ofm_step, ofm_stop)
96 ofm_start[-1] = start
97 ofm_end[-1] = end
98 ofm_box = Box(ofm_start, ofm_end)
99 ifm_box = None
100 ifm2_box = None
101
102 if ifm_tensor.shape != []:
103 ifm_box, _, _ = ofm_box.transform_with_strides_and_skirt(
104 strides, skirt, ifm_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0]
105 )
106 else:
107 ifm_box = Box([], [])
108 if ifm2_tensor is not None and ifm2_tensor.shape != []:
109 ifm2_box, _, _ = ofm_box.transform_with_strides_and_skirt(
110 strides, skirt, ifm2_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[1]
111 )
112 else:
113 ifm2_box = Box([], [])
114
Charles Xu78792222020-05-13 10:15:26 +0200115 for intermediate in ps.intermediates:
116 if intermediate != None and intermediate.shape != [] and intermediate.purpose == TensorPurpose.FeatureMap:
117 intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt(
118 strides, skirt, intermediate.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0]
119 )
120 yield from dma_if_necessary(ps, intermediate_box, intermediate)
121
Tim Hall79d07d22020-04-27 18:20:16 +0100122 weight_box = None
123 if weight_tensor is not None:
124 weight_oc_start = start
125 weight_oc_end = end
126 if concat_axis - len(weight_tensor.shape) == -1:
127 weight_oc_start -= concat_offset
128 weight_oc_end -= concat_offset
129
130 weight_box = Box.make_weight_box(
131 weight_tensor.shape,
132 npu_block_type,
133 weight_oc_start,
134 weight_oc_end,
135 weight_tensor.weight_transpose_depthwise,
136 )
Charles Xu78792222020-05-13 10:15:26 +0200137 yield from dma_if_necessary(ps, weight_box, weight_tensor)
Tim Hall79d07d22020-04-27 18:20:16 +0100138
139 yield NpuStripe(
140 ps,
141 block_config,
142 is_first,
143 is_last,
144 True,
145 True,
146 ifm_tensor,
147 ifm_box,
148 ofm_tensor,
149 ofm_box,
150 weight_tensor,
151 weight_box,
152 scale_tensor,
153 concat_axis,
154 concat_offset,
155 ifm2_tensor=ifm2_tensor,
156 ifm2_box=ifm2_box,
157 )
158
159 elif strat == SchedulingStrategy.IfmStream:
160 y_step = block_config[0]
161 y_start = 0
162 y_dim = 1
163 if len(ofm_tensor.shape) >= 3:
164 y_start = ofm_start[-3]
165 y_dim = ofm_end[-3]
166 if idx > 0:
167 ifm_y_present = 0
168 prev_pass = passes[idx - 1]
169 prev_pass_gen = generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx - 1)
170 else:
171 ifm_y_present = 1
172 if len(ifm_tensor.shape) >= 3:
173 ifm_y_present = ifm_tensor.shape[-3]
174 prev_pass_gen = []
175 prev_pass = None
176
177 if len(passes) == 1:
178 # no cascading, can just issue one big stripe
179 # but only if we've done allocation and OFM does not overlap IFM
180 if ifm_tensor.address != -1 and ofm_tensor.address != -1:
181 if (
182 ifm_tensor.address + ifm_tensor.storage_size() <= ofm_tensor.address
183 or ofm_tensor.address + ofm_tensor.storage_size() <= ifm_tensor.address
184 ):
185 y_step = y_dim
186
187 weight_box = None
188
189 for start in range(y_start, y_dim, y_step):
190 end = min(start + y_step, y_dim)
191 if len(ofm_tensor.shape) >= 3:
192 ofm_start[-3] = start
193 ofm_end[-3] = end
194 ofm_box = Box(ofm_start, ofm_end)
195
196 k_height = 1
197 if npu_block_type == NpuBlockType.Pooling:
198 if ps.primary_op is not None:
199 k_height = ps.primary_op.attrs["ksize"][1]
200 else:
201 if weight_tensor is not None:
202 k_height = weight_tensor.shape[0]
203
204 ifm_box, pad_top, pad_bottom = ofm_box.transform_with_strides_and_skirt(
205 strides, skirt, ifm_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0], k_height
206 )
207
Charles Xu78792222020-05-13 10:15:26 +0200208 for intermediate in ps.intermediates:
209 if intermediate != None and intermediate.shape != [] and intermediate.purpose == TensorPurpose.FeatureMap:
210 intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt(
211 strides, skirt, intermediate.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0]
212 )
213 yield from dma_if_necessary(ps, intermediate_box, intermediate)
214
Tim Hall79d07d22020-04-27 18:20:16 +0100215 ifm_y_needed = 1
216 if len(ifm_box.end_coord) >= 3:
217 ifm_y_needed = ifm_box.end_coord[-3]
218 if ifm_y_present < ifm_y_needed:
219 for prev_cmd in prev_pass_gen:
220 yield prev_cmd
221 rng = prev_cmd.get_ofm_y_range_for_pass(prev_pass)
222 if rng is not None:
223 ifm_y_present = max(ifm_y_present, rng[1])
224 if ifm_y_present >= ifm_y_needed:
225 break
226
227 if weight_tensor is not None and weight_box is None:
228 weight_box = Box.make_weight_box(
229 weight_tensor.shape, npu_block_type, weights_transposed=weight_tensor.weight_transpose_depthwise
230 )
Charles Xu78792222020-05-13 10:15:26 +0200231 yield from dma_if_necessary(ps, weight_box, weight_tensor)
Tim Hall79d07d22020-04-27 18:20:16 +0100232
233 # Check if first/last stripe in pass
234 is_first_h_stripe = start == y_start
235 is_last_h_stripe = (start + y_step) >= y_dim
236
237 stripe = NpuStripe(
238 ps,
239 block_config,
240 is_first,
241 is_last,
242 is_first_h_stripe,
243 is_last_h_stripe,
244 ifm_tensor,
245 ifm_box,
246 ofm_tensor,
247 ofm_box,
248 weight_tensor,
249 weight_box,
250 scale_tensor,
251 concat_axis,
252 concat_offset,
253 None,
254 None,
255 pad_top,
256 pad_bottom,
257 )
258 yield stripe
259 else:
260 assert 0, "unknown scheduling strategy"
261
262
263def generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
264 if strat == SchedulingStrategy.WeightStream:
265 for idx in range(len(passes)):
266 yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx)
267 elif strat == SchedulingStrategy.IfmStream:
268 yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, len(passes) - 1)
269 else:
270 assert 0, "Unknown streaming strategy"
271
272
273def generate_high_level_command_stream_for_cascaded_pass(cps):
274 yield from generate_high_level_command_stream_for_pass_list(
275 cps.strategy, cps.passes, [ps.block_config for ps in cps.passes]
276 )
277
278
279def generate_high_level_command_stream(nng, sg, arch, verbose_high_level_command_stream):
280 res = []
281 for cps in sg.cascaded_passes:
282 if cps.placement == PassPlacement.Npu:
283 res += list(generate_high_level_command_stream_for_cascaded_pass(cps))
284
285 sg.high_level_command_stream = res
286 if verbose_high_level_command_stream:
287 sg.print_high_level_command_stream()
288
289
290def calc_allowed_ofm_ifm_overlap_for_pass_list(strat, passes, block_configs):
291 highest_ofm_write = 0
292 if not passes[0].ifm_tensor or not passes[-1].ofm_tensor:
293 return 0
294
295 ifm_read = passes[0].ifm_tensor.storage_size
296 min_overlap = 999999999999999999999
297 ofm_size = passes[-1].ofm_tensor.storage_size()
298 if strat == SchedulingStrategy.WeightStream:
299 return 0
300 for cmd in generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
301 if cmd.is_npu_pass_command():
302 if cmd.is_first:
303 ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.start_coord, is_top_box=False)
304 if ifm_read is None:
305 return 0
306 if cmd.is_last:
307 write_offset = cmd.ofm_tensor.address_offset_for_coordinate(cmd.ofm_box.end_coord, is_top_box=True)
308 if write_offset is None:
309 return 0
310 highest_ofm_write = max(write_offset, highest_ofm_write)
311
312 if cmd.is_first or cmd.is_last:
313 overlap_required = max(highest_ofm_write - min(ifm_read, ofm_size), 0)
314 can_overwrite = ofm_size - overlap_required
315 min_overlap = min(min_overlap, can_overwrite)
316
317 if cmd.is_first:
318 ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.end_coord, is_top_box=True)
319
320 min_overlap = max(min_overlap, 0)
321 return min_overlap
322
323
324def calc_allowed_ofm_ifm_overlap_for_cascaded_pass(cps):
325 return calc_allowed_ofm_ifm_overlap_for_pass_list(cps.strategy, cps.passes, [ps.block_config for ps in cps.passes])