blob: 50b913d88eddf24b4ca1ee0f8c5fcf5c92adc7a0 [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Generate a high-level command stream from a scheduled subgraph with CascadedPasses.
18#
19# Also used during scheduling to work out allowable IFM/OFM overlap, this functionality can be accessed using
20# calc_allowed_ofm_ifm_overlap_for_cascaded_pass().
Diego Russoe8a10452020-04-21 17:39:10 +010021from .high_level_command_stream import Box
22from .high_level_command_stream import DMA
23from .high_level_command_stream import NpuStripe
24from .nn_graph import PassPlacement
25from .nn_graph import SchedulingStrategy
Charles Xu89a6bbf2020-08-11 12:31:58 +020026from .numeric_util import round_up_divide
Tim Hall79d07d22020-04-27 18:20:16 +010027from .operation import NpuBlockType
Charles Xu78792222020-05-13 10:15:26 +020028from .tensor import TensorPurpose
Tim Hall79d07d22020-04-27 18:20:16 +010029
30
Charles Xu78792222020-05-13 10:15:26 +020031def dma_if_necessary(ps, box, tensor):
Louis Verhaard3c07c972020-05-07 08:12:58 +020032 if tensor.needs_dma():
Charles Xu78792222020-05-13 10:15:26 +020033 dma_op = tensor.ops[0]
Tim Hall79d07d22020-04-27 18:20:16 +010034 in_tensor = dma_op.inputs[0]
Louis Verhaard0b8268a2020-08-05 16:11:29 +020035 yield DMA(ps, in_tensor, tensor, box)
Tim Hall79d07d22020-04-27 18:20:16 +010036
Tim Hallc30f4952020-06-15 20:47:35 +010037
Charles Xu600351a2020-05-18 08:54:47 +020038def match_tensor(source, derived):
39 if source == derived:
40 return True
41 ops = derived.ops
Tim Hallc30f4952020-06-15 20:47:35 +010042 return ops != [] and len(ops) == 1 and ops[0].type == "SplitSliceRead" and source == ops[0].inputs[0]
43
Tim Hall79d07d22020-04-27 18:20:16 +010044
45def generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx):
46 is_first = idx == 0
47 is_last = idx == len(passes) - 1
48 ps = passes[idx]
49 block_config = block_configs[idx]
Charles Xu600351a2020-05-18 08:54:47 +020050 npu_block_type = ps.npu_block_type
51 split_offsets = [None, None] # offset for [ifm, ifm2]
52
Charles Xu600351a2020-05-18 08:54:47 +020053 if len(ps.inputs) == 2 and npu_block_type == NpuBlockType.ElementWise:
Patrik Gustavsson438e5632020-09-01 12:23:25 +020054 # Ensure correct ifm and ifm2 order
Tim Hallc30f4952020-06-15 20:47:35 +010055 if match_tensor(ps.inputs[0], ps.primary_op.inputs[1]) and match_tensor(ps.inputs[1], ps.primary_op.inputs[0]):
Charles Xu600351a2020-05-18 08:54:47 +020056 ps.ifm_tensor, ps.ifm2_tensor = ps.ifm2_tensor, ps.ifm_tensor
Patrik Gustavsson438e5632020-09-01 12:23:25 +020057
58 for op in ps.ops:
59 if op.type == "SplitSliceRead":
60 ps.primary_op.attrs["fused_memory_function"] = op.type
61 assert len(op.inputs) == 1
62 if match_tensor(ps.ifm_tensor, op.inputs[0]):
63 split_offsets[0] = op.attrs["split_start"]
64 elif match_tensor(ps.ifm2_tensor, op.inputs[0]):
65 split_offsets[1] = op.attrs["split_start"]
66 else:
67 assert False
68 else:
69 ifm_idx = 0
70 for op in ps.ops:
71 if op.type == "SplitSliceRead":
72 assert ifm_idx < 2
73 split_offsets[ifm_idx] = op.attrs["split_start"]
74 ps.primary_op.attrs["fused_memory_function"] = op.type
75 ifm_idx += 1
Tim Hall79d07d22020-04-27 18:20:16 +010076
77 ifm_tensor = ps.ifm_tensor
78 ifm2_tensor = ps.ifm2_tensor
79 ofm_tensor = ps.ofm_tensor
80 weight_tensor = ps.weight_tensor
81 scale_tensor = ps.scale_tensor
82
83 ofm_start = [0] * len(ofm_tensor.shape)
84 ofm_end = list(ofm_tensor.shape)
85
86 strides = None
87 skirt = None
Jacob Bohlin611fcdf2020-06-11 15:09:57 +020088 upscaling = 1
Tim Hall79d07d22020-04-27 18:20:16 +010089 if ps.primary_op is not None:
90 strides = ps.primary_op.attrs.get("strides", None)
91 skirt = ps.primary_op.attrs.get("skirt", None)
Charles Xu89a6bbf2020-08-11 12:31:58 +020092 if ps.primary_op.type == "Conv2DBackpropInputSwitchedBias":
Jacob Bohlin611fcdf2020-06-11 15:09:57 +020093 upscaling = ofm_tensor.shape[-3] // ifm_tensor.shape[-3]
Charles Xu89a6bbf2020-08-11 12:31:58 +020094 elif ps.primary_op.type == "ResizeBilinear":
95 upscaling = round_up_divide(ofm_tensor.shape[-3], ifm_tensor.shape[-3])
Tim Hall79d07d22020-04-27 18:20:16 +010096
Tim Hall79d07d22020-04-27 18:20:16 +010097 concat_axis = 0
98 concat_offset = 0
99
Tim Hall79d07d22020-04-27 18:20:16 +0100100 # Fusable activation functions
101 activation_ops = set(("Sigmoid", "Tanh", "Relu", "Relu6", "ReluN1To1"))
102
103 for op in ps.ops:
104 if op.type == "ConcatSliceWrite":
105 concat_axis = op.attrs["concat_axis"]
106 concat_start = op.attrs["concat_start"]
107 concat_end = op.attrs["concat_end"]
108
109 ofm_start[concat_axis] = concat_start
110 ofm_end[concat_axis] = concat_end
111 concat_offset = concat_start
112 ps.primary_op.attrs["fused_memory_function"] = op.type
113 elif op.type in activation_ops:
114 ps.primary_op.attrs["fused_activation_function"] = op.type
115
Tim Hall79d07d22020-04-27 18:20:16 +0100116 if strat == SchedulingStrategy.WeightStream:
117 ofm_step = block_config[-1]
118 ofm_stop = ofm_end[-1]
Louis Verhaard3c07c972020-05-07 08:12:58 +0200119 if weight_tensor is None or not weight_tensor.needs_dma():
Tim Hall79d07d22020-04-27 18:20:16 +0100120 ofm_step = ofm_stop
121 for start in range(ofm_start[-1], ofm_stop, ofm_step):
122 end = min(start + ofm_step, ofm_stop)
123 ofm_start[-1] = start
124 ofm_end[-1] = end
125 ofm_box = Box(ofm_start, ofm_end)
126 ifm_box = None
127 ifm2_box = None
128
129 if ifm_tensor.shape != []:
130 ifm_box, _, _ = ofm_box.transform_with_strides_and_skirt(
Tim Hallc30f4952020-06-15 20:47:35 +0100131 strides,
132 skirt,
133 ifm_tensor.shape,
134 npu_block_type,
135 concat_axis,
136 concat_offset,
137 split_offsets[0],
138 upscaling,
Tim Hall79d07d22020-04-27 18:20:16 +0100139 )
140 else:
141 ifm_box = Box([], [])
142 if ifm2_tensor is not None and ifm2_tensor.shape != []:
143 ifm2_box, _, _ = ofm_box.transform_with_strides_and_skirt(
Tim Hallc30f4952020-06-15 20:47:35 +0100144 strides,
145 skirt,
146 ifm2_tensor.shape,
147 npu_block_type,
148 concat_axis,
149 concat_offset,
150 split_offsets[1],
151 upscaling,
Tim Hall79d07d22020-04-27 18:20:16 +0100152 )
153 else:
154 ifm2_box = Box([], [])
155
Charles Xu78792222020-05-13 10:15:26 +0200156 for intermediate in ps.intermediates:
Tim Hallc30f4952020-06-15 20:47:35 +0100157 if (
158 intermediate is not None
159 and intermediate.shape != []
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200160 and intermediate.purpose in (TensorPurpose.FeatureMap, TensorPurpose.LUT)
Tim Hallc30f4952020-06-15 20:47:35 +0100161 ):
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200162 if intermediate.purpose is TensorPurpose.FeatureMap:
163 intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt(
164 strides,
165 skirt,
166 intermediate.shape,
167 npu_block_type,
168 concat_axis,
169 concat_offset,
170 split_offsets[0],
171 upscaling,
172 )
173 else:
174 intermediate_box = Box([0] * len(intermediate.shape), list(intermediate.shape))
Charles Xu78792222020-05-13 10:15:26 +0200175 yield from dma_if_necessary(ps, intermediate_box, intermediate)
176
Tim Hall79d07d22020-04-27 18:20:16 +0100177 weight_box = None
178 if weight_tensor is not None:
179 weight_oc_start = start
180 weight_oc_end = end
181 if concat_axis - len(weight_tensor.shape) == -1:
182 weight_oc_start -= concat_offset
183 weight_oc_end -= concat_offset
184
185 weight_box = Box.make_weight_box(
186 weight_tensor.shape,
187 npu_block_type,
188 weight_oc_start,
189 weight_oc_end,
190 weight_tensor.weight_transpose_depthwise,
191 )
Charles Xu78792222020-05-13 10:15:26 +0200192 yield from dma_if_necessary(ps, weight_box, weight_tensor)
Tim Hall79d07d22020-04-27 18:20:16 +0100193
194 yield NpuStripe(
195 ps,
196 block_config,
197 is_first,
198 is_last,
199 True,
200 True,
201 ifm_tensor,
202 ifm_box,
203 ofm_tensor,
204 ofm_box,
205 weight_tensor,
206 weight_box,
207 scale_tensor,
208 concat_axis,
209 concat_offset,
210 ifm2_tensor=ifm2_tensor,
211 ifm2_box=ifm2_box,
212 )
213
214 elif strat == SchedulingStrategy.IfmStream:
215 y_step = block_config[0]
216 y_start = 0
217 y_dim = 1
218 if len(ofm_tensor.shape) >= 3:
219 y_start = ofm_start[-3]
220 y_dim = ofm_end[-3]
221 if idx > 0:
222 ifm_y_present = 0
223 prev_pass = passes[idx - 1]
224 prev_pass_gen = generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx - 1)
225 else:
226 ifm_y_present = 1
227 if len(ifm_tensor.shape) >= 3:
228 ifm_y_present = ifm_tensor.shape[-3]
229 prev_pass_gen = []
230 prev_pass = None
231
232 if len(passes) == 1:
233 # no cascading, can just issue one big stripe
234 # but only if we've done allocation and OFM does not overlap IFM
Charles Xu04ce34c2020-06-23 12:42:28 +0200235 if ifm_tensor.address is not None and ofm_tensor.address is not None:
Tim Hall79d07d22020-04-27 18:20:16 +0100236 if (
237 ifm_tensor.address + ifm_tensor.storage_size() <= ofm_tensor.address
238 or ofm_tensor.address + ofm_tensor.storage_size() <= ifm_tensor.address
239 ):
240 y_step = y_dim
241
242 weight_box = None
243
244 for start in range(y_start, y_dim, y_step):
245 end = min(start + y_step, y_dim)
246 if len(ofm_tensor.shape) >= 3:
247 ofm_start[-3] = start
248 ofm_end[-3] = end
249 ofm_box = Box(ofm_start, ofm_end)
250
251 k_height = 1
Charles Xu89a6bbf2020-08-11 12:31:58 +0200252 if npu_block_type in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):
Tim Hall79d07d22020-04-27 18:20:16 +0100253 if ps.primary_op is not None:
254 k_height = ps.primary_op.attrs["ksize"][1]
255 else:
256 if weight_tensor is not None:
257 k_height = weight_tensor.shape[0]
258
259 ifm_box, pad_top, pad_bottom = ofm_box.transform_with_strides_and_skirt(
Tim Hallc30f4952020-06-15 20:47:35 +0100260 strides,
261 skirt,
262 ifm_tensor.shape,
263 npu_block_type,
264 concat_axis,
265 concat_offset,
266 split_offsets[0],
267 k_height,
268 upscaling,
Tim Hall79d07d22020-04-27 18:20:16 +0100269 )
270
Charles Xu78792222020-05-13 10:15:26 +0200271 for intermediate in ps.intermediates:
Tim Hallc30f4952020-06-15 20:47:35 +0100272 if (
273 intermediate is not None
274 and intermediate.shape != []
275 and intermediate.purpose == TensorPurpose.FeatureMap
276 ):
Charles Xu78792222020-05-13 10:15:26 +0200277 intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt(
Tim Hallc30f4952020-06-15 20:47:35 +0100278 strides,
279 skirt,
280 intermediate.shape,
281 npu_block_type,
282 concat_axis,
283 concat_offset,
284 split_offsets[0],
285 upscaling,
Charles Xu78792222020-05-13 10:15:26 +0200286 )
287 yield from dma_if_necessary(ps, intermediate_box, intermediate)
288
Tim Hall79d07d22020-04-27 18:20:16 +0100289 ifm_y_needed = 1
290 if len(ifm_box.end_coord) >= 3:
291 ifm_y_needed = ifm_box.end_coord[-3]
292 if ifm_y_present < ifm_y_needed:
293 for prev_cmd in prev_pass_gen:
294 yield prev_cmd
295 rng = prev_cmd.get_ofm_y_range_for_pass(prev_pass)
296 if rng is not None:
297 ifm_y_present = max(ifm_y_present, rng[1])
298 if ifm_y_present >= ifm_y_needed:
299 break
300
301 if weight_tensor is not None and weight_box is None:
302 weight_box = Box.make_weight_box(
303 weight_tensor.shape, npu_block_type, weights_transposed=weight_tensor.weight_transpose_depthwise
304 )
Charles Xu78792222020-05-13 10:15:26 +0200305 yield from dma_if_necessary(ps, weight_box, weight_tensor)
Tim Hall79d07d22020-04-27 18:20:16 +0100306
307 # Check if first/last stripe in pass
308 is_first_h_stripe = start == y_start
309 is_last_h_stripe = (start + y_step) >= y_dim
310
311 stripe = NpuStripe(
312 ps,
313 block_config,
314 is_first,
315 is_last,
316 is_first_h_stripe,
317 is_last_h_stripe,
318 ifm_tensor,
319 ifm_box,
320 ofm_tensor,
321 ofm_box,
322 weight_tensor,
323 weight_box,
324 scale_tensor,
325 concat_axis,
326 concat_offset,
327 None,
328 None,
329 pad_top,
330 pad_bottom,
331 )
332 yield stripe
333 else:
334 assert 0, "unknown scheduling strategy"
335
336
337def generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
338 if strat == SchedulingStrategy.WeightStream:
339 for idx in range(len(passes)):
340 yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx)
341 elif strat == SchedulingStrategy.IfmStream:
342 yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, len(passes) - 1)
343 else:
344 assert 0, "Unknown streaming strategy"
345
346
347def generate_high_level_command_stream_for_cascaded_pass(cps):
348 yield from generate_high_level_command_stream_for_pass_list(
349 cps.strategy, cps.passes, [ps.block_config for ps in cps.passes]
350 )
351
352
353def generate_high_level_command_stream(nng, sg, arch, verbose_high_level_command_stream):
354 res = []
355 for cps in sg.cascaded_passes:
356 if cps.placement == PassPlacement.Npu:
357 res += list(generate_high_level_command_stream_for_cascaded_pass(cps))
358
359 sg.high_level_command_stream = res
360 if verbose_high_level_command_stream:
361 sg.print_high_level_command_stream()
362
363
364def calc_allowed_ofm_ifm_overlap_for_pass_list(strat, passes, block_configs):
365 highest_ofm_write = 0
366 if not passes[0].ifm_tensor or not passes[-1].ofm_tensor:
367 return 0
368
369 ifm_read = passes[0].ifm_tensor.storage_size
370 min_overlap = 999999999999999999999
371 ofm_size = passes[-1].ofm_tensor.storage_size()
372 if strat == SchedulingStrategy.WeightStream:
373 return 0
374 for cmd in generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
375 if cmd.is_npu_pass_command():
376 if cmd.is_first:
377 ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.start_coord, is_top_box=False)
378 if ifm_read is None:
379 return 0
380 if cmd.is_last:
381 write_offset = cmd.ofm_tensor.address_offset_for_coordinate(cmd.ofm_box.end_coord, is_top_box=True)
382 if write_offset is None:
383 return 0
384 highest_ofm_write = max(write_offset, highest_ofm_write)
385
386 if cmd.is_first or cmd.is_last:
387 overlap_required = max(highest_ofm_write - min(ifm_read, ofm_size), 0)
388 can_overwrite = ofm_size - overlap_required
389 min_overlap = min(min_overlap, can_overwrite)
390
391 if cmd.is_first:
392 ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.end_coord, is_top_box=True)
393
394 min_overlap = max(min_overlap, 0)
395 return min_overlap
396
397
398def calc_allowed_ofm_ifm_overlap_for_cascaded_pass(cps):
399 return calc_allowed_ofm_ifm_overlap_for_pass_list(cps.strategy, cps.passes, [ps.block_config for ps in cps.passes])