blob: dc52ae521bf400777d087921e6ebb13367f34496 [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Generate a high-level command stream from a scheduled subgraph with CascadedPasses.
18#
19# Also used during scheduling to work out allowable IFM/OFM overlap, this functionality can be accessed using
20# calc_allowed_ofm_ifm_overlap_for_cascaded_pass().
Diego Russoe8a10452020-04-21 17:39:10 +010021from .high_level_command_stream import Box
22from .high_level_command_stream import DMA
23from .high_level_command_stream import NpuStripe
24from .nn_graph import PassPlacement
25from .nn_graph import SchedulingStrategy
Charles Xu89a6bbf2020-08-11 12:31:58 +020026from .numeric_util import round_up_divide
Tim Hall79d07d22020-04-27 18:20:16 +010027from .operation import NpuBlockType
Louis Verhaardaee5d752020-09-30 09:01:52 +020028from .operation import Op
Charles Xu78792222020-05-13 10:15:26 +020029from .tensor import TensorPurpose
Tim Hall79d07d22020-04-27 18:20:16 +010030
31
Charles Xu78792222020-05-13 10:15:26 +020032def dma_if_necessary(ps, box, tensor):
Louis Verhaard3c07c972020-05-07 08:12:58 +020033 if tensor.needs_dma():
Charles Xu78792222020-05-13 10:15:26 +020034 dma_op = tensor.ops[0]
Tim Hall79d07d22020-04-27 18:20:16 +010035 in_tensor = dma_op.inputs[0]
Louis Verhaard0b8268a2020-08-05 16:11:29 +020036 yield DMA(ps, in_tensor, tensor, box)
Tim Hall79d07d22020-04-27 18:20:16 +010037
Tim Hallc30f4952020-06-15 20:47:35 +010038
Charles Xu600351a2020-05-18 08:54:47 +020039def match_tensor(source, derived):
40 if source == derived:
41 return True
42 ops = derived.ops
Louis Verhaardaee5d752020-09-30 09:01:52 +020043 return ops != [] and len(ops) == 1 and ops[0].type == Op.SplitSliceRead and source == ops[0].inputs[0]
Tim Hallc30f4952020-06-15 20:47:35 +010044
Tim Hall79d07d22020-04-27 18:20:16 +010045
46def generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx):
47 is_first = idx == 0
48 is_last = idx == len(passes) - 1
49 ps = passes[idx]
50 block_config = block_configs[idx]
Charles Xu600351a2020-05-18 08:54:47 +020051 npu_block_type = ps.npu_block_type
52 split_offsets = [None, None] # offset for [ifm, ifm2]
53
Charles Xu600351a2020-05-18 08:54:47 +020054 if len(ps.inputs) == 2 and npu_block_type == NpuBlockType.ElementWise:
Patrik Gustavsson438e5632020-09-01 12:23:25 +020055 # Ensure correct ifm and ifm2 order
Tim Hallc30f4952020-06-15 20:47:35 +010056 if match_tensor(ps.inputs[0], ps.primary_op.inputs[1]) and match_tensor(ps.inputs[1], ps.primary_op.inputs[0]):
Charles Xu600351a2020-05-18 08:54:47 +020057 ps.ifm_tensor, ps.ifm2_tensor = ps.ifm2_tensor, ps.ifm_tensor
Patrik Gustavsson438e5632020-09-01 12:23:25 +020058
59 for op in ps.ops:
Louis Verhaardaee5d752020-09-30 09:01:52 +020060 if op.type == Op.SplitSliceRead:
61 ps.primary_op.memory_function = op.type
Patrik Gustavsson438e5632020-09-01 12:23:25 +020062 assert len(op.inputs) == 1
63 if match_tensor(ps.ifm_tensor, op.inputs[0]):
64 split_offsets[0] = op.attrs["split_start"]
65 elif match_tensor(ps.ifm2_tensor, op.inputs[0]):
66 split_offsets[1] = op.attrs["split_start"]
67 else:
68 assert False
69 else:
70 ifm_idx = 0
71 for op in ps.ops:
Louis Verhaardaee5d752020-09-30 09:01:52 +020072 if op.type == Op.SplitSliceRead:
Patrik Gustavsson438e5632020-09-01 12:23:25 +020073 assert ifm_idx < 2
74 split_offsets[ifm_idx] = op.attrs["split_start"]
Louis Verhaardaee5d752020-09-30 09:01:52 +020075 ps.primary_op.memory_function = op.type
Patrik Gustavsson438e5632020-09-01 12:23:25 +020076 ifm_idx += 1
Tim Hall79d07d22020-04-27 18:20:16 +010077
78 ifm_tensor = ps.ifm_tensor
79 ifm2_tensor = ps.ifm2_tensor
80 ofm_tensor = ps.ofm_tensor
81 weight_tensor = ps.weight_tensor
82 scale_tensor = ps.scale_tensor
83
84 ofm_start = [0] * len(ofm_tensor.shape)
85 ofm_end = list(ofm_tensor.shape)
86
87 strides = None
88 skirt = None
Jacob Bohlin611fcdf2020-06-11 15:09:57 +020089 upscaling = 1
Tim Hall79d07d22020-04-27 18:20:16 +010090 if ps.primary_op is not None:
91 strides = ps.primary_op.attrs.get("strides", None)
92 skirt = ps.primary_op.attrs.get("skirt", None)
Louis Verhaardaee5d752020-09-30 09:01:52 +020093 if ps.primary_op.type == Op.Conv2DBackpropInputSwitchedBias:
Jacob Bohlin611fcdf2020-06-11 15:09:57 +020094 upscaling = ofm_tensor.shape[-3] // ifm_tensor.shape[-3]
Louis Verhaardaee5d752020-09-30 09:01:52 +020095 elif ps.primary_op.type == Op.ResizeBilinear:
Charles Xu89a6bbf2020-08-11 12:31:58 +020096 upscaling = round_up_divide(ofm_tensor.shape[-3], ifm_tensor.shape[-3])
Tim Hall79d07d22020-04-27 18:20:16 +010097
Tim Hall79d07d22020-04-27 18:20:16 +010098 concat_axis = 0
99 concat_offset = 0
100
Tim Hall79d07d22020-04-27 18:20:16 +0100101 for op in ps.ops:
Louis Verhaardaee5d752020-09-30 09:01:52 +0200102 if op.type == Op.ConcatSliceWrite:
Tim Hall79d07d22020-04-27 18:20:16 +0100103 concat_axis = op.attrs["concat_axis"]
104 concat_start = op.attrs["concat_start"]
105 concat_end = op.attrs["concat_end"]
106
107 ofm_start[concat_axis] = concat_start
108 ofm_end[concat_axis] = concat_end
109 concat_offset = concat_start
Louis Verhaardaee5d752020-09-30 09:01:52 +0200110 ps.primary_op.memory_function = op.type
111 elif op.type.is_relu_op() or op.type in (Op.Tanh, Op.Sigmoid):
112 ps.primary_op.activation = op.type
Tim Hall79d07d22020-04-27 18:20:16 +0100113
Tim Hall79d07d22020-04-27 18:20:16 +0100114 if strat == SchedulingStrategy.WeightStream:
115 ofm_step = block_config[-1]
116 ofm_stop = ofm_end[-1]
Louis Verhaard3c07c972020-05-07 08:12:58 +0200117 if weight_tensor is None or not weight_tensor.needs_dma():
Tim Hall79d07d22020-04-27 18:20:16 +0100118 ofm_step = ofm_stop
119 for start in range(ofm_start[-1], ofm_stop, ofm_step):
120 end = min(start + ofm_step, ofm_stop)
121 ofm_start[-1] = start
122 ofm_end[-1] = end
123 ofm_box = Box(ofm_start, ofm_end)
124 ifm_box = None
125 ifm2_box = None
126
127 if ifm_tensor.shape != []:
128 ifm_box, _, _ = ofm_box.transform_with_strides_and_skirt(
Tim Hallc30f4952020-06-15 20:47:35 +0100129 strides,
130 skirt,
131 ifm_tensor.shape,
132 npu_block_type,
133 concat_axis,
134 concat_offset,
135 split_offsets[0],
136 upscaling,
Tim Hall79d07d22020-04-27 18:20:16 +0100137 )
138 else:
139 ifm_box = Box([], [])
140 if ifm2_tensor is not None and ifm2_tensor.shape != []:
141 ifm2_box, _, _ = ofm_box.transform_with_strides_and_skirt(
Tim Hallc30f4952020-06-15 20:47:35 +0100142 strides,
143 skirt,
144 ifm2_tensor.shape,
145 npu_block_type,
146 concat_axis,
147 concat_offset,
148 split_offsets[1],
149 upscaling,
Tim Hall79d07d22020-04-27 18:20:16 +0100150 )
151 else:
152 ifm2_box = Box([], [])
153
Charles Xu78792222020-05-13 10:15:26 +0200154 for intermediate in ps.intermediates:
Tim Hallc30f4952020-06-15 20:47:35 +0100155 if (
156 intermediate is not None
157 and intermediate.shape != []
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200158 and intermediate.purpose in (TensorPurpose.FeatureMap, TensorPurpose.LUT)
Tim Hallc30f4952020-06-15 20:47:35 +0100159 ):
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200160 if intermediate.purpose is TensorPurpose.FeatureMap:
161 intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt(
162 strides,
163 skirt,
164 intermediate.shape,
165 npu_block_type,
166 concat_axis,
167 concat_offset,
168 split_offsets[0],
169 upscaling,
170 )
171 else:
172 intermediate_box = Box([0] * len(intermediate.shape), list(intermediate.shape))
Charles Xu78792222020-05-13 10:15:26 +0200173 yield from dma_if_necessary(ps, intermediate_box, intermediate)
174
Tim Hall79d07d22020-04-27 18:20:16 +0100175 weight_box = None
176 if weight_tensor is not None:
177 weight_oc_start = start
178 weight_oc_end = end
179 if concat_axis - len(weight_tensor.shape) == -1:
180 weight_oc_start -= concat_offset
181 weight_oc_end -= concat_offset
182
183 weight_box = Box.make_weight_box(
184 weight_tensor.shape,
185 npu_block_type,
186 weight_oc_start,
187 weight_oc_end,
188 weight_tensor.weight_transpose_depthwise,
189 )
Charles Xu78792222020-05-13 10:15:26 +0200190 yield from dma_if_necessary(ps, weight_box, weight_tensor)
Tim Hall79d07d22020-04-27 18:20:16 +0100191
192 yield NpuStripe(
193 ps,
194 block_config,
195 is_first,
196 is_last,
197 True,
198 True,
199 ifm_tensor,
200 ifm_box,
201 ofm_tensor,
202 ofm_box,
203 weight_tensor,
204 weight_box,
205 scale_tensor,
206 concat_axis,
207 concat_offset,
208 ifm2_tensor=ifm2_tensor,
209 ifm2_box=ifm2_box,
210 )
211
212 elif strat == SchedulingStrategy.IfmStream:
213 y_step = block_config[0]
214 y_start = 0
215 y_dim = 1
216 if len(ofm_tensor.shape) >= 3:
217 y_start = ofm_start[-3]
218 y_dim = ofm_end[-3]
219 if idx > 0:
220 ifm_y_present = 0
221 prev_pass = passes[idx - 1]
222 prev_pass_gen = generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx - 1)
223 else:
224 ifm_y_present = 1
225 if len(ifm_tensor.shape) >= 3:
226 ifm_y_present = ifm_tensor.shape[-3]
227 prev_pass_gen = []
228 prev_pass = None
229
230 if len(passes) == 1:
231 # no cascading, can just issue one big stripe
232 # but only if we've done allocation and OFM does not overlap IFM
Charles Xu04ce34c2020-06-23 12:42:28 +0200233 if ifm_tensor.address is not None and ofm_tensor.address is not None:
Tim Hall79d07d22020-04-27 18:20:16 +0100234 if (
235 ifm_tensor.address + ifm_tensor.storage_size() <= ofm_tensor.address
236 or ofm_tensor.address + ofm_tensor.storage_size() <= ifm_tensor.address
237 ):
238 y_step = y_dim
239
240 weight_box = None
241
242 for start in range(y_start, y_dim, y_step):
243 end = min(start + y_step, y_dim)
244 if len(ofm_tensor.shape) >= 3:
245 ofm_start[-3] = start
246 ofm_end[-3] = end
247 ofm_box = Box(ofm_start, ofm_end)
248
249 k_height = 1
Charles Xu89a6bbf2020-08-11 12:31:58 +0200250 if npu_block_type in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):
Tim Hall79d07d22020-04-27 18:20:16 +0100251 if ps.primary_op is not None:
252 k_height = ps.primary_op.attrs["ksize"][1]
253 else:
254 if weight_tensor is not None:
255 k_height = weight_tensor.shape[0]
256
257 ifm_box, pad_top, pad_bottom = ofm_box.transform_with_strides_and_skirt(
Tim Hallc30f4952020-06-15 20:47:35 +0100258 strides,
259 skirt,
260 ifm_tensor.shape,
261 npu_block_type,
262 concat_axis,
263 concat_offset,
264 split_offsets[0],
265 k_height,
266 upscaling,
Tim Hall79d07d22020-04-27 18:20:16 +0100267 )
268
Charles Xu78792222020-05-13 10:15:26 +0200269 for intermediate in ps.intermediates:
Tim Hallc30f4952020-06-15 20:47:35 +0100270 if (
271 intermediate is not None
272 and intermediate.shape != []
Michael McGeagh34ad19b2020-09-04 15:44:23 +0100273 and intermediate.purpose in (TensorPurpose.FeatureMap, TensorPurpose.LUT)
Tim Hallc30f4952020-06-15 20:47:35 +0100274 ):
Michael McGeagh34ad19b2020-09-04 15:44:23 +0100275 if intermediate.purpose is TensorPurpose.FeatureMap:
276 intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt(
277 strides,
278 skirt,
279 intermediate.shape,
280 npu_block_type,
281 concat_axis,
282 concat_offset,
283 split_offsets[0],
284 upscaling,
285 )
286 else:
287 intermediate_box = Box([0] * len(intermediate.shape), list(intermediate.shape))
Charles Xu78792222020-05-13 10:15:26 +0200288 yield from dma_if_necessary(ps, intermediate_box, intermediate)
289
Tim Hall79d07d22020-04-27 18:20:16 +0100290 ifm_y_needed = 1
291 if len(ifm_box.end_coord) >= 3:
292 ifm_y_needed = ifm_box.end_coord[-3]
293 if ifm_y_present < ifm_y_needed:
294 for prev_cmd in prev_pass_gen:
295 yield prev_cmd
296 rng = prev_cmd.get_ofm_y_range_for_pass(prev_pass)
297 if rng is not None:
298 ifm_y_present = max(ifm_y_present, rng[1])
299 if ifm_y_present >= ifm_y_needed:
300 break
301
302 if weight_tensor is not None and weight_box is None:
303 weight_box = Box.make_weight_box(
304 weight_tensor.shape, npu_block_type, weights_transposed=weight_tensor.weight_transpose_depthwise
305 )
Charles Xu78792222020-05-13 10:15:26 +0200306 yield from dma_if_necessary(ps, weight_box, weight_tensor)
Tim Hall79d07d22020-04-27 18:20:16 +0100307
308 # Check if first/last stripe in pass
309 is_first_h_stripe = start == y_start
310 is_last_h_stripe = (start + y_step) >= y_dim
311
312 stripe = NpuStripe(
313 ps,
314 block_config,
315 is_first,
316 is_last,
317 is_first_h_stripe,
318 is_last_h_stripe,
319 ifm_tensor,
320 ifm_box,
321 ofm_tensor,
322 ofm_box,
323 weight_tensor,
324 weight_box,
325 scale_tensor,
326 concat_axis,
327 concat_offset,
328 None,
329 None,
330 pad_top,
331 pad_bottom,
332 )
333 yield stripe
334 else:
335 assert 0, "unknown scheduling strategy"
336
337
338def generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
339 if strat == SchedulingStrategy.WeightStream:
340 for idx in range(len(passes)):
341 yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx)
342 elif strat == SchedulingStrategy.IfmStream:
343 yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, len(passes) - 1)
344 else:
345 assert 0, "Unknown streaming strategy"
346
347
348def generate_high_level_command_stream_for_cascaded_pass(cps):
349 yield from generate_high_level_command_stream_for_pass_list(
350 cps.strategy, cps.passes, [ps.block_config for ps in cps.passes]
351 )
352
353
354def generate_high_level_command_stream(nng, sg, arch, verbose_high_level_command_stream):
355 res = []
356 for cps in sg.cascaded_passes:
357 if cps.placement == PassPlacement.Npu:
358 res += list(generate_high_level_command_stream_for_cascaded_pass(cps))
359
360 sg.high_level_command_stream = res
361 if verbose_high_level_command_stream:
362 sg.print_high_level_command_stream()
363
364
365def calc_allowed_ofm_ifm_overlap_for_pass_list(strat, passes, block_configs):
366 highest_ofm_write = 0
367 if not passes[0].ifm_tensor or not passes[-1].ofm_tensor:
368 return 0
369
370 ifm_read = passes[0].ifm_tensor.storage_size
371 min_overlap = 999999999999999999999
372 ofm_size = passes[-1].ofm_tensor.storage_size()
373 if strat == SchedulingStrategy.WeightStream:
374 return 0
375 for cmd in generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
376 if cmd.is_npu_pass_command():
377 if cmd.is_first:
378 ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.start_coord, is_top_box=False)
379 if ifm_read is None:
380 return 0
381 if cmd.is_last:
382 write_offset = cmd.ofm_tensor.address_offset_for_coordinate(cmd.ofm_box.end_coord, is_top_box=True)
383 if write_offset is None:
384 return 0
385 highest_ofm_write = max(write_offset, highest_ofm_write)
386
387 if cmd.is_first or cmd.is_last:
388 overlap_required = max(highest_ofm_write - min(ifm_read, ofm_size), 0)
389 can_overwrite = ofm_size - overlap_required
390 min_overlap = min(min_overlap, can_overwrite)
391
392 if cmd.is_first:
393 ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.end_coord, is_top_box=True)
394
395 min_overlap = max(min_overlap, 0)
396 return min_overlap
397
398
399def calc_allowed_ofm_ifm_overlap_for_cascaded_pass(cps):
400 return calc_allowed_ofm_ifm_overlap_for_pass_list(cps.strategy, cps.passes, [ps.block_config for ps in cps.passes])