blob: 60e62aa6dae1e334d748f347102aa59db62f1263 [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Generate a high-level command stream from a scheduled subgraph with CascadedPasses.
18#
19# Also used during scheduling to work out allowable IFM/OFM overlap, this functionality can be accessed using
20# calc_allowed_ofm_ifm_overlap_for_cascaded_pass().
Diego Russoe8a10452020-04-21 17:39:10 +010021from .high_level_command_stream import Box
22from .high_level_command_stream import DMA
23from .high_level_command_stream import NpuStripe
24from .nn_graph import PassPlacement
25from .nn_graph import SchedulingStrategy
Charles Xu89a6bbf2020-08-11 12:31:58 +020026from .numeric_util import round_up_divide
Louis Verhaarde8a5a782020-11-02 18:04:27 +010027from .operation import create_activation_function
Tim Hall79d07d22020-04-27 18:20:16 +010028from .operation import NpuBlockType
Louis Verhaardaee5d752020-09-30 09:01:52 +020029from .operation import Op
patrik.gustavssoneeb85152020-12-21 17:10:40 +000030from .shape4d import Shape4D
Charles Xu78792222020-05-13 10:15:26 +020031from .tensor import TensorPurpose
Tim Hall79d07d22020-04-27 18:20:16 +010032
33
Charles Xu78792222020-05-13 10:15:26 +020034def dma_if_necessary(ps, box, tensor):
Louis Verhaard3c07c972020-05-07 08:12:58 +020035 if tensor.needs_dma():
Charles Xu78792222020-05-13 10:15:26 +020036 dma_op = tensor.ops[0]
Tim Hall79d07d22020-04-27 18:20:16 +010037 in_tensor = dma_op.inputs[0]
Louis Verhaard0b8268a2020-08-05 16:11:29 +020038 yield DMA(ps, in_tensor, tensor, box)
Tim Hall79d07d22020-04-27 18:20:16 +010039
Tim Hallc30f4952020-06-15 20:47:35 +010040
Charles Xu600351a2020-05-18 08:54:47 +020041def match_tensor(source, derived):
42 if source == derived:
43 return True
44 ops = derived.ops
Louis Verhaardaee5d752020-09-30 09:01:52 +020045 return ops != [] and len(ops) == 1 and ops[0].type == Op.SplitSliceRead and source == ops[0].inputs[0]
Tim Hallc30f4952020-06-15 20:47:35 +010046
Tim Hall79d07d22020-04-27 18:20:16 +010047
48def generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx):
49 is_first = idx == 0
50 is_last = idx == len(passes) - 1
51 ps = passes[idx]
52 block_config = block_configs[idx]
Charles Xu600351a2020-05-18 08:54:47 +020053 npu_block_type = ps.npu_block_type
54 split_offsets = [None, None] # offset for [ifm, ifm2]
55
Louis Verhaard2e186c72020-10-09 10:47:04 +020056 if ps.ifm_tensor is not None and ps.ifm2_tensor is not None and npu_block_type == NpuBlockType.ElementWise:
Patrik Gustavsson438e5632020-09-01 12:23:25 +020057 # Ensure correct ifm and ifm2 order
Tim Hallc30f4952020-06-15 20:47:35 +010058 if match_tensor(ps.inputs[0], ps.primary_op.inputs[1]) and match_tensor(ps.inputs[1], ps.primary_op.inputs[0]):
Charles Xu600351a2020-05-18 08:54:47 +020059 ps.ifm_tensor, ps.ifm2_tensor = ps.ifm2_tensor, ps.ifm_tensor
Patrik Gustavsson2349d422020-12-01 16:02:29 +010060 ps.ifm_shapes[0], ps.ifm_shapes[1] = ps.ifm_shapes[1], ps.ifm_shapes[0]
Patrik Gustavsson438e5632020-09-01 12:23:25 +020061
62 for op in ps.ops:
Louis Verhaardaee5d752020-09-30 09:01:52 +020063 if op.type == Op.SplitSliceRead:
64 ps.primary_op.memory_function = op.type
Patrik Gustavsson438e5632020-09-01 12:23:25 +020065 assert len(op.inputs) == 1
66 if match_tensor(ps.ifm_tensor, op.inputs[0]):
67 split_offsets[0] = op.attrs["split_start"]
68 elif match_tensor(ps.ifm2_tensor, op.inputs[0]):
69 split_offsets[1] = op.attrs["split_start"]
70 else:
71 assert False
72 else:
73 ifm_idx = 0
74 for op in ps.ops:
Louis Verhaardaee5d752020-09-30 09:01:52 +020075 if op.type == Op.SplitSliceRead:
Patrik Gustavsson438e5632020-09-01 12:23:25 +020076 assert ifm_idx < 2
77 split_offsets[ifm_idx] = op.attrs["split_start"]
Louis Verhaardaee5d752020-09-30 09:01:52 +020078 ps.primary_op.memory_function = op.type
Patrik Gustavsson438e5632020-09-01 12:23:25 +020079 ifm_idx += 1
Tim Hall79d07d22020-04-27 18:20:16 +010080
81 ifm_tensor = ps.ifm_tensor
Patrik Gustavsson2349d422020-12-01 16:02:29 +010082 ifm_shape = None
83 if ifm_tensor.shape != []:
84 ifm_shape = ps.ifm_shapes[0]
Tim Hall79d07d22020-04-27 18:20:16 +010085 ifm2_tensor = ps.ifm2_tensor
Patrik Gustavsson2349d422020-12-01 16:02:29 +010086 ifm2_shape = None
87 if ifm2_tensor is not None and ifm2_tensor.shape != []:
88 ifm2_shape = ps.ifm_shapes[1]
Tim Hall79d07d22020-04-27 18:20:16 +010089 ofm_tensor = ps.ofm_tensor
Patrik Gustavsson2349d422020-12-01 16:02:29 +010090 ofm_shape = ps.ofm_shapes[0]
Tim Hall79d07d22020-04-27 18:20:16 +010091 weight_tensor = ps.weight_tensor
92 scale_tensor = ps.scale_tensor
93
patrik.gustavssoneeb85152020-12-21 17:10:40 +000094 ofm_start = [0, 0, 0, 0]
95 ofm_end = ofm_shape.as_list()
Tim Hall79d07d22020-04-27 18:20:16 +010096
97 strides = None
98 skirt = None
Jacob Bohlin611fcdf2020-06-11 15:09:57 +020099 upscaling = 1
Tim Hall79d07d22020-04-27 18:20:16 +0100100 if ps.primary_op is not None:
101 strides = ps.primary_op.attrs.get("strides", None)
102 skirt = ps.primary_op.attrs.get("skirt", None)
Louis Verhaardaee5d752020-09-30 09:01:52 +0200103 if ps.primary_op.type == Op.Conv2DBackpropInputSwitchedBias:
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000104 upscaling = ofm_shape.height // ifm_shape.height
Louis Verhaardaee5d752020-09-30 09:01:52 +0200105 elif ps.primary_op.type == Op.ResizeBilinear:
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000106 upscaling = round_up_divide(ofm_shape.height, ifm_shape.height)
Tim Hall79d07d22020-04-27 18:20:16 +0100107
Tim Hall79d07d22020-04-27 18:20:16 +0100108 concat_axis = 0
109 concat_offset = 0
110
Tim Hall79d07d22020-04-27 18:20:16 +0100111 for op in ps.ops:
Louis Verhaardaee5d752020-09-30 09:01:52 +0200112 if op.type == Op.ConcatSliceWrite:
Tim Hall79d07d22020-04-27 18:20:16 +0100113 concat_axis = op.attrs["concat_axis"]
114 concat_start = op.attrs["concat_start"]
115 concat_end = op.attrs["concat_end"]
116
117 ofm_start[concat_axis] = concat_start
118 ofm_end[concat_axis] = concat_end
119 concat_offset = concat_start
Louis Verhaardaee5d752020-09-30 09:01:52 +0200120 ps.primary_op.memory_function = op.type
121 elif op.type.is_relu_op() or op.type in (Op.Tanh, Op.Sigmoid):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100122 ps.primary_op.activation = create_activation_function(op.type)
Tim Hall79d07d22020-04-27 18:20:16 +0100123
Tim Hall79d07d22020-04-27 18:20:16 +0100124 if strat == SchedulingStrategy.WeightStream:
125 ofm_step = block_config[-1]
126 ofm_stop = ofm_end[-1]
Louis Verhaard3c07c972020-05-07 08:12:58 +0200127 if weight_tensor is None or not weight_tensor.needs_dma():
Tim Hall79d07d22020-04-27 18:20:16 +0100128 ofm_step = ofm_stop
129 for start in range(ofm_start[-1], ofm_stop, ofm_step):
130 end = min(start + ofm_step, ofm_stop)
131 ofm_start[-1] = start
132 ofm_end[-1] = end
133 ofm_box = Box(ofm_start, ofm_end)
134 ifm_box = None
135 ifm2_box = None
136
Patrik Gustavsson2349d422020-12-01 16:02:29 +0100137 if ifm_shape is not None:
Tim Hall79d07d22020-04-27 18:20:16 +0100138 ifm_box, _, _ = ofm_box.transform_with_strides_and_skirt(
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000139 strides, skirt, ifm_shape, npu_block_type, concat_axis, concat_offset, split_offsets[0], upscaling,
Tim Hall79d07d22020-04-27 18:20:16 +0100140 )
141 else:
142 ifm_box = Box([], [])
Patrik Gustavsson2349d422020-12-01 16:02:29 +0100143 if ifm2_shape is not None:
Tim Hall79d07d22020-04-27 18:20:16 +0100144 ifm2_box, _, _ = ofm_box.transform_with_strides_and_skirt(
Patrik Gustavsson2349d422020-12-01 16:02:29 +0100145 strides, skirt, ifm2_shape, npu_block_type, concat_axis, concat_offset, split_offsets[1], upscaling,
Tim Hall79d07d22020-04-27 18:20:16 +0100146 )
147 else:
148 ifm2_box = Box([], [])
149
Charles Xu78792222020-05-13 10:15:26 +0200150 for intermediate in ps.intermediates:
Tim Hallc30f4952020-06-15 20:47:35 +0100151 if (
152 intermediate is not None
153 and intermediate.shape != []
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200154 and intermediate.purpose in (TensorPurpose.FeatureMap, TensorPurpose.LUT)
Tim Hallc30f4952020-06-15 20:47:35 +0100155 ):
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200156 if intermediate.purpose is TensorPurpose.FeatureMap:
157 intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt(
158 strides,
159 skirt,
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000160 Shape4D(intermediate.shape),
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200161 npu_block_type,
162 concat_axis,
163 concat_offset,
164 split_offsets[0],
165 upscaling,
166 )
167 else:
168 intermediate_box = Box([0] * len(intermediate.shape), list(intermediate.shape))
Charles Xu78792222020-05-13 10:15:26 +0200169 yield from dma_if_necessary(ps, intermediate_box, intermediate)
170
Tim Hall79d07d22020-04-27 18:20:16 +0100171 weight_box = None
172 if weight_tensor is not None:
173 weight_oc_start = start
174 weight_oc_end = end
175 if concat_axis - len(weight_tensor.shape) == -1:
176 weight_oc_start -= concat_offset
177 weight_oc_end -= concat_offset
178
179 weight_box = Box.make_weight_box(
180 weight_tensor.shape,
181 npu_block_type,
182 weight_oc_start,
183 weight_oc_end,
184 weight_tensor.weight_transpose_depthwise,
185 )
Charles Xu78792222020-05-13 10:15:26 +0200186 yield from dma_if_necessary(ps, weight_box, weight_tensor)
Tim Hall79d07d22020-04-27 18:20:16 +0100187
188 yield NpuStripe(
189 ps,
190 block_config,
191 is_first,
192 is_last,
193 True,
194 True,
195 ifm_tensor,
196 ifm_box,
197 ofm_tensor,
198 ofm_box,
199 weight_tensor,
200 weight_box,
201 scale_tensor,
202 concat_axis,
203 concat_offset,
204 ifm2_tensor=ifm2_tensor,
205 ifm2_box=ifm2_box,
206 )
207
208 elif strat == SchedulingStrategy.IfmStream:
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000209 assert ifm_shape is not None
Tim Hall79d07d22020-04-27 18:20:16 +0100210 y_step = block_config[0]
Patrik Gustavsson2349d422020-12-01 16:02:29 +0100211 y_start = ofm_start[-3]
212 y_dim = ofm_end[-3]
213
Tim Hall79d07d22020-04-27 18:20:16 +0100214 if idx > 0:
215 ifm_y_present = 0
216 prev_pass = passes[idx - 1]
217 prev_pass_gen = generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx - 1)
218 else:
219 ifm_y_present = 1
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000220 ifm_y_present = ifm_shape.height
Tim Hall79d07d22020-04-27 18:20:16 +0100221 prev_pass_gen = []
222 prev_pass = None
223
224 if len(passes) == 1:
225 # no cascading, can just issue one big stripe
226 # but only if we've done allocation and OFM does not overlap IFM
Charles Xu04ce34c2020-06-23 12:42:28 +0200227 if ifm_tensor.address is not None and ofm_tensor.address is not None:
Tim Hall79d07d22020-04-27 18:20:16 +0100228 if (
229 ifm_tensor.address + ifm_tensor.storage_size() <= ofm_tensor.address
230 or ofm_tensor.address + ofm_tensor.storage_size() <= ifm_tensor.address
231 ):
232 y_step = y_dim
233
234 weight_box = None
Andreas Nevalainen897cc142020-10-28 15:42:08 +0100235 scale_box = None
Tim Hall79d07d22020-04-27 18:20:16 +0100236
237 for start in range(y_start, y_dim, y_step):
238 end = min(start + y_step, y_dim)
Patrik Gustavsson2349d422020-12-01 16:02:29 +0100239 ofm_start[-3] = start
240 ofm_end[-3] = end
Tim Hall79d07d22020-04-27 18:20:16 +0100241 ofm_box = Box(ofm_start, ofm_end)
242
243 k_height = 1
Charles Xu89a6bbf2020-08-11 12:31:58 +0200244 if npu_block_type in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):
Tim Hall79d07d22020-04-27 18:20:16 +0100245 if ps.primary_op is not None:
246 k_height = ps.primary_op.attrs["ksize"][1]
247 else:
248 if weight_tensor is not None:
249 k_height = weight_tensor.shape[0]
250
251 ifm_box, pad_top, pad_bottom = ofm_box.transform_with_strides_and_skirt(
Tim Hallc30f4952020-06-15 20:47:35 +0100252 strides,
253 skirt,
Patrik Gustavsson2349d422020-12-01 16:02:29 +0100254 ifm_shape,
Tim Hallc30f4952020-06-15 20:47:35 +0100255 npu_block_type,
256 concat_axis,
257 concat_offset,
258 split_offsets[0],
259 k_height,
260 upscaling,
Tim Hall79d07d22020-04-27 18:20:16 +0100261 )
262
Charles Xu78792222020-05-13 10:15:26 +0200263 for intermediate in ps.intermediates:
Tim Hallc30f4952020-06-15 20:47:35 +0100264 if (
265 intermediate is not None
266 and intermediate.shape != []
Michael McGeagh34ad19b2020-09-04 15:44:23 +0100267 and intermediate.purpose in (TensorPurpose.FeatureMap, TensorPurpose.LUT)
Tim Hallc30f4952020-06-15 20:47:35 +0100268 ):
Michael McGeagh34ad19b2020-09-04 15:44:23 +0100269 if intermediate.purpose is TensorPurpose.FeatureMap:
270 intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt(
271 strides,
272 skirt,
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000273 Shape4D(intermediate.shape),
Michael McGeagh34ad19b2020-09-04 15:44:23 +0100274 npu_block_type,
275 concat_axis,
276 concat_offset,
277 split_offsets[0],
278 upscaling,
279 )
280 else:
281 intermediate_box = Box([0] * len(intermediate.shape), list(intermediate.shape))
Charles Xu78792222020-05-13 10:15:26 +0200282 yield from dma_if_necessary(ps, intermediate_box, intermediate)
283
Tim Hall79d07d22020-04-27 18:20:16 +0100284 ifm_y_needed = 1
285 if len(ifm_box.end_coord) >= 3:
286 ifm_y_needed = ifm_box.end_coord[-3]
287 if ifm_y_present < ifm_y_needed:
288 for prev_cmd in prev_pass_gen:
289 yield prev_cmd
290 rng = prev_cmd.get_ofm_y_range_for_pass(prev_pass)
291 if rng is not None:
292 ifm_y_present = max(ifm_y_present, rng[1])
293 if ifm_y_present >= ifm_y_needed:
294 break
295
Andreas Nevalainen897cc142020-10-28 15:42:08 +0100296 if scale_tensor is not None and scale_tensor.purpose == TensorPurpose.FSBias and scale_box is None:
297 scale_box = Box([0] * len(scale_tensor.shape), list(scale_tensor.shape))
298 yield from dma_if_necessary(ps, scale_box, scale_tensor)
299
Tim Hall79d07d22020-04-27 18:20:16 +0100300 if weight_tensor is not None and weight_box is None:
301 weight_box = Box.make_weight_box(
302 weight_tensor.shape, npu_block_type, weights_transposed=weight_tensor.weight_transpose_depthwise
303 )
Charles Xu78792222020-05-13 10:15:26 +0200304 yield from dma_if_necessary(ps, weight_box, weight_tensor)
Tim Hall79d07d22020-04-27 18:20:16 +0100305
306 # Check if first/last stripe in pass
307 is_first_h_stripe = start == y_start
308 is_last_h_stripe = (start + y_step) >= y_dim
309
310 stripe = NpuStripe(
311 ps,
312 block_config,
313 is_first,
314 is_last,
315 is_first_h_stripe,
316 is_last_h_stripe,
317 ifm_tensor,
318 ifm_box,
319 ofm_tensor,
320 ofm_box,
321 weight_tensor,
322 weight_box,
323 scale_tensor,
324 concat_axis,
325 concat_offset,
326 None,
327 None,
328 pad_top,
329 pad_bottom,
330 )
331 yield stripe
332 else:
333 assert 0, "unknown scheduling strategy"
334
335
336def generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
337 if strat == SchedulingStrategy.WeightStream:
338 for idx in range(len(passes)):
339 yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx)
340 elif strat == SchedulingStrategy.IfmStream:
341 yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, len(passes) - 1)
342 else:
343 assert 0, "Unknown streaming strategy"
344
345
346def generate_high_level_command_stream_for_cascaded_pass(cps):
347 yield from generate_high_level_command_stream_for_pass_list(
348 cps.strategy, cps.passes, [ps.block_config for ps in cps.passes]
349 )
350
351
352def generate_high_level_command_stream(nng, sg, arch, verbose_high_level_command_stream):
353 res = []
354 for cps in sg.cascaded_passes:
355 if cps.placement == PassPlacement.Npu:
356 res += list(generate_high_level_command_stream_for_cascaded_pass(cps))
357
358 sg.high_level_command_stream = res
359 if verbose_high_level_command_stream:
360 sg.print_high_level_command_stream()
361
362
363def calc_allowed_ofm_ifm_overlap_for_pass_list(strat, passes, block_configs):
364 highest_ofm_write = 0
365 if not passes[0].ifm_tensor or not passes[-1].ofm_tensor:
366 return 0
367
Michael McGeagh298e3832020-11-24 14:46:03 +0000368 ifm_read = passes[0].ifm_tensor.storage_size()
Tim Hall79d07d22020-04-27 18:20:16 +0100369 min_overlap = 999999999999999999999
370 ofm_size = passes[-1].ofm_tensor.storage_size()
371 if strat == SchedulingStrategy.WeightStream:
372 return 0
373 for cmd in generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
374 if cmd.is_npu_pass_command():
375 if cmd.is_first:
Patrik Gustavsson2349d422020-12-01 16:02:29 +0100376 ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000377 cmd.ifm_box.start_coord, cmd.ps.ifm_shapes[0].as_list(), is_top_box=False
Patrik Gustavsson2349d422020-12-01 16:02:29 +0100378 )
Tim Hall79d07d22020-04-27 18:20:16 +0100379 if ifm_read is None:
380 return 0
381 if cmd.is_last:
Patrik Gustavsson2349d422020-12-01 16:02:29 +0100382 write_offset = cmd.ofm_tensor.address_offset_for_coordinate(
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000383 cmd.ofm_box.end_coord, cmd.ps.ofm_shapes[0].as_list(), is_top_box=True
Patrik Gustavsson2349d422020-12-01 16:02:29 +0100384 )
Tim Hall79d07d22020-04-27 18:20:16 +0100385 if write_offset is None:
386 return 0
387 highest_ofm_write = max(write_offset, highest_ofm_write)
388
389 if cmd.is_first or cmd.is_last:
390 overlap_required = max(highest_ofm_write - min(ifm_read, ofm_size), 0)
391 can_overwrite = ofm_size - overlap_required
392 min_overlap = min(min_overlap, can_overwrite)
393
394 if cmd.is_first:
Patrik Gustavsson2349d422020-12-01 16:02:29 +0100395 ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000396 cmd.ifm_box.end_coord, cmd.ps.ifm_shapes[0].as_list(), is_top_box=True
Patrik Gustavsson2349d422020-12-01 16:02:29 +0100397 )
Tim Hall79d07d22020-04-27 18:20:16 +0100398
399 min_overlap = max(min_overlap, 0)
400 return min_overlap