blob: 14cd051e4e9672cf2c61e5bcd15aac6bc2b6113c [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Generate a high-level command stream from a scheduled subgraph with CascadedPasses.
18#
19# Also used during scheduling to work out allowable IFM/OFM overlap, this functionality can be accessed using
20# calc_allowed_ofm_ifm_overlap_for_cascaded_pass().
Diego Russoe8a10452020-04-21 17:39:10 +010021from .high_level_command_stream import Box
22from .high_level_command_stream import DMA
23from .high_level_command_stream import NpuStripe
24from .nn_graph import PassPlacement
25from .nn_graph import SchedulingStrategy
Charles Xu89a6bbf2020-08-11 12:31:58 +020026from .numeric_util import round_up_divide
Louis Verhaarde8a5a782020-11-02 18:04:27 +010027from .operation import create_activation_function
Tim Hall79d07d22020-04-27 18:20:16 +010028from .operation import NpuBlockType
Louis Verhaardaee5d752020-09-30 09:01:52 +020029from .operation import Op
Charles Xu78792222020-05-13 10:15:26 +020030from .tensor import TensorPurpose
Tim Hall79d07d22020-04-27 18:20:16 +010031
32
Charles Xu78792222020-05-13 10:15:26 +020033def dma_if_necessary(ps, box, tensor):
Louis Verhaard3c07c972020-05-07 08:12:58 +020034 if tensor.needs_dma():
Charles Xu78792222020-05-13 10:15:26 +020035 dma_op = tensor.ops[0]
Tim Hall79d07d22020-04-27 18:20:16 +010036 in_tensor = dma_op.inputs[0]
Louis Verhaard0b8268a2020-08-05 16:11:29 +020037 yield DMA(ps, in_tensor, tensor, box)
Tim Hall79d07d22020-04-27 18:20:16 +010038
Tim Hallc30f4952020-06-15 20:47:35 +010039
Charles Xu600351a2020-05-18 08:54:47 +020040def match_tensor(source, derived):
41 if source == derived:
42 return True
43 ops = derived.ops
Louis Verhaardaee5d752020-09-30 09:01:52 +020044 return ops != [] and len(ops) == 1 and ops[0].type == Op.SplitSliceRead and source == ops[0].inputs[0]
Tim Hallc30f4952020-06-15 20:47:35 +010045
Tim Hall79d07d22020-04-27 18:20:16 +010046
47def generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx):
48 is_first = idx == 0
49 is_last = idx == len(passes) - 1
50 ps = passes[idx]
51 block_config = block_configs[idx]
Charles Xu600351a2020-05-18 08:54:47 +020052 npu_block_type = ps.npu_block_type
53 split_offsets = [None, None] # offset for [ifm, ifm2]
54
Louis Verhaard2e186c72020-10-09 10:47:04 +020055 if ps.ifm_tensor is not None and ps.ifm2_tensor is not None and npu_block_type == NpuBlockType.ElementWise:
Patrik Gustavsson438e5632020-09-01 12:23:25 +020056 # Ensure correct ifm and ifm2 order
Tim Hallc30f4952020-06-15 20:47:35 +010057 if match_tensor(ps.inputs[0], ps.primary_op.inputs[1]) and match_tensor(ps.inputs[1], ps.primary_op.inputs[0]):
Charles Xu600351a2020-05-18 08:54:47 +020058 ps.ifm_tensor, ps.ifm2_tensor = ps.ifm2_tensor, ps.ifm_tensor
Patrik Gustavsson438e5632020-09-01 12:23:25 +020059
60 for op in ps.ops:
Louis Verhaardaee5d752020-09-30 09:01:52 +020061 if op.type == Op.SplitSliceRead:
62 ps.primary_op.memory_function = op.type
Patrik Gustavsson438e5632020-09-01 12:23:25 +020063 assert len(op.inputs) == 1
64 if match_tensor(ps.ifm_tensor, op.inputs[0]):
65 split_offsets[0] = op.attrs["split_start"]
66 elif match_tensor(ps.ifm2_tensor, op.inputs[0]):
67 split_offsets[1] = op.attrs["split_start"]
68 else:
69 assert False
70 else:
71 ifm_idx = 0
72 for op in ps.ops:
Louis Verhaardaee5d752020-09-30 09:01:52 +020073 if op.type == Op.SplitSliceRead:
Patrik Gustavsson438e5632020-09-01 12:23:25 +020074 assert ifm_idx < 2
75 split_offsets[ifm_idx] = op.attrs["split_start"]
Louis Verhaardaee5d752020-09-30 09:01:52 +020076 ps.primary_op.memory_function = op.type
Patrik Gustavsson438e5632020-09-01 12:23:25 +020077 ifm_idx += 1
Tim Hall79d07d22020-04-27 18:20:16 +010078
79 ifm_tensor = ps.ifm_tensor
80 ifm2_tensor = ps.ifm2_tensor
81 ofm_tensor = ps.ofm_tensor
82 weight_tensor = ps.weight_tensor
83 scale_tensor = ps.scale_tensor
84
85 ofm_start = [0] * len(ofm_tensor.shape)
86 ofm_end = list(ofm_tensor.shape)
87
88 strides = None
89 skirt = None
Jacob Bohlin611fcdf2020-06-11 15:09:57 +020090 upscaling = 1
Tim Hall79d07d22020-04-27 18:20:16 +010091 if ps.primary_op is not None:
92 strides = ps.primary_op.attrs.get("strides", None)
93 skirt = ps.primary_op.attrs.get("skirt", None)
Louis Verhaardaee5d752020-09-30 09:01:52 +020094 if ps.primary_op.type == Op.Conv2DBackpropInputSwitchedBias:
Jacob Bohlin611fcdf2020-06-11 15:09:57 +020095 upscaling = ofm_tensor.shape[-3] // ifm_tensor.shape[-3]
Louis Verhaardaee5d752020-09-30 09:01:52 +020096 elif ps.primary_op.type == Op.ResizeBilinear:
Charles Xu89a6bbf2020-08-11 12:31:58 +020097 upscaling = round_up_divide(ofm_tensor.shape[-3], ifm_tensor.shape[-3])
Tim Hall79d07d22020-04-27 18:20:16 +010098
Tim Hall79d07d22020-04-27 18:20:16 +010099 concat_axis = 0
100 concat_offset = 0
101
Tim Hall79d07d22020-04-27 18:20:16 +0100102 for op in ps.ops:
Louis Verhaardaee5d752020-09-30 09:01:52 +0200103 if op.type == Op.ConcatSliceWrite:
Tim Hall79d07d22020-04-27 18:20:16 +0100104 concat_axis = op.attrs["concat_axis"]
105 concat_start = op.attrs["concat_start"]
106 concat_end = op.attrs["concat_end"]
107
108 ofm_start[concat_axis] = concat_start
109 ofm_end[concat_axis] = concat_end
110 concat_offset = concat_start
Louis Verhaardaee5d752020-09-30 09:01:52 +0200111 ps.primary_op.memory_function = op.type
112 elif op.type.is_relu_op() or op.type in (Op.Tanh, Op.Sigmoid):
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100113 ps.primary_op.activation = create_activation_function(op.type)
Tim Hall79d07d22020-04-27 18:20:16 +0100114
Tim Hall79d07d22020-04-27 18:20:16 +0100115 if strat == SchedulingStrategy.WeightStream:
116 ofm_step = block_config[-1]
117 ofm_stop = ofm_end[-1]
Louis Verhaard3c07c972020-05-07 08:12:58 +0200118 if weight_tensor is None or not weight_tensor.needs_dma():
Tim Hall79d07d22020-04-27 18:20:16 +0100119 ofm_step = ofm_stop
120 for start in range(ofm_start[-1], ofm_stop, ofm_step):
121 end = min(start + ofm_step, ofm_stop)
122 ofm_start[-1] = start
123 ofm_end[-1] = end
124 ofm_box = Box(ofm_start, ofm_end)
125 ifm_box = None
126 ifm2_box = None
127
128 if ifm_tensor.shape != []:
129 ifm_box, _, _ = ofm_box.transform_with_strides_and_skirt(
Tim Hallc30f4952020-06-15 20:47:35 +0100130 strides,
131 skirt,
132 ifm_tensor.shape,
133 npu_block_type,
134 concat_axis,
135 concat_offset,
136 split_offsets[0],
137 upscaling,
Tim Hall79d07d22020-04-27 18:20:16 +0100138 )
139 else:
140 ifm_box = Box([], [])
141 if ifm2_tensor is not None and ifm2_tensor.shape != []:
142 ifm2_box, _, _ = ofm_box.transform_with_strides_and_skirt(
Tim Hallc30f4952020-06-15 20:47:35 +0100143 strides,
144 skirt,
145 ifm2_tensor.shape,
146 npu_block_type,
147 concat_axis,
148 concat_offset,
149 split_offsets[1],
150 upscaling,
Tim Hall79d07d22020-04-27 18:20:16 +0100151 )
152 else:
153 ifm2_box = Box([], [])
154
Charles Xu78792222020-05-13 10:15:26 +0200155 for intermediate in ps.intermediates:
Tim Hallc30f4952020-06-15 20:47:35 +0100156 if (
157 intermediate is not None
158 and intermediate.shape != []
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200159 and intermediate.purpose in (TensorPurpose.FeatureMap, TensorPurpose.LUT)
Tim Hallc30f4952020-06-15 20:47:35 +0100160 ):
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200161 if intermediate.purpose is TensorPurpose.FeatureMap:
162 intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt(
163 strides,
164 skirt,
165 intermediate.shape,
166 npu_block_type,
167 concat_axis,
168 concat_offset,
169 split_offsets[0],
170 upscaling,
171 )
172 else:
173 intermediate_box = Box([0] * len(intermediate.shape), list(intermediate.shape))
Charles Xu78792222020-05-13 10:15:26 +0200174 yield from dma_if_necessary(ps, intermediate_box, intermediate)
175
Tim Hall79d07d22020-04-27 18:20:16 +0100176 weight_box = None
177 if weight_tensor is not None:
178 weight_oc_start = start
179 weight_oc_end = end
180 if concat_axis - len(weight_tensor.shape) == -1:
181 weight_oc_start -= concat_offset
182 weight_oc_end -= concat_offset
183
184 weight_box = Box.make_weight_box(
185 weight_tensor.shape,
186 npu_block_type,
187 weight_oc_start,
188 weight_oc_end,
189 weight_tensor.weight_transpose_depthwise,
190 )
Charles Xu78792222020-05-13 10:15:26 +0200191 yield from dma_if_necessary(ps, weight_box, weight_tensor)
Tim Hall79d07d22020-04-27 18:20:16 +0100192
193 yield NpuStripe(
194 ps,
195 block_config,
196 is_first,
197 is_last,
198 True,
199 True,
200 ifm_tensor,
201 ifm_box,
202 ofm_tensor,
203 ofm_box,
204 weight_tensor,
205 weight_box,
206 scale_tensor,
207 concat_axis,
208 concat_offset,
209 ifm2_tensor=ifm2_tensor,
210 ifm2_box=ifm2_box,
211 )
212
213 elif strat == SchedulingStrategy.IfmStream:
214 y_step = block_config[0]
215 y_start = 0
216 y_dim = 1
217 if len(ofm_tensor.shape) >= 3:
218 y_start = ofm_start[-3]
219 y_dim = ofm_end[-3]
220 if idx > 0:
221 ifm_y_present = 0
222 prev_pass = passes[idx - 1]
223 prev_pass_gen = generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx - 1)
224 else:
225 ifm_y_present = 1
226 if len(ifm_tensor.shape) >= 3:
227 ifm_y_present = ifm_tensor.shape[-3]
228 prev_pass_gen = []
229 prev_pass = None
230
231 if len(passes) == 1:
232 # no cascading, can just issue one big stripe
233 # but only if we've done allocation and OFM does not overlap IFM
Charles Xu04ce34c2020-06-23 12:42:28 +0200234 if ifm_tensor.address is not None and ofm_tensor.address is not None:
Tim Hall79d07d22020-04-27 18:20:16 +0100235 if (
236 ifm_tensor.address + ifm_tensor.storage_size() <= ofm_tensor.address
237 or ofm_tensor.address + ofm_tensor.storage_size() <= ifm_tensor.address
238 ):
239 y_step = y_dim
240
241 weight_box = None
Andreas Nevalainen897cc142020-10-28 15:42:08 +0100242 scale_box = None
Tim Hall79d07d22020-04-27 18:20:16 +0100243
244 for start in range(y_start, y_dim, y_step):
245 end = min(start + y_step, y_dim)
246 if len(ofm_tensor.shape) >= 3:
247 ofm_start[-3] = start
248 ofm_end[-3] = end
249 ofm_box = Box(ofm_start, ofm_end)
250
251 k_height = 1
Charles Xu89a6bbf2020-08-11 12:31:58 +0200252 if npu_block_type in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):
Tim Hall79d07d22020-04-27 18:20:16 +0100253 if ps.primary_op is not None:
254 k_height = ps.primary_op.attrs["ksize"][1]
255 else:
256 if weight_tensor is not None:
257 k_height = weight_tensor.shape[0]
258
259 ifm_box, pad_top, pad_bottom = ofm_box.transform_with_strides_and_skirt(
Tim Hallc30f4952020-06-15 20:47:35 +0100260 strides,
261 skirt,
262 ifm_tensor.shape,
263 npu_block_type,
264 concat_axis,
265 concat_offset,
266 split_offsets[0],
267 k_height,
268 upscaling,
Tim Hall79d07d22020-04-27 18:20:16 +0100269 )
270
Charles Xu78792222020-05-13 10:15:26 +0200271 for intermediate in ps.intermediates:
Tim Hallc30f4952020-06-15 20:47:35 +0100272 if (
273 intermediate is not None
274 and intermediate.shape != []
Michael McGeagh34ad19b2020-09-04 15:44:23 +0100275 and intermediate.purpose in (TensorPurpose.FeatureMap, TensorPurpose.LUT)
Tim Hallc30f4952020-06-15 20:47:35 +0100276 ):
Michael McGeagh34ad19b2020-09-04 15:44:23 +0100277 if intermediate.purpose is TensorPurpose.FeatureMap:
278 intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt(
279 strides,
280 skirt,
281 intermediate.shape,
282 npu_block_type,
283 concat_axis,
284 concat_offset,
285 split_offsets[0],
286 upscaling,
287 )
288 else:
289 intermediate_box = Box([0] * len(intermediate.shape), list(intermediate.shape))
Charles Xu78792222020-05-13 10:15:26 +0200290 yield from dma_if_necessary(ps, intermediate_box, intermediate)
291
Tim Hall79d07d22020-04-27 18:20:16 +0100292 ifm_y_needed = 1
293 if len(ifm_box.end_coord) >= 3:
294 ifm_y_needed = ifm_box.end_coord[-3]
295 if ifm_y_present < ifm_y_needed:
296 for prev_cmd in prev_pass_gen:
297 yield prev_cmd
298 rng = prev_cmd.get_ofm_y_range_for_pass(prev_pass)
299 if rng is not None:
300 ifm_y_present = max(ifm_y_present, rng[1])
301 if ifm_y_present >= ifm_y_needed:
302 break
303
Andreas Nevalainen897cc142020-10-28 15:42:08 +0100304 if scale_tensor is not None and scale_tensor.purpose == TensorPurpose.FSBias and scale_box is None:
305 scale_box = Box([0] * len(scale_tensor.shape), list(scale_tensor.shape))
306 yield from dma_if_necessary(ps, scale_box, scale_tensor)
307
Tim Hall79d07d22020-04-27 18:20:16 +0100308 if weight_tensor is not None and weight_box is None:
309 weight_box = Box.make_weight_box(
310 weight_tensor.shape, npu_block_type, weights_transposed=weight_tensor.weight_transpose_depthwise
311 )
Charles Xu78792222020-05-13 10:15:26 +0200312 yield from dma_if_necessary(ps, weight_box, weight_tensor)
Tim Hall79d07d22020-04-27 18:20:16 +0100313
314 # Check if first/last stripe in pass
315 is_first_h_stripe = start == y_start
316 is_last_h_stripe = (start + y_step) >= y_dim
317
318 stripe = NpuStripe(
319 ps,
320 block_config,
321 is_first,
322 is_last,
323 is_first_h_stripe,
324 is_last_h_stripe,
325 ifm_tensor,
326 ifm_box,
327 ofm_tensor,
328 ofm_box,
329 weight_tensor,
330 weight_box,
331 scale_tensor,
332 concat_axis,
333 concat_offset,
334 None,
335 None,
336 pad_top,
337 pad_bottom,
338 )
339 yield stripe
340 else:
341 assert 0, "unknown scheduling strategy"
342
343
344def generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
345 if strat == SchedulingStrategy.WeightStream:
346 for idx in range(len(passes)):
347 yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx)
348 elif strat == SchedulingStrategy.IfmStream:
349 yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, len(passes) - 1)
350 else:
351 assert 0, "Unknown streaming strategy"
352
353
354def generate_high_level_command_stream_for_cascaded_pass(cps):
355 yield from generate_high_level_command_stream_for_pass_list(
356 cps.strategy, cps.passes, [ps.block_config for ps in cps.passes]
357 )
358
359
360def generate_high_level_command_stream(nng, sg, arch, verbose_high_level_command_stream):
361 res = []
362 for cps in sg.cascaded_passes:
363 if cps.placement == PassPlacement.Npu:
364 res += list(generate_high_level_command_stream_for_cascaded_pass(cps))
365
366 sg.high_level_command_stream = res
367 if verbose_high_level_command_stream:
368 sg.print_high_level_command_stream()
369
370
371def calc_allowed_ofm_ifm_overlap_for_pass_list(strat, passes, block_configs):
372 highest_ofm_write = 0
373 if not passes[0].ifm_tensor or not passes[-1].ofm_tensor:
374 return 0
375
376 ifm_read = passes[0].ifm_tensor.storage_size
377 min_overlap = 999999999999999999999
378 ofm_size = passes[-1].ofm_tensor.storage_size()
379 if strat == SchedulingStrategy.WeightStream:
380 return 0
381 for cmd in generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
382 if cmd.is_npu_pass_command():
383 if cmd.is_first:
384 ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.start_coord, is_top_box=False)
385 if ifm_read is None:
386 return 0
387 if cmd.is_last:
388 write_offset = cmd.ofm_tensor.address_offset_for_coordinate(cmd.ofm_box.end_coord, is_top_box=True)
389 if write_offset is None:
390 return 0
391 highest_ofm_write = max(write_offset, highest_ofm_write)
392
393 if cmd.is_first or cmd.is_last:
394 overlap_required = max(highest_ofm_write - min(ifm_read, ofm_size), 0)
395 can_overwrite = ofm_size - overlap_required
396 min_overlap = min(min_overlap, can_overwrite)
397
398 if cmd.is_first:
399 ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.end_coord, is_top_box=True)
400
401 min_overlap = max(min_overlap, 0)
402 return min_overlap