blob: 2297a3bf914263e6b54ac57ee706eaa1e75f1efe [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Generate a high-level command stream from a scheduled subgraph with CascadedPasses.
18#
19# Also used during scheduling to work out allowable IFM/OFM overlap, this functionality can be accessed using
20# calc_allowed_ofm_ifm_overlap_for_cascaded_pass().
Diego Russoe8a10452020-04-21 17:39:10 +010021from .high_level_command_stream import Box
22from .high_level_command_stream import DMA
23from .high_level_command_stream import NpuStripe
24from .nn_graph import PassPlacement
25from .nn_graph import SchedulingStrategy
Tim Hall79d07d22020-04-27 18:20:16 +010026from .operation import NpuBlockType
Charles Xu78792222020-05-13 10:15:26 +020027from .tensor import TensorPurpose
Tim Hall79d07d22020-04-27 18:20:16 +010028
29
Charles Xu78792222020-05-13 10:15:26 +020030def dma_if_necessary(ps, box, tensor):
Louis Verhaard3c07c972020-05-07 08:12:58 +020031 if tensor.needs_dma():
Charles Xu78792222020-05-13 10:15:26 +020032 dma_op = tensor.ops[0]
Tim Hall79d07d22020-04-27 18:20:16 +010033 in_tensor = dma_op.inputs[0]
Charles Xu78792222020-05-13 10:15:26 +020034 yield DMA(in_tensor, tensor, box)
Tim Hall79d07d22020-04-27 18:20:16 +010035
Tim Hallc30f4952020-06-15 20:47:35 +010036
Charles Xu600351a2020-05-18 08:54:47 +020037def match_tensor(source, derived):
38 if source == derived:
39 return True
40 ops = derived.ops
Tim Hallc30f4952020-06-15 20:47:35 +010041 return ops != [] and len(ops) == 1 and ops[0].type == "SplitSliceRead" and source == ops[0].inputs[0]
42
Tim Hall79d07d22020-04-27 18:20:16 +010043
44def generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx):
45 is_first = idx == 0
46 is_last = idx == len(passes) - 1
47 ps = passes[idx]
48 block_config = block_configs[idx]
Charles Xu600351a2020-05-18 08:54:47 +020049 npu_block_type = ps.npu_block_type
50 split_offsets = [None, None] # offset for [ifm, ifm2]
51
52 ifm_idx = 0
53 for op in ps.ops:
54 if op.type == "SplitSliceRead":
55 split_offsets[ifm_idx] = op.attrs["split_start"]
56 ps.primary_op.attrs["fused_memory_function"] = op.type
57 ifm_idx += 1
58
59 if len(ps.inputs) == 2 and npu_block_type == NpuBlockType.ElementWise:
60 # Ensure correct imf and ifm2 order
Tim Hallc30f4952020-06-15 20:47:35 +010061 if match_tensor(ps.inputs[0], ps.primary_op.inputs[1]) and match_tensor(ps.inputs[1], ps.primary_op.inputs[0]):
Charles Xu600351a2020-05-18 08:54:47 +020062 ps.ifm_tensor, ps.ifm2_tensor = ps.ifm2_tensor, ps.ifm_tensor
63 split_offsets[0], split_offsets[1] = split_offsets[1], split_offsets[0]
Tim Hall79d07d22020-04-27 18:20:16 +010064
65 ifm_tensor = ps.ifm_tensor
66 ifm2_tensor = ps.ifm2_tensor
67 ofm_tensor = ps.ofm_tensor
68 weight_tensor = ps.weight_tensor
69 scale_tensor = ps.scale_tensor
70
71 ofm_start = [0] * len(ofm_tensor.shape)
72 ofm_end = list(ofm_tensor.shape)
73
74 strides = None
75 skirt = None
Jacob Bohlin611fcdf2020-06-11 15:09:57 +020076 upscaling = 1
Tim Hall79d07d22020-04-27 18:20:16 +010077 if ps.primary_op is not None:
78 strides = ps.primary_op.attrs.get("strides", None)
79 skirt = ps.primary_op.attrs.get("skirt", None)
Jacob Bohlin611fcdf2020-06-11 15:09:57 +020080 if ps.primary_op.type in set(("Conv2DBackpropInputSwitchedBias", "ResizeBilinear")):
81 upscaling = ofm_tensor.shape[-3] // ifm_tensor.shape[-3]
Tim Hall79d07d22020-04-27 18:20:16 +010082
Tim Hall79d07d22020-04-27 18:20:16 +010083 concat_axis = 0
84 concat_offset = 0
85
Tim Hall79d07d22020-04-27 18:20:16 +010086 # Fusable activation functions
87 activation_ops = set(("Sigmoid", "Tanh", "Relu", "Relu6", "ReluN1To1"))
88
89 for op in ps.ops:
90 if op.type == "ConcatSliceWrite":
91 concat_axis = op.attrs["concat_axis"]
92 concat_start = op.attrs["concat_start"]
93 concat_end = op.attrs["concat_end"]
94
95 ofm_start[concat_axis] = concat_start
96 ofm_end[concat_axis] = concat_end
97 concat_offset = concat_start
98 ps.primary_op.attrs["fused_memory_function"] = op.type
99 elif op.type in activation_ops:
100 ps.primary_op.attrs["fused_activation_function"] = op.type
101
Tim Hall79d07d22020-04-27 18:20:16 +0100102 if strat == SchedulingStrategy.WeightStream:
103 ofm_step = block_config[-1]
104 ofm_stop = ofm_end[-1]
Louis Verhaard3c07c972020-05-07 08:12:58 +0200105 if weight_tensor is None or not weight_tensor.needs_dma():
Tim Hall79d07d22020-04-27 18:20:16 +0100106 ofm_step = ofm_stop
107 for start in range(ofm_start[-1], ofm_stop, ofm_step):
108 end = min(start + ofm_step, ofm_stop)
109 ofm_start[-1] = start
110 ofm_end[-1] = end
111 ofm_box = Box(ofm_start, ofm_end)
112 ifm_box = None
113 ifm2_box = None
114
115 if ifm_tensor.shape != []:
116 ifm_box, _, _ = ofm_box.transform_with_strides_and_skirt(
Tim Hallc30f4952020-06-15 20:47:35 +0100117 strides,
118 skirt,
119 ifm_tensor.shape,
120 npu_block_type,
121 concat_axis,
122 concat_offset,
123 split_offsets[0],
124 upscaling,
Tim Hall79d07d22020-04-27 18:20:16 +0100125 )
126 else:
127 ifm_box = Box([], [])
128 if ifm2_tensor is not None and ifm2_tensor.shape != []:
129 ifm2_box, _, _ = ofm_box.transform_with_strides_and_skirt(
Tim Hallc30f4952020-06-15 20:47:35 +0100130 strides,
131 skirt,
132 ifm2_tensor.shape,
133 npu_block_type,
134 concat_axis,
135 concat_offset,
136 split_offsets[1],
137 upscaling,
Tim Hall79d07d22020-04-27 18:20:16 +0100138 )
139 else:
140 ifm2_box = Box([], [])
141
Charles Xu78792222020-05-13 10:15:26 +0200142 for intermediate in ps.intermediates:
Tim Hallc30f4952020-06-15 20:47:35 +0100143 if (
144 intermediate is not None
145 and intermediate.shape != []
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200146 and intermediate.purpose in (TensorPurpose.FeatureMap, TensorPurpose.LUT)
Tim Hallc30f4952020-06-15 20:47:35 +0100147 ):
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200148 if intermediate.purpose is TensorPurpose.FeatureMap:
149 intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt(
150 strides,
151 skirt,
152 intermediate.shape,
153 npu_block_type,
154 concat_axis,
155 concat_offset,
156 split_offsets[0],
157 upscaling,
158 )
159 else:
160 intermediate_box = Box([0] * len(intermediate.shape), list(intermediate.shape))
Charles Xu78792222020-05-13 10:15:26 +0200161 yield from dma_if_necessary(ps, intermediate_box, intermediate)
162
Tim Hall79d07d22020-04-27 18:20:16 +0100163 weight_box = None
164 if weight_tensor is not None:
165 weight_oc_start = start
166 weight_oc_end = end
167 if concat_axis - len(weight_tensor.shape) == -1:
168 weight_oc_start -= concat_offset
169 weight_oc_end -= concat_offset
170
171 weight_box = Box.make_weight_box(
172 weight_tensor.shape,
173 npu_block_type,
174 weight_oc_start,
175 weight_oc_end,
176 weight_tensor.weight_transpose_depthwise,
177 )
Charles Xu78792222020-05-13 10:15:26 +0200178 yield from dma_if_necessary(ps, weight_box, weight_tensor)
Tim Hall79d07d22020-04-27 18:20:16 +0100179
180 yield NpuStripe(
181 ps,
182 block_config,
183 is_first,
184 is_last,
185 True,
186 True,
187 ifm_tensor,
188 ifm_box,
189 ofm_tensor,
190 ofm_box,
191 weight_tensor,
192 weight_box,
193 scale_tensor,
194 concat_axis,
195 concat_offset,
196 ifm2_tensor=ifm2_tensor,
197 ifm2_box=ifm2_box,
198 )
199
200 elif strat == SchedulingStrategy.IfmStream:
201 y_step = block_config[0]
202 y_start = 0
203 y_dim = 1
204 if len(ofm_tensor.shape) >= 3:
205 y_start = ofm_start[-3]
206 y_dim = ofm_end[-3]
207 if idx > 0:
208 ifm_y_present = 0
209 prev_pass = passes[idx - 1]
210 prev_pass_gen = generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx - 1)
211 else:
212 ifm_y_present = 1
213 if len(ifm_tensor.shape) >= 3:
214 ifm_y_present = ifm_tensor.shape[-3]
215 prev_pass_gen = []
216 prev_pass = None
217
218 if len(passes) == 1:
219 # no cascading, can just issue one big stripe
220 # but only if we've done allocation and OFM does not overlap IFM
Charles Xu04ce34c2020-06-23 12:42:28 +0200221 if ifm_tensor.address is not None and ofm_tensor.address is not None:
Tim Hall79d07d22020-04-27 18:20:16 +0100222 if (
223 ifm_tensor.address + ifm_tensor.storage_size() <= ofm_tensor.address
224 or ofm_tensor.address + ofm_tensor.storage_size() <= ifm_tensor.address
225 ):
226 y_step = y_dim
227
228 weight_box = None
229
230 for start in range(y_start, y_dim, y_step):
231 end = min(start + y_step, y_dim)
232 if len(ofm_tensor.shape) >= 3:
233 ofm_start[-3] = start
234 ofm_end[-3] = end
235 ofm_box = Box(ofm_start, ofm_end)
236
237 k_height = 1
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200238 if npu_block_type == set((NpuBlockType.Pooling, NpuBlockType.ReduceSum)):
Tim Hall79d07d22020-04-27 18:20:16 +0100239 if ps.primary_op is not None:
240 k_height = ps.primary_op.attrs["ksize"][1]
241 else:
242 if weight_tensor is not None:
243 k_height = weight_tensor.shape[0]
244
245 ifm_box, pad_top, pad_bottom = ofm_box.transform_with_strides_and_skirt(
Tim Hallc30f4952020-06-15 20:47:35 +0100246 strides,
247 skirt,
248 ifm_tensor.shape,
249 npu_block_type,
250 concat_axis,
251 concat_offset,
252 split_offsets[0],
253 k_height,
254 upscaling,
Tim Hall79d07d22020-04-27 18:20:16 +0100255 )
256
Charles Xu78792222020-05-13 10:15:26 +0200257 for intermediate in ps.intermediates:
Tim Hallc30f4952020-06-15 20:47:35 +0100258 if (
259 intermediate is not None
260 and intermediate.shape != []
261 and intermediate.purpose == TensorPurpose.FeatureMap
262 ):
Charles Xu78792222020-05-13 10:15:26 +0200263 intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt(
Tim Hallc30f4952020-06-15 20:47:35 +0100264 strides,
265 skirt,
266 intermediate.shape,
267 npu_block_type,
268 concat_axis,
269 concat_offset,
270 split_offsets[0],
271 upscaling,
Charles Xu78792222020-05-13 10:15:26 +0200272 )
273 yield from dma_if_necessary(ps, intermediate_box, intermediate)
274
Tim Hall79d07d22020-04-27 18:20:16 +0100275 ifm_y_needed = 1
276 if len(ifm_box.end_coord) >= 3:
277 ifm_y_needed = ifm_box.end_coord[-3]
278 if ifm_y_present < ifm_y_needed:
279 for prev_cmd in prev_pass_gen:
280 yield prev_cmd
281 rng = prev_cmd.get_ofm_y_range_for_pass(prev_pass)
282 if rng is not None:
283 ifm_y_present = max(ifm_y_present, rng[1])
284 if ifm_y_present >= ifm_y_needed:
285 break
286
287 if weight_tensor is not None and weight_box is None:
288 weight_box = Box.make_weight_box(
289 weight_tensor.shape, npu_block_type, weights_transposed=weight_tensor.weight_transpose_depthwise
290 )
Charles Xu78792222020-05-13 10:15:26 +0200291 yield from dma_if_necessary(ps, weight_box, weight_tensor)
Tim Hall79d07d22020-04-27 18:20:16 +0100292
293 # Check if first/last stripe in pass
294 is_first_h_stripe = start == y_start
295 is_last_h_stripe = (start + y_step) >= y_dim
296
297 stripe = NpuStripe(
298 ps,
299 block_config,
300 is_first,
301 is_last,
302 is_first_h_stripe,
303 is_last_h_stripe,
304 ifm_tensor,
305 ifm_box,
306 ofm_tensor,
307 ofm_box,
308 weight_tensor,
309 weight_box,
310 scale_tensor,
311 concat_axis,
312 concat_offset,
313 None,
314 None,
315 pad_top,
316 pad_bottom,
317 )
318 yield stripe
319 else:
320 assert 0, "unknown scheduling strategy"
321
322
323def generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
324 if strat == SchedulingStrategy.WeightStream:
325 for idx in range(len(passes)):
326 yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx)
327 elif strat == SchedulingStrategy.IfmStream:
328 yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, len(passes) - 1)
329 else:
330 assert 0, "Unknown streaming strategy"
331
332
333def generate_high_level_command_stream_for_cascaded_pass(cps):
334 yield from generate_high_level_command_stream_for_pass_list(
335 cps.strategy, cps.passes, [ps.block_config for ps in cps.passes]
336 )
337
338
339def generate_high_level_command_stream(nng, sg, arch, verbose_high_level_command_stream):
340 res = []
341 for cps in sg.cascaded_passes:
342 if cps.placement == PassPlacement.Npu:
343 res += list(generate_high_level_command_stream_for_cascaded_pass(cps))
344
345 sg.high_level_command_stream = res
346 if verbose_high_level_command_stream:
347 sg.print_high_level_command_stream()
348
349
350def calc_allowed_ofm_ifm_overlap_for_pass_list(strat, passes, block_configs):
351 highest_ofm_write = 0
352 if not passes[0].ifm_tensor or not passes[-1].ofm_tensor:
353 return 0
354
355 ifm_read = passes[0].ifm_tensor.storage_size
356 min_overlap = 999999999999999999999
357 ofm_size = passes[-1].ofm_tensor.storage_size()
358 if strat == SchedulingStrategy.WeightStream:
359 return 0
360 for cmd in generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
361 if cmd.is_npu_pass_command():
362 if cmd.is_first:
363 ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.start_coord, is_top_box=False)
364 if ifm_read is None:
365 return 0
366 if cmd.is_last:
367 write_offset = cmd.ofm_tensor.address_offset_for_coordinate(cmd.ofm_box.end_coord, is_top_box=True)
368 if write_offset is None:
369 return 0
370 highest_ofm_write = max(write_offset, highest_ofm_write)
371
372 if cmd.is_first or cmd.is_last:
373 overlap_required = max(highest_ofm_write - min(ifm_read, ofm_size), 0)
374 can_overwrite = ofm_size - overlap_required
375 min_overlap = min(min_overlap, can_overwrite)
376
377 if cmd.is_first:
378 ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.end_coord, is_top_box=True)
379
380 min_overlap = max(min_overlap, 0)
381 return min_overlap
382
383
384def calc_allowed_ofm_ifm_overlap_for_cascaded_pass(cps):
385 return calc_allowed_ofm_ifm_overlap_for_pass_list(cps.strategy, cps.passes, [ps.block_config for ps in cps.passes])