blob: 0cd3ad22cc541ce75aeae77e42922fef05734c0e [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Generate a high-level command stream from a scheduled subgraph with CascadedPasses.
18#
19# Also used during scheduling to work out allowable IFM/OFM overlap, this functionality can be accessed using
20# calc_allowed_ofm_ifm_overlap_for_cascaded_pass().
Diego Russoe8a10452020-04-21 17:39:10 +010021from .high_level_command_stream import Box
22from .high_level_command_stream import DMA
23from .high_level_command_stream import NpuStripe
24from .nn_graph import PassPlacement
25from .nn_graph import SchedulingStrategy
Tim Hall79d07d22020-04-27 18:20:16 +010026from .operation import NpuBlockType
Charles Xu78792222020-05-13 10:15:26 +020027from .tensor import TensorPurpose
Tim Hall79d07d22020-04-27 18:20:16 +010028
29
Charles Xu78792222020-05-13 10:15:26 +020030def dma_if_necessary(ps, box, tensor):
Louis Verhaard3c07c972020-05-07 08:12:58 +020031 if tensor.needs_dma():
Charles Xu78792222020-05-13 10:15:26 +020032 dma_op = tensor.ops[0]
Tim Hall79d07d22020-04-27 18:20:16 +010033 in_tensor = dma_op.inputs[0]
Charles Xu78792222020-05-13 10:15:26 +020034 yield DMA(in_tensor, tensor, box)
Tim Hall79d07d22020-04-27 18:20:16 +010035
Charles Xu600351a2020-05-18 08:54:47 +020036def match_tensor(source, derived):
37 if source == derived:
38 return True
39 ops = derived.ops
40 return (ops != [] and
41 len(ops) ==1 and
42 ops[0].type == "SplitSliceRead" and
43 source == ops[0].inputs[0])
Tim Hall79d07d22020-04-27 18:20:16 +010044
45def generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx):
46 is_first = idx == 0
47 is_last = idx == len(passes) - 1
48 ps = passes[idx]
49 block_config = block_configs[idx]
Charles Xu600351a2020-05-18 08:54:47 +020050 npu_block_type = ps.npu_block_type
51 split_offsets = [None, None] # offset for [ifm, ifm2]
52
53 ifm_idx = 0
54 for op in ps.ops:
55 if op.type == "SplitSliceRead":
56 split_offsets[ifm_idx] = op.attrs["split_start"]
57 ps.primary_op.attrs["fused_memory_function"] = op.type
58 ifm_idx += 1
59
60 if len(ps.inputs) == 2 and npu_block_type == NpuBlockType.ElementWise:
61 # Ensure correct imf and ifm2 order
62 if (match_tensor(ps.inputs[0], ps.primary_op.inputs[1]) and
63 match_tensor(ps.inputs[1], ps.primary_op.inputs[0])):
64 ps.ifm_tensor, ps.ifm2_tensor = ps.ifm2_tensor, ps.ifm_tensor
65 split_offsets[0], split_offsets[1] = split_offsets[1], split_offsets[0]
Tim Hall79d07d22020-04-27 18:20:16 +010066
67 ifm_tensor = ps.ifm_tensor
68 ifm2_tensor = ps.ifm2_tensor
69 ofm_tensor = ps.ofm_tensor
70 weight_tensor = ps.weight_tensor
71 scale_tensor = ps.scale_tensor
72
73 ofm_start = [0] * len(ofm_tensor.shape)
74 ofm_end = list(ofm_tensor.shape)
75
76 strides = None
77 skirt = None
78 if ps.primary_op is not None:
79 strides = ps.primary_op.attrs.get("strides", None)
80 skirt = ps.primary_op.attrs.get("skirt", None)
81
Tim Hall79d07d22020-04-27 18:20:16 +010082 concat_axis = 0
83 concat_offset = 0
84
Tim Hall79d07d22020-04-27 18:20:16 +010085 # Fusable activation functions
86 activation_ops = set(("Sigmoid", "Tanh", "Relu", "Relu6", "ReluN1To1"))
87
88 for op in ps.ops:
89 if op.type == "ConcatSliceWrite":
90 concat_axis = op.attrs["concat_axis"]
91 concat_start = op.attrs["concat_start"]
92 concat_end = op.attrs["concat_end"]
93
94 ofm_start[concat_axis] = concat_start
95 ofm_end[concat_axis] = concat_end
96 concat_offset = concat_start
97 ps.primary_op.attrs["fused_memory_function"] = op.type
98 elif op.type in activation_ops:
99 ps.primary_op.attrs["fused_activation_function"] = op.type
100
Tim Hall79d07d22020-04-27 18:20:16 +0100101 if strat == SchedulingStrategy.WeightStream:
102 ofm_step = block_config[-1]
103 ofm_stop = ofm_end[-1]
Louis Verhaard3c07c972020-05-07 08:12:58 +0200104 if weight_tensor is None or not weight_tensor.needs_dma():
Tim Hall79d07d22020-04-27 18:20:16 +0100105 ofm_step = ofm_stop
106 for start in range(ofm_start[-1], ofm_stop, ofm_step):
107 end = min(start + ofm_step, ofm_stop)
108 ofm_start[-1] = start
109 ofm_end[-1] = end
110 ofm_box = Box(ofm_start, ofm_end)
111 ifm_box = None
112 ifm2_box = None
113
114 if ifm_tensor.shape != []:
115 ifm_box, _, _ = ofm_box.transform_with_strides_and_skirt(
116 strides, skirt, ifm_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0]
117 )
118 else:
119 ifm_box = Box([], [])
120 if ifm2_tensor is not None and ifm2_tensor.shape != []:
121 ifm2_box, _, _ = ofm_box.transform_with_strides_and_skirt(
122 strides, skirt, ifm2_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[1]
123 )
124 else:
125 ifm2_box = Box([], [])
126
Charles Xu78792222020-05-13 10:15:26 +0200127 for intermediate in ps.intermediates:
128 if intermediate != None and intermediate.shape != [] and intermediate.purpose == TensorPurpose.FeatureMap:
129 intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt(
130 strides, skirt, intermediate.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0]
131 )
132 yield from dma_if_necessary(ps, intermediate_box, intermediate)
133
Tim Hall79d07d22020-04-27 18:20:16 +0100134 weight_box = None
135 if weight_tensor is not None:
136 weight_oc_start = start
137 weight_oc_end = end
138 if concat_axis - len(weight_tensor.shape) == -1:
139 weight_oc_start -= concat_offset
140 weight_oc_end -= concat_offset
141
142 weight_box = Box.make_weight_box(
143 weight_tensor.shape,
144 npu_block_type,
145 weight_oc_start,
146 weight_oc_end,
147 weight_tensor.weight_transpose_depthwise,
148 )
Charles Xu78792222020-05-13 10:15:26 +0200149 yield from dma_if_necessary(ps, weight_box, weight_tensor)
Tim Hall79d07d22020-04-27 18:20:16 +0100150
151 yield NpuStripe(
152 ps,
153 block_config,
154 is_first,
155 is_last,
156 True,
157 True,
158 ifm_tensor,
159 ifm_box,
160 ofm_tensor,
161 ofm_box,
162 weight_tensor,
163 weight_box,
164 scale_tensor,
165 concat_axis,
166 concat_offset,
167 ifm2_tensor=ifm2_tensor,
168 ifm2_box=ifm2_box,
169 )
170
171 elif strat == SchedulingStrategy.IfmStream:
172 y_step = block_config[0]
173 y_start = 0
174 y_dim = 1
175 if len(ofm_tensor.shape) >= 3:
176 y_start = ofm_start[-3]
177 y_dim = ofm_end[-3]
178 if idx > 0:
179 ifm_y_present = 0
180 prev_pass = passes[idx - 1]
181 prev_pass_gen = generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx - 1)
182 else:
183 ifm_y_present = 1
184 if len(ifm_tensor.shape) >= 3:
185 ifm_y_present = ifm_tensor.shape[-3]
186 prev_pass_gen = []
187 prev_pass = None
188
189 if len(passes) == 1:
190 # no cascading, can just issue one big stripe
191 # but only if we've done allocation and OFM does not overlap IFM
192 if ifm_tensor.address != -1 and ofm_tensor.address != -1:
193 if (
194 ifm_tensor.address + ifm_tensor.storage_size() <= ofm_tensor.address
195 or ofm_tensor.address + ofm_tensor.storage_size() <= ifm_tensor.address
196 ):
197 y_step = y_dim
198
199 weight_box = None
200
201 for start in range(y_start, y_dim, y_step):
202 end = min(start + y_step, y_dim)
203 if len(ofm_tensor.shape) >= 3:
204 ofm_start[-3] = start
205 ofm_end[-3] = end
206 ofm_box = Box(ofm_start, ofm_end)
207
208 k_height = 1
209 if npu_block_type == NpuBlockType.Pooling:
210 if ps.primary_op is not None:
211 k_height = ps.primary_op.attrs["ksize"][1]
212 else:
213 if weight_tensor is not None:
214 k_height = weight_tensor.shape[0]
215
216 ifm_box, pad_top, pad_bottom = ofm_box.transform_with_strides_and_skirt(
217 strides, skirt, ifm_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0], k_height
218 )
219
Charles Xu78792222020-05-13 10:15:26 +0200220 for intermediate in ps.intermediates:
221 if intermediate != None and intermediate.shape != [] and intermediate.purpose == TensorPurpose.FeatureMap:
222 intermediate_box, _, _ = ofm_box.transform_with_strides_and_skirt(
223 strides, skirt, intermediate.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0]
224 )
225 yield from dma_if_necessary(ps, intermediate_box, intermediate)
226
Tim Hall79d07d22020-04-27 18:20:16 +0100227 ifm_y_needed = 1
228 if len(ifm_box.end_coord) >= 3:
229 ifm_y_needed = ifm_box.end_coord[-3]
230 if ifm_y_present < ifm_y_needed:
231 for prev_cmd in prev_pass_gen:
232 yield prev_cmd
233 rng = prev_cmd.get_ofm_y_range_for_pass(prev_pass)
234 if rng is not None:
235 ifm_y_present = max(ifm_y_present, rng[1])
236 if ifm_y_present >= ifm_y_needed:
237 break
238
239 if weight_tensor is not None and weight_box is None:
240 weight_box = Box.make_weight_box(
241 weight_tensor.shape, npu_block_type, weights_transposed=weight_tensor.weight_transpose_depthwise
242 )
Charles Xu78792222020-05-13 10:15:26 +0200243 yield from dma_if_necessary(ps, weight_box, weight_tensor)
Tim Hall79d07d22020-04-27 18:20:16 +0100244
245 # Check if first/last stripe in pass
246 is_first_h_stripe = start == y_start
247 is_last_h_stripe = (start + y_step) >= y_dim
248
249 stripe = NpuStripe(
250 ps,
251 block_config,
252 is_first,
253 is_last,
254 is_first_h_stripe,
255 is_last_h_stripe,
256 ifm_tensor,
257 ifm_box,
258 ofm_tensor,
259 ofm_box,
260 weight_tensor,
261 weight_box,
262 scale_tensor,
263 concat_axis,
264 concat_offset,
265 None,
266 None,
267 pad_top,
268 pad_bottom,
269 )
270 yield stripe
271 else:
272 assert 0, "unknown scheduling strategy"
273
274
275def generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
276 if strat == SchedulingStrategy.WeightStream:
277 for idx in range(len(passes)):
278 yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx)
279 elif strat == SchedulingStrategy.IfmStream:
280 yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, len(passes) - 1)
281 else:
282 assert 0, "Unknown streaming strategy"
283
284
285def generate_high_level_command_stream_for_cascaded_pass(cps):
286 yield from generate_high_level_command_stream_for_pass_list(
287 cps.strategy, cps.passes, [ps.block_config for ps in cps.passes]
288 )
289
290
291def generate_high_level_command_stream(nng, sg, arch, verbose_high_level_command_stream):
292 res = []
293 for cps in sg.cascaded_passes:
294 if cps.placement == PassPlacement.Npu:
295 res += list(generate_high_level_command_stream_for_cascaded_pass(cps))
296
297 sg.high_level_command_stream = res
298 if verbose_high_level_command_stream:
299 sg.print_high_level_command_stream()
300
301
302def calc_allowed_ofm_ifm_overlap_for_pass_list(strat, passes, block_configs):
303 highest_ofm_write = 0
304 if not passes[0].ifm_tensor or not passes[-1].ofm_tensor:
305 return 0
306
307 ifm_read = passes[0].ifm_tensor.storage_size
308 min_overlap = 999999999999999999999
309 ofm_size = passes[-1].ofm_tensor.storage_size()
310 if strat == SchedulingStrategy.WeightStream:
311 return 0
312 for cmd in generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
313 if cmd.is_npu_pass_command():
314 if cmd.is_first:
315 ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.start_coord, is_top_box=False)
316 if ifm_read is None:
317 return 0
318 if cmd.is_last:
319 write_offset = cmd.ofm_tensor.address_offset_for_coordinate(cmd.ofm_box.end_coord, is_top_box=True)
320 if write_offset is None:
321 return 0
322 highest_ofm_write = max(write_offset, highest_ofm_write)
323
324 if cmd.is_first or cmd.is_last:
325 overlap_required = max(highest_ofm_write - min(ifm_read, ofm_size), 0)
326 can_overwrite = ofm_size - overlap_required
327 min_overlap = min(min_overlap, can_overwrite)
328
329 if cmd.is_first:
330 ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.end_coord, is_top_box=True)
331
332 min_overlap = max(min_overlap, 0)
333 return min_overlap
334
335
336def calc_allowed_ofm_ifm_overlap_for_cascaded_pass(cps):
337 return calc_allowed_ofm_ifm_overlap_for_pass_list(cps.strategy, cps.passes, [ps.block_config for ps in cps.passes])