blob: ef21e06c4627e6d452e3f0c9aec1bbecc92be0ab [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Generate a high-level command stream from a scheduled subgraph with CascadedPasses.
18#
19# Also used during scheduling to work out allowable IFM/OFM overlap, this functionality can be accessed using
20# calc_allowed_ofm_ifm_overlap_for_cascaded_pass().
Diego Russoe8a10452020-04-21 17:39:10 +010021from .high_level_command_stream import Box
22from .high_level_command_stream import DMA
23from .high_level_command_stream import NpuStripe
24from .nn_graph import PassPlacement
25from .nn_graph import SchedulingStrategy
Tim Hall79d07d22020-04-27 18:20:16 +010026from .operation import NpuBlockType
Tim Hall79d07d22020-04-27 18:20:16 +010027
28
29def need_dma(tens):
30 return len(tens.ops) == 1 and tens.ops[0].type == "DMA"
31
32
33def dma_weights_if_necessary(ps, box, weight_tensor):
34 if need_dma(weight_tensor):
35 dma_op = weight_tensor.ops[0]
36 in_tensor = dma_op.inputs[0]
37 yield DMA(in_tensor, weight_tensor, box)
38
39
40def generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx):
41 is_first = idx == 0
42 is_last = idx == len(passes) - 1
43 ps = passes[idx]
44 block_config = block_configs[idx]
45
46 ifm_tensor = ps.ifm_tensor
47 ifm2_tensor = ps.ifm2_tensor
48 ofm_tensor = ps.ofm_tensor
49 weight_tensor = ps.weight_tensor
50 scale_tensor = ps.scale_tensor
51
52 ofm_start = [0] * len(ofm_tensor.shape)
53 ofm_end = list(ofm_tensor.shape)
54
55 strides = None
56 skirt = None
57 if ps.primary_op is not None:
58 strides = ps.primary_op.attrs.get("strides", None)
59 skirt = ps.primary_op.attrs.get("skirt", None)
60
61 npu_block_type = ps.npu_block_type
62
63 concat_axis = 0
64 concat_offset = 0
65
66 split_offsets = [None, None] # offset for [ifm, ifm2]
67
68 # Fusable activation functions
69 activation_ops = set(("Sigmoid", "Tanh", "Relu", "Relu6", "ReluN1To1"))
70
71 for op in ps.ops:
72 if op.type == "ConcatSliceWrite":
73 concat_axis = op.attrs["concat_axis"]
74 concat_start = op.attrs["concat_start"]
75 concat_end = op.attrs["concat_end"]
76
77 ofm_start[concat_axis] = concat_start
78 ofm_end[concat_axis] = concat_end
79 concat_offset = concat_start
80 ps.primary_op.attrs["fused_memory_function"] = op.type
81 elif op.type in activation_ops:
82 ps.primary_op.attrs["fused_activation_function"] = op.type
83
84 # The ops list has to be reversed here since the Pass Packing is done in reverse
85 ifm_idx = 0
86 for op in reversed(ps.ops):
87 if op.type == "SplitSliceRead":
88 split_offsets[ifm_idx] = op.attrs["split_start"]
89 ps.primary_op.attrs["fused_memory_function"] = op.type
90 ifm_idx += 1
91
92 if strat == SchedulingStrategy.WeightStream:
93 ofm_step = block_config[-1]
94 ofm_stop = ofm_end[-1]
95 if weight_tensor is None or not need_dma(weight_tensor):
96 ofm_step = ofm_stop
97 for start in range(ofm_start[-1], ofm_stop, ofm_step):
98 end = min(start + ofm_step, ofm_stop)
99 ofm_start[-1] = start
100 ofm_end[-1] = end
101 ofm_box = Box(ofm_start, ofm_end)
102 ifm_box = None
103 ifm2_box = None
104
105 if ifm_tensor.shape != []:
106 ifm_box, _, _ = ofm_box.transform_with_strides_and_skirt(
107 strides, skirt, ifm_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0]
108 )
109 else:
110 ifm_box = Box([], [])
111 if ifm2_tensor is not None and ifm2_tensor.shape != []:
112 ifm2_box, _, _ = ofm_box.transform_with_strides_and_skirt(
113 strides, skirt, ifm2_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[1]
114 )
115 else:
116 ifm2_box = Box([], [])
117
118 weight_box = None
119 if weight_tensor is not None:
120 weight_oc_start = start
121 weight_oc_end = end
122 if concat_axis - len(weight_tensor.shape) == -1:
123 weight_oc_start -= concat_offset
124 weight_oc_end -= concat_offset
125
126 weight_box = Box.make_weight_box(
127 weight_tensor.shape,
128 npu_block_type,
129 weight_oc_start,
130 weight_oc_end,
131 weight_tensor.weight_transpose_depthwise,
132 )
133 yield from dma_weights_if_necessary(ps, weight_box, weight_tensor)
134
135 yield NpuStripe(
136 ps,
137 block_config,
138 is_first,
139 is_last,
140 True,
141 True,
142 ifm_tensor,
143 ifm_box,
144 ofm_tensor,
145 ofm_box,
146 weight_tensor,
147 weight_box,
148 scale_tensor,
149 concat_axis,
150 concat_offset,
151 ifm2_tensor=ifm2_tensor,
152 ifm2_box=ifm2_box,
153 )
154
155 elif strat == SchedulingStrategy.IfmStream:
156 y_step = block_config[0]
157 y_start = 0
158 y_dim = 1
159 if len(ofm_tensor.shape) >= 3:
160 y_start = ofm_start[-3]
161 y_dim = ofm_end[-3]
162 if idx > 0:
163 ifm_y_present = 0
164 prev_pass = passes[idx - 1]
165 prev_pass_gen = generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx - 1)
166 else:
167 ifm_y_present = 1
168 if len(ifm_tensor.shape) >= 3:
169 ifm_y_present = ifm_tensor.shape[-3]
170 prev_pass_gen = []
171 prev_pass = None
172
173 if len(passes) == 1:
174 # no cascading, can just issue one big stripe
175 # but only if we've done allocation and OFM does not overlap IFM
176 if ifm_tensor.address != -1 and ofm_tensor.address != -1:
177 if (
178 ifm_tensor.address + ifm_tensor.storage_size() <= ofm_tensor.address
179 or ofm_tensor.address + ofm_tensor.storage_size() <= ifm_tensor.address
180 ):
181 y_step = y_dim
182
183 weight_box = None
184
185 for start in range(y_start, y_dim, y_step):
186 end = min(start + y_step, y_dim)
187 if len(ofm_tensor.shape) >= 3:
188 ofm_start[-3] = start
189 ofm_end[-3] = end
190 ofm_box = Box(ofm_start, ofm_end)
191
192 k_height = 1
193 if npu_block_type == NpuBlockType.Pooling:
194 if ps.primary_op is not None:
195 k_height = ps.primary_op.attrs["ksize"][1]
196 else:
197 if weight_tensor is not None:
198 k_height = weight_tensor.shape[0]
199
200 ifm_box, pad_top, pad_bottom = ofm_box.transform_with_strides_and_skirt(
201 strides, skirt, ifm_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0], k_height
202 )
203
204 ifm_y_needed = 1
205 if len(ifm_box.end_coord) >= 3:
206 ifm_y_needed = ifm_box.end_coord[-3]
207 if ifm_y_present < ifm_y_needed:
208 for prev_cmd in prev_pass_gen:
209 yield prev_cmd
210 rng = prev_cmd.get_ofm_y_range_for_pass(prev_pass)
211 if rng is not None:
212 ifm_y_present = max(ifm_y_present, rng[1])
213 if ifm_y_present >= ifm_y_needed:
214 break
215
216 if weight_tensor is not None and weight_box is None:
217 weight_box = Box.make_weight_box(
218 weight_tensor.shape, npu_block_type, weights_transposed=weight_tensor.weight_transpose_depthwise
219 )
220 yield from dma_weights_if_necessary(ps, weight_box, weight_tensor)
221
222 # Check if first/last stripe in pass
223 is_first_h_stripe = start == y_start
224 is_last_h_stripe = (start + y_step) >= y_dim
225
226 stripe = NpuStripe(
227 ps,
228 block_config,
229 is_first,
230 is_last,
231 is_first_h_stripe,
232 is_last_h_stripe,
233 ifm_tensor,
234 ifm_box,
235 ofm_tensor,
236 ofm_box,
237 weight_tensor,
238 weight_box,
239 scale_tensor,
240 concat_axis,
241 concat_offset,
242 None,
243 None,
244 pad_top,
245 pad_bottom,
246 )
247 yield stripe
248 else:
249 assert 0, "unknown scheduling strategy"
250
251
252def generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
253 if strat == SchedulingStrategy.WeightStream:
254 for idx in range(len(passes)):
255 yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx)
256 elif strat == SchedulingStrategy.IfmStream:
257 yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, len(passes) - 1)
258 else:
259 assert 0, "Unknown streaming strategy"
260
261
262def generate_high_level_command_stream_for_cascaded_pass(cps):
263 yield from generate_high_level_command_stream_for_pass_list(
264 cps.strategy, cps.passes, [ps.block_config for ps in cps.passes]
265 )
266
267
268def generate_high_level_command_stream(nng, sg, arch, verbose_high_level_command_stream):
269 res = []
270 for cps in sg.cascaded_passes:
271 if cps.placement == PassPlacement.Npu:
272 res += list(generate_high_level_command_stream_for_cascaded_pass(cps))
273
274 sg.high_level_command_stream = res
275 if verbose_high_level_command_stream:
276 sg.print_high_level_command_stream()
277
278
279def calc_allowed_ofm_ifm_overlap_for_pass_list(strat, passes, block_configs):
280 highest_ofm_write = 0
281 if not passes[0].ifm_tensor or not passes[-1].ofm_tensor:
282 return 0
283
284 ifm_read = passes[0].ifm_tensor.storage_size
285 min_overlap = 999999999999999999999
286 ofm_size = passes[-1].ofm_tensor.storage_size()
287 if strat == SchedulingStrategy.WeightStream:
288 return 0
289 for cmd in generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
290 if cmd.is_npu_pass_command():
291 if cmd.is_first:
292 ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.start_coord, is_top_box=False)
293 if ifm_read is None:
294 return 0
295 if cmd.is_last:
296 write_offset = cmd.ofm_tensor.address_offset_for_coordinate(cmd.ofm_box.end_coord, is_top_box=True)
297 if write_offset is None:
298 return 0
299 highest_ofm_write = max(write_offset, highest_ofm_write)
300
301 if cmd.is_first or cmd.is_last:
302 overlap_required = max(highest_ofm_write - min(ifm_read, ofm_size), 0)
303 can_overwrite = ofm_size - overlap_required
304 min_overlap = min(min_overlap, can_overwrite)
305
306 if cmd.is_first:
307 ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.end_coord, is_top_box=True)
308
309 min_overlap = max(min_overlap, 0)
310 return min_overlap
311
312
313def calc_allowed_ofm_ifm_overlap_for_cascaded_pass(cps):
314 return calc_allowed_ofm_ifm_overlap_for_pass_list(cps.strategy, cps.passes, [ps.block_config for ps in cps.passes])