blob: 364df6f88e04dbdc7020d51daafb62732bd2076a [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
17
18# Description:
19# Generate a high-level command stream from a scheduled subgraph with CascadedPasses.
20#
21# Also used during scheduling to work out allowable IFM/OFM overlap, this functionality can be accessed using
22# calc_allowed_ofm_ifm_overlap_for_cascaded_pass().
23
24from .nn_graph import SchedulingStrategy, PassPlacement
25import numpy as np
26from .operation import NpuBlockType
27from .high_level_command_stream import Box, CommandType, Command, NpuStripe, DMA
28
29
30def need_dma(tens):
31 return len(tens.ops) == 1 and tens.ops[0].type == "DMA"
32
33
34def dma_weights_if_necessary(ps, box, weight_tensor):
35 if need_dma(weight_tensor):
36 dma_op = weight_tensor.ops[0]
37 in_tensor = dma_op.inputs[0]
38 yield DMA(in_tensor, weight_tensor, box)
39
40
41def generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx):
42 is_first = idx == 0
43 is_last = idx == len(passes) - 1
44 ps = passes[idx]
45 block_config = block_configs[idx]
46
47 ifm_tensor = ps.ifm_tensor
48 ifm2_tensor = ps.ifm2_tensor
49 ofm_tensor = ps.ofm_tensor
50 weight_tensor = ps.weight_tensor
51 scale_tensor = ps.scale_tensor
52
53 ofm_start = [0] * len(ofm_tensor.shape)
54 ofm_end = list(ofm_tensor.shape)
55
56 strides = None
57 skirt = None
58 if ps.primary_op is not None:
59 strides = ps.primary_op.attrs.get("strides", None)
60 skirt = ps.primary_op.attrs.get("skirt", None)
61
62 npu_block_type = ps.npu_block_type
63
64 concat_axis = 0
65 concat_offset = 0
66
67 split_offsets = [None, None] # offset for [ifm, ifm2]
68
69 # Fusable activation functions
70 activation_ops = set(("Sigmoid", "Tanh", "Relu", "Relu6", "ReluN1To1"))
71
72 for op in ps.ops:
73 if op.type == "ConcatSliceWrite":
74 concat_axis = op.attrs["concat_axis"]
75 concat_start = op.attrs["concat_start"]
76 concat_end = op.attrs["concat_end"]
77
78 ofm_start[concat_axis] = concat_start
79 ofm_end[concat_axis] = concat_end
80 concat_offset = concat_start
81 ps.primary_op.attrs["fused_memory_function"] = op.type
82 elif op.type in activation_ops:
83 ps.primary_op.attrs["fused_activation_function"] = op.type
84
85 # The ops list has to be reversed here since the Pass Packing is done in reverse
86 ifm_idx = 0
87 for op in reversed(ps.ops):
88 if op.type == "SplitSliceRead":
89 split_offsets[ifm_idx] = op.attrs["split_start"]
90 ps.primary_op.attrs["fused_memory_function"] = op.type
91 ifm_idx += 1
92
93 if strat == SchedulingStrategy.WeightStream:
94 ofm_step = block_config[-1]
95 ofm_stop = ofm_end[-1]
96 if weight_tensor is None or not need_dma(weight_tensor):
97 ofm_step = ofm_stop
98 for start in range(ofm_start[-1], ofm_stop, ofm_step):
99 end = min(start + ofm_step, ofm_stop)
100 ofm_start[-1] = start
101 ofm_end[-1] = end
102 ofm_box = Box(ofm_start, ofm_end)
103 ifm_box = None
104 ifm2_box = None
105
106 if ifm_tensor.shape != []:
107 ifm_box, _, _ = ofm_box.transform_with_strides_and_skirt(
108 strides, skirt, ifm_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0]
109 )
110 else:
111 ifm_box = Box([], [])
112 if ifm2_tensor is not None and ifm2_tensor.shape != []:
113 ifm2_box, _, _ = ofm_box.transform_with_strides_and_skirt(
114 strides, skirt, ifm2_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[1]
115 )
116 else:
117 ifm2_box = Box([], [])
118
119 weight_box = None
120 if weight_tensor is not None:
121 weight_oc_start = start
122 weight_oc_end = end
123 if concat_axis - len(weight_tensor.shape) == -1:
124 weight_oc_start -= concat_offset
125 weight_oc_end -= concat_offset
126
127 weight_box = Box.make_weight_box(
128 weight_tensor.shape,
129 npu_block_type,
130 weight_oc_start,
131 weight_oc_end,
132 weight_tensor.weight_transpose_depthwise,
133 )
134 yield from dma_weights_if_necessary(ps, weight_box, weight_tensor)
135
136 yield NpuStripe(
137 ps,
138 block_config,
139 is_first,
140 is_last,
141 True,
142 True,
143 ifm_tensor,
144 ifm_box,
145 ofm_tensor,
146 ofm_box,
147 weight_tensor,
148 weight_box,
149 scale_tensor,
150 concat_axis,
151 concat_offset,
152 ifm2_tensor=ifm2_tensor,
153 ifm2_box=ifm2_box,
154 )
155
156 elif strat == SchedulingStrategy.IfmStream:
157 y_step = block_config[0]
158 y_start = 0
159 y_dim = 1
160 if len(ofm_tensor.shape) >= 3:
161 y_start = ofm_start[-3]
162 y_dim = ofm_end[-3]
163 if idx > 0:
164 ifm_y_present = 0
165 prev_pass = passes[idx - 1]
166 prev_pass_gen = generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx - 1)
167 else:
168 ifm_y_present = 1
169 if len(ifm_tensor.shape) >= 3:
170 ifm_y_present = ifm_tensor.shape[-3]
171 prev_pass_gen = []
172 prev_pass = None
173
174 if len(passes) == 1:
175 # no cascading, can just issue one big stripe
176 # but only if we've done allocation and OFM does not overlap IFM
177 if ifm_tensor.address != -1 and ofm_tensor.address != -1:
178 if (
179 ifm_tensor.address + ifm_tensor.storage_size() <= ofm_tensor.address
180 or ofm_tensor.address + ofm_tensor.storage_size() <= ifm_tensor.address
181 ):
182 y_step = y_dim
183
184 weight_box = None
185
186 for start in range(y_start, y_dim, y_step):
187 end = min(start + y_step, y_dim)
188 if len(ofm_tensor.shape) >= 3:
189 ofm_start[-3] = start
190 ofm_end[-3] = end
191 ofm_box = Box(ofm_start, ofm_end)
192
193 k_height = 1
194 if npu_block_type == NpuBlockType.Pooling:
195 if ps.primary_op is not None:
196 k_height = ps.primary_op.attrs["ksize"][1]
197 else:
198 if weight_tensor is not None:
199 k_height = weight_tensor.shape[0]
200
201 ifm_box, pad_top, pad_bottom = ofm_box.transform_with_strides_and_skirt(
202 strides, skirt, ifm_tensor.shape, npu_block_type, concat_axis, concat_offset, split_offsets[0], k_height
203 )
204
205 ifm_y_needed = 1
206 if len(ifm_box.end_coord) >= 3:
207 ifm_y_needed = ifm_box.end_coord[-3]
208 if ifm_y_present < ifm_y_needed:
209 for prev_cmd in prev_pass_gen:
210 yield prev_cmd
211 rng = prev_cmd.get_ofm_y_range_for_pass(prev_pass)
212 if rng is not None:
213 ifm_y_present = max(ifm_y_present, rng[1])
214 if ifm_y_present >= ifm_y_needed:
215 break
216
217 if weight_tensor is not None and weight_box is None:
218 weight_box = Box.make_weight_box(
219 weight_tensor.shape, npu_block_type, weights_transposed=weight_tensor.weight_transpose_depthwise
220 )
221 yield from dma_weights_if_necessary(ps, weight_box, weight_tensor)
222
223 # Check if first/last stripe in pass
224 is_first_h_stripe = start == y_start
225 is_last_h_stripe = (start + y_step) >= y_dim
226
227 stripe = NpuStripe(
228 ps,
229 block_config,
230 is_first,
231 is_last,
232 is_first_h_stripe,
233 is_last_h_stripe,
234 ifm_tensor,
235 ifm_box,
236 ofm_tensor,
237 ofm_box,
238 weight_tensor,
239 weight_box,
240 scale_tensor,
241 concat_axis,
242 concat_offset,
243 None,
244 None,
245 pad_top,
246 pad_bottom,
247 )
248 yield stripe
249 else:
250 assert 0, "unknown scheduling strategy"
251
252
253def generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
254 if strat == SchedulingStrategy.WeightStream:
255 for idx in range(len(passes)):
256 yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, idx)
257 elif strat == SchedulingStrategy.IfmStream:
258 yield from generate_high_level_command_stream_for_pass(strat, passes, block_configs, len(passes) - 1)
259 else:
260 assert 0, "Unknown streaming strategy"
261
262
263def generate_high_level_command_stream_for_cascaded_pass(cps):
264 yield from generate_high_level_command_stream_for_pass_list(
265 cps.strategy, cps.passes, [ps.block_config for ps in cps.passes]
266 )
267
268
269def generate_high_level_command_stream(nng, sg, arch, verbose_high_level_command_stream):
270 res = []
271 for cps in sg.cascaded_passes:
272 if cps.placement == PassPlacement.Npu:
273 res += list(generate_high_level_command_stream_for_cascaded_pass(cps))
274
275 sg.high_level_command_stream = res
276 if verbose_high_level_command_stream:
277 sg.print_high_level_command_stream()
278
279
280def calc_allowed_ofm_ifm_overlap_for_pass_list(strat, passes, block_configs):
281 highest_ofm_write = 0
282 if not passes[0].ifm_tensor or not passes[-1].ofm_tensor:
283 return 0
284
285 ifm_read = passes[0].ifm_tensor.storage_size
286 min_overlap = 999999999999999999999
287 ofm_size = passes[-1].ofm_tensor.storage_size()
288 if strat == SchedulingStrategy.WeightStream:
289 return 0
290 for cmd in generate_high_level_command_stream_for_pass_list(strat, passes, block_configs):
291 if cmd.is_npu_pass_command():
292 if cmd.is_first:
293 ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.start_coord, is_top_box=False)
294 if ifm_read is None:
295 return 0
296 if cmd.is_last:
297 write_offset = cmd.ofm_tensor.address_offset_for_coordinate(cmd.ofm_box.end_coord, is_top_box=True)
298 if write_offset is None:
299 return 0
300 highest_ofm_write = max(write_offset, highest_ofm_write)
301
302 if cmd.is_first or cmd.is_last:
303 overlap_required = max(highest_ofm_write - min(ifm_read, ofm_size), 0)
304 can_overwrite = ofm_size - overlap_required
305 min_overlap = min(min_overlap, can_overwrite)
306
307 if cmd.is_first:
308 ifm_read = cmd.ifm_tensor.address_offset_for_coordinate(cmd.ifm_box.end_coord, is_top_box=True)
309
310 min_overlap = max(min_overlap, 0)
311 return min_overlap
312
313
314def calc_allowed_ofm_ifm_overlap_for_cascaded_pass(cps):
315 return calc_allowed_ofm_ifm_overlap_for_pass_list(cps.strategy, cps.passes, [ps.block_config for ps in cps.passes])