blob: 1ad5b4f71b30e438cef1237f3a4967b86b7ed8fe [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
17
18# Description:
19# Packs a subgraph with Neural Network Operations into Passes. Each Pass has one or more Operations.
20
Tim Hall79d07d22020-04-27 18:20:16 +010021import enum
Diego Russoea6111a2020-04-14 18:41:58 +010022import collections
23
24from .nn_graph import Pass, PassPlacement
25from .tensor import TensorPurpose
26from .operation import Operation, NpuBlockType
Tim Hall79d07d22020-04-27 18:20:16 +010027
28
29class PassFlags(enum.Flag):
30 Empty = 0
31 Pre = 1
32 Main = 2
33 Post = 4
34 Mac = 8
35 Dma = 32
36 ElementWise = 256
37 Npu = 512
38 Cpu = 1024
39 StartupInit = 2048
40 MemoryOnly = 4096
41 PostFusingLimited = 8192
42
43
44npu_pre_ops = set(("QuantizedResizeBilinear", "SplitSliceRead",))
45
46mac_main_ops = set(
47 (
48 # convolutions
49 "Conv2DBiasAct",
50 "Conv2D",
51 "QuantizedConv2D",
52 "Conv2DBackpropInputSwitched",
53 # depth-wise convolutions
54 "DepthwiseConv2dBiasAct",
55 "DepthwiseConv2dNative",
56 "QuantizedDepthwiseConv2D",
57 # FC layers
58 "QuantizedMatMul",
59 "MatMul",
60 "FullyConnectedAct",
61 # RNN/LSTM/GRU
62 "BlockLSTM",
63 # pooling
64 "QuantizedMaxPool",
65 "QuantizedAvgPool",
66 "AvgPool",
67 "MaxPool",
68 "AvgPoolAct",
69 "MaxPoolAct",
Dwight Lidman3ec04ac2020-04-30 11:54:48 +020070 # deconvolution
71 "ResizeBilinear",
Tim Hall79d07d22020-04-27 18:20:16 +010072 )
73)
74
75binary_elem_wise_main_ops = set(
76 (
77 # binary element-wise
78 "AddAct",
79 "MulAct",
80 "SubAct",
81 "QuantizedAdd",
82 "QuantizedSub",
83 "QuantizedMul",
84 "Mul",
85 "Add",
86 "Sub",
87 "Minimum",
88 "Maximum",
89 )
90)
91
92unary_elem_wise_main_ops = set(("LeakyRelu", "Abs")) # Unary element-wise operations
93
94elem_wise_main_ops = binary_elem_wise_main_ops | unary_elem_wise_main_ops
95
96activation_ops = set(("QuantizedRelu", "QuantizedRelu1", "QuantizedRelu6", "Relu", "Relu6", "ReluN1To1"))
97npu_post_ops = activation_ops | set(
98 # Bias-add operations: Get rid of these once we have rewrites from Conv2D + BiasAdd + Activation to Conv2DBiasAct.
99 ("Mul", "Add", "QuantizedBiasAdd", "Requantize", "QuantizedBatchNorm", "BiasAdd", "FusedBatchNorm")
100)
101
102npu_post_fuse_limited_ops = set(
103 # Set of post operators that should not be fused with main/elementwise ops
104 ("ConcatSliceWrite", "Sigmoid", "Tanh")
105)
106
107elem_wise_ops = elem_wise_main_ops | activation_ops | set(("Sigmoid", "Tanh"))
108
109
110quantization_ops = set(("Dequantize", "QuantizeV2", "Max", "Min"))
Diego Russoea6111a2020-04-14 18:41:58 +0100111cpu_ops = set(("Softmax", "QuantizedSoftmax", "LRN", "Shape", "QuantizedPad", "Pad", "AddN")) | quantization_ops
Tim Hall79d07d22020-04-27 18:20:16 +0100112
113npu_dma_ops = set(("DMA",))
114startup_init_ops = set(("Const", "VariableV2", "Placeholder", "SubgraphInput"))
115memory_only_ops = set(("Squeeze", "Reshape", "QuantizedReshape", "ExpandDims",))
116
117
118test_sequence = [
119 (
120 # ops_set
121 npu_post_ops,
122 # incompatible_pack_flags
123 PassFlags.Cpu | PassFlags.MemoryOnly | PassFlags.Pre | PassFlags.Main,
124 # flags_to_set
125 PassFlags.Npu | PassFlags.Post,
126 # flags_to_clear
127 PassFlags.Empty,
128 ),
129 (
130 # ops_set
131 npu_post_fuse_limited_ops,
132 # incompatible_pack_flags
133 PassFlags.Cpu | PassFlags.MemoryOnly | PassFlags.Pre | PassFlags.Main,
134 # flags_to_set
135 PassFlags.Npu | PassFlags.PostFusingLimited,
136 # flags_to_clear
137 PassFlags.Empty,
138 ),
139 (
140 # ops_set
141 mac_main_ops,
142 # incompatible_pack_flags
143 PassFlags.Cpu
144 | PassFlags.MemoryOnly
145 | PassFlags.ElementWise
146 | PassFlags.Pre
147 | PassFlags.Main
148 | PassFlags.PostFusingLimited,
149 # flags_to_set
150 PassFlags.Npu | PassFlags.Mac | PassFlags.Main,
151 # flags_to_clear
152 PassFlags.Empty,
153 ),
154 (
155 # ops_set
156 elem_wise_main_ops,
157 # incompatible_pack_flags
158 PassFlags.Cpu
159 | PassFlags.MemoryOnly
160 | PassFlags.Mac
161 | PassFlags.Pre
162 | PassFlags.Main
163 | PassFlags.PostFusingLimited,
164 # flags_to_set
165 PassFlags.Npu | PassFlags.ElementWise | PassFlags.Main,
166 # flags_to_clear
167 PassFlags.Empty,
168 ),
169 (
170 # ops_set
171 npu_pre_ops,
172 # incompatible_pack_flags
173 PassFlags.Cpu | PassFlags.MemoryOnly,
174 # flags_to_set
175 PassFlags.Npu | PassFlags.Mac | PassFlags.Pre | PassFlags.ElementWise,
176 # flags_to_clear
177 PassFlags.Empty,
178 ),
179 (
180 # ops_set
181 npu_dma_ops,
182 # incompatible_pack_flags
183 PassFlags.Cpu | PassFlags.MemoryOnly,
184 # flags_to_set
185 PassFlags.Npu | PassFlags.Dma,
186 # flags_to_clear
Diego Russoea6111a2020-04-14 18:41:58 +0100187 PassFlags.Empty,
Tim Hall79d07d22020-04-27 18:20:16 +0100188 ),
189 (
190 # ops_set
191 startup_init_ops,
192 # incompatible_pack_flags
193 PassFlags.Npu | PassFlags.Cpu | PassFlags.MemoryOnly,
194 # flags_to_set
195 PassFlags.StartupInit | PassFlags.Main,
196 # flags_to_clear
197 PassFlags.Empty,
198 ),
199 (
200 # ops_set
201 memory_only_ops,
202 # incompatible_pack_flags
203 PassFlags.Npu | PassFlags.Cpu,
204 # flags_to_set
205 PassFlags.MemoryOnly | PassFlags.Main,
206 # flags_to_clear
Diego Russoea6111a2020-04-14 18:41:58 +0100207 PassFlags.Empty,
Tim Hall79d07d22020-04-27 18:20:16 +0100208 ),
209 (
210 # ops_set
211 cpu_ops,
212 # incompatible_pack_flags
213 PassFlags.Npu | PassFlags.MemoryOnly | PassFlags.Main,
214 # flags_to_set
215 PassFlags.Cpu | PassFlags.Main,
216 # flags_to_clear
Diego Russoea6111a2020-04-14 18:41:58 +0100217 PassFlags.Empty,
Tim Hall79d07d22020-04-27 18:20:16 +0100218 ),
Diego Russoea6111a2020-04-14 18:41:58 +0100219 ( # This last one is a fallback for unrecognised operations
Tim Hall79d07d22020-04-27 18:20:16 +0100220 # ops_set
221 None,
222 # incompatible_pack_flags
223 PassFlags.Npu | PassFlags.MemoryOnly | PassFlags.Main,
224 # flags_to_set
225 PassFlags.Cpu | PassFlags.Main,
226 # flags_to_clear
Diego Russoea6111a2020-04-14 18:41:58 +0100227 PassFlags.Empty,
Tim Hall79d07d22020-04-27 18:20:16 +0100228 ),
229]
230
231# Some sanity checking
232for (operation_set, incompatible_pack_flags, flags_to_set, flags_to_clear) in test_sequence:
233 assert not flags_to_clear & flags_to_set
234
235 if operation_set is not None:
236 for op in operation_set:
237 assert len(op) > 1 # This is to avoid string literals being decomposed
238
239
240def pack_into_passes(nng, arch, verbose_packing=False):
241 def visit_op(op, ignored):
242 visit_op_refcount[op] += 1
243
244 if visit_op_refcount[op] == 1: # First-time visit, go and fix up unused output tensors
245 for tens in op.outputs:
246 if len(tens.consumers()) == 0:
247 visit_op_refcount[op] += 1
248
249 assert visit_op_refcount[op] <= len(op.outputs)
250 if visit_op_refcount[op] == len(op.outputs):
251
252 if op.type in startup_init_ops:
253 startup_list.append(op)
254 else:
255 _, _, _, ofm_tensor = op.get_ifm_ifm2_weights_ofm()
256 if ofm_tensor is None:
257 ofm_tensor = op.outputs[0]
258 build_pass((op,), ofm_tensor)
259
260 def build_pass(start_ops_to_process, ofm_tensor=None):
261 reverse_ops_list = []
262 curr_flags = PassFlags.Empty
263 npu_block_type = NpuBlockType.Default
264
265 reverse_intermediates = []
266 input_set = set()
267 ifm_tensor = None
268 primary_op = None
269
270 to_process = collections.deque()
271 for start_op in start_ops_to_process:
272 to_process.append((start_op, None))
273
274 while to_process:
275 curr_op, tens = to_process.popleft()
276
277 if curr_op in reverse_ops_list:
278 continue
279
280 for operation_set, incompatible_pack_flags, flags_to_set, flags_to_clear in test_sequence:
281 if operation_set is None or curr_op.type in operation_set:
282 if not (curr_flags & incompatible_pack_flags):
283 if flags_to_set & PassFlags.Npu:
284 if not curr_op.run_on_npu:
285 continue
286
287 reverse_ops_list.append(curr_op)
288 new_block_type = curr_op.attrs.get("npu_block_type", NpuBlockType.Default)
289 if new_block_type != NpuBlockType.Default:
290 assert npu_block_type == NpuBlockType.Default
291 npu_block_type = new_block_type # Only one major block type per pass
292 assert primary_op is None
293 primary_op = curr_op
294
295 curr_flags &= ~flags_to_clear
296 curr_flags |= flags_to_set
297
298 if flags_to_set & PassFlags.Npu:
299 if flags_to_set & (
300 PassFlags.Mac | PassFlags.ElementWise | PassFlags.Post | PassFlags.PostFusingLimited
301 ):
302 assert len(curr_op.inputs) >= 1
303 if curr_op.type == "BlockLSTM":
304 ifm_tensor = curr_op.inputs[3]
305 else:
306 ifm_tensor = curr_op.inputs[0]
307 assert ifm_tensor.purpose == TensorPurpose.FeatureMap
308
309 if flags_to_set & PassFlags.Dma:
310 # DMAs are special - Output buffers need to be preserved as intermediates,
311 # if the pass consumes the results
312 if tens is not None:
313 reverse_intermediates.append(tens)
314
315 if operation_set is None:
316 print("Warning:", curr_op.type, "operation is unknown or unsupported, placing on CPU")
317
318 for inp in curr_op.inputs:
319 can_pack = True
320 if len(inp.ops) == 1:
321 next_op = inp.ops[0]
322 for outp in next_op.outputs:
323 consumers = outp.consumers()
324 if len(consumers) > 1 or (len(consumers) == 1 and consumers[0] != curr_op):
325 can_pack = False
326 break
327 else:
328 can_pack = False
329
330 if can_pack:
331 to_process.append((next_op, inp))
332 else:
333 assert inp is not None
334 input_set.add(inp)
335
336 break
337
338 else:
339 # This operation is not compatible with already packed operations, just register the tensor as an input
340 assert tens is not None
341 input_set.add(tens)
342
343 if curr_flags & PassFlags.Npu and not curr_flags & (PassFlags.ElementWise | PassFlags.Mac):
344 # Make the choice that if we don't have a mac operation, the ambidextrous operations go on the
345 # element wise unit
346 curr_flags |= PassFlags.ElementWise
347
348 is_element_wise = True
349 for op in reverse_ops_list:
Diego Russoea6111a2020-04-14 18:41:58 +0100350 if op.type not in elem_wise_ops and op.type not in npu_dma_ops:
Tim Hall79d07d22020-04-27 18:20:16 +0100351 is_element_wise = False
352 break
353
354 placement = PassPlacement.Unknown
355 if curr_flags & PassFlags.Npu:
356 assert placement == PassPlacement.Unknown
357 placement = PassPlacement.Npu
358 if curr_flags & PassFlags.Cpu:
359 assert placement == PassPlacement.Unknown
360 placement = PassPlacement.Cpu
361 if curr_flags & PassFlags.MemoryOnly:
362 assert placement == PassPlacement.Unknown
363 placement = PassPlacement.MemoryOnly
364 if curr_flags & PassFlags.StartupInit:
365 assert placement == PassPlacement.Unknown
366 placement = PassPlacement.StartupInit
367 assert placement != PassPlacement.Unknown
368
369 ops_list = list(reversed(reverse_ops_list))
370 intermediates = list(reversed(reverse_intermediates))
371
Diego Russoea6111a2020-04-14 18:41:58 +0100372 if primary_op is None:
Tim Hall79d07d22020-04-27 18:20:16 +0100373 primary_op = create_primary_op(ops_list)
Diego Russoea6111a2020-04-14 18:41:58 +0100374 if primary_op is not None:
Tim Hall79d07d22020-04-27 18:20:16 +0100375 visit_tensor_refcount[primary_op.inputs[0]] += 1
376 npu_block_type = primary_op.attrs["npu_block_type"]
377 for input_tens in primary_op.inputs:
378 if input_tens not in input_set:
379 input_set.add(input_tens)
380
381 ordered_input_list = []
382 input_refcounts = collections.defaultdict(int)
383 for op in ops_list:
384 for inp in op.inputs:
385 if inp in input_set:
386 if input_refcounts[inp] == 0:
387 ordered_input_list.append(inp)
388 input_refcounts[inp] += 1
389
390 name = ops_list[0].name
391 non_dma_ops = [op for op in ops_list if op.type != "DMA"]
392 if non_dma_ops:
393 name = non_dma_ops[0].name
394 ps = Pass(name, placement, is_element_wise, npu_block_type)
395 ps.ops = ops_list
396 ps.primary_op = primary_op
397 ps.inputs = ordered_input_list
398 ps.intermediates = intermediates
399 ps.outputs = list(ops_list[-1].outputs)
400 ps.ifm_tensor = ifm_tensor
401
402 # ElementWise operation, 2 IFMs
403 if ps.primary_op and ps.primary_op.type in binary_elem_wise_main_ops:
404 ps.ifm_tensor = ps.inputs[0]
405
406 if len(ps.inputs) == 1:
407 # Only 1 input, IFM and IFM2 are the same tensor
408 ps.ifm2_tensor = ps.inputs[0]
409 else:
410 ps.ifm2_tensor = ps.inputs[1]
411 else:
412 ps.ifm_tensor = ifm_tensor
413 ps.ifm2_tensor = None
414
415 ps.ofm_tensor = ofm_tensor
416 assert ps.placement != PassPlacement.Npu or ps.ofm_tensor is not None
417 ps.weight_tensor = ps.get_primary_op_ifm_weights()[1]
418 ps.scale_tensor = ps.get_primary_op_ifm_weights_biases_ofm()[2]
419
420 for op in ps.ops:
421 op.scheduled_pass = ps
422
423 reverse_pass_list.append(ps)
424
425 for inp, refcount in input_refcounts.items():
426 for _ in range(refcount):
427 visit_tensor(inp)
428
429 return ps
430
431 def visit_tensor(tens):
432 visit_tensor_refcount[tens] += 1
433 assert visit_tensor_refcount[tens] <= len(tens.consumers())
434 if visit_tensor_refcount[tens] == len(tens.consumers()):
435 for op in reversed(tens.ops):
436 visit_op(op, tens)
437
438 def create_primary_op(ops_list):
439 if any(op.type in (npu_pre_ops | npu_post_ops | npu_post_fuse_limited_ops) for op in ops_list):
440 # Configure a 1x1 AvgPool and attach the op onto it
441 op = ops_list[0]
442 inp = op.inputs[0]
443 avgpool_name = op.name + "_avgpool"
444 avgpool_op = Operation("AvgPool", avgpool_name)
445 avgpool_op.inputs = [inp]
446 avgpool_op.inputs[0].consumer_list.append(avgpool_op)
447 avgpool_op.attrs["padding"] = b"VALID"
448 avgpool_op.attrs["npu_block_type"] = NpuBlockType.Pooling
449 avgpool_op.attrs["stride_w"] = 1
450 avgpool_op.attrs["stride_h"] = 1
451 avgpool_op.attrs["filter_width"] = 1
452 avgpool_op.attrs["filter_height"] = 1
453 avgpool_op.attrs["strides"] = [1, 1, 1, 1]
454 avgpool_op.attrs["ksize"] = [1, 1, 1, 1]
455 avgpool_op.attrs["skirt"] = [0, 0, 0, 0]
456 avgpool_op.attrs["explicit_padding"] = [0, 0, 0, 0]
457 avgpool_out = inp.clone("_avgpooled")
458 avgpool_out.consumer_list.append(op)
459 avgpool_out.ops = [avgpool_op]
460 avgpool_op.outputs = [avgpool_out]
461
462 op.inputs[0] = avgpool_out
463 ops_list.insert(0, avgpool_op)
464
465 return avgpool_op
466
467 return None
468
469 for sg in nng.subgraphs:
470 reverse_pass_list = []
471 visit_op_refcount = collections.defaultdict(int)
472 visit_tensor_refcount = collections.defaultdict(int)
473
474 startup_list = []
475
476 for tens in sg.output_tensors:
477 visit_tensor(tens)
478
479 if startup_list:
480 startup_ps = build_pass(startup_list)
481 startup_ps.outputs = [op.outputs[0] for op in startup_list] # Need to fixup the outputs
482 startup_ps.name = "startup_weight_initialisation"
483
484 sg.passes = list(reversed(reverse_pass_list))
485 sg.build_pass_links()
486
487 if verbose_packing:
488 nng.print_passes()
489
490 return nng