blob: 64aff06b4d971646964d542f3085bb4357c08837 [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Contains the main sequencing of the compiler.
Diego Russoea6111a2020-04-14 18:41:58 +010018import time
19
Diego Russoe8a10452020-04-21 17:39:10 +010020from . import extract_npu_subgraphs
Tim Hall79d07d22020-04-27 18:20:16 +010021from . import graph_optimiser
Diego Russoe8a10452020-04-21 17:39:10 +010022from . import high_level_command_stream_generator
Tim Hall79d07d22020-04-27 18:20:16 +010023from . import insert_dma
Diego Russoe8a10452020-04-21 17:39:10 +010024from . import live_range
25from . import mark_tensors
26from . import npu_performance
27from . import npu_serialisation
Tim Hall79d07d22020-04-27 18:20:16 +010028from . import pass_packing
Diego Russoe8a10452020-04-21 17:39:10 +010029from . import register_command_stream_generator
Tim Hall79d07d22020-04-27 18:20:16 +010030from . import scheduler
31from . import tensor_allocation
Tim Hall79d07d22020-04-27 18:20:16 +010032from . import weight_compressor
Diego Russoe8a10452020-04-21 17:39:10 +010033from .nn_graph import PassPlacement
34from .nn_graph import TensorAllocator
Diego Russoea6111a2020-04-14 18:41:58 +010035from .rewrite_graph import verify_graph_health
Diego Russoe8a10452020-04-21 17:39:10 +010036from .tensor import MemArea
Tim Hall79d07d22020-04-27 18:20:16 +010037
38
39class CompilerOptions:
40 """Set of options to change compiler behaviour - verbosity, targets, turning off passes.
41
42Note the difference between ArchitectureFeatures and CompilerOptions
43- ArchitectureFeatures is for changing the Ethos-U55 and system architecture
44- CompilerOptions is for changing the behaviour of the compiler
45"""
46
47 def __init__(
48 self,
49 verbose_graph=False,
50 verbose_quantization=False,
51 verbose_packing=False,
52 verbose_tensor_purpose=False,
53 verbose_tensor_format=False,
54 verbose_allocation=False,
55 verbose_high_level_command_stream=False,
56 verbose_register_command_stream=False,
57 verbose_operators=False,
58 show_minimum_possible_allocation=False,
59 show_cpu_operations=False,
60 tensor_allocator=TensorAllocator.Greedy,
61 timing=False,
62 output_dir="outputs",
63 ):
64
65 self.verbose_graph = verbose_graph
66 self.verbose_quantization = verbose_quantization
67 self.verbose_packing = verbose_packing
68 self.verbose_tensor_purpose = verbose_tensor_purpose
69 self.verbose_tensor_format = verbose_tensor_format
70 self.verbose_allocation = verbose_allocation
71 self.verbose_high_level_command_stream = verbose_high_level_command_stream
72 self.verbose_register_command_stream = verbose_register_command_stream
73 self.verbose_operators = verbose_operators
74 self.show_minimum_possible_allocation = show_minimum_possible_allocation
75 self.show_cpu_operations = show_cpu_operations
76 self.tensor_allocator = tensor_allocator
77 self.timing = timing
78 self.output_dir = output_dir
79
80 def __str__(self):
81 return type(self).__name__ + ": " + str(self.__dict__)
82
83 __repr__ = __str__
84
85
86def compiler_driver(nng, arch, options, scheduler_options):
87 assert verify_graph_health(nng)
88 nng = graph_optimiser.optimise_graph_a(nng, arch, options.verbose_graph)
89 assert verify_graph_health(nng)
90
91 if options.verbose_quantization:
92 nng.print_graph_with_tensor_quantization()
93
94 nng = graph_optimiser.optimise_graph_b(nng, arch, options.verbose_graph)
95 assert verify_graph_health(nng)
96
97 nng = mark_tensors.mark_tensor_purpose(nng, arch, options.verbose_tensor_purpose)
98 assert verify_graph_health(nng)
99 nng = insert_dma.insert_dma_commands(nng, arch, options.verbose_graph)
100 assert verify_graph_health(nng)
101 pass_packing.pack_into_passes(nng, arch, options.verbose_packing)
102 assert verify_graph_health(nng)
103
104 extract_npu_subgraphs.extract_npu_subgraphs(nng, arch)
105
106 mark_tensors.mark_tensor_format(nng, arch, options.verbose_tensor_format)
107 assert verify_graph_health(nng)
108 if options.timing:
109 start = time.time()
110
111 # Run the scheduler
112 scheduler.schedule_passes(nng, arch, scheduler_options)
113
114 if options.timing:
115 stop = time.time()
116 print("Scheduling took %f s" % (stop - start))
117 start = time.time()
118
119 # Update the compressed weights now that we have determined the
120 # block config, and calc and pack the scales and biases
121 weight_compressor.update_pass_weight_and_scale_tensors(nng, arch)
122
123 # Memory area for all non-constant tensors (Cpu and Npu)
124 non_const_mem_area = MemArea.Sram
125
126 # LiveRanges for constant tensors for all Npu subgraphs
127 permanent_storage = arch.permanent_storage_mem_area
128 lr_graph_flash = live_range.LiveRangeGraph()
129
130 # Placeholders for scratch and flash tensors that are common for all Npu subgraphs
131 scratch_tens = None
132 flash_tens = None
133
134 # Calculate live ranges for all constant Npu tensors, in permanent storage
135 for sg in nng.subgraphs:
136 if sg.placement == PassPlacement.Npu:
137 lr_graph_flash = live_range.extract_live_ranges_from_cascaded_passes(
138 sg, permanent_storage, ignore_subgraph_input_output_tensors=True, lr_graph=lr_graph_flash
139 )
140
Patrik Gustavssoncf728902020-04-30 08:57:23 +0200141 assert len(nng.subgraphs) > 1, "Error: No operators can be hardware accelerated; cancelling compilation"
142
Tim Hall79d07d22020-04-27 18:20:16 +0100143 # Allocate all Npu constant tensors to the first Npu subgraph since it is
144 # processed first during serialization into tensors
145 first_npu_sg = nng.subgraphs[1]
146 assert first_npu_sg.placement == PassPlacement.Npu
147 tensor_allocation.allocate_tensors(
148 nng,
149 first_npu_sg,
150 arch,
151 permanent_storage,
152 scheduler_options.use_ifm_ofm_overlap,
153 options.tensor_allocator,
154 options.verbose_allocation,
155 options.show_minimum_possible_allocation,
156 lr_graph_flash,
157 )
158
159 # Allocate all non-constant tensors to the root, i.e. Cpu, subgraph. This step
160 # will start at the root subgraph's input and traverse from top to bottom. When
161 # it comes across an Npu-op it will extract live ranges for it's corresponding
162 # Npu subgraph and add them to the root's live range graph. Finally, all of the
163 # non-constant tensors are allocated together
164 root_sg = nng.get_root_subgraph()
165 tensor_allocation.allocate_tensors(
166 nng,
167 root_sg,
168 arch,
169 non_const_mem_area,
170 scheduler_options.use_ifm_ofm_overlap,
171 options.tensor_allocator,
172 options.verbose_allocation,
173 options.show_minimum_possible_allocation,
174 )
175
176 # Generate command streams and serialise Npu-ops into tensors
177 for sg in nng.subgraphs:
178 high_level_command_stream_generator.generate_high_level_command_stream(
179 nng, sg, arch, options.verbose_high_level_command_stream
180 )
181 register_command_stream_generator.generate_register_command_stream(
182 nng, sg, arch, options.verbose_register_command_stream
183 )
184 scratch_tens, flash_tens = npu_serialisation.serialise_npu_subgraph_into_tensors(
185 nng, sg, arch, scratch_tens, flash_tens
186 )
187
188 npu_serialisation.rewrite_npu_call_ops(nng, root_sg, arch)
189
190 # Allocate all Cpu constant tensors, this is done last because the Npu-ops
191 # have to be serialized into flash and scratch tensors first
192 tensor_allocation.allocate_tensors(
193 nng,
194 root_sg,
195 arch,
196 permanent_storage,
197 scheduler_options.use_ifm_ofm_overlap,
198 options.tensor_allocator,
199 options.verbose_allocation,
200 options.show_minimum_possible_allocation,
201 )
202
203 npu_performance.calc_performance_for_network(nng, arch)