blob: 7f8c4ca446110310ce99c29c0fb44e6e2869c42b [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
17
18# Description:
19# Contains the main sequencing of the compiler.
20
21from . import graph_optimiser
22from . import mark_tensors
23from . import insert_dma
24from . import pass_packing
25from . import scheduler
26from . import tensor_allocation
27from . import npu_performance
28import time
29
30from . import high_level_command_stream
31from . import high_level_command_stream_generator
32from . import register_command_stream_generator
33from . import extract_npu_subgraphs
34from . import npu_serialisation
35from . import weight_compressor
36from . import live_range
37from .tensor import MemArea
38from .nn_graph import TensorAllocator, PassPlacement
39from .rewrite_graph import verify_graph_health, verify_subgraph_health
40
41
42class CompilerOptions:
43 """Set of options to change compiler behaviour - verbosity, targets, turning off passes.
44
45Note the difference between ArchitectureFeatures and CompilerOptions
46- ArchitectureFeatures is for changing the Ethos-U55 and system architecture
47- CompilerOptions is for changing the behaviour of the compiler
48"""
49
50 def __init__(
51 self,
52 verbose_graph=False,
53 verbose_quantization=False,
54 verbose_packing=False,
55 verbose_tensor_purpose=False,
56 verbose_tensor_format=False,
57 verbose_allocation=False,
58 verbose_high_level_command_stream=False,
59 verbose_register_command_stream=False,
60 verbose_operators=False,
61 show_minimum_possible_allocation=False,
62 show_cpu_operations=False,
63 tensor_allocator=TensorAllocator.Greedy,
64 timing=False,
65 output_dir="outputs",
66 ):
67
68 self.verbose_graph = verbose_graph
69 self.verbose_quantization = verbose_quantization
70 self.verbose_packing = verbose_packing
71 self.verbose_tensor_purpose = verbose_tensor_purpose
72 self.verbose_tensor_format = verbose_tensor_format
73 self.verbose_allocation = verbose_allocation
74 self.verbose_high_level_command_stream = verbose_high_level_command_stream
75 self.verbose_register_command_stream = verbose_register_command_stream
76 self.verbose_operators = verbose_operators
77 self.show_minimum_possible_allocation = show_minimum_possible_allocation
78 self.show_cpu_operations = show_cpu_operations
79 self.tensor_allocator = tensor_allocator
80 self.timing = timing
81 self.output_dir = output_dir
82
83 def __str__(self):
84 return type(self).__name__ + ": " + str(self.__dict__)
85
86 __repr__ = __str__
87
88
89def compiler_driver(nng, arch, options, scheduler_options):
90 assert verify_graph_health(nng)
91 nng = graph_optimiser.optimise_graph_a(nng, arch, options.verbose_graph)
92 assert verify_graph_health(nng)
93
94 if options.verbose_quantization:
95 nng.print_graph_with_tensor_quantization()
96
97 nng = graph_optimiser.optimise_graph_b(nng, arch, options.verbose_graph)
98 assert verify_graph_health(nng)
99
100 nng = mark_tensors.mark_tensor_purpose(nng, arch, options.verbose_tensor_purpose)
101 assert verify_graph_health(nng)
102 nng = insert_dma.insert_dma_commands(nng, arch, options.verbose_graph)
103 assert verify_graph_health(nng)
104 pass_packing.pack_into_passes(nng, arch, options.verbose_packing)
105 assert verify_graph_health(nng)
106
107 extract_npu_subgraphs.extract_npu_subgraphs(nng, arch)
108
109 mark_tensors.mark_tensor_format(nng, arch, options.verbose_tensor_format)
110 assert verify_graph_health(nng)
111 if options.timing:
112 start = time.time()
113
114 # Run the scheduler
115 scheduler.schedule_passes(nng, arch, scheduler_options)
116
117 if options.timing:
118 stop = time.time()
119 print("Scheduling took %f s" % (stop - start))
120 start = time.time()
121
122 # Update the compressed weights now that we have determined the
123 # block config, and calc and pack the scales and biases
124 weight_compressor.update_pass_weight_and_scale_tensors(nng, arch)
125
126 # Memory area for all non-constant tensors (Cpu and Npu)
127 non_const_mem_area = MemArea.Sram
128
129 # LiveRanges for constant tensors for all Npu subgraphs
130 permanent_storage = arch.permanent_storage_mem_area
131 lr_graph_flash = live_range.LiveRangeGraph()
132
133 # Placeholders for scratch and flash tensors that are common for all Npu subgraphs
134 scratch_tens = None
135 flash_tens = None
136
137 # Calculate live ranges for all constant Npu tensors, in permanent storage
138 for sg in nng.subgraphs:
139 if sg.placement == PassPlacement.Npu:
140 lr_graph_flash = live_range.extract_live_ranges_from_cascaded_passes(
141 sg, permanent_storage, ignore_subgraph_input_output_tensors=True, lr_graph=lr_graph_flash
142 )
143
144 # Allocate all Npu constant tensors to the first Npu subgraph since it is
145 # processed first during serialization into tensors
146 first_npu_sg = nng.subgraphs[1]
147 assert first_npu_sg.placement == PassPlacement.Npu
148 tensor_allocation.allocate_tensors(
149 nng,
150 first_npu_sg,
151 arch,
152 permanent_storage,
153 scheduler_options.use_ifm_ofm_overlap,
154 options.tensor_allocator,
155 options.verbose_allocation,
156 options.show_minimum_possible_allocation,
157 lr_graph_flash,
158 )
159
160 # Allocate all non-constant tensors to the root, i.e. Cpu, subgraph. This step
161 # will start at the root subgraph's input and traverse from top to bottom. When
162 # it comes across an Npu-op it will extract live ranges for it's corresponding
163 # Npu subgraph and add them to the root's live range graph. Finally, all of the
164 # non-constant tensors are allocated together
165 root_sg = nng.get_root_subgraph()
166 tensor_allocation.allocate_tensors(
167 nng,
168 root_sg,
169 arch,
170 non_const_mem_area,
171 scheduler_options.use_ifm_ofm_overlap,
172 options.tensor_allocator,
173 options.verbose_allocation,
174 options.show_minimum_possible_allocation,
175 )
176
177 # Generate command streams and serialise Npu-ops into tensors
178 for sg in nng.subgraphs:
179 high_level_command_stream_generator.generate_high_level_command_stream(
180 nng, sg, arch, options.verbose_high_level_command_stream
181 )
182 register_command_stream_generator.generate_register_command_stream(
183 nng, sg, arch, options.verbose_register_command_stream
184 )
185 scratch_tens, flash_tens = npu_serialisation.serialise_npu_subgraph_into_tensors(
186 nng, sg, arch, scratch_tens, flash_tens
187 )
188
189 npu_serialisation.rewrite_npu_call_ops(nng, root_sg, arch)
190
191 # Allocate all Cpu constant tensors, this is done last because the Npu-ops
192 # have to be serialized into flash and scratch tensors first
193 tensor_allocation.allocate_tensors(
194 nng,
195 root_sg,
196 arch,
197 permanent_storage,
198 scheduler_options.use_ifm_ofm_overlap,
199 options.tensor_allocator,
200 options.verbose_allocation,
201 options.show_minimum_possible_allocation,
202 )
203
204 npu_performance.calc_performance_for_network(nng, arch)