blob: 51c9707021675267c801aede96b213b5a7f68786 [file] [log] [blame]
wilisa0146c94772023-02-08 09:56:14 +00001# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Rickard Bolinbc6ee582022-11-04 08:24:29 +000016#
Tim Hall79d07d22020-04-27 18:20:16 +010017# Description:
18# Contains the main sequencing of the compiler.
Diego Russoea6111a2020-04-14 18:41:58 +010019import time
20
Diego Russoe8a10452020-04-21 17:39:10 +010021from . import extract_npu_subgraphs
Tim Hall79d07d22020-04-27 18:20:16 +010022from . import graph_optimiser
Diego Russoe8a10452020-04-21 17:39:10 +010023from . import high_level_command_stream_generator
Louis Verhaard1e170182020-11-26 11:42:04 +010024from . import high_level_command_to_npu_op
Diego Russoe8a10452020-04-21 17:39:10 +010025from . import live_range
Louis Verhaard0b8268a2020-08-05 16:11:29 +020026from . import lut
Diego Russoe8a10452020-04-21 17:39:10 +010027from . import mark_tensors
28from . import npu_performance
29from . import npu_serialisation
Tim Hall79d07d22020-04-27 18:20:16 +010030from . import pass_packing
31from . import scheduler
32from . import tensor_allocation
Tim Halle6ccd872020-11-09 16:46:37 +000033from .debug_database import DebugDatabase
Diego Russoe8a10452020-04-21 17:39:10 +010034from .nn_graph import PassPlacement
35from .nn_graph import TensorAllocator
Tim Halle6ccd872020-11-09 16:46:37 +000036from .operation import Op
Diego Russoea6111a2020-04-14 18:41:58 +010037from .rewrite_graph import verify_graph_health
Tim Halle6ccd872020-11-09 16:46:37 +000038from .rewrite_graph import visit_graph_post_order
Tim Halld8339a72021-05-27 18:49:40 +010039from .scheduler import OptimizationStrategy
40from .tensor import MemArea
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020041from .tensor import MemType
Jacob Bohlin0628a8c2020-08-28 13:25:14 +020042from .tensor import Tensor
Raul Farkas1c54ac12023-04-26 07:49:15 +010043from .utils import progress_print
Tim Hall79d07d22020-04-27 18:20:16 +010044
45
46class CompilerOptions:
47 """Set of options to change compiler behaviour - verbosity, targets, turning off passes.
48
Jonas Ohlssond8575072022-03-30 10:30:25 +020049 Note the difference between ArchitectureFeatures and CompilerOptions
50 - ArchitectureFeatures is for changing the Ethos-U and system architecture
51 - CompilerOptions is for changing the behaviour of the compiler"""
Tim Hall79d07d22020-04-27 18:20:16 +010052
53 def __init__(
54 self,
55 verbose_graph=False,
56 verbose_quantization=False,
57 verbose_packing=False,
58 verbose_tensor_purpose=False,
59 verbose_tensor_format=False,
60 verbose_allocation=False,
61 verbose_high_level_command_stream=False,
62 verbose_register_command_stream=False,
63 verbose_operators=False,
Fredrik Svedbergf5c07c42021-04-23 14:36:42 +020064 verbose_weights=False,
Tim Hallc1be0872022-03-03 17:50:52 +000065 verbose_performance=False,
Raul Farkas1c54ac12023-04-26 07:49:15 +010066 verbose_progress=False,
Tim Hall79d07d22020-04-27 18:20:16 +010067 show_cpu_operations=False,
68 tensor_allocator=TensorAllocator.Greedy,
69 timing=False,
wilisa0146c94772023-02-08 09:56:14 +000070 force_symmetric_int_weights=False,
Tim Hall79d07d22020-04-27 18:20:16 +010071 output_dir="outputs",
Tim Hallb9b515c2020-11-01 21:27:19 +000072 cpu_tensor_alignment=Tensor.AllocationQuantum,
Tim Hallcda4fcb2022-05-19 12:36:58 +010073 hillclimb_max_iterations=None,
Tim Hall79d07d22020-04-27 18:20:16 +010074 ):
75
76 self.verbose_graph = verbose_graph
77 self.verbose_quantization = verbose_quantization
78 self.verbose_packing = verbose_packing
79 self.verbose_tensor_purpose = verbose_tensor_purpose
80 self.verbose_tensor_format = verbose_tensor_format
81 self.verbose_allocation = verbose_allocation
82 self.verbose_high_level_command_stream = verbose_high_level_command_stream
83 self.verbose_register_command_stream = verbose_register_command_stream
84 self.verbose_operators = verbose_operators
Fredrik Svedbergf5c07c42021-04-23 14:36:42 +020085 self.verbose_weights = verbose_weights
Tim Hallc1be0872022-03-03 17:50:52 +000086 self.verbose_performance = verbose_performance
Raul Farkas1c54ac12023-04-26 07:49:15 +010087 self.verbose_progress = verbose_progress
Tim Hall79d07d22020-04-27 18:20:16 +010088 self.show_cpu_operations = show_cpu_operations
89 self.tensor_allocator = tensor_allocator
90 self.timing = timing
wilisa0146c94772023-02-08 09:56:14 +000091 self.force_symmetric_int_weights = force_symmetric_int_weights
Tim Hall79d07d22020-04-27 18:20:16 +010092 self.output_dir = output_dir
Tim Hallb9b515c2020-11-01 21:27:19 +000093 self.cpu_tensor_alignment = cpu_tensor_alignment
Tim Hallcda4fcb2022-05-19 12:36:58 +010094 self.hillclimb_max_iterations = hillclimb_max_iterations
Tim Hall79d07d22020-04-27 18:20:16 +010095
96 def __str__(self):
97 return type(self).__name__ + ": " + str(self.__dict__)
98
99 __repr__ = __str__
100
101
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200102def next_sram_factor(alloc_results):
103 # Bisects to find the max SRAM usage that successfully can be fitted with the tensor allocator.
104 # Returns tuple (factor, dry_test), with factor is None (stop) or 0 <= factor <= 1 (next SRAM factor to try),
105 # dry_test is True while still bisecting.
106 upper = 1.0
107 lower = 0.7
108 MAX_ITERATIONS = 8
109 if len(alloc_results) == 0:
110 # First iteration, try max SRAM, keep the result if it succeeds
111 return (upper, False)
112 elif len(alloc_results) == 1:
113 if alloc_results[0]:
114 # The allocator succeeded at first try; stop
115 return (None, False)
116 else:
117 # Start bisecting, try lowerbound SRAM
118 return (lower, True)
119 elif len(alloc_results) > MAX_ITERATIONS:
120 # Stop
121 return (None, False)
122 if not alloc_results[1]:
123 # Allocation at lower failed; search interval 0 - lower
124 upper = lower
125 lower = 0
126 best = lower
127 for success in alloc_results[2:]:
128 middle = (lower + upper) / 2
129 if success:
130 best = max(best, middle)
131 lower = middle
132 else:
133 upper = middle
134 if len(alloc_results) == MAX_ITERATIONS:
135 # Done bisecting; repeat the best match, but not as dry test
136 return (best, False)
137 # Next try; run only as dry test
138 return ((lower + upper) / 2, True)
139
140
Tim Halle6ccd872020-11-09 16:46:37 +0000141def _record_operator(op, arch):
wilisa0179a89042022-11-02 17:18:43 +0000142 if op.type not in (Op.Const, Op.Placeholder):
Tim Halle6ccd872020-11-09 16:46:37 +0000143 DebugDatabase.add_source(op)
144
145
Tim Halld8339a72021-05-27 18:49:40 +0100146def _check_schedule(nng, arch, scheduler_options):
147 # check sram usage for optimisation strategy
148 sram_usage = nng.get_root_subgraph().memory_used.get(MemArea.Sram)
149 if sram_usage is not None and scheduler_options.optimization_strategy == OptimizationStrategy.Performance:
150 if sram_usage > scheduler_options.optimization_sram_limit:
151 print(
152 f"Warning: SRAM target for arena memory area exceeded."
153 f" Target = {scheduler_options.optimization_sram_limit} Bytes,"
154 f" Actual = {sram_usage} Bytes"
155 )
156
157
wilisa0189a8cdd2022-08-22 16:13:06 +0000158def compiler_driver(nng, arch, options, scheduler_options, network_type, output_basename):
Tim Hall79d07d22020-04-27 18:20:16 +0100159 assert verify_graph_health(nng)
Raul Farkas1c54ac12023-04-26 07:49:15 +0100160 verbose_progress = scheduler_options.verbose_progress
Tim Halle6ccd872020-11-09 16:46:37 +0000161
162 # Pre-optimisation operator tracking
163 for sg in nng.subgraphs:
164 visit_graph_post_order(sg.output_tensors, arch, [], [_record_operator])
165
Raul Farkas1c54ac12023-04-26 07:49:15 +0100166 progress_print(verbose_progress, "Performing graph optimisation")
wilisa0146c94772023-02-08 09:56:14 +0000167 nng = graph_optimiser.optimise_graph(
168 nng, arch, network_type, options.verbose_graph, options.force_symmetric_int_weights
169 )
Tim Hall79d07d22020-04-27 18:20:16 +0100170 assert verify_graph_health(nng)
171
172 if options.verbose_quantization:
173 nng.print_graph_with_tensor_quantization()
174
Raul Farkas1c54ac12023-04-26 07:49:15 +0100175 progress_print(verbose_progress, "Defining tensor purpose")
Tim Hall79d07d22020-04-27 18:20:16 +0100176 nng = mark_tensors.mark_tensor_purpose(nng, arch, options.verbose_tensor_purpose)
177 assert verify_graph_health(nng)
Raul Farkas1c54ac12023-04-26 07:49:15 +0100178
179 progress_print(verbose_progress, "Performing pass packing")
Tim Hall79d07d22020-04-27 18:20:16 +0100180 pass_packing.pack_into_passes(nng, arch, options.verbose_packing)
181 assert verify_graph_health(nng)
182
Raul Farkas1c54ac12023-04-26 07:49:15 +0100183 progress_print(verbose_progress, "Extracting npu subgraphs")
Tim Hall79d07d22020-04-27 18:20:16 +0100184 extract_npu_subgraphs.extract_npu_subgraphs(nng, arch)
185
Tim Hall79d07d22020-04-27 18:20:16 +0100186 assert verify_graph_health(nng)
187 if options.timing:
188 start = time.time()
189
Raul Farkas1c54ac12023-04-26 07:49:15 +0100190 progress_print(verbose_progress, "Scheduling passes")
Tim Hall79d07d22020-04-27 18:20:16 +0100191 # Run the scheduler
Tim Halld8339a72021-05-27 18:49:40 +0100192 scheduler.schedule_passes(nng, arch, options, scheduler_options)
193 _check_schedule(nng, arch, scheduler_options)
Tim Hall79d07d22020-04-27 18:20:16 +0100194
195 if options.timing:
196 stop = time.time()
197 print("Scheduling took %f s" % (stop - start))
198 start = time.time()
199
Tim Hall79d07d22020-04-27 18:20:16 +0100200 # LiveRanges for constant tensors for all Npu subgraphs
201 permanent_storage = arch.permanent_storage_mem_area
202 lr_graph_flash = live_range.LiveRangeGraph()
203
204 # Placeholders for scratch and flash tensors that are common for all Npu subgraphs
205 scratch_tens = None
Patrik Gustavsson3ab94522020-06-29 17:36:55 +0200206 scratch_fast_tens = None
Tim Hall79d07d22020-04-27 18:20:16 +0100207 flash_tens = None
208
Dwight Lidman62cdfe52021-10-11 16:39:10 +0200209 # Create list of NPU subgraphs with same order as the list of all subgraphs
210 npu_subgraphs = [sg for sg in nng.subgraphs if sg.placement == PassPlacement.Npu]
Tim Hall79d07d22020-04-27 18:20:16 +0100211
Raul Farkas1c54ac12023-04-26 07:49:15 +0100212 progress_print(verbose_progress, "Calculating live ranges for constant NPU tensors")
Dwight Lidman62cdfe52021-10-11 16:39:10 +0200213 # Calculate live ranges for all constant Npu tensors, in permanent storage
214 for sg in npu_subgraphs:
215 lr_graph_flash = live_range.create_linear_live_range_graph(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200216 sg,
217 permanent_storage,
218 MemType.Permanent_NPU,
219 lr_graph=lr_graph_flash,
Dwight Lidman62cdfe52021-10-11 16:39:10 +0200220 )
221
222 if npu_subgraphs:
Raul Farkas1c54ac12023-04-26 07:49:15 +0100223 progress_print(verbose_progress, "Allocating NPU constant tensors to the first NPU subgraph")
Tim Hall25f605c2020-05-18 18:04:26 +0100224 # Allocate all Npu constant tensors to the first Npu subgraph since it is
225 # processed first during serialization into tensors
Dwight Lidman62cdfe52021-10-11 16:39:10 +0200226 first_npu_sg = npu_subgraphs[0]
Tim Hall25f605c2020-05-18 18:04:26 +0100227 tensor_allocation.allocate_tensors(
228 nng,
229 first_npu_sg,
230 arch,
231 permanent_storage,
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200232 set((MemType.Permanent_NPU,)),
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200233 tensor_allocator=TensorAllocator.LinearAlloc,
234 verbose_allocation=options.verbose_allocation,
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200235 lr_graph=lr_graph_flash,
Tim Hall25f605c2020-05-18 18:04:26 +0100236 )
Tim Hall79d07d22020-04-27 18:20:16 +0100237
Tim Hall79d07d22020-04-27 18:20:16 +0100238 root_sg = nng.get_root_subgraph()
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200239
Raul Farkas1c54ac12023-04-26 07:49:15 +0100240 progress_print(verbose_progress, "Generating command stream")
Tim Hall79d07d22020-04-27 18:20:16 +0100241 # Generate command streams and serialise Npu-ops into tensors
Dwight Lidman62cdfe52021-10-11 16:39:10 +0200242 for sg in npu_subgraphs:
243 high_level_command_stream_generator.generate_high_level_command_stream_for_schedule(
244 nng, sg, arch, options.verbose_high_level_command_stream
245 )
246 lut.optimize_high_level_cmd_stream(sg, arch)
247 high_level_command_to_npu_op.generate_register_command_stream_for_sg(
248 nng, sg, arch, options.verbose_register_command_stream
249 )
250 scratch_tens, scratch_fast_tens, flash_tens = npu_serialisation.serialise_npu_subgraph_into_tensors(
251 sg, arch, scratch_tens, scratch_fast_tens, flash_tens
252 )
Tim Hall79d07d22020-04-27 18:20:16 +0100253
Johan Alfvén673683b2022-09-05 09:39:47 +0200254 # Create list of CPU subgraphs with same order as the list of all subgraphs
255 cpu_subgraphs = [sg for sg in nng.subgraphs if sg.placement == PassPlacement.Cpu]
256 for sg in cpu_subgraphs:
257 npu_serialisation.rewrite_npu_call_ops(sg, arch)
Tim Hall79d07d22020-04-27 18:20:16 +0100258
Jacob Bohlin268394d2020-08-13 13:24:59 +0200259 # Set Scratch and Fast_scratch Tensor size
260 if scratch_tens is not None:
261 scratch_tens.set_all_shapes([root_sg.memory_used_per_type.get(MemType.Scratch, 0)])
262 if scratch_fast_tens is not None:
263 scratch_fast_tens.set_all_shapes([root_sg.memory_used_per_type.get(MemType.Scratch_fast, 0)])
264
Raul Farkas1c54ac12023-04-26 07:49:15 +0100265 progress_print(verbose_progress, "Allocating CPU constant tensors")
Tim Hall79d07d22020-04-27 18:20:16 +0100266 # Allocate all Cpu constant tensors, this is done last because the Npu-ops
267 # have to be serialized into flash and scratch tensors first
268 tensor_allocation.allocate_tensors(
269 nng,
270 root_sg,
271 arch,
272 permanent_storage,
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200273 set((MemType.Permanent_CPU,)),
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200274 tensor_allocator=TensorAllocator.LinearAlloc,
275 verbose_allocation=options.verbose_allocation,
Tim Hallb9b515c2020-11-01 21:27:19 +0000276 cpu_tensor_alignment=options.cpu_tensor_alignment,
Tim Hall79d07d22020-04-27 18:20:16 +0100277 )
Raul Farkas1c54ac12023-04-26 07:49:15 +0100278 progress_print(verbose_progress, "Calculating new performance for the network")
wilisa0189a8cdd2022-08-22 16:13:06 +0000279 npu_performance.calc_new_performance_for_network(
280 nng, arch, network_type, options.verbose_performance, output_basename
281 )