blob: 26d350eab6631c6869004ed4500d1542e10e2add [file] [log] [blame]
erik.andersson@arm.com460c6892021-02-24 14:38:09 +01001# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Contains the main sequencing of the compiler.
Diego Russoea6111a2020-04-14 18:41:58 +010018import time
19
Diego Russoe8a10452020-04-21 17:39:10 +010020from . import extract_npu_subgraphs
Tim Hall79d07d22020-04-27 18:20:16 +010021from . import graph_optimiser
Diego Russoe8a10452020-04-21 17:39:10 +010022from . import high_level_command_stream_generator
Louis Verhaard1e170182020-11-26 11:42:04 +010023from . import high_level_command_to_npu_op
Tim Hall79d07d22020-04-27 18:20:16 +010024from . import insert_dma
Diego Russoe8a10452020-04-21 17:39:10 +010025from . import live_range
Louis Verhaard0b8268a2020-08-05 16:11:29 +020026from . import lut
Diego Russoe8a10452020-04-21 17:39:10 +010027from . import mark_tensors
28from . import npu_performance
29from . import npu_serialisation
Tim Hall79d07d22020-04-27 18:20:16 +010030from . import pass_packing
31from . import scheduler
32from . import tensor_allocation
Tim Hall79d07d22020-04-27 18:20:16 +010033from . import weight_compressor
Tim Halle6ccd872020-11-09 16:46:37 +000034from .debug_database import DebugDatabase
Patrik Gustavssonc0bb8992020-08-11 16:45:35 +020035from .errors import VelaError
Diego Russoe8a10452020-04-21 17:39:10 +010036from .nn_graph import PassPlacement
37from .nn_graph import TensorAllocator
Tim Halle6ccd872020-11-09 16:46:37 +000038from .operation import Op
Diego Russoea6111a2020-04-14 18:41:58 +010039from .rewrite_graph import verify_graph_health
Tim Halle6ccd872020-11-09 16:46:37 +000040from .rewrite_graph import visit_graph_post_order
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020041from .tensor import MemType
Jacob Bohlin0628a8c2020-08-28 13:25:14 +020042from .tensor import Tensor
Tim Hall79d07d22020-04-27 18:20:16 +010043
44
45class CompilerOptions:
46 """Set of options to change compiler behaviour - verbosity, targets, turning off passes.
47
48Note the difference between ArchitectureFeatures and CompilerOptions
Tim Hallc8a73862020-10-27 12:43:14 +000049- ArchitectureFeatures is for changing the Ethos-U and system architecture
Tim Hall79d07d22020-04-27 18:20:16 +010050- CompilerOptions is for changing the behaviour of the compiler
51"""
52
53 def __init__(
54 self,
55 verbose_graph=False,
56 verbose_quantization=False,
57 verbose_packing=False,
58 verbose_tensor_purpose=False,
59 verbose_tensor_format=False,
60 verbose_allocation=False,
61 verbose_high_level_command_stream=False,
62 verbose_register_command_stream=False,
63 verbose_operators=False,
Fredrik Svedbergf5c07c42021-04-23 14:36:42 +020064 verbose_weights=False,
Tim Hall79d07d22020-04-27 18:20:16 +010065 show_cpu_operations=False,
66 tensor_allocator=TensorAllocator.Greedy,
67 timing=False,
68 output_dir="outputs",
Tim Hallb9b515c2020-11-01 21:27:19 +000069 cpu_tensor_alignment=Tensor.AllocationQuantum,
Tim Hall79d07d22020-04-27 18:20:16 +010070 ):
71
72 self.verbose_graph = verbose_graph
73 self.verbose_quantization = verbose_quantization
74 self.verbose_packing = verbose_packing
75 self.verbose_tensor_purpose = verbose_tensor_purpose
76 self.verbose_tensor_format = verbose_tensor_format
77 self.verbose_allocation = verbose_allocation
78 self.verbose_high_level_command_stream = verbose_high_level_command_stream
79 self.verbose_register_command_stream = verbose_register_command_stream
80 self.verbose_operators = verbose_operators
Fredrik Svedbergf5c07c42021-04-23 14:36:42 +020081 self.verbose_weights = verbose_weights
Tim Hall79d07d22020-04-27 18:20:16 +010082 self.show_cpu_operations = show_cpu_operations
83 self.tensor_allocator = tensor_allocator
84 self.timing = timing
85 self.output_dir = output_dir
Tim Hallb9b515c2020-11-01 21:27:19 +000086 self.cpu_tensor_alignment = cpu_tensor_alignment
Tim Hall79d07d22020-04-27 18:20:16 +010087
88 def __str__(self):
89 return type(self).__name__ + ": " + str(self.__dict__)
90
91 __repr__ = __str__
92
93
Louis Verhaard0b9c9a32020-09-15 14:05:38 +020094def next_sram_factor(alloc_results):
95 # Bisects to find the max SRAM usage that successfully can be fitted with the tensor allocator.
96 # Returns tuple (factor, dry_test), with factor is None (stop) or 0 <= factor <= 1 (next SRAM factor to try),
97 # dry_test is True while still bisecting.
98 upper = 1.0
99 lower = 0.7
100 MAX_ITERATIONS = 8
101 if len(alloc_results) == 0:
102 # First iteration, try max SRAM, keep the result if it succeeds
103 return (upper, False)
104 elif len(alloc_results) == 1:
105 if alloc_results[0]:
106 # The allocator succeeded at first try; stop
107 return (None, False)
108 else:
109 # Start bisecting, try lowerbound SRAM
110 return (lower, True)
111 elif len(alloc_results) > MAX_ITERATIONS:
112 # Stop
113 return (None, False)
114 if not alloc_results[1]:
115 # Allocation at lower failed; search interval 0 - lower
116 upper = lower
117 lower = 0
118 best = lower
119 for success in alloc_results[2:]:
120 middle = (lower + upper) / 2
121 if success:
122 best = max(best, middle)
123 lower = middle
124 else:
125 upper = middle
126 if len(alloc_results) == MAX_ITERATIONS:
127 # Done bisecting; repeat the best match, but not as dry test
128 return (best, False)
129 # Next try; run only as dry test
130 return ((lower + upper) / 2, True)
131
132
Tim Halle6ccd872020-11-09 16:46:37 +0000133def _record_operator(op, arch):
134 if op.type != Op.Const:
135 DebugDatabase.add_source(op)
136
137
Tim Hall79d07d22020-04-27 18:20:16 +0100138def compiler_driver(nng, arch, options, scheduler_options):
139 assert verify_graph_health(nng)
Tim Halle6ccd872020-11-09 16:46:37 +0000140
141 # Pre-optimisation operator tracking
142 for sg in nng.subgraphs:
143 visit_graph_post_order(sg.output_tensors, arch, [], [_record_operator])
144
Tim Hall79d07d22020-04-27 18:20:16 +0100145 nng = graph_optimiser.optimise_graph_a(nng, arch, options.verbose_graph)
146 assert verify_graph_health(nng)
147
148 if options.verbose_quantization:
149 nng.print_graph_with_tensor_quantization()
150
Tim Hall79d07d22020-04-27 18:20:16 +0100151 nng = mark_tensors.mark_tensor_purpose(nng, arch, options.verbose_tensor_purpose)
152 assert verify_graph_health(nng)
153 nng = insert_dma.insert_dma_commands(nng, arch, options.verbose_graph)
154 assert verify_graph_health(nng)
155 pass_packing.pack_into_passes(nng, arch, options.verbose_packing)
156 assert verify_graph_health(nng)
157
158 extract_npu_subgraphs.extract_npu_subgraphs(nng, arch)
159
Tim Hall79d07d22020-04-27 18:20:16 +0100160 assert verify_graph_health(nng)
161 if options.timing:
162 start = time.time()
163
164 # Run the scheduler
165 scheduler.schedule_passes(nng, arch, scheduler_options)
166
167 if options.timing:
168 stop = time.time()
169 print("Scheduling took %f s" % (stop - start))
170 start = time.time()
171
172 # Update the compressed weights now that we have determined the
173 # block config, and calc and pack the scales and biases
174 weight_compressor.update_pass_weight_and_scale_tensors(nng, arch)
175
Tim Hall14e8a202020-11-27 12:23:42 +0000176 if scheduler_options.cache_bias_scale_tensor:
Andreas Nevalainen27d36f02020-11-19 11:27:50 +0100177 scheduler.move_scales_to_fast_storage(nng, arch)
178
Tim Hall79d07d22020-04-27 18:20:16 +0100179 # LiveRanges for constant tensors for all Npu subgraphs
180 permanent_storage = arch.permanent_storage_mem_area
181 lr_graph_flash = live_range.LiveRangeGraph()
182
183 # Placeholders for scratch and flash tensors that are common for all Npu subgraphs
184 scratch_tens = None
Patrik Gustavsson3ab94522020-06-29 17:36:55 +0200185 scratch_fast_tens = None
Tim Hall79d07d22020-04-27 18:20:16 +0100186 flash_tens = None
187
188 # Calculate live ranges for all constant Npu tensors, in permanent storage
189 for sg in nng.subgraphs:
190 if sg.placement == PassPlacement.Npu:
191 lr_graph_flash = live_range.extract_live_ranges_from_cascaded_passes(
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200192 sg,
193 permanent_storage,
194 MemType.Permanent_NPU,
195 ignore_subgraph_input_output_tensors=True,
196 lr_graph=lr_graph_flash,
Tim Hall79d07d22020-04-27 18:20:16 +0100197 )
198
Tim Hall25f605c2020-05-18 18:04:26 +0100199 if len(nng.subgraphs) > 1:
200 # Allocate all Npu constant tensors to the first Npu subgraph since it is
201 # processed first during serialization into tensors
202 first_npu_sg = nng.subgraphs[1]
203 assert first_npu_sg.placement == PassPlacement.Npu
Tim Hall25f605c2020-05-18 18:04:26 +0100204 tensor_allocation.allocate_tensors(
205 nng,
206 first_npu_sg,
207 arch,
208 permanent_storage,
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200209 set((MemType.Permanent_NPU,)),
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200210 tensor_allocator=TensorAllocator.LinearAlloc,
211 verbose_allocation=options.verbose_allocation,
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200212 lr_graph=lr_graph_flash,
Tim Hall25f605c2020-05-18 18:04:26 +0100213 )
Tim Hall79d07d22020-04-27 18:20:16 +0100214
215 # Allocate all non-constant tensors to the root, i.e. Cpu, subgraph. This step
216 # will start at the root subgraph's input and traverse from top to bottom. When
217 # it comes across an Npu-op it will extract live ranges for it's corresponding
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200218 # Npu subgraph and add them to the root's live range graph.
219 # The non-constant tensors are stored either in arch.feature_map_storage_mem_area or
220 # arch.fast_storage_mem_area.
221 # When these memory areas are the same, all non-constant tensors are allocated together.
222 # Otherwise they are allocated separately.
223
Tim Hall79d07d22020-04-27 18:20:16 +0100224 root_sg = nng.get_root_subgraph()
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200225
226 alloc_list = []
Tim Hall1bd531d2020-11-01 20:59:36 +0000227 if arch.is_spilling_enabled():
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200228 mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,)))
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200229 mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))
230 # Order is important
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200231 alloc_list.append(mem_alloc_scratch_fast)
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200232 alloc_list.append(mem_alloc_scratch)
Tim Hall1bd531d2020-11-01 20:59:36 +0000233 else:
234 mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))
235 alloc_list.append(mem_alloc_scratch)
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200236
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200237 for mem_area, mem_type_set in alloc_list:
Tim Hall1bd531d2020-11-01 20:59:36 +0000238 if arch.is_spilling_enabled() and mem_area == arch.fast_storage_mem_area:
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200239 # For the case where scratch_fast != scratch: attempt to place feature maps used between
240 # cascaded passes in fast storage. Bisection is used to find the max possible usage of SRAM.
241 alloc_results = []
242 while True:
243 assert len(alloc_results) < 10, "Infinite allocator loop"
244 sram_factor, dry_test = next_sram_factor(alloc_results)
245 if sram_factor is None:
246 break
247 # Try to move as many feature maps as possible to SRAM before allocating
248 sram_limit = sram_factor * arch.sram_size
249 for sg in nng.subgraphs:
250 scheduler.use_fast_storage_for_feature_maps(sg, sram_limit, arch)
251 alloc_success = tensor_allocation.allocate_tensors(
252 nng,
253 root_sg,
254 arch,
255 mem_area,
256 mem_type_set,
257 max_size=arch.sram_size,
258 dry_test=dry_test,
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200259 tensor_allocator=options.tensor_allocator,
260 verbose_allocation=options.verbose_allocation,
Tim Hallb9b515c2020-11-01 21:27:19 +0000261 cpu_tensor_alignment=options.cpu_tensor_alignment,
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200262 )
263 if dry_test or not alloc_success:
264 for sg in nng.subgraphs:
265 scheduler.undo_use_fast_storage(sg, arch)
266 alloc_results.append(alloc_success)
267 if not alloc_results[-1]:
268 raise VelaError(
Michael McGeagh7a6f8432020-12-02 15:29:22 +0000269 f"Sram limit {arch.sram_size} bytes, has been exceeded by the scratch fast tensor. "
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200270 "Increasing the value of --weight-estimation-scaling may help to resolve the issue. "
Michael McGeagh7a6f8432020-12-02 15:29:22 +0000271 "See OPTIONS.md for more information"
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200272 )
Tim Hall1bd531d2020-11-01 20:59:36 +0000273 else:
274 tensor_allocation.allocate_tensors(
275 nng,
276 root_sg,
277 arch,
278 mem_area,
279 mem_type_set,
280 tensor_allocator=options.tensor_allocator,
281 verbose_allocation=options.verbose_allocation,
Tim Hallb9b515c2020-11-01 21:27:19 +0000282 cpu_tensor_alignment=options.cpu_tensor_alignment,
Tim Hall1bd531d2020-11-01 20:59:36 +0000283 )
Tim Hall79d07d22020-04-27 18:20:16 +0100284
285 # Generate command streams and serialise Npu-ops into tensors
286 for sg in nng.subgraphs:
287 high_level_command_stream_generator.generate_high_level_command_stream(
288 nng, sg, arch, options.verbose_high_level_command_stream
289 )
Louis Verhaard0b8268a2020-08-05 16:11:29 +0200290 lut.optimize_high_level_cmd_stream(sg, arch)
Louis Verhaard1e170182020-11-26 11:42:04 +0100291 high_level_command_to_npu_op.generate_register_command_stream_for_sg(
Tim Hall79d07d22020-04-27 18:20:16 +0100292 nng, sg, arch, options.verbose_register_command_stream
293 )
Patrik Gustavsson3ab94522020-06-29 17:36:55 +0200294 scratch_tens, scratch_fast_tens, flash_tens = npu_serialisation.serialise_npu_subgraph_into_tensors(
295 nng, sg, arch, scratch_tens, scratch_fast_tens, flash_tens
Tim Hall79d07d22020-04-27 18:20:16 +0100296 )
297
298 npu_serialisation.rewrite_npu_call_ops(nng, root_sg, arch)
299
Jacob Bohlin268394d2020-08-13 13:24:59 +0200300 # Set Scratch and Fast_scratch Tensor size
301 if scratch_tens is not None:
302 scratch_tens.set_all_shapes([root_sg.memory_used_per_type.get(MemType.Scratch, 0)])
303 if scratch_fast_tens is not None:
304 scratch_fast_tens.set_all_shapes([root_sg.memory_used_per_type.get(MemType.Scratch_fast, 0)])
305
Tim Hall79d07d22020-04-27 18:20:16 +0100306 # Allocate all Cpu constant tensors, this is done last because the Npu-ops
307 # have to be serialized into flash and scratch tensors first
308 tensor_allocation.allocate_tensors(
309 nng,
310 root_sg,
311 arch,
312 permanent_storage,
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200313 set((MemType.Permanent_CPU,)),
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200314 tensor_allocator=TensorAllocator.LinearAlloc,
315 verbose_allocation=options.verbose_allocation,
Tim Hallb9b515c2020-11-01 21:27:19 +0000316 cpu_tensor_alignment=options.cpu_tensor_alignment,
Tim Hall79d07d22020-04-27 18:20:16 +0100317 )
318
319 npu_performance.calc_performance_for_network(nng, arch)