blob: 6c1142d138d067494fd3d709f647e0b9fdcfb7cf [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Contains the main sequencing of the compiler.
Diego Russoea6111a2020-04-14 18:41:58 +010018import time
19
Diego Russoe8a10452020-04-21 17:39:10 +010020from . import extract_npu_subgraphs
Tim Hall79d07d22020-04-27 18:20:16 +010021from . import graph_optimiser
Diego Russoe8a10452020-04-21 17:39:10 +010022from . import high_level_command_stream_generator
Tim Hall79d07d22020-04-27 18:20:16 +010023from . import insert_dma
Diego Russoe8a10452020-04-21 17:39:10 +010024from . import live_range
Louis Verhaard0b8268a2020-08-05 16:11:29 +020025from . import lut
Diego Russoe8a10452020-04-21 17:39:10 +010026from . import mark_tensors
27from . import npu_performance
28from . import npu_serialisation
Tim Hall79d07d22020-04-27 18:20:16 +010029from . import pass_packing
Diego Russoe8a10452020-04-21 17:39:10 +010030from . import register_command_stream_generator
Tim Hall79d07d22020-04-27 18:20:16 +010031from . import scheduler
32from . import tensor_allocation
Tim Hall79d07d22020-04-27 18:20:16 +010033from . import weight_compressor
Patrik Gustavssonc0bb8992020-08-11 16:45:35 +020034from .errors import VelaError
Diego Russoe8a10452020-04-21 17:39:10 +010035from .nn_graph import PassPlacement
36from .nn_graph import TensorAllocator
Diego Russoea6111a2020-04-14 18:41:58 +010037from .rewrite_graph import verify_graph_health
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020038from .tensor import MemType
Jacob Bohlin0628a8c2020-08-28 13:25:14 +020039from .tensor import Tensor
Tim Hall79d07d22020-04-27 18:20:16 +010040
41
42class CompilerOptions:
43 """Set of options to change compiler behaviour - verbosity, targets, turning off passes.
44
45Note the difference between ArchitectureFeatures and CompilerOptions
46- ArchitectureFeatures is for changing the Ethos-U55 and system architecture
47- CompilerOptions is for changing the behaviour of the compiler
48"""
49
50 def __init__(
51 self,
52 verbose_graph=False,
53 verbose_quantization=False,
54 verbose_packing=False,
55 verbose_tensor_purpose=False,
56 verbose_tensor_format=False,
57 verbose_allocation=False,
58 verbose_high_level_command_stream=False,
59 verbose_register_command_stream=False,
60 verbose_operators=False,
61 show_minimum_possible_allocation=False,
62 show_cpu_operations=False,
63 tensor_allocator=TensorAllocator.Greedy,
64 timing=False,
65 output_dir="outputs",
Jacob Bohlin0628a8c2020-08-28 13:25:14 +020066 allocation_alignment=Tensor.AllocationQuantum,
Tim Hall79d07d22020-04-27 18:20:16 +010067 ):
68
69 self.verbose_graph = verbose_graph
70 self.verbose_quantization = verbose_quantization
71 self.verbose_packing = verbose_packing
72 self.verbose_tensor_purpose = verbose_tensor_purpose
73 self.verbose_tensor_format = verbose_tensor_format
74 self.verbose_allocation = verbose_allocation
75 self.verbose_high_level_command_stream = verbose_high_level_command_stream
76 self.verbose_register_command_stream = verbose_register_command_stream
77 self.verbose_operators = verbose_operators
78 self.show_minimum_possible_allocation = show_minimum_possible_allocation
79 self.show_cpu_operations = show_cpu_operations
80 self.tensor_allocator = tensor_allocator
81 self.timing = timing
82 self.output_dir = output_dir
Jacob Bohlin0628a8c2020-08-28 13:25:14 +020083 self.allocation_alignment = allocation_alignment
Tim Hall79d07d22020-04-27 18:20:16 +010084
85 def __str__(self):
86 return type(self).__name__ + ": " + str(self.__dict__)
87
88 __repr__ = __str__
89
90
Louis Verhaard0b9c9a32020-09-15 14:05:38 +020091def next_sram_factor(alloc_results):
92 # Bisects to find the max SRAM usage that successfully can be fitted with the tensor allocator.
93 # Returns tuple (factor, dry_test), with factor is None (stop) or 0 <= factor <= 1 (next SRAM factor to try),
94 # dry_test is True while still bisecting.
95 upper = 1.0
96 lower = 0.7
97 MAX_ITERATIONS = 8
98 if len(alloc_results) == 0:
99 # First iteration, try max SRAM, keep the result if it succeeds
100 return (upper, False)
101 elif len(alloc_results) == 1:
102 if alloc_results[0]:
103 # The allocator succeeded at first try; stop
104 return (None, False)
105 else:
106 # Start bisecting, try lowerbound SRAM
107 return (lower, True)
108 elif len(alloc_results) > MAX_ITERATIONS:
109 # Stop
110 return (None, False)
111 if not alloc_results[1]:
112 # Allocation at lower failed; search interval 0 - lower
113 upper = lower
114 lower = 0
115 best = lower
116 for success in alloc_results[2:]:
117 middle = (lower + upper) / 2
118 if success:
119 best = max(best, middle)
120 lower = middle
121 else:
122 upper = middle
123 if len(alloc_results) == MAX_ITERATIONS:
124 # Done bisecting; repeat the best match, but not as dry test
125 return (best, False)
126 # Next try; run only as dry test
127 return ((lower + upper) / 2, True)
128
129
Tim Hall79d07d22020-04-27 18:20:16 +0100130def compiler_driver(nng, arch, options, scheduler_options):
131 assert verify_graph_health(nng)
132 nng = graph_optimiser.optimise_graph_a(nng, arch, options.verbose_graph)
133 assert verify_graph_health(nng)
134
135 if options.verbose_quantization:
136 nng.print_graph_with_tensor_quantization()
137
138 nng = graph_optimiser.optimise_graph_b(nng, arch, options.verbose_graph)
139 assert verify_graph_health(nng)
140
141 nng = mark_tensors.mark_tensor_purpose(nng, arch, options.verbose_tensor_purpose)
142 assert verify_graph_health(nng)
143 nng = insert_dma.insert_dma_commands(nng, arch, options.verbose_graph)
144 assert verify_graph_health(nng)
145 pass_packing.pack_into_passes(nng, arch, options.verbose_packing)
146 assert verify_graph_health(nng)
147
148 extract_npu_subgraphs.extract_npu_subgraphs(nng, arch)
149
150 mark_tensors.mark_tensor_format(nng, arch, options.verbose_tensor_format)
151 assert verify_graph_health(nng)
152 if options.timing:
153 start = time.time()
154
155 # Run the scheduler
156 scheduler.schedule_passes(nng, arch, scheduler_options)
157
158 if options.timing:
159 stop = time.time()
160 print("Scheduling took %f s" % (stop - start))
161 start = time.time()
162
163 # Update the compressed weights now that we have determined the
164 # block config, and calc and pack the scales and biases
165 weight_compressor.update_pass_weight_and_scale_tensors(nng, arch)
166
Tim Hall79d07d22020-04-27 18:20:16 +0100167 # LiveRanges for constant tensors for all Npu subgraphs
168 permanent_storage = arch.permanent_storage_mem_area
169 lr_graph_flash = live_range.LiveRangeGraph()
170
171 # Placeholders for scratch and flash tensors that are common for all Npu subgraphs
172 scratch_tens = None
Patrik Gustavsson3ab94522020-06-29 17:36:55 +0200173 scratch_fast_tens = None
Tim Hall79d07d22020-04-27 18:20:16 +0100174 flash_tens = None
175
176 # Calculate live ranges for all constant Npu tensors, in permanent storage
177 for sg in nng.subgraphs:
178 if sg.placement == PassPlacement.Npu:
179 lr_graph_flash = live_range.extract_live_ranges_from_cascaded_passes(
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200180 sg,
181 permanent_storage,
182 MemType.Permanent_NPU,
183 ignore_subgraph_input_output_tensors=True,
184 lr_graph=lr_graph_flash,
Tim Hall79d07d22020-04-27 18:20:16 +0100185 )
186
Tim Hall25f605c2020-05-18 18:04:26 +0100187 if len(nng.subgraphs) > 1:
188 # Allocate all Npu constant tensors to the first Npu subgraph since it is
189 # processed first during serialization into tensors
190 first_npu_sg = nng.subgraphs[1]
191 assert first_npu_sg.placement == PassPlacement.Npu
Tim Hall25f605c2020-05-18 18:04:26 +0100192 tensor_allocation.allocate_tensors(
193 nng,
194 first_npu_sg,
195 arch,
196 permanent_storage,
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200197 set((MemType.Permanent_NPU,)),
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200198 use_ifm_ofm_overlap=scheduler_options.use_ifm_ofm_overlap,
199 tensor_allocator=TensorAllocator.LinearAlloc,
200 verbose_allocation=options.verbose_allocation,
201 show_minimum_possible_allocation=options.show_minimum_possible_allocation,
202 lr_graph=lr_graph_flash,
Tim Hall25f605c2020-05-18 18:04:26 +0100203 )
Tim Hall79d07d22020-04-27 18:20:16 +0100204
205 # Allocate all non-constant tensors to the root, i.e. Cpu, subgraph. This step
206 # will start at the root subgraph's input and traverse from top to bottom. When
207 # it comes across an Npu-op it will extract live ranges for it's corresponding
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200208 # Npu subgraph and add them to the root's live range graph.
209 # The non-constant tensors are stored either in arch.feature_map_storage_mem_area or
210 # arch.fast_storage_mem_area.
211 # When these memory areas are the same, all non-constant tensors are allocated together.
212 # Otherwise they are allocated separately.
213
Tim Hall79d07d22020-04-27 18:20:16 +0100214 root_sg = nng.get_root_subgraph()
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200215
216 alloc_list = []
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200217 feature_maps_in_fast_storage = arch.feature_map_storage_mem_area == arch.fast_storage_mem_area
218 if feature_maps_in_fast_storage:
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200219 mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))
220 alloc_list.append(mem_alloc_scratch)
221 else:
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200222 mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,)))
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200223 mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))
224 # Order is important
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200225 alloc_list.append(mem_alloc_scratch_fast)
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200226 alloc_list.append(mem_alloc_scratch)
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200227
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200228 for mem_area, mem_type_set in alloc_list:
229 if feature_maps_in_fast_storage or mem_area != arch.fast_storage_mem_area:
230 tensor_allocation.allocate_tensors(
231 nng,
232 root_sg,
233 arch,
234 mem_area,
235 mem_type_set,
236 use_ifm_ofm_overlap=scheduler_options.use_ifm_ofm_overlap,
237 tensor_allocator=options.tensor_allocator,
238 verbose_allocation=options.verbose_allocation,
239 show_minimum_possible_allocation=options.show_minimum_possible_allocation,
240 allocation_alignment=options.allocation_alignment,
241 )
242 else:
243 # For the case where scratch_fast != scratch: attempt to place feature maps used between
244 # cascaded passes in fast storage. Bisection is used to find the max possible usage of SRAM.
245 alloc_results = []
246 while True:
247 assert len(alloc_results) < 10, "Infinite allocator loop"
248 sram_factor, dry_test = next_sram_factor(alloc_results)
249 if sram_factor is None:
250 break
251 # Try to move as many feature maps as possible to SRAM before allocating
252 sram_limit = sram_factor * arch.sram_size
253 for sg in nng.subgraphs:
254 scheduler.use_fast_storage_for_feature_maps(sg, sram_limit, arch)
255 alloc_success = tensor_allocation.allocate_tensors(
256 nng,
257 root_sg,
258 arch,
259 mem_area,
260 mem_type_set,
261 max_size=arch.sram_size,
262 dry_test=dry_test,
263 use_ifm_ofm_overlap=scheduler_options.use_ifm_ofm_overlap,
264 tensor_allocator=options.tensor_allocator,
265 verbose_allocation=options.verbose_allocation,
266 show_minimum_possible_allocation=options.show_minimum_possible_allocation,
267 allocation_alignment=options.allocation_alignment,
268 )
269 if dry_test or not alloc_success:
270 for sg in nng.subgraphs:
271 scheduler.undo_use_fast_storage(sg, arch)
272 alloc_results.append(alloc_success)
273 if not alloc_results[-1]:
274 raise VelaError(
275 "Sram limit {} bytes, has been exceeded by the scratch fast tensor. "
276 "Increasing the value of --weight-estimation-scaling may help to resolve the issue. "
277 "See OPTIONS.md for more information.".format(arch.sram_size)
278 )
Tim Hall79d07d22020-04-27 18:20:16 +0100279
280 # Generate command streams and serialise Npu-ops into tensors
281 for sg in nng.subgraphs:
282 high_level_command_stream_generator.generate_high_level_command_stream(
283 nng, sg, arch, options.verbose_high_level_command_stream
284 )
Louis Verhaard0b8268a2020-08-05 16:11:29 +0200285 lut.optimize_high_level_cmd_stream(sg, arch)
Tim Hall79d07d22020-04-27 18:20:16 +0100286 register_command_stream_generator.generate_register_command_stream(
287 nng, sg, arch, options.verbose_register_command_stream
288 )
Patrik Gustavsson3ab94522020-06-29 17:36:55 +0200289 scratch_tens, scratch_fast_tens, flash_tens = npu_serialisation.serialise_npu_subgraph_into_tensors(
290 nng, sg, arch, scratch_tens, scratch_fast_tens, flash_tens
Tim Hall79d07d22020-04-27 18:20:16 +0100291 )
292
293 npu_serialisation.rewrite_npu_call_ops(nng, root_sg, arch)
294
295 # Allocate all Cpu constant tensors, this is done last because the Npu-ops
296 # have to be serialized into flash and scratch tensors first
297 tensor_allocation.allocate_tensors(
298 nng,
299 root_sg,
300 arch,
301 permanent_storage,
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200302 set((MemType.Permanent_CPU,)),
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200303 use_ifm_ofm_overlap=scheduler_options.use_ifm_ofm_overlap,
304 tensor_allocator=TensorAllocator.LinearAlloc,
305 verbose_allocation=options.verbose_allocation,
306 show_minimum_possible_allocation=options.show_minimum_possible_allocation,
Jacob Bohlin0628a8c2020-08-28 13:25:14 +0200307 allocation_alignment=options.allocation_alignment,
Tim Hall79d07d22020-04-27 18:20:16 +0100308 )
309
310 npu_performance.calc_performance_for_network(nng, arch)