blob: cb4753986a5c33bcf7f4be6557231b4be8a9bbb5 [file] [log] [blame]
erik.andersson@arm.com460c6892021-02-24 14:38:09 +01001# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Contains the main sequencing of the compiler.
Diego Russoea6111a2020-04-14 18:41:58 +010018import time
19
Diego Russoe8a10452020-04-21 17:39:10 +010020from . import extract_npu_subgraphs
Tim Hall79d07d22020-04-27 18:20:16 +010021from . import graph_optimiser
Diego Russoe8a10452020-04-21 17:39:10 +010022from . import high_level_command_stream_generator
Louis Verhaard1e170182020-11-26 11:42:04 +010023from . import high_level_command_to_npu_op
Diego Russoe8a10452020-04-21 17:39:10 +010024from . import live_range
Louis Verhaard0b8268a2020-08-05 16:11:29 +020025from . import lut
Diego Russoe8a10452020-04-21 17:39:10 +010026from . import mark_tensors
27from . import npu_performance
28from . import npu_serialisation
Tim Hall79d07d22020-04-27 18:20:16 +010029from . import pass_packing
30from . import scheduler
31from . import tensor_allocation
Tim Halle6ccd872020-11-09 16:46:37 +000032from .debug_database import DebugDatabase
Diego Russoe8a10452020-04-21 17:39:10 +010033from .nn_graph import PassPlacement
34from .nn_graph import TensorAllocator
Tim Halle6ccd872020-11-09 16:46:37 +000035from .operation import Op
Diego Russoea6111a2020-04-14 18:41:58 +010036from .rewrite_graph import verify_graph_health
Tim Halle6ccd872020-11-09 16:46:37 +000037from .rewrite_graph import visit_graph_post_order
Tim Halld8339a72021-05-27 18:49:40 +010038from .scheduler import OptimizationStrategy
39from .tensor import MemArea
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020040from .tensor import MemType
Jacob Bohlin0628a8c2020-08-28 13:25:14 +020041from .tensor import Tensor
Tim Hall79d07d22020-04-27 18:20:16 +010042
43
44class CompilerOptions:
45 """Set of options to change compiler behaviour - verbosity, targets, turning off passes.
46
47Note the difference between ArchitectureFeatures and CompilerOptions
Tim Hallc8a73862020-10-27 12:43:14 +000048- ArchitectureFeatures is for changing the Ethos-U and system architecture
Tim Hall79d07d22020-04-27 18:20:16 +010049- CompilerOptions is for changing the behaviour of the compiler
50"""
51
52 def __init__(
53 self,
54 verbose_graph=False,
55 verbose_quantization=False,
56 verbose_packing=False,
57 verbose_tensor_purpose=False,
58 verbose_tensor_format=False,
59 verbose_allocation=False,
60 verbose_high_level_command_stream=False,
61 verbose_register_command_stream=False,
62 verbose_operators=False,
Fredrik Svedbergf5c07c42021-04-23 14:36:42 +020063 verbose_weights=False,
Tim Hall79d07d22020-04-27 18:20:16 +010064 show_cpu_operations=False,
65 tensor_allocator=TensorAllocator.Greedy,
66 timing=False,
67 output_dir="outputs",
Tim Hallb9b515c2020-11-01 21:27:19 +000068 cpu_tensor_alignment=Tensor.AllocationQuantum,
Tim Hall79d07d22020-04-27 18:20:16 +010069 ):
70
71 self.verbose_graph = verbose_graph
72 self.verbose_quantization = verbose_quantization
73 self.verbose_packing = verbose_packing
74 self.verbose_tensor_purpose = verbose_tensor_purpose
75 self.verbose_tensor_format = verbose_tensor_format
76 self.verbose_allocation = verbose_allocation
77 self.verbose_high_level_command_stream = verbose_high_level_command_stream
78 self.verbose_register_command_stream = verbose_register_command_stream
79 self.verbose_operators = verbose_operators
Fredrik Svedbergf5c07c42021-04-23 14:36:42 +020080 self.verbose_weights = verbose_weights
Tim Hall79d07d22020-04-27 18:20:16 +010081 self.show_cpu_operations = show_cpu_operations
82 self.tensor_allocator = tensor_allocator
83 self.timing = timing
84 self.output_dir = output_dir
Tim Hallb9b515c2020-11-01 21:27:19 +000085 self.cpu_tensor_alignment = cpu_tensor_alignment
Tim Hall79d07d22020-04-27 18:20:16 +010086
87 def __str__(self):
88 return type(self).__name__ + ": " + str(self.__dict__)
89
90 __repr__ = __str__
91
92
Louis Verhaard0b9c9a32020-09-15 14:05:38 +020093def next_sram_factor(alloc_results):
94 # Bisects to find the max SRAM usage that successfully can be fitted with the tensor allocator.
95 # Returns tuple (factor, dry_test), with factor is None (stop) or 0 <= factor <= 1 (next SRAM factor to try),
96 # dry_test is True while still bisecting.
97 upper = 1.0
98 lower = 0.7
99 MAX_ITERATIONS = 8
100 if len(alloc_results) == 0:
101 # First iteration, try max SRAM, keep the result if it succeeds
102 return (upper, False)
103 elif len(alloc_results) == 1:
104 if alloc_results[0]:
105 # The allocator succeeded at first try; stop
106 return (None, False)
107 else:
108 # Start bisecting, try lowerbound SRAM
109 return (lower, True)
110 elif len(alloc_results) > MAX_ITERATIONS:
111 # Stop
112 return (None, False)
113 if not alloc_results[1]:
114 # Allocation at lower failed; search interval 0 - lower
115 upper = lower
116 lower = 0
117 best = lower
118 for success in alloc_results[2:]:
119 middle = (lower + upper) / 2
120 if success:
121 best = max(best, middle)
122 lower = middle
123 else:
124 upper = middle
125 if len(alloc_results) == MAX_ITERATIONS:
126 # Done bisecting; repeat the best match, but not as dry test
127 return (best, False)
128 # Next try; run only as dry test
129 return ((lower + upper) / 2, True)
130
131
Tim Halle6ccd872020-11-09 16:46:37 +0000132def _record_operator(op, arch):
133 if op.type != Op.Const:
134 DebugDatabase.add_source(op)
135
136
Tim Halld8339a72021-05-27 18:49:40 +0100137def _check_schedule(nng, arch, scheduler_options):
138 # check sram usage for optimisation strategy
139 sram_usage = nng.get_root_subgraph().memory_used.get(MemArea.Sram)
140 if sram_usage is not None and scheduler_options.optimization_strategy == OptimizationStrategy.Performance:
141 if sram_usage > scheduler_options.optimization_sram_limit:
142 print(
143 f"Warning: SRAM target for arena memory area exceeded."
144 f" Target = {scheduler_options.optimization_sram_limit} Bytes,"
145 f" Actual = {sram_usage} Bytes"
146 )
147
148
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200149def compiler_driver(nng, arch, options, scheduler_options, network_type):
Tim Hall79d07d22020-04-27 18:20:16 +0100150 assert verify_graph_health(nng)
Tim Halle6ccd872020-11-09 16:46:37 +0000151
152 # Pre-optimisation operator tracking
153 for sg in nng.subgraphs:
154 visit_graph_post_order(sg.output_tensors, arch, [], [_record_operator])
155
Patrik Gustavsson8f1f9aa2021-06-28 07:41:58 +0200156 nng = graph_optimiser.optimise_graph(nng, arch, network_type, options.verbose_graph)
Tim Hall79d07d22020-04-27 18:20:16 +0100157 assert verify_graph_health(nng)
158
159 if options.verbose_quantization:
160 nng.print_graph_with_tensor_quantization()
161
Tim Hall79d07d22020-04-27 18:20:16 +0100162 nng = mark_tensors.mark_tensor_purpose(nng, arch, options.verbose_tensor_purpose)
163 assert verify_graph_health(nng)
Tim Hall79d07d22020-04-27 18:20:16 +0100164 pass_packing.pack_into_passes(nng, arch, options.verbose_packing)
165 assert verify_graph_health(nng)
166
167 extract_npu_subgraphs.extract_npu_subgraphs(nng, arch)
168
Tim Hall79d07d22020-04-27 18:20:16 +0100169 assert verify_graph_health(nng)
170 if options.timing:
171 start = time.time()
172
173 # Run the scheduler
Tim Halld8339a72021-05-27 18:49:40 +0100174 scheduler.schedule_passes(nng, arch, options, scheduler_options)
175 _check_schedule(nng, arch, scheduler_options)
Tim Hall79d07d22020-04-27 18:20:16 +0100176
177 if options.timing:
178 stop = time.time()
179 print("Scheduling took %f s" % (stop - start))
180 start = time.time()
181
Tim Hall79d07d22020-04-27 18:20:16 +0100182 # LiveRanges for constant tensors for all Npu subgraphs
183 permanent_storage = arch.permanent_storage_mem_area
184 lr_graph_flash = live_range.LiveRangeGraph()
185
186 # Placeholders for scratch and flash tensors that are common for all Npu subgraphs
187 scratch_tens = None
Patrik Gustavsson3ab94522020-06-29 17:36:55 +0200188 scratch_fast_tens = None
Tim Hall79d07d22020-04-27 18:20:16 +0100189 flash_tens = None
190
191 # Calculate live ranges for all constant Npu tensors, in permanent storage
192 for sg in nng.subgraphs:
193 if sg.placement == PassPlacement.Npu:
Tim Halld8339a72021-05-27 18:49:40 +0100194 lr_graph_flash = live_range.create_linear_live_range_graph(
195 sg, permanent_storage, MemType.Permanent_NPU, lr_graph=lr_graph_flash,
Tim Hall79d07d22020-04-27 18:20:16 +0100196 )
197
Tim Hall25f605c2020-05-18 18:04:26 +0100198 if len(nng.subgraphs) > 1:
199 # Allocate all Npu constant tensors to the first Npu subgraph since it is
200 # processed first during serialization into tensors
201 first_npu_sg = nng.subgraphs[1]
202 assert first_npu_sg.placement == PassPlacement.Npu
Tim Hall25f605c2020-05-18 18:04:26 +0100203 tensor_allocation.allocate_tensors(
204 nng,
205 first_npu_sg,
206 arch,
207 permanent_storage,
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200208 set((MemType.Permanent_NPU,)),
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200209 tensor_allocator=TensorAllocator.LinearAlloc,
210 verbose_allocation=options.verbose_allocation,
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200211 lr_graph=lr_graph_flash,
Tim Hall25f605c2020-05-18 18:04:26 +0100212 )
Tim Hall79d07d22020-04-27 18:20:16 +0100213
Tim Hall79d07d22020-04-27 18:20:16 +0100214 root_sg = nng.get_root_subgraph()
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200215
Tim Hall79d07d22020-04-27 18:20:16 +0100216 # Generate command streams and serialise Npu-ops into tensors
217 for sg in nng.subgraphs:
Tim Halld8339a72021-05-27 18:49:40 +0100218 if sg.placement == PassPlacement.Npu:
219 high_level_command_stream_generator.generate_high_level_command_stream_for_schedule(
220 nng, sg, arch, options.verbose_high_level_command_stream
221 )
222 lut.optimize_high_level_cmd_stream(sg, arch)
223 high_level_command_to_npu_op.generate_register_command_stream_for_sg(
224 nng, sg, arch, options.verbose_register_command_stream
225 )
226 scratch_tens, scratch_fast_tens, flash_tens = npu_serialisation.serialise_npu_subgraph_into_tensors(
227 sg, arch, scratch_tens, scratch_fast_tens, flash_tens
228 )
Tim Hall79d07d22020-04-27 18:20:16 +0100229
Tim Hall03d40a22021-04-22 12:08:28 +0100230 npu_serialisation.rewrite_npu_call_ops(root_sg, arch)
Tim Hall79d07d22020-04-27 18:20:16 +0100231
Jacob Bohlin268394d2020-08-13 13:24:59 +0200232 # Set Scratch and Fast_scratch Tensor size
233 if scratch_tens is not None:
234 scratch_tens.set_all_shapes([root_sg.memory_used_per_type.get(MemType.Scratch, 0)])
235 if scratch_fast_tens is not None:
236 scratch_fast_tens.set_all_shapes([root_sg.memory_used_per_type.get(MemType.Scratch_fast, 0)])
237
Tim Hall79d07d22020-04-27 18:20:16 +0100238 # Allocate all Cpu constant tensors, this is done last because the Npu-ops
239 # have to be serialized into flash and scratch tensors first
240 tensor_allocation.allocate_tensors(
241 nng,
242 root_sg,
243 arch,
244 permanent_storage,
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200245 set((MemType.Permanent_CPU,)),
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200246 tensor_allocator=TensorAllocator.LinearAlloc,
247 verbose_allocation=options.verbose_allocation,
Tim Hallb9b515c2020-11-01 21:27:19 +0000248 cpu_tensor_alignment=options.cpu_tensor_alignment,
Tim Hall79d07d22020-04-27 18:20:16 +0100249 )
250
Tim Halld8339a72021-05-27 18:49:40 +0100251 npu_performance.calc_new_performance_for_network(nng, arch)