blob: 5e9e38fb5f75647a897e49e5cc83478f97299003 [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Contains the main sequencing of the compiler.
Diego Russoea6111a2020-04-14 18:41:58 +010018import time
19
Diego Russoe8a10452020-04-21 17:39:10 +010020from . import extract_npu_subgraphs
Tim Hall79d07d22020-04-27 18:20:16 +010021from . import graph_optimiser
Diego Russoe8a10452020-04-21 17:39:10 +010022from . import high_level_command_stream_generator
Tim Hall79d07d22020-04-27 18:20:16 +010023from . import insert_dma
Diego Russoe8a10452020-04-21 17:39:10 +010024from . import live_range
Louis Verhaard0b8268a2020-08-05 16:11:29 +020025from . import lut
Diego Russoe8a10452020-04-21 17:39:10 +010026from . import mark_tensors
27from . import npu_performance
28from . import npu_serialisation
Tim Hall79d07d22020-04-27 18:20:16 +010029from . import pass_packing
Diego Russoe8a10452020-04-21 17:39:10 +010030from . import register_command_stream_generator
Tim Hall79d07d22020-04-27 18:20:16 +010031from . import scheduler
32from . import tensor_allocation
Tim Hall79d07d22020-04-27 18:20:16 +010033from . import weight_compressor
Patrik Gustavssonc0bb8992020-08-11 16:45:35 +020034from .errors import VelaError
Diego Russoe8a10452020-04-21 17:39:10 +010035from .nn_graph import PassPlacement
36from .nn_graph import TensorAllocator
Diego Russoea6111a2020-04-14 18:41:58 +010037from .rewrite_graph import verify_graph_health
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020038from .tensor import MemType
Tim Hall79d07d22020-04-27 18:20:16 +010039
40
41class CompilerOptions:
42 """Set of options to change compiler behaviour - verbosity, targets, turning off passes.
43
44Note the difference between ArchitectureFeatures and CompilerOptions
45- ArchitectureFeatures is for changing the Ethos-U55 and system architecture
46- CompilerOptions is for changing the behaviour of the compiler
47"""
48
49 def __init__(
50 self,
51 verbose_graph=False,
52 verbose_quantization=False,
53 verbose_packing=False,
54 verbose_tensor_purpose=False,
55 verbose_tensor_format=False,
56 verbose_allocation=False,
57 verbose_high_level_command_stream=False,
58 verbose_register_command_stream=False,
59 verbose_operators=False,
60 show_minimum_possible_allocation=False,
61 show_cpu_operations=False,
62 tensor_allocator=TensorAllocator.Greedy,
63 timing=False,
64 output_dir="outputs",
65 ):
66
67 self.verbose_graph = verbose_graph
68 self.verbose_quantization = verbose_quantization
69 self.verbose_packing = verbose_packing
70 self.verbose_tensor_purpose = verbose_tensor_purpose
71 self.verbose_tensor_format = verbose_tensor_format
72 self.verbose_allocation = verbose_allocation
73 self.verbose_high_level_command_stream = verbose_high_level_command_stream
74 self.verbose_register_command_stream = verbose_register_command_stream
75 self.verbose_operators = verbose_operators
76 self.show_minimum_possible_allocation = show_minimum_possible_allocation
77 self.show_cpu_operations = show_cpu_operations
78 self.tensor_allocator = tensor_allocator
79 self.timing = timing
80 self.output_dir = output_dir
81
82 def __str__(self):
83 return type(self).__name__ + ": " + str(self.__dict__)
84
85 __repr__ = __str__
86
87
88def compiler_driver(nng, arch, options, scheduler_options):
89 assert verify_graph_health(nng)
90 nng = graph_optimiser.optimise_graph_a(nng, arch, options.verbose_graph)
91 assert verify_graph_health(nng)
92
93 if options.verbose_quantization:
94 nng.print_graph_with_tensor_quantization()
95
96 nng = graph_optimiser.optimise_graph_b(nng, arch, options.verbose_graph)
97 assert verify_graph_health(nng)
98
99 nng = mark_tensors.mark_tensor_purpose(nng, arch, options.verbose_tensor_purpose)
100 assert verify_graph_health(nng)
101 nng = insert_dma.insert_dma_commands(nng, arch, options.verbose_graph)
102 assert verify_graph_health(nng)
103 pass_packing.pack_into_passes(nng, arch, options.verbose_packing)
104 assert verify_graph_health(nng)
105
106 extract_npu_subgraphs.extract_npu_subgraphs(nng, arch)
107
108 mark_tensors.mark_tensor_format(nng, arch, options.verbose_tensor_format)
109 assert verify_graph_health(nng)
110 if options.timing:
111 start = time.time()
112
113 # Run the scheduler
114 scheduler.schedule_passes(nng, arch, scheduler_options)
115
116 if options.timing:
117 stop = time.time()
118 print("Scheduling took %f s" % (stop - start))
119 start = time.time()
120
121 # Update the compressed weights now that we have determined the
122 # block config, and calc and pack the scales and biases
123 weight_compressor.update_pass_weight_and_scale_tensors(nng, arch)
124
Tim Hall79d07d22020-04-27 18:20:16 +0100125 # LiveRanges for constant tensors for all Npu subgraphs
126 permanent_storage = arch.permanent_storage_mem_area
127 lr_graph_flash = live_range.LiveRangeGraph()
128
129 # Placeholders for scratch and flash tensors that are common for all Npu subgraphs
130 scratch_tens = None
Patrik Gustavsson3ab94522020-06-29 17:36:55 +0200131 scratch_fast_tens = None
Tim Hall79d07d22020-04-27 18:20:16 +0100132 flash_tens = None
133
134 # Calculate live ranges for all constant Npu tensors, in permanent storage
135 for sg in nng.subgraphs:
136 if sg.placement == PassPlacement.Npu:
137 lr_graph_flash = live_range.extract_live_ranges_from_cascaded_passes(
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200138 sg,
139 permanent_storage,
140 MemType.Permanent_NPU,
141 ignore_subgraph_input_output_tensors=True,
142 lr_graph=lr_graph_flash,
Tim Hall79d07d22020-04-27 18:20:16 +0100143 )
144
Tim Hall25f605c2020-05-18 18:04:26 +0100145 if len(nng.subgraphs) > 1:
146 # Allocate all Npu constant tensors to the first Npu subgraph since it is
147 # processed first during serialization into tensors
148 first_npu_sg = nng.subgraphs[1]
149 assert first_npu_sg.placement == PassPlacement.Npu
Tim Hall25f605c2020-05-18 18:04:26 +0100150 tensor_allocation.allocate_tensors(
151 nng,
152 first_npu_sg,
153 arch,
154 permanent_storage,
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200155 set((MemType.Permanent_NPU,)),
Tim Hall25f605c2020-05-18 18:04:26 +0100156 scheduler_options.use_ifm_ofm_overlap,
157 TensorAllocator.LinearAlloc,
158 options.verbose_allocation,
159 options.show_minimum_possible_allocation,
160 lr_graph_flash,
161 )
Tim Hall79d07d22020-04-27 18:20:16 +0100162
163 # Allocate all non-constant tensors to the root, i.e. Cpu, subgraph. This step
164 # will start at the root subgraph's input and traverse from top to bottom. When
165 # it comes across an Npu-op it will extract live ranges for it's corresponding
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200166 # Npu subgraph and add them to the root's live range graph.
167 # The non-constant tensors are stored either in arch.feature_map_storage_mem_area or
168 # arch.fast_storage_mem_area.
169 # When these memory areas are the same, all non-constant tensors are allocated together.
170 # Otherwise they are allocated separately.
171
Tim Hall79d07d22020-04-27 18:20:16 +0100172 root_sg = nng.get_root_subgraph()
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200173
174 alloc_list = []
175 if arch.feature_map_storage_mem_area == arch.fast_storage_mem_area:
176 mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))
177 alloc_list.append(mem_alloc_scratch)
178 else:
179 mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))
180 mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,)))
181 alloc_list.append(mem_alloc_scratch)
182 alloc_list.append(mem_alloc_scratch_fast)
183
184 for alloc in alloc_list:
185 tensor_allocation.allocate_tensors(
186 nng,
187 root_sg,
188 arch,
189 alloc[0],
190 alloc[1],
191 scheduler_options.use_ifm_ofm_overlap,
192 options.tensor_allocator,
193 options.verbose_allocation,
194 options.show_minimum_possible_allocation,
195 )
Tim Hall79d07d22020-04-27 18:20:16 +0100196
197 # Generate command streams and serialise Npu-ops into tensors
198 for sg in nng.subgraphs:
199 high_level_command_stream_generator.generate_high_level_command_stream(
200 nng, sg, arch, options.verbose_high_level_command_stream
201 )
Louis Verhaard0b8268a2020-08-05 16:11:29 +0200202 lut.optimize_high_level_cmd_stream(sg, arch)
Tim Hall79d07d22020-04-27 18:20:16 +0100203 register_command_stream_generator.generate_register_command_stream(
204 nng, sg, arch, options.verbose_register_command_stream
205 )
Patrik Gustavsson3ab94522020-06-29 17:36:55 +0200206 scratch_tens, scratch_fast_tens, flash_tens = npu_serialisation.serialise_npu_subgraph_into_tensors(
207 nng, sg, arch, scratch_tens, scratch_fast_tens, flash_tens
Tim Hall79d07d22020-04-27 18:20:16 +0100208 )
209
210 npu_serialisation.rewrite_npu_call_ops(nng, root_sg, arch)
211
Patrik Gustavsson3ab94522020-06-29 17:36:55 +0200212 if root_sg is not None and (arch.feature_map_storage_mem_area != arch.fast_storage_mem_area):
213 if root_sg.memory_used_per_type.get(MemType.Scratch_fast, 0) > arch.sram_size:
Patrik Gustavssonc0bb8992020-08-11 16:45:35 +0200214 raise VelaError(
215 "Sram limit {} bytes, has been exceeded by the scratch fast tensor {} bytes".format(
216 arch.sram_size, root_sg.memory_used_per_type.get(MemType.Scratch_fast, 0)
217 )
218 )
Patrik Gustavsson3ab94522020-06-29 17:36:55 +0200219
Tim Hall79d07d22020-04-27 18:20:16 +0100220 # Allocate all Cpu constant tensors, this is done last because the Npu-ops
221 # have to be serialized into flash and scratch tensors first
222 tensor_allocation.allocate_tensors(
223 nng,
224 root_sg,
225 arch,
226 permanent_storage,
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200227 set((MemType.Permanent_CPU,)),
Tim Hall79d07d22020-04-27 18:20:16 +0100228 scheduler_options.use_ifm_ofm_overlap,
Louis Verhaard3c07c972020-05-07 08:12:58 +0200229 TensorAllocator.LinearAlloc,
Tim Hall79d07d22020-04-27 18:20:16 +0100230 options.verbose_allocation,
231 options.show_minimum_possible_allocation,
232 )
233
234 npu_performance.calc_performance_for_network(nng, arch)