blob: f07aec896950fea271c27d0f05316ae737b36e6c [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
17
18# Description:
19# Main entry point for the Vela compiler.
20#
21# Provides command line interface, options parsing, and network loading. Before calling the compiler driver.
22
23import sys
24import os.path
25import os
26import time
27import subprocess
28import configparser
29import argparse
30import ast
31
32from . import architecture_features
33from . import stats_writer
34from . import tflite_writer
35from . import model_reader
36from . import compiler_driver
37from . import scheduler
38from ._version import __version__
39from .scheduler import ParetoMetric
40from .nn_graph import MemArea, TensorFormat, TensorAllocator, PassPlacement
41
42
43def process(fname, arch, model_reader_options, compiler_options, scheduler_options):
44 if compiler_options.timing:
45 start = time.time()
46
47 nng = model_reader.read_model(fname, model_reader_options)
48
49 if not nng:
50 print("reading of", fname, "failed")
51 assert False
52
53 if compiler_options.verbose_operators:
54 nng.print_operators()
55
56 if compiler_options.timing:
57 stop = time.time()
58 print("Model reading took %f s" % (stop - start))
59 start = time.time()
60
61 compiler_driver.compiler_driver(nng, arch, compiler_options, scheduler_options)
62
63 passes_csv_file = "%s/%s_pass-breakdown_%s.csv" % (compiler_options.output_dir, nng.name, arch.system_config)
64 stats_writer.write_pass_metrics_csv(nng, passes_csv_file)
65
66 summary_csv_file = "%s/%s_summary_%s.csv" % (compiler_options.output_dir, nng.name, arch.system_config)
67 stats_writer.write_summary_metrics_csv(nng, summary_csv_file, arch)
68
69 stats_writer.print_performance_metrics(nng, show_cpu_operations=compiler_options.show_cpu_operations, arch=arch)
70
71 if fname.endswith(".tflite"):
72 tflite_writer.write_tflite(nng, "%s/%s_vela.tflite" % (compiler_options.output_dir, nng.name))
73
74 if compiler_options.timing:
75 stop = time.time()
76 print("Compiler driver took %f s" % (stop - start))
77
78 return nng
79
80
81def print_subgraph_io_summary(nng):
82 """Print a summary of all the input and output tensor sizes for all subgraphs.
83 Also displays the total tensor size and the memory used area for sram.
84 """
85
86 print("Subgraph IO Summary")
87 print("-------------------")
88 print("NNG: {0}".format(nng.name))
89 max_sg_size = 0
90 for sg in reversed(nng.subgraphs):
91 print(" Subgraph: {0} = {1}".format(sg.name, sg.placement))
92 sg_size = 0
93
94 if sg.placement == PassPlacement.Npu:
95 for tens in sg.input_tensors + [sg.scratch_tensor] + sg.output_tensors:
96 if tens in sg.input_tensors:
97 tens_dir = "In"
98 elif tens in sg.output_tensors:
99 tens_dir = "Out"
100 else:
101 tens_dir = "In/Out"
102
103 size = tens.elements() * tens.element_size() / 1024.0
104 sg_size = sg_size + size
105 print(" Tensor [{0}]: {1} = {2} KiB".format(tens_dir, tens.name, size))
106
107 print(" Total Size = {0} KiB".format(sg_size))
108 print(" SRAM Memory Used = {0} KiB".format(sg.memory_used.get(MemArea.Sram, 0) / 1024.0))
109 max_sg_size = max(sg_size, max_sg_size)
110
111 print(" Maximum Subgraph Size = {0} KiB".format(max_sg_size))
112
113
114def main(args=None):
115 if args is None:
116 args = sys.argv[1:]
117
118 parser = argparse.ArgumentParser(prog="vela", description="Neural network model compiler for Ethos-U55")
119
120 parser.add_argument(
121 "network", metavar="NETWORK", type=str, default=None, nargs=None, help="Filename of network to process"
122 )
123
124 parser.add_argument("--version", action="version", version=__version__)
125 parser.add_argument(
126 "--output-dir", type=str, default="output", help="Output directory to write files to (default: %(default)s)"
127 )
128 parser.add_argument("--config", type=str, help="Location of vela configuration file")
129 parser.add_argument("--batch-size", type=int, default=1, help="Batch size (default: %(default)s)")
130
131 parser.add_argument("--verbose-graph", action="store_true", help="Verbose graph rewriter")
132 parser.add_argument("--verbose-quantization", action="store_true", help="Verbose quantization")
133 parser.add_argument("--verbose-packing", action="store_true", help="Verbose pass packing")
134 parser.add_argument("--verbose-tensor-purpose", action="store_true", help="Verbose tensor purpose")
135 parser.add_argument("--verbose-tensor-format", action="store_true", help="Verbose tensor format")
136 parser.add_argument("--verbose-schedule", action="store_true", help="Verbose schedule")
137 parser.add_argument(
138 "--verbose-pareto-frontier-schedules",
139 action="store_true",
140 help="Show all schedules along the pareto frontier of optimisation criteria",
141 )
142 parser.add_argument("--verbose-allocation", action="store_true", help="Verbose tensor allocation")
143 parser.add_argument(
144 "--verbose-high-level-command-stream", action="store_true", help="Verbose high level command stream"
145 )
146 parser.add_argument(
147 "--verbose-register-command-stream", action="store_true", help="Verbose register command stream"
148 )
149 parser.add_argument("--verbose-operators", action="store_true", help="Verbose operator list")
150
151 parser.add_argument(
152 "--show-minimum-possible-allocation", action="store_true", help="Show the minimum possible allocation"
153 )
154 parser.add_argument(
155 "--show-cpu-operations", action="store_true", help="Show the operations that fall back to the CPU"
156 )
157 parser.add_argument(
158 "--cascading",
159 type=ast.literal_eval,
160 default=True,
161 choices=[True, False],
162 help="Controls the packing of multiple passes into a cascade (default: %(default)s)",
163 )
164 parser.add_argument(
165 "--ifm-ofm-overlap",
166 type=ast.literal_eval,
167 default=True,
168 choices=[True, False],
169 help="Controls the overlapping of IFM and OFM buffers (default: %(default)s)",
170 )
171 parser.add_argument("--force-block-config", type=str, default="", help="Force a specific block configuration HxWxC")
172 parser.add_argument(
173 "--inter-pass-cycle-delay",
174 type=int,
175 default=0,
176 help="Artificial delay between passes, measured in NPU cycles (default: %(default)s)",
177 )
178 parser.add_argument("--timing", action="store_true", help="Time the compiler doing operations")
179 parser.add_argument(
180 "--accelerator-config",
181 type=str,
182 default="ethos-u55-256",
183 choices=list(architecture_features.ArchitectureFeatures.accelerator_configs.keys()),
184 help="Accelerator configuration to use (default: %(default)s)",
185 )
186 parser.add_argument(
187 "--system-config",
188 type=str,
189 default="internal-default",
190 help="System configuration to use (default: %(default)s)",
191 )
192 parser.add_argument(
193 "--dram-bandwidth",
194 type=float,
195 default=0.0,
196 help="DRAM memory bandwidth in GB/s, use zero to select the value from system config (default: %(default)s)",
197 )
198 parser.add_argument(
199 "--permanent-storage",
200 default=MemArea.OffChipFlash,
201 type=lambda s: MemArea[s],
202 choices=list(MemArea)[3:-1],
203 help=(
204 "Memory area for permanent storage. To store the weights and other constant data in SRAM select "
205 "'OnChipFlash' (default: %(default)s)"
206 ),
207 )
208 parser.add_argument(
209 "--tensor-allocator",
210 default=TensorAllocator.Greedy,
211 type=lambda s: TensorAllocator[s],
212 choices=list(TensorAllocator),
213 help="Tensor Allocator algorithm (default: %(default)s)",
214 )
215 parser.add_argument(
216 "--show-subgraph-io-summary",
217 action="store_true",
218 help="Shows a summary of all the subgraphs and their inputs and outputs",
219 )
220 parser.add_argument(
221 "--ifm-streaming",
222 type=ast.literal_eval,
223 default=True,
224 choices=[True, False],
225 help="Controls scheduler IFM streaming search (default: %(default)s)",
226 )
227 parser.add_argument(
228 "--block-config-limit",
229 type=int,
230 default=16,
231 help="Limit block config search space, use zero for unlimited (default: %(default)s)",
232 )
233 parser.add_argument(
234 "--global-memory-clock-scale",
235 type=float,
236 default=1.0,
237 help=(
238 "Performs an additional scaling of the individual memory clock scales specified by the system config "
239 "(default: %(default)s)"
240 ),
241 )
242 parser.add_argument(
243 "--pareto-metric",
244 default=ParetoMetric.BwCycMem,
245 type=lambda s: ParetoMetric[s],
246 choices=list(ParetoMetric),
247 help="Controls the calculation of the pareto metric (default: %(default)s)",
248 )
249 parser.add_argument(
250 "--recursion-limit",
251 type=int,
252 default=10000,
253 help="Set the recursion depth limit, may result in RecursionError if too low (default: %(default)s)",
254 )
255 parser.add_argument(
256 "--max-block-dependency",
257 type=int,
258 default=architecture_features.ArchitectureFeatures.MAX_BLOCKDEP,
259 choices=range(0, architecture_features.ArchitectureFeatures.MAX_BLOCKDEP + 1),
260 help=(
261 "Set the maximum value that can be used for the block dependency between npu kernel operations "
262 "(default: %(default)s)"
263 ),
264 )
265
266 args = parser.parse_args(args=args)
267
268 # Read configuration file
269 config_file = args.config
270 config = None
271 if config_file is not None:
272 with open(config_file) as f:
273 config = configparser.ConfigParser()
274 config.read_file(f)
275
276 if args.network is None:
277 parser.error("the following argument is required: NETWORK")
278
279 sys.setrecursionlimit(args.recursion_limit)
280
281 if args.force_block_config:
282 force_block_config = architecture_features.Block.from_string(args.force_block_config)
283 else:
284 force_block_config = None
285
286 arch = architecture_features.ArchitectureFeatures(
287 vela_config=config,
288 system_config=args.system_config,
289 accelerator_config=args.accelerator_config,
290 permanent_storage=args.permanent_storage,
291 inter_pass_cycle_delay=args.inter_pass_cycle_delay,
292 dram_bandwidth=args.dram_bandwidth,
293 override_block_config=force_block_config,
294 block_config_limit=args.block_config_limit,
295 global_memory_clock_scale=args.global_memory_clock_scale,
296 max_blockdep=args.max_block_dependency,
297 )
298
299 compiler_options = compiler_driver.CompilerOptions(
300 verbose_graph=args.verbose_graph,
301 verbose_quantization=args.verbose_quantization,
302 verbose_packing=args.verbose_packing,
303 verbose_tensor_purpose=args.verbose_tensor_purpose,
304 verbose_tensor_format=args.verbose_tensor_format,
305 verbose_allocation=args.verbose_allocation,
306 verbose_high_level_command_stream=args.verbose_high_level_command_stream,
307 verbose_register_command_stream=args.verbose_register_command_stream,
308 verbose_operators=args.verbose_operators,
309 show_minimum_possible_allocation=args.show_minimum_possible_allocation,
310 show_cpu_operations=args.show_cpu_operations,
311 tensor_allocator=args.tensor_allocator,
312 timing=args.timing,
313 output_dir=args.output_dir,
314 )
315
316 scheduler_options = scheduler.SchedulerOptions(
317 use_cascading=args.cascading,
318 use_ifm_ofm_overlap=args.ifm_ofm_overlap,
319 verbose_schedule=args.verbose_schedule,
320 verbose_pareto_frontier_schedules=args.verbose_pareto_frontier_schedules,
321 use_ifm_streaming=args.ifm_streaming,
322 pareto_metric=args.pareto_metric,
323 )
324
325 model_reader_options = model_reader.ModelReaderOptions(batch_size=args.batch_size)
326
327 os.makedirs(args.output_dir, exist_ok=True)
328
329 nng = process(args.network, arch, model_reader_options, compiler_options, scheduler_options)
330
331 if args.show_subgraph_io_summary:
332 print_subgraph_io_summary(nng)
333
334 return 0