blob: 49f8c26c00f9aa046ca27028f19e3565d6f34100 [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Main entry point for the Vela compiler.
18#
19# Provides command line interface, options parsing, and network loading. Before calling the compiler driver.
Diego Russoe8a10452020-04-21 17:39:10 +010020import argparse
21import ast
22import configparser
Diego Russoea6111a2020-04-14 18:41:58 +010023import os.path
24import sys
Tim Hall79d07d22020-04-27 18:20:16 +010025import time
Tim Hall79d07d22020-04-27 18:20:16 +010026
27from . import architecture_features
Diego Russoe8a10452020-04-21 17:39:10 +010028from . import compiler_driver
29from . import model_reader
30from . import scheduler
Tim Hall79d07d22020-04-27 18:20:16 +010031from . import stats_writer
32from . import tflite_writer
Tim Hall79d07d22020-04-27 18:20:16 +010033from ._version import __version__
Diego Russoe8a10452020-04-21 17:39:10 +010034from .nn_graph import PassPlacement
35from .nn_graph import TensorAllocator
Tim Hall79d07d22020-04-27 18:20:16 +010036from .scheduler import ParetoMetric
Diego Russoea6111a2020-04-14 18:41:58 +010037from .tensor import MemArea
Tim Hall79d07d22020-04-27 18:20:16 +010038
39
40def process(fname, arch, model_reader_options, compiler_options, scheduler_options):
41 if compiler_options.timing:
42 start = time.time()
43
44 nng = model_reader.read_model(fname, model_reader_options)
45
46 if not nng:
47 print("reading of", fname, "failed")
48 assert False
49
50 if compiler_options.verbose_operators:
51 nng.print_operators()
52
53 if compiler_options.timing:
54 stop = time.time()
55 print("Model reading took %f s" % (stop - start))
56 start = time.time()
57
58 compiler_driver.compiler_driver(nng, arch, compiler_options, scheduler_options)
59
60 passes_csv_file = "%s/%s_pass-breakdown_%s.csv" % (compiler_options.output_dir, nng.name, arch.system_config)
61 stats_writer.write_pass_metrics_csv(nng, passes_csv_file)
62
63 summary_csv_file = "%s/%s_summary_%s.csv" % (compiler_options.output_dir, nng.name, arch.system_config)
64 stats_writer.write_summary_metrics_csv(nng, summary_csv_file, arch)
65
66 stats_writer.print_performance_metrics(nng, show_cpu_operations=compiler_options.show_cpu_operations, arch=arch)
67
68 if fname.endswith(".tflite"):
69 tflite_writer.write_tflite(nng, "%s/%s_vela.tflite" % (compiler_options.output_dir, nng.name))
70
71 if compiler_options.timing:
72 stop = time.time()
73 print("Compiler driver took %f s" % (stop - start))
74
75 return nng
76
77
78def print_subgraph_io_summary(nng):
79 """Print a summary of all the input and output tensor sizes for all subgraphs.
80 Also displays the total tensor size and the memory used area for sram.
81 """
82
83 print("Subgraph IO Summary")
84 print("-------------------")
85 print("NNG: {0}".format(nng.name))
86 max_sg_size = 0
87 for sg in reversed(nng.subgraphs):
88 print(" Subgraph: {0} = {1}".format(sg.name, sg.placement))
89 sg_size = 0
90
91 if sg.placement == PassPlacement.Npu:
92 for tens in sg.input_tensors + [sg.scratch_tensor] + sg.output_tensors:
93 if tens in sg.input_tensors:
94 tens_dir = "In"
95 elif tens in sg.output_tensors:
96 tens_dir = "Out"
97 else:
98 tens_dir = "In/Out"
99
100 size = tens.elements() * tens.element_size() / 1024.0
101 sg_size = sg_size + size
102 print(" Tensor [{0}]: {1} = {2} KiB".format(tens_dir, tens.name, size))
103
104 print(" Total Size = {0} KiB".format(sg_size))
105 print(" SRAM Memory Used = {0} KiB".format(sg.memory_used.get(MemArea.Sram, 0) / 1024.0))
106 max_sg_size = max(sg_size, max_sg_size)
107
108 print(" Maximum Subgraph Size = {0} KiB".format(max_sg_size))
109
110
111def main(args=None):
112 if args is None:
113 args = sys.argv[1:]
114
115 parser = argparse.ArgumentParser(prog="vela", description="Neural network model compiler for Ethos-U55")
116
117 parser.add_argument(
118 "network", metavar="NETWORK", type=str, default=None, nargs=None, help="Filename of network to process"
119 )
120
121 parser.add_argument("--version", action="version", version=__version__)
122 parser.add_argument(
123 "--output-dir", type=str, default="output", help="Output directory to write files to (default: %(default)s)"
124 )
125 parser.add_argument("--config", type=str, help="Location of vela configuration file")
126 parser.add_argument("--batch-size", type=int, default=1, help="Batch size (default: %(default)s)")
127
128 parser.add_argument("--verbose-graph", action="store_true", help="Verbose graph rewriter")
129 parser.add_argument("--verbose-quantization", action="store_true", help="Verbose quantization")
130 parser.add_argument("--verbose-packing", action="store_true", help="Verbose pass packing")
131 parser.add_argument("--verbose-tensor-purpose", action="store_true", help="Verbose tensor purpose")
132 parser.add_argument("--verbose-tensor-format", action="store_true", help="Verbose tensor format")
133 parser.add_argument("--verbose-schedule", action="store_true", help="Verbose schedule")
134 parser.add_argument(
135 "--verbose-pareto-frontier-schedules",
136 action="store_true",
137 help="Show all schedules along the pareto frontier of optimisation criteria",
138 )
139 parser.add_argument("--verbose-allocation", action="store_true", help="Verbose tensor allocation")
140 parser.add_argument(
141 "--verbose-high-level-command-stream", action="store_true", help="Verbose high level command stream"
142 )
143 parser.add_argument(
144 "--verbose-register-command-stream", action="store_true", help="Verbose register command stream"
145 )
146 parser.add_argument("--verbose-operators", action="store_true", help="Verbose operator list")
147
148 parser.add_argument(
149 "--show-minimum-possible-allocation", action="store_true", help="Show the minimum possible allocation"
150 )
151 parser.add_argument(
152 "--show-cpu-operations", action="store_true", help="Show the operations that fall back to the CPU"
153 )
154 parser.add_argument(
155 "--cascading",
156 type=ast.literal_eval,
157 default=True,
158 choices=[True, False],
159 help="Controls the packing of multiple passes into a cascade (default: %(default)s)",
160 )
161 parser.add_argument(
162 "--ifm-ofm-overlap",
163 type=ast.literal_eval,
164 default=True,
165 choices=[True, False],
166 help="Controls the overlapping of IFM and OFM buffers (default: %(default)s)",
167 )
168 parser.add_argument("--force-block-config", type=str, default="", help="Force a specific block configuration HxWxC")
169 parser.add_argument(
170 "--inter-pass-cycle-delay",
171 type=int,
172 default=0,
173 help="Artificial delay between passes, measured in NPU cycles (default: %(default)s)",
174 )
175 parser.add_argument("--timing", action="store_true", help="Time the compiler doing operations")
176 parser.add_argument(
177 "--accelerator-config",
178 type=str,
179 default="ethos-u55-256",
180 choices=list(architecture_features.ArchitectureFeatures.accelerator_configs.keys()),
181 help="Accelerator configuration to use (default: %(default)s)",
182 )
183 parser.add_argument(
184 "--system-config",
185 type=str,
186 default="internal-default",
187 help="System configuration to use (default: %(default)s)",
188 )
189 parser.add_argument(
190 "--dram-bandwidth",
191 type=float,
192 default=0.0,
193 help="DRAM memory bandwidth in GB/s, use zero to select the value from system config (default: %(default)s)",
194 )
195 parser.add_argument(
196 "--permanent-storage",
197 default=MemArea.OffChipFlash,
198 type=lambda s: MemArea[s],
199 choices=list(MemArea)[3:-1],
200 help=(
201 "Memory area for permanent storage. To store the weights and other constant data in SRAM select "
202 "'OnChipFlash' (default: %(default)s)"
203 ),
204 )
205 parser.add_argument(
206 "--tensor-allocator",
207 default=TensorAllocator.Greedy,
208 type=lambda s: TensorAllocator[s],
209 choices=list(TensorAllocator),
210 help="Tensor Allocator algorithm (default: %(default)s)",
211 )
212 parser.add_argument(
213 "--show-subgraph-io-summary",
214 action="store_true",
215 help="Shows a summary of all the subgraphs and their inputs and outputs",
216 )
217 parser.add_argument(
218 "--ifm-streaming",
219 type=ast.literal_eval,
220 default=True,
221 choices=[True, False],
222 help="Controls scheduler IFM streaming search (default: %(default)s)",
223 )
224 parser.add_argument(
225 "--block-config-limit",
226 type=int,
227 default=16,
228 help="Limit block config search space, use zero for unlimited (default: %(default)s)",
229 )
230 parser.add_argument(
231 "--global-memory-clock-scale",
232 type=float,
233 default=1.0,
234 help=(
235 "Performs an additional scaling of the individual memory clock scales specified by the system config "
236 "(default: %(default)s)"
237 ),
238 )
239 parser.add_argument(
240 "--pareto-metric",
241 default=ParetoMetric.BwCycMem,
242 type=lambda s: ParetoMetric[s],
243 choices=list(ParetoMetric),
244 help="Controls the calculation of the pareto metric (default: %(default)s)",
245 )
246 parser.add_argument(
247 "--recursion-limit",
248 type=int,
249 default=10000,
250 help="Set the recursion depth limit, may result in RecursionError if too low (default: %(default)s)",
251 )
252 parser.add_argument(
253 "--max-block-dependency",
254 type=int,
255 default=architecture_features.ArchitectureFeatures.MAX_BLOCKDEP,
256 choices=range(0, architecture_features.ArchitectureFeatures.MAX_BLOCKDEP + 1),
257 help=(
258 "Set the maximum value that can be used for the block dependency between npu kernel operations "
259 "(default: %(default)s)"
260 ),
261 )
262
263 args = parser.parse_args(args=args)
264
265 # Read configuration file
266 config_file = args.config
267 config = None
268 if config_file is not None:
269 with open(config_file) as f:
270 config = configparser.ConfigParser()
271 config.read_file(f)
272
273 if args.network is None:
274 parser.error("the following argument is required: NETWORK")
275
276 sys.setrecursionlimit(args.recursion_limit)
277
278 if args.force_block_config:
279 force_block_config = architecture_features.Block.from_string(args.force_block_config)
280 else:
281 force_block_config = None
282
283 arch = architecture_features.ArchitectureFeatures(
284 vela_config=config,
285 system_config=args.system_config,
286 accelerator_config=args.accelerator_config,
287 permanent_storage=args.permanent_storage,
288 inter_pass_cycle_delay=args.inter_pass_cycle_delay,
289 dram_bandwidth=args.dram_bandwidth,
290 override_block_config=force_block_config,
291 block_config_limit=args.block_config_limit,
292 global_memory_clock_scale=args.global_memory_clock_scale,
293 max_blockdep=args.max_block_dependency,
294 )
295
296 compiler_options = compiler_driver.CompilerOptions(
297 verbose_graph=args.verbose_graph,
298 verbose_quantization=args.verbose_quantization,
299 verbose_packing=args.verbose_packing,
300 verbose_tensor_purpose=args.verbose_tensor_purpose,
301 verbose_tensor_format=args.verbose_tensor_format,
302 verbose_allocation=args.verbose_allocation,
303 verbose_high_level_command_stream=args.verbose_high_level_command_stream,
304 verbose_register_command_stream=args.verbose_register_command_stream,
305 verbose_operators=args.verbose_operators,
306 show_minimum_possible_allocation=args.show_minimum_possible_allocation,
307 show_cpu_operations=args.show_cpu_operations,
308 tensor_allocator=args.tensor_allocator,
309 timing=args.timing,
310 output_dir=args.output_dir,
311 )
312
313 scheduler_options = scheduler.SchedulerOptions(
314 use_cascading=args.cascading,
315 use_ifm_ofm_overlap=args.ifm_ofm_overlap,
316 verbose_schedule=args.verbose_schedule,
317 verbose_pareto_frontier_schedules=args.verbose_pareto_frontier_schedules,
318 use_ifm_streaming=args.ifm_streaming,
319 pareto_metric=args.pareto_metric,
320 )
321
322 model_reader_options = model_reader.ModelReaderOptions(batch_size=args.batch_size)
323
324 os.makedirs(args.output_dir, exist_ok=True)
325
326 nng = process(args.network, arch, model_reader_options, compiler_options, scheduler_options)
327
328 if args.show_subgraph_io_summary:
329 print_subgraph_io_summary(nng)
330
331 return 0