blob: 32eef3020b3c3dcc96a55600b68f07dbfddc254b [file] [log] [blame]
# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Description:
# Contains the main sequencing of the compiler.
import time
from . import extract_npu_subgraphs
from . import graph_optimiser
from . import high_level_command_stream_generator
from . import insert_dma
from . import live_range
from . import lut
from . import mark_tensors
from . import npu_performance
from . import npu_serialisation
from . import pass_packing
from . import register_command_stream_generator
from . import scheduler
from . import tensor_allocation
from . import weight_compressor
from .debug_database import DebugDatabase
from .errors import VelaError
from .nn_graph import PassPlacement
from .nn_graph import TensorAllocator
from .operation import Op
from .rewrite_graph import verify_graph_health
from .rewrite_graph import visit_graph_post_order
from .tensor import MemType
from .tensor import Tensor
class CompilerOptions:
"""Set of options to change compiler behaviour - verbosity, targets, turning off passes.
Note the difference between ArchitectureFeatures and CompilerOptions
- ArchitectureFeatures is for changing the Ethos-U55 and system architecture
- CompilerOptions is for changing the behaviour of the compiler
"""
def __init__(
self,
verbose_graph=False,
verbose_quantization=False,
verbose_packing=False,
verbose_tensor_purpose=False,
verbose_tensor_format=False,
verbose_allocation=False,
verbose_high_level_command_stream=False,
verbose_register_command_stream=False,
verbose_operators=False,
show_minimum_possible_allocation=False,
show_cpu_operations=False,
tensor_allocator=TensorAllocator.Greedy,
timing=False,
output_dir="outputs",
allocation_alignment=Tensor.AllocationQuantum,
):
self.verbose_graph = verbose_graph
self.verbose_quantization = verbose_quantization
self.verbose_packing = verbose_packing
self.verbose_tensor_purpose = verbose_tensor_purpose
self.verbose_tensor_format = verbose_tensor_format
self.verbose_allocation = verbose_allocation
self.verbose_high_level_command_stream = verbose_high_level_command_stream
self.verbose_register_command_stream = verbose_register_command_stream
self.verbose_operators = verbose_operators
self.show_minimum_possible_allocation = show_minimum_possible_allocation
self.show_cpu_operations = show_cpu_operations
self.tensor_allocator = tensor_allocator
self.timing = timing
self.output_dir = output_dir
self.allocation_alignment = allocation_alignment
def __str__(self):
return type(self).__name__ + ": " + str(self.__dict__)
__repr__ = __str__
def next_sram_factor(alloc_results):
# Bisects to find the max SRAM usage that successfully can be fitted with the tensor allocator.
# Returns tuple (factor, dry_test), with factor is None (stop) or 0 <= factor <= 1 (next SRAM factor to try),
# dry_test is True while still bisecting.
upper = 1.0
lower = 0.7
MAX_ITERATIONS = 8
if len(alloc_results) == 0:
# First iteration, try max SRAM, keep the result if it succeeds
return (upper, False)
elif len(alloc_results) == 1:
if alloc_results[0]:
# The allocator succeeded at first try; stop
return (None, False)
else:
# Start bisecting, try lowerbound SRAM
return (lower, True)
elif len(alloc_results) > MAX_ITERATIONS:
# Stop
return (None, False)
if not alloc_results[1]:
# Allocation at lower failed; search interval 0 - lower
upper = lower
lower = 0
best = lower
for success in alloc_results[2:]:
middle = (lower + upper) / 2
if success:
best = max(best, middle)
lower = middle
else:
upper = middle
if len(alloc_results) == MAX_ITERATIONS:
# Done bisecting; repeat the best match, but not as dry test
return (best, False)
# Next try; run only as dry test
return ((lower + upper) / 2, True)
def _record_operator(op, arch):
if op.type != Op.Const:
DebugDatabase.add_source(op)
def compiler_driver(nng, arch, options, scheduler_options):
assert verify_graph_health(nng)
# Pre-optimisation operator tracking
for sg in nng.subgraphs:
visit_graph_post_order(sg.output_tensors, arch, [], [_record_operator])
nng = graph_optimiser.optimise_graph_a(nng, arch, options.verbose_graph)
assert verify_graph_health(nng)
if options.verbose_quantization:
nng.print_graph_with_tensor_quantization()
nng = graph_optimiser.optimise_graph_b(nng, arch, options.verbose_graph)
assert verify_graph_health(nng)
nng = mark_tensors.mark_tensor_purpose(nng, arch, options.verbose_tensor_purpose)
assert verify_graph_health(nng)
nng = insert_dma.insert_dma_commands(nng, arch, options.verbose_graph)
assert verify_graph_health(nng)
pass_packing.pack_into_passes(nng, arch, options.verbose_packing)
assert verify_graph_health(nng)
extract_npu_subgraphs.extract_npu_subgraphs(nng, arch)
assert verify_graph_health(nng)
if options.timing:
start = time.time()
# Run the scheduler
scheduler.schedule_passes(nng, arch, scheduler_options)
if options.timing:
stop = time.time()
print("Scheduling took %f s" % (stop - start))
start = time.time()
# Update the compressed weights now that we have determined the
# block config, and calc and pack the scales and biases
weight_compressor.update_pass_weight_and_scale_tensors(nng, arch)
# LiveRanges for constant tensors for all Npu subgraphs
permanent_storage = arch.permanent_storage_mem_area
lr_graph_flash = live_range.LiveRangeGraph()
# Placeholders for scratch and flash tensors that are common for all Npu subgraphs
scratch_tens = None
scratch_fast_tens = None
flash_tens = None
# Calculate live ranges for all constant Npu tensors, in permanent storage
for sg in nng.subgraphs:
if sg.placement == PassPlacement.Npu:
lr_graph_flash = live_range.extract_live_ranges_from_cascaded_passes(
sg,
permanent_storage,
MemType.Permanent_NPU,
ignore_subgraph_input_output_tensors=True,
lr_graph=lr_graph_flash,
)
if len(nng.subgraphs) > 1:
# Allocate all Npu constant tensors to the first Npu subgraph since it is
# processed first during serialization into tensors
first_npu_sg = nng.subgraphs[1]
assert first_npu_sg.placement == PassPlacement.Npu
tensor_allocation.allocate_tensors(
nng,
first_npu_sg,
arch,
permanent_storage,
set((MemType.Permanent_NPU,)),
tensor_allocator=TensorAllocator.LinearAlloc,
verbose_allocation=options.verbose_allocation,
show_minimum_possible_allocation=options.show_minimum_possible_allocation,
lr_graph=lr_graph_flash,
)
# Allocate all non-constant tensors to the root, i.e. Cpu, subgraph. This step
# will start at the root subgraph's input and traverse from top to bottom. When
# it comes across an Npu-op it will extract live ranges for it's corresponding
# Npu subgraph and add them to the root's live range graph.
# The non-constant tensors are stored either in arch.feature_map_storage_mem_area or
# arch.fast_storage_mem_area.
# When these memory areas are the same, all non-constant tensors are allocated together.
# Otherwise they are allocated separately.
root_sg = nng.get_root_subgraph()
alloc_list = []
feature_maps_in_fast_storage = arch.feature_map_storage_mem_area == arch.fast_storage_mem_area
if feature_maps_in_fast_storage:
mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))
alloc_list.append(mem_alloc_scratch)
else:
mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,)))
mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))
# Order is important
alloc_list.append(mem_alloc_scratch_fast)
alloc_list.append(mem_alloc_scratch)
for mem_area, mem_type_set in alloc_list:
if feature_maps_in_fast_storage or mem_area != arch.fast_storage_mem_area:
tensor_allocation.allocate_tensors(
nng,
root_sg,
arch,
mem_area,
mem_type_set,
tensor_allocator=options.tensor_allocator,
verbose_allocation=options.verbose_allocation,
show_minimum_possible_allocation=options.show_minimum_possible_allocation,
allocation_alignment=options.allocation_alignment,
)
else:
# For the case where scratch_fast != scratch: attempt to place feature maps used between
# cascaded passes in fast storage. Bisection is used to find the max possible usage of SRAM.
alloc_results = []
while True:
assert len(alloc_results) < 10, "Infinite allocator loop"
sram_factor, dry_test = next_sram_factor(alloc_results)
if sram_factor is None:
break
# Try to move as many feature maps as possible to SRAM before allocating
sram_limit = sram_factor * arch.sram_size
for sg in nng.subgraphs:
scheduler.use_fast_storage_for_feature_maps(sg, sram_limit, arch)
alloc_success = tensor_allocation.allocate_tensors(
nng,
root_sg,
arch,
mem_area,
mem_type_set,
max_size=arch.sram_size,
dry_test=dry_test,
tensor_allocator=options.tensor_allocator,
verbose_allocation=options.verbose_allocation,
show_minimum_possible_allocation=options.show_minimum_possible_allocation,
allocation_alignment=options.allocation_alignment,
)
if dry_test or not alloc_success:
for sg in nng.subgraphs:
scheduler.undo_use_fast_storage(sg, arch)
alloc_results.append(alloc_success)
if not alloc_results[-1]:
raise VelaError(
"Sram limit {} bytes, has been exceeded by the scratch fast tensor. "
"Increasing the value of --weight-estimation-scaling may help to resolve the issue. "
"See OPTIONS.md for more information.".format(arch.sram_size)
)
# Generate command streams and serialise Npu-ops into tensors
for sg in nng.subgraphs:
high_level_command_stream_generator.generate_high_level_command_stream(
nng, sg, arch, options.verbose_high_level_command_stream
)
lut.optimize_high_level_cmd_stream(sg, arch)
register_command_stream_generator.generate_register_command_stream_for_sg(
nng, sg, arch, options.verbose_register_command_stream
)
scratch_tens, scratch_fast_tens, flash_tens = npu_serialisation.serialise_npu_subgraph_into_tensors(
nng, sg, arch, scratch_tens, scratch_fast_tens, flash_tens
)
npu_serialisation.rewrite_npu_call_ops(nng, root_sg, arch)
# Set Scratch and Fast_scratch Tensor size
if scratch_tens is not None:
scratch_tens.set_all_shapes([root_sg.memory_used_per_type.get(MemType.Scratch, 0)])
if scratch_fast_tens is not None:
scratch_fast_tens.set_all_shapes([root_sg.memory_used_per_type.get(MemType.Scratch_fast, 0)])
# Allocate all Cpu constant tensors, this is done last because the Npu-ops
# have to be serialized into flash and scratch tensors first
tensor_allocation.allocate_tensors(
nng,
root_sg,
arch,
permanent_storage,
set((MemType.Permanent_CPU,)),
tensor_allocator=TensorAllocator.LinearAlloc,
verbose_allocation=options.verbose_allocation,
show_minimum_possible_allocation=options.show_minimum_possible_allocation,
allocation_alignment=options.allocation_alignment,
)
npu_performance.calc_performance_for_network(nng, arch)