blob: 1ffae4c484ff763e81ca0af2317b8331a7829e1c [file] [log] [blame]
# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
#
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the License); you may
# not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an AS IS BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Description:
# Wrapping function to do tensor address allocation. That is, assigning addresses to tensors based on what has been
# worked out from the allowable overlaps that are calculated by the live range analysis.
import math
from typing import List
import numpy as np
from . import hillclimb_allocation
from . import live_range
from . import numeric_util
from .errors import AllocationError
from .greedy_allocation import allocate_live_ranges as greedy_allocate_live_ranges
from .live_range import LiveRange
from .live_range import LiveRangeGraph
from .nn_graph import TensorAllocator
from .tensor import MemArea
from .tensor import MemType
from .tensor import Tensor
from .tensor import TensorPurpose
def linear_allocate_live_ranges(live_ranges, alloc_granularity=Tensor.AllocationQuantum):
# Allocates using increasing addresses. Duplicate constant tensors will be allocated to the same address
total_sz = 0
allocated_tensors = []
# just assign increasing addresses, except for duplicates
for tens, lr in live_ranges.ranges.items():
if tens in allocated_tensors:
continue
address = total_sz
if tens.weight_compression_config is not None:
for allocated_tens in allocated_tensors:
if allocated_tens.weight_compression_config == tens.weight_compression_config:
assert allocated_tens.scale_compression_config == tens.scale_compression_config
address = allocated_tens.address
break
if tens.purpose == TensorPurpose.LUT:
for allocated_tens in allocated_tensors:
if allocated_tens.equivalent(tens):
address = allocated_tens.address
break
lr.set_address(address)
allocated_tensors += lr.tensors
if address == total_sz:
total_sz += numeric_util.round_up(int(math.ceil(lr.size)), alloc_granularity)
verify_alignment(live_ranges, alloc_granularity)
return total_sz
def hillclimb_allocate_live_ranges(
live_ranges: LiveRangeGraph, alloc_granularity: int, max_iterations: int, mem_limit: int
) -> int:
# Allocates using the hill climb allocator
addresses = hillclimb_allocation.allocate_live_ranges(live_ranges.lrs, max_iterations, mem_limit)
# The result is a list containing the allocated addresses
total_sz = 0
for lr, address in zip(live_ranges.lrs, addresses):
total_sz = max(total_sz, address + lr.size)
lr.set_address(address)
verify_allocation(live_ranges, alloc_granularity)
return total_sz
def verify_alignment(live_ranges: LiveRangeGraph, alignment: int):
for lr in live_ranges.lrs:
for tens in lr.tensors:
if not all(op and op.run_on_npu for op in tens.ops + tens.consumer_list):
# This is a CPU tensor, verify alignment
if tens.address % alignment != 0:
raise AllocationError(f"Tensor '{tens.name}' not aligned to {alignment} bytes")
def verify_allocation(live_ranges: LiveRangeGraph, alignment: int):
verify_alignment(live_ranges, alignment)
nr_time_slots = 1 + max(lr.end_time for lr in live_ranges.lrs)
# Contains active live ranges at each timestamp
lrs_at_time: List[List[LiveRange]] = [[] for i in range(nr_time_slots)]
for lr in live_ranges.lrs:
for t in range(lr.start_time, lr.end_time + 1):
lrs_at_time[t].append(lr)
for t in range(nr_time_slots):
lrs_new_items = [lr for lr in lrs_at_time[t] if t == 0 or lr not in lrs_at_time[t - 1]]
for m in lrs_new_items:
for n in lrs_at_time[t]:
overlap, tens_n, tens_m = n.overlaps_address(m)
if overlap and not (tens_n.equivalent(tens_m) and tens_n.address == tens_m.address):
raise AllocationError(
f"Overlapping buffers: {n.name}: {tens_n.address} -> {tens_n.address + n.size}"
f" and {m.name}: {tens_m.address} -> {tens_m.address + m.size}"
)
def mark_sram_used_for_cascaded_passes(sg, lrs):
if len(sg.cascaded_passes) < 1:
return
end_pos = max(ps.time for ps in sg.cascaded_passes) + 2
mem_usage = np.zeros(end_pos, dtype=np.int64)
for tens, rng in lrs.ranges.items():
storage_size = tens.storage_size()
mem_usage[rng.start_time : rng.end_time] += storage_size
for cps in sg.cascaded_passes:
sram_used = max(mem_usage[cps.time], mem_usage[cps.time + 1])
cps.sram_used = sram_used
for ps in cps.passes:
ps.sram_used = sram_used
def print_allocation(lrs, mem_area, mem_type_set, tensor_allocator, sg, actual_mem_usage_for_alloc):
print("\n" + "#" * 80)
sg_placement = (
sg.placement.name
if mem_type_set.intersection(
(
MemType.Permanent_NPU,
MemType.Permanent_CPU,
)
)
else "Cpu and Npu"
)
print(
f"Tensor Allocation for mem_area {mem_area.name}, of mem_type_set ("
f'{", ".join(f"{mem_type.name}" for mem_type in mem_type_set)}'
f"), using allocator {tensor_allocator}, in {sg_placement} subgraph:"
)
memory_hist = memory_usage_histogram(lrs.lrs)
min_mem_usage_for_alloc = max(memory_hist)
print(
f"{'Start Time':>10s} - {'End Time':>10s}: {'Start Addr':>10s} - {'End Addr':>10s}: {'Tensor Size':>11s}:"
f" {'Memory Usage':>12s}: {'Purpose':12s}: Name"
)
for start_time, end_time, size, start_addr, end_addr, purpose, name in sorted(
(
lr.start_time,
lr.end_time,
lr.size,
tens.address,
tens.address + lr.size,
tens.purpose,
tens.name,
)
for tens, lr in lrs.ranges.items()
):
print(
f"{start_time:10d} - {end_time:10d}: {start_addr:#10x} - {end_addr:#10x}: {size:11d}:"
f" {memory_hist[start_time]:12d}: {purpose.display_name():12s}: {name:s}"
)
alloc_overhead_fraction = (actual_mem_usage_for_alloc - min_mem_usage_for_alloc) / min_mem_usage_for_alloc
print(
f"Allocation Peak Tensor Size: {min_mem_usage_for_alloc:9d} ({min_mem_usage_for_alloc:#10x})"
f" Bytes {min_mem_usage_for_alloc/1024.0:8.2f} KiB"
)
print(
f"Allocation Peak Memory Usage: {actual_mem_usage_for_alloc:9d} ({actual_mem_usage_for_alloc:#10x})"
f" Bytes {actual_mem_usage_for_alloc/1024.0:8.2f} KiB"
)
print(
f"Allocation Overhead: {actual_mem_usage_for_alloc-min_mem_usage_for_alloc:9d}"
f" Bytes ({100*alloc_overhead_fraction:.2f} %)"
)
def memory_usage_histogram(lrs: List[LiveRange]):
histogram = [0] * (1 + max(lr.end_time for lr in lrs))
for lr in lrs:
for t in range(lr.start_time, lr.end_time + 1):
histogram[t] += lr.size
return histogram
def allocate(
sg,
arch,
mem_area,
mem_type_set,
tensor_allocator=TensorAllocator.Greedy,
lr_graph=None,
cpu_tensor_alignment=Tensor.AllocationQuantum,
hillclimb_max_iterations=None,
):
# Allocates addresses to tensors, returns False if tensors could not be fit within max_size
lrs = live_range.extract_live_ranges_from_cascaded_passes(
sg,
mem_area,
mem_type_set,
lr_graph=lr_graph,
cpu_tensor_alignment=cpu_tensor_alignment,
)
total_sz = 0
if lrs.ranges:
tens_alloc = tensor_allocator
if tens_alloc == TensorAllocator.Greedy:
total_sz = greedy_allocate_live_ranges(lrs, cpu_tensor_alignment)
verify_allocation(lrs, cpu_tensor_alignment)
elif tens_alloc == TensorAllocator.LinearAlloc:
total_sz = linear_allocate_live_ranges(lrs, cpu_tensor_alignment)
elif tens_alloc == TensorAllocator.HillClimb:
mem_type = MemType.Scratch_fast if MemType.Scratch_fast in mem_type_set else list(mem_type_set)[0]
mem_size = arch.mem_type_size(mem_type)
total_sz = hillclimb_allocate_live_ranges(lrs, cpu_tensor_alignment, hillclimb_max_iterations, mem_size)
else:
assert 0
return lrs, total_sz
def allocate_tensors(
nng,
sg,
arch,
mem_area,
mem_type_set,
tensor_allocator=TensorAllocator.Greedy,
verbose_allocation=False,
lr_graph=None,
cpu_tensor_alignment=Tensor.AllocationQuantum,
hillclimb_max_iterations=None,
max_size=None,
dry_test=False,
):
# Allocates addresses to tensors, returns False if tensors could not be fit within max_size
lrs, total_sz = allocate(
sg,
arch,
mem_area,
mem_type_set,
tensor_allocator=tensor_allocator,
lr_graph=lr_graph,
cpu_tensor_alignment=cpu_tensor_alignment,
hillclimb_max_iterations=hillclimb_max_iterations,
)
if lrs.ranges:
alloc_ok = max_size is None or total_sz <= max_size
if dry_test or not alloc_ok:
# Dry test or allocation failed; undo allocation
for lr in lrs.ranges.values():
lr.set_address(None)
return alloc_ok
if sg.memory_used.get(mem_area, 0) == 0:
sg.memory_used[mem_area] = total_sz
else:
sg.memory_used[mem_area] += total_sz
# Keep track of how much should be used for scratch or permanent storage for NPU
for mem_type in mem_type_set:
if sg.memory_used_per_type.get(mem_type, 0) == 0:
sg.memory_used_per_type[mem_type] = total_sz
else:
sg.memory_used_per_type[mem_type] += total_sz
if verbose_allocation:
print_allocation(lrs, mem_area, mem_type_set, tensor_allocator, sg, total_sz)
if mem_area == MemArea.Sram:
# Mark Sram usage for all subgraphs
for sg_ in nng.subgraphs:
mark_sram_used_for_cascaded_passes(sg_, lrs)
if sg == nng.get_root_subgraph():
nng.memory_used = sg.memory_used
return True