ethosu/vela/tensor_allocation.py - ml/ethos-u/ethos-u-vela - Gitiles

 # Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the License); you may
 # not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an AS IS BASIS, WITHOUT
 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # Description:
 # Wrapping function to do tensor address allocation. That is, assigning addresses to tensors based on what has been
 # worked out from the allowable overlaps that are calculated by the live range analysis.
 import math
 from typing import List

 import numpy as np

 from . import hillclimb_allocation
 from . import live_range
 from . import numeric_util
 from .errors import AllocationError
 from .greedy_allocation import allocate_live_ranges as greedy_allocate_live_ranges
 from .live_range import LiveRange
 from .live_range import LiveRangeGraph
 from .nn_graph import TensorAllocator
 from .tensor import MemArea
 from .tensor import MemType
 from .tensor import Tensor
 from .tensor import TensorPurpose


 def linear_allocate_live_ranges(live_ranges, alloc_granularity=Tensor.AllocationQuantum):
     # Allocates using increasing addresses. Duplicate constant tensors will be allocated to the same address
     total_sz = 0
     allocated_tensors = []

     # just assign increasing addresses, except for duplicates
     for tens, lr in live_ranges.ranges.items():
         if tens in allocated_tensors:
             continue

         address = total_sz
         if tens.weight_compression_config is not None:
             for allocated_tens in allocated_tensors:
                 if allocated_tens.weight_compression_config == tens.weight_compression_config:
                     assert allocated_tens.scale_compression_config == tens.scale_compression_config
                     address = allocated_tens.address
                     break
         if tens.purpose == TensorPurpose.LUT:
             for allocated_tens in allocated_tensors:
                 if allocated_tens.equivalent(tens):
                     address = allocated_tens.address
                     break
         lr.set_address(address)
         allocated_tensors += lr.tensors
         if address == total_sz:
             total_sz += numeric_util.round_up(int(math.ceil(lr.size)), alloc_granularity)

     verify_alignment(live_ranges, alloc_granularity)
     return total_sz


 def hillclimb_allocate_live_ranges(
     live_ranges: LiveRangeGraph, alloc_granularity: int, max_iterations: int, mem_limit: int
 ) -> int:
     # Allocates using the hill climb allocator
     addresses = hillclimb_allocation.allocate_live_ranges(live_ranges.lrs, max_iterations, mem_limit)
     # The result is a list containing the allocated addresses
     total_sz = 0
     for lr, address in zip(live_ranges.lrs, addresses):
         total_sz = max(total_sz, address + lr.size)
         lr.set_address(address)
     verify_allocation(live_ranges, alloc_granularity)
     return total_sz


 def verify_alignment(live_ranges: LiveRangeGraph, alignment: int):
     for lr in live_ranges.lrs:
         for tens in lr.tensors:
             if not all(op and op.run_on_npu for op in tens.ops + tens.consumer_list):
                 # This is a CPU tensor, verify alignment
                 if tens.address % alignment != 0:
                     raise AllocationError(f"Tensor '{tens.name}' not aligned to {alignment} bytes")


 def verify_allocation(live_ranges: LiveRangeGraph, alignment: int):
     verify_alignment(live_ranges, alignment)
     nr_time_slots = 1 + max(lr.end_time for lr in live_ranges.lrs)
     # Contains active live ranges at each timestamp
     lrs_at_time: List[List[LiveRange]] = [[] for i in range(nr_time_slots)]
     for lr in live_ranges.lrs:
         for t in range(lr.start_time, lr.end_time + 1):
             lrs_at_time[t].append(lr)
     for t in range(nr_time_slots):
         lrs_new_items = [lr for lr in lrs_at_time[t] if t == 0 or lr not in lrs_at_time[t - 1]]
         for m in lrs_new_items:
             for n in lrs_at_time[t]:
                 overlap, tens_n, tens_m = n.overlaps_address(m)
                 if overlap and not (tens_n.equivalent(tens_m) and tens_n.address == tens_m.address):
                     raise AllocationError(
                         f"Overlapping buffers: {n.name}: {tens_n.address} -> {tens_n.address + n.size}"
                         f" and {m.name}: {tens_m.address} -> {tens_m.address + m.size}"
                     )


 def mark_sram_used_for_cascaded_passes(sg, lrs):
     if len(sg.cascaded_passes) < 1:
         return
     end_pos = max(ps.time for ps in sg.cascaded_passes) + 2
     mem_usage = np.zeros(end_pos, dtype=np.int64)

     for tens, rng in lrs.ranges.items():
         storage_size = tens.storage_size()
         mem_usage[rng.start_time : rng.end_time] += storage_size

     for cps in sg.cascaded_passes:
         sram_used = max(mem_usage[cps.time], mem_usage[cps.time + 1])
         cps.sram_used = sram_used
         for ps in cps.passes:
             ps.sram_used = sram_used


 def print_allocation(lrs, mem_area, mem_type_set, tensor_allocator, sg, actual_mem_usage_for_alloc):
     print("\n" + "#" * 80)
     sg_placement = (
         sg.placement.name
         if mem_type_set.intersection(
             (
                 MemType.Permanent_NPU,
                 MemType.Permanent_CPU,
             )
         )
         else "Cpu and Npu"
     )
     print(
         f"Tensor Allocation for mem_area {mem_area.name}, of mem_type_set ("
         f'{", ".join(f"{mem_type.name}" for mem_type in mem_type_set)}'
         f"), using allocator {tensor_allocator}, in {sg_placement} subgraph:"
     )

     memory_hist = memory_usage_histogram(lrs.lrs)
     min_mem_usage_for_alloc = max(memory_hist)
     print(
         f"{'Start Time':>10s} - {'End Time':>10s}: {'Start Addr':>10s} - {'End Addr':>10s}: {'Tensor Size':>11s}:"
         f" {'Memory Usage':>12s}: {'Purpose':12s}: Name"
     )
     for start_time, end_time, size, start_addr, end_addr, purpose, name in sorted(
         (
             lr.start_time,
             lr.end_time,
             lr.size,
             tens.address,
             tens.address + lr.size,
             tens.purpose,
             tens.name,
         )
         for tens, lr in lrs.ranges.items()
     ):
         print(
             f"{start_time:10d} - {end_time:10d}: {start_addr:#10x} - {end_addr:#10x}: {size:11d}:"
             f" {memory_hist[start_time]:12d}: {purpose.display_name():12s}: {name:s}"
         )

     alloc_overhead_fraction = (actual_mem_usage_for_alloc - min_mem_usage_for_alloc) / min_mem_usage_for_alloc
     print(
         f"Allocation Peak Tensor Size:  {min_mem_usage_for_alloc:9d} ({min_mem_usage_for_alloc:#10x})"
         f" Bytes {min_mem_usage_for_alloc/1024.0:8.2f} KiB"
     )
     print(
         f"Allocation Peak Memory Usage: {actual_mem_usage_for_alloc:9d} ({actual_mem_usage_for_alloc:#10x})"
         f" Bytes {actual_mem_usage_for_alloc/1024.0:8.2f} KiB"
     )
     print(
         f"Allocation Overhead:          {actual_mem_usage_for_alloc-min_mem_usage_for_alloc:9d}"
         f" Bytes ({100*alloc_overhead_fraction:.2f} %)"
     )


 def memory_usage_histogram(lrs: List[LiveRange]):
     histogram = [0] * (1 + max(lr.end_time for lr in lrs))
     for lr in lrs:
         for t in range(lr.start_time, lr.end_time + 1):
             histogram[t] += lr.size

     return histogram


 def allocate(
     sg,
     arch,
     mem_area,
     mem_type_set,
     tensor_allocator=TensorAllocator.Greedy,
     lr_graph=None,
     cpu_tensor_alignment=Tensor.AllocationQuantum,
     hillclimb_max_iterations=None,
 ):
     # Allocates addresses to tensors, returns False if tensors could not be fit within max_size
     lrs = live_range.extract_live_ranges_from_cascaded_passes(
         sg,
         mem_area,
         mem_type_set,
         lr_graph=lr_graph,
         cpu_tensor_alignment=cpu_tensor_alignment,
     )
     total_sz = 0
     if lrs.ranges:
         tens_alloc = tensor_allocator
         if tens_alloc == TensorAllocator.Greedy:
             total_sz = greedy_allocate_live_ranges(lrs, cpu_tensor_alignment)
             verify_allocation(lrs, cpu_tensor_alignment)
         elif tens_alloc == TensorAllocator.LinearAlloc:
             total_sz = linear_allocate_live_ranges(lrs, cpu_tensor_alignment)
         elif tens_alloc == TensorAllocator.HillClimb:
             mem_type = MemType.Scratch_fast if MemType.Scratch_fast in mem_type_set else list(mem_type_set)[0]
             mem_size = arch.mem_type_size(mem_type)
             total_sz = hillclimb_allocate_live_ranges(lrs, cpu_tensor_alignment, hillclimb_max_iterations, mem_size)
         else:
             assert 0
     return lrs, total_sz


 def allocate_tensors(
     nng,
     sg,
     arch,
     mem_area,
     mem_type_set,
     tensor_allocator=TensorAllocator.Greedy,
     verbose_allocation=False,
     lr_graph=None,
     cpu_tensor_alignment=Tensor.AllocationQuantum,
     hillclimb_max_iterations=None,
     max_size=None,
     dry_test=False,
 ):
     # Allocates addresses to tensors, returns False if tensors could not be fit within max_size
     lrs, total_sz = allocate(
         sg,
         arch,
         mem_area,
         mem_type_set,
         tensor_allocator=tensor_allocator,
         lr_graph=lr_graph,
         cpu_tensor_alignment=cpu_tensor_alignment,
         hillclimb_max_iterations=hillclimb_max_iterations,
     )

     if lrs.ranges:
         alloc_ok = max_size is None or total_sz <= max_size
         if dry_test or not alloc_ok:
             # Dry test or allocation failed; undo allocation
             for lr in lrs.ranges.values():
                 lr.set_address(None)
             return alloc_ok

         if sg.memory_used.get(mem_area, 0) == 0:
             sg.memory_used[mem_area] = total_sz
         else:
             sg.memory_used[mem_area] += total_sz

         # Keep track of how much should be used for scratch or permanent storage for NPU
         for mem_type in mem_type_set:
             if sg.memory_used_per_type.get(mem_type, 0) == 0:
                 sg.memory_used_per_type[mem_type] = total_sz
             else:
                 sg.memory_used_per_type[mem_type] += total_sz

         if verbose_allocation:
             print_allocation(lrs, mem_area, mem_type_set, tensor_allocator, sg, total_sz)

         if mem_area == MemArea.Sram:
             # Mark Sram usage for all subgraphs
             for sg_ in nng.subgraphs:
                 mark_sram_used_for_cascaded_passes(sg_, lrs)

     if sg == nng.get_root_subgraph():
         nng.memory_used = sg.memory_used
     return True
	# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
	#
	# SPDX-License-Identifier: Apache-2.0
	#
	# Licensed under the Apache License, Version 2.0 (the License); you may
	# not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# Description:
	# Wrapping function to do tensor address allocation. That is, assigning addresses to tensors based on what has been
	# worked out from the allowable overlaps that are calculated by the live range analysis.
	import math
	from typing import List

	import numpy as np

	from . import hillclimb_allocation
	from . import live_range
	from . import numeric_util
	from .errors import AllocationError
	from .greedy_allocation import allocate_live_ranges as greedy_allocate_live_ranges
	from .live_range import LiveRange
	from .live_range import LiveRangeGraph
	from .nn_graph import TensorAllocator
	from .tensor import MemArea
	from .tensor import MemType
	from .tensor import Tensor
	from .tensor import TensorPurpose


	def linear_allocate_live_ranges(live_ranges, alloc_granularity=Tensor.AllocationQuantum):
	# Allocates using increasing addresses. Duplicate constant tensors will be allocated to the same address
	total_sz = 0
	allocated_tensors = []

	# just assign increasing addresses, except for duplicates
	for tens, lr in live_ranges.ranges.items():
	if tens in allocated_tensors:
	continue

	address = total_sz
	if tens.weight_compression_config is not None:
	for allocated_tens in allocated_tensors:
	if allocated_tens.weight_compression_config == tens.weight_compression_config:
	assert allocated_tens.scale_compression_config == tens.scale_compression_config
	address = allocated_tens.address
	break
	if tens.purpose == TensorPurpose.LUT:
	for allocated_tens in allocated_tensors:
	if allocated_tens.equivalent(tens):
	address = allocated_tens.address
	break
	lr.set_address(address)
	allocated_tensors += lr.tensors
	if address == total_sz:
	total_sz += numeric_util.round_up(int(math.ceil(lr.size)), alloc_granularity)

	verify_alignment(live_ranges, alloc_granularity)
	return total_sz


	def hillclimb_allocate_live_ranges(
	live_ranges: LiveRangeGraph, alloc_granularity: int, max_iterations: int, mem_limit: int
	) -> int:
	# Allocates using the hill climb allocator
	addresses = hillclimb_allocation.allocate_live_ranges(live_ranges.lrs, max_iterations, mem_limit)
	# The result is a list containing the allocated addresses
	total_sz = 0
	for lr, address in zip(live_ranges.lrs, addresses):
	total_sz = max(total_sz, address + lr.size)
	lr.set_address(address)
	verify_allocation(live_ranges, alloc_granularity)
	return total_sz


	def verify_alignment(live_ranges: LiveRangeGraph, alignment: int):
	for lr in live_ranges.lrs:
	for tens in lr.tensors:
	if not all(op and op.run_on_npu for op in tens.ops + tens.consumer_list):
	# This is a CPU tensor, verify alignment
	if tens.address % alignment != 0:
	raise AllocationError(f"Tensor '{tens.name}' not aligned to {alignment} bytes")


	def verify_allocation(live_ranges: LiveRangeGraph, alignment: int):
	verify_alignment(live_ranges, alignment)
	nr_time_slots = 1 + max(lr.end_time for lr in live_ranges.lrs)
	# Contains active live ranges at each timestamp
	lrs_at_time: List[List[LiveRange]] = [[] for i in range(nr_time_slots)]
	for lr in live_ranges.lrs:
	for t in range(lr.start_time, lr.end_time + 1):
	lrs_at_time[t].append(lr)
	for t in range(nr_time_slots):
	lrs_new_items = [lr for lr in lrs_at_time[t] if t == 0 or lr not in lrs_at_time[t - 1]]
	for m in lrs_new_items:
	for n in lrs_at_time[t]:
	overlap, tens_n, tens_m = n.overlaps_address(m)
	if overlap and not (tens_n.equivalent(tens_m) and tens_n.address == tens_m.address):
	raise AllocationError(
	f"Overlapping buffers: {n.name}: {tens_n.address} -> {tens_n.address + n.size}"
	f" and {m.name}: {tens_m.address} -> {tens_m.address + m.size}"
	)


	def mark_sram_used_for_cascaded_passes(sg, lrs):
	if len(sg.cascaded_passes) < 1:
	return
	end_pos = max(ps.time for ps in sg.cascaded_passes) + 2
	mem_usage = np.zeros(end_pos, dtype=np.int64)

	for tens, rng in lrs.ranges.items():
	storage_size = tens.storage_size()
	mem_usage[rng.start_time : rng.end_time] += storage_size

	for cps in sg.cascaded_passes:
	sram_used = max(mem_usage[cps.time], mem_usage[cps.time + 1])
	cps.sram_used = sram_used
	for ps in cps.passes:
	ps.sram_used = sram_used


	def print_allocation(lrs, mem_area, mem_type_set, tensor_allocator, sg, actual_mem_usage_for_alloc):
	print("\n" + "#" * 80)
	sg_placement = (
	sg.placement.name
	if mem_type_set.intersection(
	(
	MemType.Permanent_NPU,
	MemType.Permanent_CPU,
	)
	)
	else "Cpu and Npu"
	)
	print(
	f"Tensor Allocation for mem_area {mem_area.name}, of mem_type_set ("
	f'{", ".join(f"{mem_type.name}" for mem_type in mem_type_set)}'
	f"), using allocator {tensor_allocator}, in {sg_placement} subgraph:"
	)

	memory_hist = memory_usage_histogram(lrs.lrs)
	min_mem_usage_for_alloc = max(memory_hist)
	print(
	f"{'Start Time':>10s} - {'End Time':>10s}: {'Start Addr':>10s} - {'End Addr':>10s}: {'Tensor Size':>11s}:"
	f" {'Memory Usage':>12s}: {'Purpose':12s}: Name"
	)
	for start_time, end_time, size, start_addr, end_addr, purpose, name in sorted(
	(
	lr.start_time,
	lr.end_time,
	lr.size,
	tens.address,
	tens.address + lr.size,
	tens.purpose,
	tens.name,
	)
	for tens, lr in lrs.ranges.items()
	):
	print(
	f"{start_time:10d} - {end_time:10d}: {start_addr:#10x} - {end_addr:#10x}: {size:11d}:"
	f" {memory_hist[start_time]:12d}: {purpose.display_name():12s}: {name:s}"
	)

	alloc_overhead_fraction = (actual_mem_usage_for_alloc - min_mem_usage_for_alloc) / min_mem_usage_for_alloc
	print(
	f"Allocation Peak Tensor Size: {min_mem_usage_for_alloc:9d} ({min_mem_usage_for_alloc:#10x})"
	f" Bytes {min_mem_usage_for_alloc/1024.0:8.2f} KiB"
	)
	print(
	f"Allocation Peak Memory Usage: {actual_mem_usage_for_alloc:9d} ({actual_mem_usage_for_alloc:#10x})"
	f" Bytes {actual_mem_usage_for_alloc/1024.0:8.2f} KiB"
	)
	print(
	f"Allocation Overhead: {actual_mem_usage_for_alloc-min_mem_usage_for_alloc:9d}"
	f" Bytes ({100*alloc_overhead_fraction:.2f} %)"
	)


	def memory_usage_histogram(lrs: List[LiveRange]):
	histogram = [0] * (1 + max(lr.end_time for lr in lrs))
	for lr in lrs:
	for t in range(lr.start_time, lr.end_time + 1):
	histogram[t] += lr.size

	return histogram


	def allocate(
	sg,
	arch,
	mem_area,
	mem_type_set,
	tensor_allocator=TensorAllocator.Greedy,
	lr_graph=None,
	cpu_tensor_alignment=Tensor.AllocationQuantum,
	hillclimb_max_iterations=None,
	):
	# Allocates addresses to tensors, returns False if tensors could not be fit within max_size
	lrs = live_range.extract_live_ranges_from_cascaded_passes(
	sg,
	mem_area,
	mem_type_set,
	lr_graph=lr_graph,
	cpu_tensor_alignment=cpu_tensor_alignment,
	)
	total_sz = 0
	if lrs.ranges:
	tens_alloc = tensor_allocator
	if tens_alloc == TensorAllocator.Greedy:
	total_sz = greedy_allocate_live_ranges(lrs, cpu_tensor_alignment)
	verify_allocation(lrs, cpu_tensor_alignment)
	elif tens_alloc == TensorAllocator.LinearAlloc:
	total_sz = linear_allocate_live_ranges(lrs, cpu_tensor_alignment)
	elif tens_alloc == TensorAllocator.HillClimb:
	mem_type = MemType.Scratch_fast if MemType.Scratch_fast in mem_type_set else list(mem_type_set)[0]
	mem_size = arch.mem_type_size(mem_type)
	total_sz = hillclimb_allocate_live_ranges(lrs, cpu_tensor_alignment, hillclimb_max_iterations, mem_size)
	else:
	assert 0
	return lrs, total_sz


	def allocate_tensors(
	nng,
	sg,
	arch,
	mem_area,
	mem_type_set,
	tensor_allocator=TensorAllocator.Greedy,
	verbose_allocation=False,
	lr_graph=None,
	cpu_tensor_alignment=Tensor.AllocationQuantum,
	hillclimb_max_iterations=None,
	max_size=None,
	dry_test=False,
	):
	# Allocates addresses to tensors, returns False if tensors could not be fit within max_size
	lrs, total_sz = allocate(
	sg,
	arch,
	mem_area,
	mem_type_set,
	tensor_allocator=tensor_allocator,
	lr_graph=lr_graph,
	cpu_tensor_alignment=cpu_tensor_alignment,
	hillclimb_max_iterations=hillclimb_max_iterations,
	)

	if lrs.ranges:
	alloc_ok = max_size is None or total_sz <= max_size
	if dry_test or not alloc_ok:
	# Dry test or allocation failed; undo allocation
	for lr in lrs.ranges.values():
	lr.set_address(None)
	return alloc_ok

	if sg.memory_used.get(mem_area, 0) == 0:
	sg.memory_used[mem_area] = total_sz
	else:
	sg.memory_used[mem_area] += total_sz

	# Keep track of how much should be used for scratch or permanent storage for NPU
	for mem_type in mem_type_set:
	if sg.memory_used_per_type.get(mem_type, 0) == 0:
	sg.memory_used_per_type[mem_type] = total_sz
	else:
	sg.memory_used_per_type[mem_type] += total_sz

	if verbose_allocation:
	print_allocation(lrs, mem_area, mem_type_set, tensor_allocator, sg, total_sz)

	if mem_area == MemArea.Sram:
	# Mark Sram usage for all subgraphs
	for sg_ in nng.subgraphs:
	mark_sram_used_for_cascaded_passes(sg_, lrs)

	if sg == nng.get_root_subgraph():
	nng.memory_used = sg.memory_used
	return True