Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 1 | # Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. |
| 2 | # |
| 3 | # SPDX-License-Identifier: Apache-2.0 |
| 4 | # |
| 5 | # Licensed under the Apache License, Version 2.0 (the License); you may |
| 6 | # not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| 8 | # |
| 9 | # www.apache.org/licenses/LICENSE-2.0 |
| 10 | # |
| 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an AS IS BASIS, WITHOUT |
| 13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 16 | # Description: |
| 17 | # Wrapping function to do tensor address allocation. That is, assigning addresses to tensors based on what has been |
| 18 | # worked out from the allowable overlaps that are calculated by the live range analysis. |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 19 | import math |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 20 | |
Diego Russo | ea6111a | 2020-04-14 18:41:58 +0100 | [diff] [blame] | 21 | import numpy as np |
| 22 | |
| 23 | from . import live_range |
| 24 | from . import numeric_util |
Jacob Bohlin | 0628a8c | 2020-08-28 13:25:14 +0200 | [diff] [blame] | 25 | from .errors import AllocationError |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 26 | from .greedy_allocation import allocate_live_ranges as greedy_allocate_live_ranges |
Diego Russo | e8a1045 | 2020-04-21 17:39:10 +0100 | [diff] [blame] | 27 | from .nn_graph import TensorAllocator |
| 28 | from .tensor import MemArea |
Patrik Gustavsson | eca2e95 | 2020-05-27 09:15:11 +0200 | [diff] [blame] | 29 | from .tensor import MemType |
Jacob Bohlin | 0628a8c | 2020-08-28 13:25:14 +0200 | [diff] [blame] | 30 | from .tensor import Tensor |
Louis Verhaard | 0b8268a | 2020-08-05 16:11:29 +0200 | [diff] [blame] | 31 | from .tensor import TensorPurpose |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 32 | |
| 33 | |
Jacob Bohlin | 0628a8c | 2020-08-28 13:25:14 +0200 | [diff] [blame] | 34 | def linear_allocate_live_ranges(live_ranges, alloc_granularity=Tensor.AllocationQuantum): |
Louis Verhaard | 3c07c97 | 2020-05-07 08:12:58 +0200 | [diff] [blame] | 35 | # Allocates using increasing addresses. Duplicate constant tensors will be allocated to the same address |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 36 | total_sz = 0 |
| 37 | allocated_tensors = [] |
| 38 | |
Louis Verhaard | 3c07c97 | 2020-05-07 08:12:58 +0200 | [diff] [blame] | 39 | # just assign increasing addresses, except for duplicates |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 40 | for tens, lr in live_ranges.ranges.items(): |
| 41 | if tens in allocated_tensors: |
| 42 | continue |
| 43 | |
Louis Verhaard | 3c07c97 | 2020-05-07 08:12:58 +0200 | [diff] [blame] | 44 | address = total_sz |
| 45 | if tens.weight_compression_config is not None: |
| 46 | for allocated_tens in allocated_tensors: |
| 47 | if allocated_tens.weight_compression_config == tens.weight_compression_config: |
| 48 | address = allocated_tens.address |
| 49 | break |
Louis Verhaard | 0b8268a | 2020-08-05 16:11:29 +0200 | [diff] [blame] | 50 | if tens.purpose == TensorPurpose.LUT: |
| 51 | for allocated_tens in allocated_tensors: |
| 52 | if allocated_tens.equivalent(tens): |
| 53 | address = allocated_tens.address |
| 54 | break |
Louis Verhaard | 3c07c97 | 2020-05-07 08:12:58 +0200 | [diff] [blame] | 55 | lr.set_address(address) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 56 | allocated_tensors += lr.tensors |
Louis Verhaard | 3c07c97 | 2020-05-07 08:12:58 +0200 | [diff] [blame] | 57 | if address == total_sz: |
| 58 | total_sz += numeric_util.round_up(int(math.ceil(lr.size)), alloc_granularity) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 59 | |
Jacob Bohlin | 0628a8c | 2020-08-28 13:25:14 +0200 | [diff] [blame] | 60 | verify_alignment(live_ranges, alloc_granularity) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 61 | return total_sz |
| 62 | |
| 63 | |
Jacob Bohlin | 0628a8c | 2020-08-28 13:25:14 +0200 | [diff] [blame] | 64 | def verify_alignment(live_ranges, alignment): |
| 65 | for lr in live_ranges.ranges.values(): |
| 66 | for tens in lr.tensors: |
| 67 | if not all(op and op.run_on_npu for op in tens.ops + tens.consumer_list): |
| 68 | # This is a CPU tensor, verify alignment |
| 69 | if tens.address % alignment != 0: |
| 70 | raise AllocationError("Tensor {} not aligned to {} bytes".format(tens.name, alignment)) |
| 71 | |
| 72 | |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 73 | def mark_sram_used_for_cascaded_passes(sg, lrs): |
| 74 | end_pos = max(ps.time for ps in sg.cascaded_passes) + 2 |
| 75 | mem_usage = np.zeros(end_pos, dtype=np.int64) |
| 76 | |
| 77 | for tens, rng in lrs.ranges.items(): |
| 78 | storage_size = tens.storage_size() |
| 79 | mem_usage[rng.start_time : rng.end_time] += storage_size |
| 80 | |
| 81 | for cps in sg.cascaded_passes: |
| 82 | sram_used = max(mem_usage[cps.time], mem_usage[cps.time + 1]) |
| 83 | cps.sram_used = sram_used |
| 84 | for ps in cps.passes: |
| 85 | ps.sram_used = sram_used |
| 86 | |
| 87 | |
Patrik Gustavsson | eca2e95 | 2020-05-27 09:15:11 +0200 | [diff] [blame] | 88 | def print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation, show_minimum_possible_allocation): |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 89 | if verbose_allocation: |
Patrik Gustavsson | eca2e95 | 2020-05-27 09:15:11 +0200 | [diff] [blame] | 90 | if mem_type_set == set((MemType.Permanent_NPU,)) or mem_type_set == set((MemType.Permanent_CPU,)): |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 91 | print("allocation for", mem_area, "- constant tensors in", sg.placement.name, "subgraph(s)") |
Patrik Gustavsson | eca2e95 | 2020-05-27 09:15:11 +0200 | [diff] [blame] | 92 | else: |
| 93 | print("allocation for", mem_area, "- non-constant tensors in Cpu and Npu subgraphs") |
Louis Verhaard | 1356c2a | 2020-09-16 10:25:28 +0200 | [diff] [blame] | 94 | mem_usage = 0 |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 95 | for start_time, start, end, name, end_time in sorted( |
| 96 | ( |
| 97 | lr.start_time, |
| 98 | tens.address, |
| 99 | tens.address + int(math.ceil(tens.storage_size())), |
| 100 | tens.name + " " + str(tens.purpose), |
| 101 | lr.end_time, |
| 102 | ) |
| 103 | for tens, lr in lrs.ranges.items() |
| 104 | ): |
| 105 | name = name.replace("\x00", "") |
| 106 | print("%9d: %#12x - %#12x: %3d - %3d %s" % ((end - start), start, end, start_time, end_time, name)) |
Louis Verhaard | 1356c2a | 2020-09-16 10:25:28 +0200 | [diff] [blame] | 107 | mem_usage = max(mem_usage, end) |
| 108 | print("Memory usage: {} ({:#x}) bytes / {:.1f} KB".format(mem_usage, mem_usage, mem_usage / 1024)) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 109 | print() |
| 110 | |
| 111 | if show_minimum_possible_allocation and mem_area == MemArea.Sram: |
| 112 | min_possible_allocation = max(cps.sram_used for cps in sg.cascaded_passes) |
| 113 | print( |
| 114 | "Min possible allocation %d bytes / %.1f KB / %.1f MB" |
| 115 | % (min_possible_allocation, min_possible_allocation / 1024, min_possible_allocation / 1024 / 1024) |
| 116 | ) |
| 117 | |
| 118 | |
| 119 | def allocate_tensors( |
| 120 | nng, |
| 121 | sg, |
| 122 | arch, |
| 123 | mem_area, |
Patrik Gustavsson | eca2e95 | 2020-05-27 09:15:11 +0200 | [diff] [blame] | 124 | mem_type_set, |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 125 | tensor_allocator=TensorAllocator.Greedy, |
| 126 | verbose_allocation=False, |
| 127 | show_minimum_possible_allocation=False, |
| 128 | lr_graph=None, |
Jacob Bohlin | 0628a8c | 2020-08-28 13:25:14 +0200 | [diff] [blame] | 129 | allocation_alignment=Tensor.AllocationQuantum, |
Louis Verhaard | 0b9c9a3 | 2020-09-15 14:05:38 +0200 | [diff] [blame] | 130 | max_size=None, |
| 131 | dry_test=False, |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 132 | ): |
Louis Verhaard | 0b9c9a3 | 2020-09-15 14:05:38 +0200 | [diff] [blame] | 133 | # Allocates addresses to tensors, returns False if tensors could not be fit within max_size |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 134 | ignore_subgraph_input_output_tensors = False |
| 135 | lrs = live_range.extract_live_ranges_from_cascaded_passes( |
| 136 | sg, |
| 137 | mem_area, |
Patrik Gustavsson | eca2e95 | 2020-05-27 09:15:11 +0200 | [diff] [blame] | 138 | mem_type_set, |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 139 | ignore_subgraph_input_output_tensors=ignore_subgraph_input_output_tensors, |
| 140 | lr_graph=lr_graph, |
Jacob Bohlin | 0628a8c | 2020-08-28 13:25:14 +0200 | [diff] [blame] | 141 | allocation_alignment=allocation_alignment, |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 142 | ) |
| 143 | |
| 144 | if lrs.ranges: |
| 145 | tens_alloc = tensor_allocator |
| 146 | if tens_alloc == TensorAllocator.Greedy: |
Jacob Bohlin | 0628a8c | 2020-08-28 13:25:14 +0200 | [diff] [blame] | 147 | total_sz = greedy_allocate_live_ranges(sg, arch, lrs, mem_area, allocation_alignment, verbose_allocation) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 148 | elif tens_alloc == TensorAllocator.LinearAlloc: |
Jacob Bohlin | 0628a8c | 2020-08-28 13:25:14 +0200 | [diff] [blame] | 149 | total_sz = linear_allocate_live_ranges(lrs, allocation_alignment) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 150 | else: |
| 151 | assert 0 |
Louis Verhaard | 0b9c9a3 | 2020-09-15 14:05:38 +0200 | [diff] [blame] | 152 | alloc_ok = max_size is None or total_sz <= max_size |
| 153 | if dry_test or not alloc_ok: |
| 154 | # Dry test or allocation failed; undo allocation |
| 155 | for lr in lrs.ranges.values(): |
| 156 | lr.set_address(None) |
| 157 | return alloc_ok |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 158 | |
Patrik Gustavsson | eca2e95 | 2020-05-27 09:15:11 +0200 | [diff] [blame] | 159 | if sg.memory_used.get(mem_area, 0) == 0: |
| 160 | sg.memory_used[mem_area] = total_sz |
| 161 | else: |
| 162 | sg.memory_used[mem_area] += total_sz |
| 163 | |
| 164 | # Keep track of how much should be used for scratch or permanent storage for NPU |
| 165 | for mem_type in mem_type_set: |
| 166 | if sg.memory_used_per_type.get(mem_type, 0) == 0: |
| 167 | sg.memory_used_per_type[mem_type] = total_sz |
| 168 | else: |
| 169 | sg.memory_used_per_type[mem_type] += total_sz |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 170 | |
| 171 | nng.total_size[mem_area] = nng.total_size.get(mem_area, 0) + sum(tens.storage_size() for tens in lrs.ranges) |
| 172 | nng.total_elements[mem_area] = nng.total_elements.get(mem_area, 0) + sum(tens.elements() for tens in lrs.ranges) |
| 173 | |
Patrik Gustavsson | eca2e95 | 2020-05-27 09:15:11 +0200 | [diff] [blame] | 174 | print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation, show_minimum_possible_allocation) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 175 | |
| 176 | if mem_area == MemArea.Sram: |
| 177 | # Mark Sram usage for all subgraphs |
| 178 | for sg_ in nng.subgraphs: |
| 179 | mark_sram_used_for_cascaded_passes(sg_, lrs) |
| 180 | |
| 181 | if sg == nng.get_root_subgraph(): |
| 182 | nng.memory_used = sg.memory_used |
| 183 | for mem_area in nng.total_elements.keys(): |
| 184 | try: |
| 185 | nng.bits_per_element[mem_area] = nng.total_size[mem_area] * 8 / nng.total_elements[mem_area] |
| 186 | except ZeroDivisionError: |
| 187 | nng.bits_per_element[mem_area] = 0.0 |
Louis Verhaard | 0b9c9a3 | 2020-09-15 14:05:38 +0200 | [diff] [blame] | 188 | return True |