blob: b2ea7de6e9c287035221710efe7df51db8b9ecce [file] [log] [blame]
Louis Verhaardd7002522021-01-20 17:23:54 +01001# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Wrapping function to do tensor address allocation. That is, assigning addresses to tensors based on what has been
18# worked out from the allowable overlaps that are calculated by the live range analysis.
Tim Hall79d07d22020-04-27 18:20:16 +010019import math
Tim Hall79d07d22020-04-27 18:20:16 +010020
Diego Russoea6111a2020-04-14 18:41:58 +010021import numpy as np
22
Louis Verhaardd7002522021-01-20 17:23:54 +010023from . import hillclimb_allocation
Diego Russoea6111a2020-04-14 18:41:58 +010024from . import live_range
25from . import numeric_util
Jacob Bohlin0628a8c2020-08-28 13:25:14 +020026from .errors import AllocationError
Tim Hall79d07d22020-04-27 18:20:16 +010027from .greedy_allocation import allocate_live_ranges as greedy_allocate_live_ranges
Louis Verhaard9bfe0f82020-12-03 12:26:25 +010028from .live_range import LiveRangeGraph
Diego Russoe8a10452020-04-21 17:39:10 +010029from .nn_graph import TensorAllocator
30from .tensor import MemArea
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020031from .tensor import MemType
Jacob Bohlin0628a8c2020-08-28 13:25:14 +020032from .tensor import Tensor
Louis Verhaard0b8268a2020-08-05 16:11:29 +020033from .tensor import TensorPurpose
Tim Hall79d07d22020-04-27 18:20:16 +010034
35
Jacob Bohlin0628a8c2020-08-28 13:25:14 +020036def linear_allocate_live_ranges(live_ranges, alloc_granularity=Tensor.AllocationQuantum):
Louis Verhaard3c07c972020-05-07 08:12:58 +020037 # Allocates using increasing addresses. Duplicate constant tensors will be allocated to the same address
Tim Hall79d07d22020-04-27 18:20:16 +010038 total_sz = 0
39 allocated_tensors = []
40
Louis Verhaard3c07c972020-05-07 08:12:58 +020041 # just assign increasing addresses, except for duplicates
Tim Hall79d07d22020-04-27 18:20:16 +010042 for tens, lr in live_ranges.ranges.items():
43 if tens in allocated_tensors:
44 continue
45
Louis Verhaard3c07c972020-05-07 08:12:58 +020046 address = total_sz
47 if tens.weight_compression_config is not None:
48 for allocated_tens in allocated_tensors:
49 if allocated_tens.weight_compression_config == tens.weight_compression_config:
50 address = allocated_tens.address
51 break
Louis Verhaard0b8268a2020-08-05 16:11:29 +020052 if tens.purpose == TensorPurpose.LUT:
53 for allocated_tens in allocated_tensors:
54 if allocated_tens.equivalent(tens):
55 address = allocated_tens.address
56 break
Louis Verhaard3c07c972020-05-07 08:12:58 +020057 lr.set_address(address)
Tim Hall79d07d22020-04-27 18:20:16 +010058 allocated_tensors += lr.tensors
Louis Verhaard3c07c972020-05-07 08:12:58 +020059 if address == total_sz:
60 total_sz += numeric_util.round_up(int(math.ceil(lr.size)), alloc_granularity)
Tim Hall79d07d22020-04-27 18:20:16 +010061
Jacob Bohlin0628a8c2020-08-28 13:25:14 +020062 verify_alignment(live_ranges, alloc_granularity)
Tim Hall79d07d22020-04-27 18:20:16 +010063 return total_sz
64
65
Louis Verhaardd7002522021-01-20 17:23:54 +010066def hillclimb_allocate_live_ranges(live_ranges: LiveRangeGraph, alloc_granularity: int) -> int:
67 # Allocates using the hill climb allocator
68 lr_set = {(lr.start_time, lr.end_time, lr) for lr in live_ranges.ranges.values()}
69 lr_list = [lr for _, _, lr in lr_set]
Fredrik Svedbergab11bd12021-02-15 15:36:05 +010070 lr_list.sort()
71
Louis Verhaardd7002522021-01-20 17:23:54 +010072 addresses = hillclimb_allocation.allocate_live_ranges(lr_list)
Louis Verhaard9bfe0f82020-12-03 12:26:25 +010073 # The result is a list containing the allocated addresses
74 total_sz = 0
Louis Verhaardd7002522021-01-20 17:23:54 +010075 for lr, address in zip(lr_list, addresses):
Louis Verhaard9bfe0f82020-12-03 12:26:25 +010076 total_sz = max(total_sz, address + lr.size)
77 lr.set_address(address)
78 verify_allocation(live_ranges, alloc_granularity)
79 return total_sz
80
81
82def verify_alignment(live_ranges: LiveRangeGraph, alignment: int):
Jacob Bohlin0628a8c2020-08-28 13:25:14 +020083 for lr in live_ranges.ranges.values():
84 for tens in lr.tensors:
85 if not all(op and op.run_on_npu for op in tens.ops + tens.consumer_list):
86 # This is a CPU tensor, verify alignment
87 if tens.address % alignment != 0:
Michael McGeagh7a6f8432020-12-02 15:29:22 +000088 raise AllocationError(f"Tensor '{tens.name}' not aligned to {alignment} bytes")
Jacob Bohlin0628a8c2020-08-28 13:25:14 +020089
90
Louis Verhaard9bfe0f82020-12-03 12:26:25 +010091def verify_allocation(live_ranges: LiveRangeGraph, alignment: int):
92 lrs = list(live_ranges.ranges.values())
93 for n in lrs:
94 verify_alignment(live_ranges, alignment)
95
96 for m in lrs:
97 if n != m and n.overlaps_ranges(m):
98 overlap, tens_n, tens_m = n.overlaps_address(m)
99 if overlap and not (tens_n.equivalent(tens_m) and tens_n.address == tens_m.address):
100 raise AllocationError(
Michael McGeagh7a6f8432020-12-02 15:29:22 +0000101 f"Overlapping buffers: {n.name}: {tens_n.address} -> {tens_n.address + n.size}"
102 f" and {m.name}: {tens_m.address} -> {tens_m.address + m.size}"
Louis Verhaard9bfe0f82020-12-03 12:26:25 +0100103 )
104
105
Tim Hall79d07d22020-04-27 18:20:16 +0100106def mark_sram_used_for_cascaded_passes(sg, lrs):
107 end_pos = max(ps.time for ps in sg.cascaded_passes) + 2
108 mem_usage = np.zeros(end_pos, dtype=np.int64)
109
110 for tens, rng in lrs.ranges.items():
111 storage_size = tens.storage_size()
112 mem_usage[rng.start_time : rng.end_time] += storage_size
113
114 for cps in sg.cascaded_passes:
115 sram_used = max(mem_usage[cps.time], mem_usage[cps.time + 1])
116 cps.sram_used = sram_used
117 for ps in cps.passes:
118 ps.sram_used = sram_used
119
120
Tim Hallb9b515c2020-11-01 21:27:19 +0000121def print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation):
Tim Hall79d07d22020-04-27 18:20:16 +0100122 if verbose_allocation:
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200123 if mem_type_set == set((MemType.Permanent_NPU,)) or mem_type_set == set((MemType.Permanent_CPU,)):
Tim Hall79d07d22020-04-27 18:20:16 +0100124 print("allocation for", mem_area, "- constant tensors in", sg.placement.name, "subgraph(s)")
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200125 else:
126 print("allocation for", mem_area, "- non-constant tensors in Cpu and Npu subgraphs")
Louis Verhaard1356c2a2020-09-16 10:25:28 +0200127 mem_usage = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100128 for start_time, start, end, name, end_time in sorted(
129 (
130 lr.start_time,
131 tens.address,
132 tens.address + int(math.ceil(tens.storage_size())),
133 tens.name + " " + str(tens.purpose),
134 lr.end_time,
135 )
136 for tens, lr in lrs.ranges.items()
137 ):
138 name = name.replace("\x00", "")
139 print("%9d: %#12x - %#12x: %3d - %3d %s" % ((end - start), start, end, start_time, end_time, name))
Louis Verhaard1356c2a2020-09-16 10:25:28 +0200140 mem_usage = max(mem_usage, end)
141 print("Memory usage: {} ({:#x}) bytes / {:.1f} KB".format(mem_usage, mem_usage, mem_usage / 1024))
Tim Hall79d07d22020-04-27 18:20:16 +0100142 print()
143
Tim Hall79d07d22020-04-27 18:20:16 +0100144
erik.andersson@arm.com3438c922021-03-24 10:32:09 +0100145def calculate_allocation_efficiency(lrs):
146 lr_set = set(lrs.ranges.values())
147
148 size_at_time = [0] * (1 + max(lr.end_time for lr in lr_set))
149 for lr in lr_set:
150 for t in range(lr.start_time, lr.end_time + 1):
151 size_at_time[t] += lr.size
152
153 return max(size_at_time)
154
155
Tim Hall79d07d22020-04-27 18:20:16 +0100156def allocate_tensors(
157 nng,
158 sg,
159 arch,
160 mem_area,
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200161 mem_type_set,
Tim Hall79d07d22020-04-27 18:20:16 +0100162 tensor_allocator=TensorAllocator.Greedy,
163 verbose_allocation=False,
Tim Hall79d07d22020-04-27 18:20:16 +0100164 lr_graph=None,
Tim Hallb9b515c2020-11-01 21:27:19 +0000165 cpu_tensor_alignment=Tensor.AllocationQuantum,
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200166 max_size=None,
167 dry_test=False,
Tim Hall79d07d22020-04-27 18:20:16 +0100168):
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200169 # Allocates addresses to tensors, returns False if tensors could not be fit within max_size
Tim Hall79d07d22020-04-27 18:20:16 +0100170 ignore_subgraph_input_output_tensors = False
171 lrs = live_range.extract_live_ranges_from_cascaded_passes(
172 sg,
173 mem_area,
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200174 mem_type_set,
Tim Hall79d07d22020-04-27 18:20:16 +0100175 ignore_subgraph_input_output_tensors=ignore_subgraph_input_output_tensors,
176 lr_graph=lr_graph,
Tim Hallb9b515c2020-11-01 21:27:19 +0000177 cpu_tensor_alignment=cpu_tensor_alignment,
Tim Hall79d07d22020-04-27 18:20:16 +0100178 )
179
180 if lrs.ranges:
181 tens_alloc = tensor_allocator
182 if tens_alloc == TensorAllocator.Greedy:
Tim Hallb9b515c2020-11-01 21:27:19 +0000183 total_sz = greedy_allocate_live_ranges(sg, arch, lrs, mem_area, cpu_tensor_alignment, verbose_allocation)
Louis Verhaard9bfe0f82020-12-03 12:26:25 +0100184 verify_allocation(lrs, cpu_tensor_alignment)
Tim Hall79d07d22020-04-27 18:20:16 +0100185 elif tens_alloc == TensorAllocator.LinearAlloc:
Tim Hallb9b515c2020-11-01 21:27:19 +0000186 total_sz = linear_allocate_live_ranges(lrs, cpu_tensor_alignment)
Louis Verhaardd7002522021-01-20 17:23:54 +0100187 elif tens_alloc == TensorAllocator.HillClimb:
188 total_sz = hillclimb_allocate_live_ranges(lrs, cpu_tensor_alignment)
Tim Hall79d07d22020-04-27 18:20:16 +0100189 else:
190 assert 0
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200191 alloc_ok = max_size is None or total_sz <= max_size
192 if dry_test or not alloc_ok:
193 # Dry test or allocation failed; undo allocation
194 for lr in lrs.ranges.values():
195 lr.set_address(None)
196 return alloc_ok
Tim Hall79d07d22020-04-27 18:20:16 +0100197
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200198 if sg.memory_used.get(mem_area, 0) == 0:
199 sg.memory_used[mem_area] = total_sz
200 else:
201 sg.memory_used[mem_area] += total_sz
202
203 # Keep track of how much should be used for scratch or permanent storage for NPU
204 for mem_type in mem_type_set:
205 if sg.memory_used_per_type.get(mem_type, 0) == 0:
206 sg.memory_used_per_type[mem_type] = total_sz
207 else:
208 sg.memory_used_per_type[mem_type] += total_sz
Tim Hall79d07d22020-04-27 18:20:16 +0100209
Tim Hallb9b515c2020-11-01 21:27:19 +0000210 print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation)
Tim Hall79d07d22020-04-27 18:20:16 +0100211
212 if mem_area == MemArea.Sram:
erik.andersson@arm.com3438c922021-03-24 10:32:09 +0100213 sg.min_mem_usage = calculate_allocation_efficiency(lrs)
Tim Hall79d07d22020-04-27 18:20:16 +0100214 # Mark Sram usage for all subgraphs
215 for sg_ in nng.subgraphs:
216 mark_sram_used_for_cascaded_passes(sg_, lrs)
217
218 if sg == nng.get_root_subgraph():
219 nng.memory_used = sg.memory_used
Diqing Zhong66d7ec02021-02-01 19:07:04 +0100220 try:
221 nng.weights_compression_ratio = nng.total_compressed_weights / nng.total_original_weights
222 except ZeroDivisionError:
223 nng.weights_compression_ratio = 0.0
Diqing Zhongdb5124c2021-01-11 12:52:48 +0100224
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200225 return True