blob: 4b5e5e42f56d69406e38873d87b4ce7843ca5626 [file] [log] [blame]
Louis Verhaardd7002522021-01-20 17:23:54 +01001# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Wrapping function to do tensor address allocation. That is, assigning addresses to tensors based on what has been
18# worked out from the allowable overlaps that are calculated by the live range analysis.
Tim Hall79d07d22020-04-27 18:20:16 +010019import math
Louis Verhaard226ecaf2021-03-30 10:18:28 +020020from typing import List
Tim Hall79d07d22020-04-27 18:20:16 +010021
Diego Russoea6111a2020-04-14 18:41:58 +010022import numpy as np
23
Louis Verhaardd7002522021-01-20 17:23:54 +010024from . import hillclimb_allocation
Diego Russoea6111a2020-04-14 18:41:58 +010025from . import live_range
26from . import numeric_util
Jacob Bohlin0628a8c2020-08-28 13:25:14 +020027from .errors import AllocationError
Tim Hall79d07d22020-04-27 18:20:16 +010028from .greedy_allocation import allocate_live_ranges as greedy_allocate_live_ranges
Louis Verhaard226ecaf2021-03-30 10:18:28 +020029from .live_range import LiveRange
Louis Verhaard9bfe0f82020-12-03 12:26:25 +010030from .live_range import LiveRangeGraph
Diego Russoe8a10452020-04-21 17:39:10 +010031from .nn_graph import TensorAllocator
32from .tensor import MemArea
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020033from .tensor import MemType
Jacob Bohlin0628a8c2020-08-28 13:25:14 +020034from .tensor import Tensor
Louis Verhaard0b8268a2020-08-05 16:11:29 +020035from .tensor import TensorPurpose
Tim Hall79d07d22020-04-27 18:20:16 +010036
37
Jacob Bohlin0628a8c2020-08-28 13:25:14 +020038def linear_allocate_live_ranges(live_ranges, alloc_granularity=Tensor.AllocationQuantum):
Louis Verhaard3c07c972020-05-07 08:12:58 +020039 # Allocates using increasing addresses. Duplicate constant tensors will be allocated to the same address
Tim Hall79d07d22020-04-27 18:20:16 +010040 total_sz = 0
41 allocated_tensors = []
42
Louis Verhaard3c07c972020-05-07 08:12:58 +020043 # just assign increasing addresses, except for duplicates
Tim Hall79d07d22020-04-27 18:20:16 +010044 for tens, lr in live_ranges.ranges.items():
45 if tens in allocated_tensors:
46 continue
47
Louis Verhaard3c07c972020-05-07 08:12:58 +020048 address = total_sz
49 if tens.weight_compression_config is not None:
50 for allocated_tens in allocated_tensors:
51 if allocated_tens.weight_compression_config == tens.weight_compression_config:
Tim Halld784af72021-06-08 21:25:57 +010052 assert allocated_tens.scale_compression_config == tens.scale_compression_config
Louis Verhaard3c07c972020-05-07 08:12:58 +020053 address = allocated_tens.address
54 break
Louis Verhaard0b8268a2020-08-05 16:11:29 +020055 if tens.purpose == TensorPurpose.LUT:
56 for allocated_tens in allocated_tensors:
57 if allocated_tens.equivalent(tens):
58 address = allocated_tens.address
59 break
Louis Verhaard3c07c972020-05-07 08:12:58 +020060 lr.set_address(address)
Tim Hall79d07d22020-04-27 18:20:16 +010061 allocated_tensors += lr.tensors
Louis Verhaard3c07c972020-05-07 08:12:58 +020062 if address == total_sz:
63 total_sz += numeric_util.round_up(int(math.ceil(lr.size)), alloc_granularity)
Tim Hall79d07d22020-04-27 18:20:16 +010064
Jacob Bohlin0628a8c2020-08-28 13:25:14 +020065 verify_alignment(live_ranges, alloc_granularity)
Tim Hall79d07d22020-04-27 18:20:16 +010066 return total_sz
67
68
Louis Verhaardd7002522021-01-20 17:23:54 +010069def hillclimb_allocate_live_ranges(live_ranges: LiveRangeGraph, alloc_granularity: int) -> int:
70 # Allocates using the hill climb allocator
Louis Verhaard226ecaf2021-03-30 10:18:28 +020071 addresses = hillclimb_allocation.allocate_live_ranges(live_ranges.lrs)
Louis Verhaard9bfe0f82020-12-03 12:26:25 +010072 # The result is a list containing the allocated addresses
73 total_sz = 0
Louis Verhaard226ecaf2021-03-30 10:18:28 +020074 for lr, address in zip(live_ranges.lrs, addresses):
Louis Verhaard9bfe0f82020-12-03 12:26:25 +010075 total_sz = max(total_sz, address + lr.size)
76 lr.set_address(address)
77 verify_allocation(live_ranges, alloc_granularity)
78 return total_sz
79
80
81def verify_alignment(live_ranges: LiveRangeGraph, alignment: int):
Louis Verhaard226ecaf2021-03-30 10:18:28 +020082 for lr in live_ranges.lrs:
Jacob Bohlin0628a8c2020-08-28 13:25:14 +020083 for tens in lr.tensors:
84 if not all(op and op.run_on_npu for op in tens.ops + tens.consumer_list):
85 # This is a CPU tensor, verify alignment
86 if tens.address % alignment != 0:
Michael McGeagh7a6f8432020-12-02 15:29:22 +000087 raise AllocationError(f"Tensor '{tens.name}' not aligned to {alignment} bytes")
Jacob Bohlin0628a8c2020-08-28 13:25:14 +020088
89
Louis Verhaard9bfe0f82020-12-03 12:26:25 +010090def verify_allocation(live_ranges: LiveRangeGraph, alignment: int):
Louis Verhaard226ecaf2021-03-30 10:18:28 +020091 verify_alignment(live_ranges, alignment)
92 nr_time_slots = 1 + max(lr.end_time for lr in live_ranges.lrs)
93 # Contains active live ranges at each timestamp
94 lrs_at_time = [[] for i in range(nr_time_slots)]
95 for lr in live_ranges.lrs:
96 for t in range(lr.start_time, lr.end_time + 1):
97 lrs_at_time[t].append(lr)
98 for t in range(nr_time_slots):
99 for ix, n in enumerate(lrs_at_time[t]):
100 for m in lrs_at_time[t][ix + 1 :]:
Louis Verhaard9bfe0f82020-12-03 12:26:25 +0100101 overlap, tens_n, tens_m = n.overlaps_address(m)
102 if overlap and not (tens_n.equivalent(tens_m) and tens_n.address == tens_m.address):
103 raise AllocationError(
Michael McGeagh7a6f8432020-12-02 15:29:22 +0000104 f"Overlapping buffers: {n.name}: {tens_n.address} -> {tens_n.address + n.size}"
105 f" and {m.name}: {tens_m.address} -> {tens_m.address + m.size}"
Louis Verhaard9bfe0f82020-12-03 12:26:25 +0100106 )
107
108
Tim Hall79d07d22020-04-27 18:20:16 +0100109def mark_sram_used_for_cascaded_passes(sg, lrs):
Tim Halld8339a72021-05-27 18:49:40 +0100110 if len(sg.cascaded_passes) < 1:
111 return
Tim Hall79d07d22020-04-27 18:20:16 +0100112 end_pos = max(ps.time for ps in sg.cascaded_passes) + 2
113 mem_usage = np.zeros(end_pos, dtype=np.int64)
114
115 for tens, rng in lrs.ranges.items():
116 storage_size = tens.storage_size()
117 mem_usage[rng.start_time : rng.end_time] += storage_size
118
119 for cps in sg.cascaded_passes:
120 sram_used = max(mem_usage[cps.time], mem_usage[cps.time + 1])
121 cps.sram_used = sram_used
122 for ps in cps.passes:
123 ps.sram_used = sram_used
124
125
Tim Hall64556f32021-05-17 22:57:46 +0100126def print_allocation(lrs, mem_area, mem_type_set, tensor_allocator, sg, actual_mem_usage_for_alloc):
127 print("\n" + "#" * 80)
128 sg_placement = (
129 sg.placement.name
130 if mem_type_set.intersection((MemType.Permanent_NPU, MemType.Permanent_CPU,))
131 else "Cpu and Npu"
132 )
133 print(
134 f"Tensor Allocation for mem_area {mem_area.name}, of mem_type_set ("
135 f'{", ".join(f"{mem_type.name}" for mem_type in mem_type_set)}'
136 f"), using allocator {tensor_allocator}, in {sg_placement} subgraph:"
137 )
138
139 memory_hist = memory_usage_histogram(lrs.lrs)
140 min_mem_usage_for_alloc = max(memory_hist)
141 print("Start Time - End Time: Start Addr - End Addr: Tensor Size: Memory Usage: Tensor Purpose: Tensor Name")
142 for start_time, end_time, size, start_addr, end_addr, purpose, name in sorted(
143 (lr.start_time, lr.end_time, lr.size, tens.address, tens.address + lr.size, tens.purpose, tens.name,)
144 for tens, lr in lrs.ranges.items()
145 ):
146 print(
147 f"{start_time:10d} - {end_time:10d}: {start_addr:#10x} - {end_addr:#10x}: {size:11d}:"
148 f" {memory_hist[start_time]:12d}: {purpose.display_name():15s}: {name:s}"
149 )
150
151 alloc_overhead_fraction = (actual_mem_usage_for_alloc - min_mem_usage_for_alloc) / min_mem_usage_for_alloc
152 print(
153 f"Allocation Peak Tensor Size: {min_mem_usage_for_alloc:9d} ({min_mem_usage_for_alloc:#10x})"
154 f" Bytes {min_mem_usage_for_alloc/1024.0:8.2f} KiB"
155 )
156 print(
157 f"Allocation Peak Memory Usage: {actual_mem_usage_for_alloc:9d} ({actual_mem_usage_for_alloc:#10x})"
158 f" Bytes {actual_mem_usage_for_alloc/1024.0:8.2f} KiB"
159 )
160 print(
161 f"Allocation Overhead: {actual_mem_usage_for_alloc-min_mem_usage_for_alloc:9d}"
162 f" Bytes ({100*alloc_overhead_fraction:.2f} %)"
163 )
Tim Hall79d07d22020-04-27 18:20:16 +0100164
Tim Hall79d07d22020-04-27 18:20:16 +0100165
Tim Hall64556f32021-05-17 22:57:46 +0100166def memory_usage_histogram(lrs: List[LiveRange]):
167 histogram = [0] * (1 + max(lr.end_time for lr in lrs))
Louis Verhaard226ecaf2021-03-30 10:18:28 +0200168 for lr in lrs:
erik.andersson@arm.com3438c922021-03-24 10:32:09 +0100169 for t in range(lr.start_time, lr.end_time + 1):
Tim Hall64556f32021-05-17 22:57:46 +0100170 histogram[t] += lr.size
erik.andersson@arm.com3438c922021-03-24 10:32:09 +0100171
Tim Hall64556f32021-05-17 22:57:46 +0100172 return histogram
erik.andersson@arm.com3438c922021-03-24 10:32:09 +0100173
174
Tim Halld8339a72021-05-27 18:49:40 +0100175def allocate(
176 sg,
177 arch,
178 mem_area,
179 mem_type_set,
180 tensor_allocator=TensorAllocator.Greedy,
181 lr_graph=None,
182 cpu_tensor_alignment=Tensor.AllocationQuantum,
183):
184 # Allocates addresses to tensors, returns False if tensors could not be fit within max_size
185 ignore_subgraph_input_output_tensors = False
186 lrs = live_range.extract_live_ranges_from_cascaded_passes(
187 sg,
188 mem_area,
189 mem_type_set,
190 ignore_subgraph_input_output_tensors=ignore_subgraph_input_output_tensors,
191 lr_graph=lr_graph,
192 cpu_tensor_alignment=cpu_tensor_alignment,
193 )
194 total_sz = 0
195 if lrs.ranges:
196 tens_alloc = tensor_allocator
197 if tens_alloc == TensorAllocator.Greedy:
198 total_sz = greedy_allocate_live_ranges(sg, arch, lrs, mem_area, cpu_tensor_alignment)
199 verify_allocation(lrs, cpu_tensor_alignment)
200 elif tens_alloc == TensorAllocator.LinearAlloc:
201 total_sz = linear_allocate_live_ranges(lrs, cpu_tensor_alignment)
202 elif tens_alloc == TensorAllocator.HillClimb:
203 total_sz = hillclimb_allocate_live_ranges(lrs, cpu_tensor_alignment)
204 else:
205 assert 0
206 return lrs, total_sz
207
208
Tim Hall79d07d22020-04-27 18:20:16 +0100209def allocate_tensors(
210 nng,
211 sg,
212 arch,
213 mem_area,
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200214 mem_type_set,
Tim Hall79d07d22020-04-27 18:20:16 +0100215 tensor_allocator=TensorAllocator.Greedy,
216 verbose_allocation=False,
Tim Hall79d07d22020-04-27 18:20:16 +0100217 lr_graph=None,
Tim Hallb9b515c2020-11-01 21:27:19 +0000218 cpu_tensor_alignment=Tensor.AllocationQuantum,
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200219 max_size=None,
220 dry_test=False,
Tim Hall79d07d22020-04-27 18:20:16 +0100221):
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200222 # Allocates addresses to tensors, returns False if tensors could not be fit within max_size
Tim Halld8339a72021-05-27 18:49:40 +0100223 lrs, total_sz = allocate(
Tim Hall79d07d22020-04-27 18:20:16 +0100224 sg,
Tim Halld8339a72021-05-27 18:49:40 +0100225 arch,
Tim Hall79d07d22020-04-27 18:20:16 +0100226 mem_area,
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200227 mem_type_set,
Tim Halld8339a72021-05-27 18:49:40 +0100228 tensor_allocator=tensor_allocator,
Tim Hall79d07d22020-04-27 18:20:16 +0100229 lr_graph=lr_graph,
Tim Hallb9b515c2020-11-01 21:27:19 +0000230 cpu_tensor_alignment=cpu_tensor_alignment,
Tim Hall79d07d22020-04-27 18:20:16 +0100231 )
232
233 if lrs.ranges:
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200234 alloc_ok = max_size is None or total_sz <= max_size
235 if dry_test or not alloc_ok:
236 # Dry test or allocation failed; undo allocation
237 for lr in lrs.ranges.values():
238 lr.set_address(None)
239 return alloc_ok
Tim Hall79d07d22020-04-27 18:20:16 +0100240
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200241 if sg.memory_used.get(mem_area, 0) == 0:
242 sg.memory_used[mem_area] = total_sz
243 else:
244 sg.memory_used[mem_area] += total_sz
245
246 # Keep track of how much should be used for scratch or permanent storage for NPU
247 for mem_type in mem_type_set:
248 if sg.memory_used_per_type.get(mem_type, 0) == 0:
249 sg.memory_used_per_type[mem_type] = total_sz
250 else:
251 sg.memory_used_per_type[mem_type] += total_sz
Tim Hall79d07d22020-04-27 18:20:16 +0100252
Tim Hall64556f32021-05-17 22:57:46 +0100253 if verbose_allocation:
254 print_allocation(lrs, mem_area, mem_type_set, tensor_allocator, sg, total_sz)
Tim Hall79d07d22020-04-27 18:20:16 +0100255
256 if mem_area == MemArea.Sram:
257 # Mark Sram usage for all subgraphs
258 for sg_ in nng.subgraphs:
259 mark_sram_used_for_cascaded_passes(sg_, lrs)
260
261 if sg == nng.get_root_subgraph():
262 nng.memory_used = sg.memory_used
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200263 return True