blob: 7a7aaa4173e6d628e17e865e80ece7f84b9809af [file] [log] [blame]
Raul Farkas1c54ac12023-04-26 07:49:15 +01001# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Rickard Bolinbc6ee582022-11-04 08:24:29 +000016#
Tim Hall79d07d22020-04-27 18:20:16 +010017# Description:
18# Wrapping function to do tensor address allocation. That is, assigning addresses to tensors based on what has been
19# worked out from the allowable overlaps that are calculated by the live range analysis.
Tim Hall79d07d22020-04-27 18:20:16 +010020import math
Louis Verhaard226ecaf2021-03-30 10:18:28 +020021from typing import List
Tim Hall79d07d22020-04-27 18:20:16 +010022
Diego Russoea6111a2020-04-14 18:41:58 +010023import numpy as np
24
Louis Verhaardd7002522021-01-20 17:23:54 +010025from . import hillclimb_allocation
Diego Russoea6111a2020-04-14 18:41:58 +010026from . import live_range
27from . import numeric_util
Jacob Bohlin0628a8c2020-08-28 13:25:14 +020028from .errors import AllocationError
Tim Hall79d07d22020-04-27 18:20:16 +010029from .greedy_allocation import allocate_live_ranges as greedy_allocate_live_ranges
Louis Verhaard226ecaf2021-03-30 10:18:28 +020030from .live_range import LiveRange
Louis Verhaard9bfe0f82020-12-03 12:26:25 +010031from .live_range import LiveRangeGraph
Diego Russoe8a10452020-04-21 17:39:10 +010032from .nn_graph import TensorAllocator
33from .tensor import MemArea
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020034from .tensor import MemType
Jacob Bohlin0628a8c2020-08-28 13:25:14 +020035from .tensor import Tensor
Louis Verhaard0b8268a2020-08-05 16:11:29 +020036from .tensor import TensorPurpose
Tim Hall79d07d22020-04-27 18:20:16 +010037
38
Jacob Bohlin0628a8c2020-08-28 13:25:14 +020039def linear_allocate_live_ranges(live_ranges, alloc_granularity=Tensor.AllocationQuantum):
Louis Verhaard3c07c972020-05-07 08:12:58 +020040 # Allocates using increasing addresses. Duplicate constant tensors will be allocated to the same address
Tim Hall79d07d22020-04-27 18:20:16 +010041 total_sz = 0
42 allocated_tensors = []
43
Louis Verhaard3c07c972020-05-07 08:12:58 +020044 # just assign increasing addresses, except for duplicates
Tim Hall79d07d22020-04-27 18:20:16 +010045 for tens, lr in live_ranges.ranges.items():
46 if tens in allocated_tensors:
47 continue
48
Louis Verhaard3c07c972020-05-07 08:12:58 +020049 address = total_sz
50 if tens.weight_compression_config is not None:
51 for allocated_tens in allocated_tensors:
52 if allocated_tens.weight_compression_config == tens.weight_compression_config:
Tim Halld784af72021-06-08 21:25:57 +010053 assert allocated_tens.scale_compression_config == tens.scale_compression_config
Louis Verhaard3c07c972020-05-07 08:12:58 +020054 address = allocated_tens.address
55 break
Louis Verhaard0b8268a2020-08-05 16:11:29 +020056 if tens.purpose == TensorPurpose.LUT:
57 for allocated_tens in allocated_tensors:
58 if allocated_tens.equivalent(tens):
59 address = allocated_tens.address
60 break
Louis Verhaard3c07c972020-05-07 08:12:58 +020061 lr.set_address(address)
Tim Hall79d07d22020-04-27 18:20:16 +010062 allocated_tensors += lr.tensors
Louis Verhaard3c07c972020-05-07 08:12:58 +020063 if address == total_sz:
64 total_sz += numeric_util.round_up(int(math.ceil(lr.size)), alloc_granularity)
Tim Hall79d07d22020-04-27 18:20:16 +010065
Jacob Bohlin0628a8c2020-08-28 13:25:14 +020066 verify_alignment(live_ranges, alloc_granularity)
Tim Hall79d07d22020-04-27 18:20:16 +010067 return total_sz
68
69
Tim Hallcda4fcb2022-05-19 12:36:58 +010070def hillclimb_allocate_live_ranges(
71 live_ranges: LiveRangeGraph, alloc_granularity: int, max_iterations: int, mem_limit: int
72) -> int:
Louis Verhaardd7002522021-01-20 17:23:54 +010073 # Allocates using the hill climb allocator
Tim Hallcda4fcb2022-05-19 12:36:58 +010074 addresses = hillclimb_allocation.allocate_live_ranges(live_ranges.lrs, max_iterations, mem_limit)
Louis Verhaard9bfe0f82020-12-03 12:26:25 +010075 # The result is a list containing the allocated addresses
76 total_sz = 0
Louis Verhaard226ecaf2021-03-30 10:18:28 +020077 for lr, address in zip(live_ranges.lrs, addresses):
Louis Verhaard9bfe0f82020-12-03 12:26:25 +010078 total_sz = max(total_sz, address + lr.size)
79 lr.set_address(address)
80 verify_allocation(live_ranges, alloc_granularity)
81 return total_sz
82
83
84def verify_alignment(live_ranges: LiveRangeGraph, alignment: int):
Louis Verhaard226ecaf2021-03-30 10:18:28 +020085 for lr in live_ranges.lrs:
Jacob Bohlin0628a8c2020-08-28 13:25:14 +020086 for tens in lr.tensors:
87 if not all(op and op.run_on_npu for op in tens.ops + tens.consumer_list):
88 # This is a CPU tensor, verify alignment
89 if tens.address % alignment != 0:
Michael McGeagh7a6f8432020-12-02 15:29:22 +000090 raise AllocationError(f"Tensor '{tens.name}' not aligned to {alignment} bytes")
Jacob Bohlin0628a8c2020-08-28 13:25:14 +020091
92
Louis Verhaard9bfe0f82020-12-03 12:26:25 +010093def verify_allocation(live_ranges: LiveRangeGraph, alignment: int):
Louis Verhaard226ecaf2021-03-30 10:18:28 +020094 verify_alignment(live_ranges, alignment)
95 nr_time_slots = 1 + max(lr.end_time for lr in live_ranges.lrs)
96 # Contains active live ranges at each timestamp
Jonas Ohlsson845e2322022-03-01 12:39:55 +010097 lrs_at_time: List[List[LiveRange]] = [[] for i in range(nr_time_slots)]
Louis Verhaard226ecaf2021-03-30 10:18:28 +020098 for lr in live_ranges.lrs:
99 for t in range(lr.start_time, lr.end_time + 1):
100 lrs_at_time[t].append(lr)
101 for t in range(nr_time_slots):
Johan Alfvén36da8d32022-01-18 08:56:56 +0100102 lrs_new_items = [lr for lr in lrs_at_time[t] if t == 0 or lr not in lrs_at_time[t - 1]]
103 for m in lrs_new_items:
104 for n in lrs_at_time[t]:
Louis Verhaard9bfe0f82020-12-03 12:26:25 +0100105 overlap, tens_n, tens_m = n.overlaps_address(m)
106 if overlap and not (tens_n.equivalent(tens_m) and tens_n.address == tens_m.address):
107 raise AllocationError(
Michael McGeagh7a6f8432020-12-02 15:29:22 +0000108 f"Overlapping buffers: {n.name}: {tens_n.address} -> {tens_n.address + n.size}"
109 f" and {m.name}: {tens_m.address} -> {tens_m.address + m.size}"
Louis Verhaard9bfe0f82020-12-03 12:26:25 +0100110 )
111
112
Tim Hall79d07d22020-04-27 18:20:16 +0100113def mark_sram_used_for_cascaded_passes(sg, lrs):
Tim Halld8339a72021-05-27 18:49:40 +0100114 if len(sg.cascaded_passes) < 1:
115 return
Tim Hall79d07d22020-04-27 18:20:16 +0100116 end_pos = max(ps.time for ps in sg.cascaded_passes) + 2
117 mem_usage = np.zeros(end_pos, dtype=np.int64)
118
119 for tens, rng in lrs.ranges.items():
120 storage_size = tens.storage_size()
121 mem_usage[rng.start_time : rng.end_time] += storage_size
122
123 for cps in sg.cascaded_passes:
124 sram_used = max(mem_usage[cps.time], mem_usage[cps.time + 1])
125 cps.sram_used = sram_used
126 for ps in cps.passes:
127 ps.sram_used = sram_used
128
129
Tim Hall64556f32021-05-17 22:57:46 +0100130def print_allocation(lrs, mem_area, mem_type_set, tensor_allocator, sg, actual_mem_usage_for_alloc):
131 print("\n" + "#" * 80)
132 sg_placement = (
133 sg.placement.name
Jonas Ohlssond8575072022-03-30 10:30:25 +0200134 if mem_type_set.intersection(
135 (
136 MemType.Permanent_NPU,
137 MemType.Permanent_CPU,
138 )
139 )
Tim Hall64556f32021-05-17 22:57:46 +0100140 else "Cpu and Npu"
141 )
142 print(
143 f"Tensor Allocation for mem_area {mem_area.name}, of mem_type_set ("
144 f'{", ".join(f"{mem_type.name}" for mem_type in mem_type_set)}'
145 f"), using allocator {tensor_allocator}, in {sg_placement} subgraph:"
146 )
147
148 memory_hist = memory_usage_histogram(lrs.lrs)
149 min_mem_usage_for_alloc = max(memory_hist)
Tim Hallcda4fcb2022-05-19 12:36:58 +0100150 print(
151 f"{'Start Time':>10s} - {'End Time':>10s}: {'Start Addr':>10s} - {'End Addr':>10s}: {'Tensor Size':>11s}:"
152 f" {'Memory Usage':>12s}: {'Purpose':12s}: Name"
153 )
Tim Hall64556f32021-05-17 22:57:46 +0100154 for start_time, end_time, size, start_addr, end_addr, purpose, name in sorted(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200155 (
156 lr.start_time,
157 lr.end_time,
158 lr.size,
159 tens.address,
160 tens.address + lr.size,
161 tens.purpose,
162 tens.name,
163 )
Tim Hall64556f32021-05-17 22:57:46 +0100164 for tens, lr in lrs.ranges.items()
165 ):
166 print(
167 f"{start_time:10d} - {end_time:10d}: {start_addr:#10x} - {end_addr:#10x}: {size:11d}:"
Tim Halld2e03c62023-12-19 11:17:37 +0000168 f" {max(memory_hist[start_time:end_time+1]):12d}: {purpose.display_name():12s}: {name:s}"
Tim Hall64556f32021-05-17 22:57:46 +0100169 )
170
171 alloc_overhead_fraction = (actual_mem_usage_for_alloc - min_mem_usage_for_alloc) / min_mem_usage_for_alloc
172 print(
173 f"Allocation Peak Tensor Size: {min_mem_usage_for_alloc:9d} ({min_mem_usage_for_alloc:#10x})"
174 f" Bytes {min_mem_usage_for_alloc/1024.0:8.2f} KiB"
175 )
176 print(
177 f"Allocation Peak Memory Usage: {actual_mem_usage_for_alloc:9d} ({actual_mem_usage_for_alloc:#10x})"
178 f" Bytes {actual_mem_usage_for_alloc/1024.0:8.2f} KiB"
179 )
180 print(
181 f"Allocation Overhead: {actual_mem_usage_for_alloc-min_mem_usage_for_alloc:9d}"
182 f" Bytes ({100*alloc_overhead_fraction:.2f} %)"
183 )
Tim Hall79d07d22020-04-27 18:20:16 +0100184
Tim Hall79d07d22020-04-27 18:20:16 +0100185
Tim Hall64556f32021-05-17 22:57:46 +0100186def memory_usage_histogram(lrs: List[LiveRange]):
187 histogram = [0] * (1 + max(lr.end_time for lr in lrs))
Louis Verhaard226ecaf2021-03-30 10:18:28 +0200188 for lr in lrs:
erik.andersson@arm.com3438c922021-03-24 10:32:09 +0100189 for t in range(lr.start_time, lr.end_time + 1):
Tim Hall64556f32021-05-17 22:57:46 +0100190 histogram[t] += lr.size
erik.andersson@arm.com3438c922021-03-24 10:32:09 +0100191
Tim Hall64556f32021-05-17 22:57:46 +0100192 return histogram
erik.andersson@arm.com3438c922021-03-24 10:32:09 +0100193
194
Tim Halld8339a72021-05-27 18:49:40 +0100195def allocate(
196 sg,
197 arch,
198 mem_area,
199 mem_type_set,
200 tensor_allocator=TensorAllocator.Greedy,
201 lr_graph=None,
202 cpu_tensor_alignment=Tensor.AllocationQuantum,
Tim Hallcda4fcb2022-05-19 12:36:58 +0100203 hillclimb_max_iterations=None,
Raul Farkas1c54ac12023-04-26 07:49:15 +0100204 verbose_progress=False,
Tim Halld8339a72021-05-27 18:49:40 +0100205):
206 # Allocates addresses to tensors, returns False if tensors could not be fit within max_size
Tim Halld8339a72021-05-27 18:49:40 +0100207 lrs = live_range.extract_live_ranges_from_cascaded_passes(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200208 sg,
209 mem_area,
210 mem_type_set,
211 lr_graph=lr_graph,
212 cpu_tensor_alignment=cpu_tensor_alignment,
Raul Farkas1c54ac12023-04-26 07:49:15 +0100213 verbose_progress=verbose_progress,
Tim Halld8339a72021-05-27 18:49:40 +0100214 )
215 total_sz = 0
216 if lrs.ranges:
217 tens_alloc = tensor_allocator
218 if tens_alloc == TensorAllocator.Greedy:
Tim Hallcda4fcb2022-05-19 12:36:58 +0100219 total_sz = greedy_allocate_live_ranges(lrs, cpu_tensor_alignment)
Tim Halld8339a72021-05-27 18:49:40 +0100220 verify_allocation(lrs, cpu_tensor_alignment)
221 elif tens_alloc == TensorAllocator.LinearAlloc:
222 total_sz = linear_allocate_live_ranges(lrs, cpu_tensor_alignment)
223 elif tens_alloc == TensorAllocator.HillClimb:
Tim Hallcda4fcb2022-05-19 12:36:58 +0100224 mem_type = MemType.Scratch_fast if MemType.Scratch_fast in mem_type_set else list(mem_type_set)[0]
225 mem_size = arch.mem_type_size(mem_type)
226 total_sz = hillclimb_allocate_live_ranges(lrs, cpu_tensor_alignment, hillclimb_max_iterations, mem_size)
Tim Halld8339a72021-05-27 18:49:40 +0100227 else:
228 assert 0
229 return lrs, total_sz
230
231
Tim Hall79d07d22020-04-27 18:20:16 +0100232def allocate_tensors(
233 nng,
234 sg,
235 arch,
236 mem_area,
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200237 mem_type_set,
Tim Hall79d07d22020-04-27 18:20:16 +0100238 tensor_allocator=TensorAllocator.Greedy,
239 verbose_allocation=False,
Raul Farkas1c54ac12023-04-26 07:49:15 +0100240 verbose_progress=False,
Tim Hall79d07d22020-04-27 18:20:16 +0100241 lr_graph=None,
Tim Hallb9b515c2020-11-01 21:27:19 +0000242 cpu_tensor_alignment=Tensor.AllocationQuantum,
Tim Hallcda4fcb2022-05-19 12:36:58 +0100243 hillclimb_max_iterations=None,
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200244 max_size=None,
245 dry_test=False,
Tim Hall79d07d22020-04-27 18:20:16 +0100246):
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200247 # Allocates addresses to tensors, returns False if tensors could not be fit within max_size
Tim Halld8339a72021-05-27 18:49:40 +0100248 lrs, total_sz = allocate(
Tim Hall79d07d22020-04-27 18:20:16 +0100249 sg,
Tim Halld8339a72021-05-27 18:49:40 +0100250 arch,
Tim Hall79d07d22020-04-27 18:20:16 +0100251 mem_area,
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200252 mem_type_set,
Tim Halld8339a72021-05-27 18:49:40 +0100253 tensor_allocator=tensor_allocator,
Tim Hall79d07d22020-04-27 18:20:16 +0100254 lr_graph=lr_graph,
Tim Hallb9b515c2020-11-01 21:27:19 +0000255 cpu_tensor_alignment=cpu_tensor_alignment,
Tim Hallcda4fcb2022-05-19 12:36:58 +0100256 hillclimb_max_iterations=hillclimb_max_iterations,
Raul Farkas1c54ac12023-04-26 07:49:15 +0100257 verbose_progress=verbose_progress,
Tim Hall79d07d22020-04-27 18:20:16 +0100258 )
259
260 if lrs.ranges:
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200261 alloc_ok = max_size is None or total_sz <= max_size
262 if dry_test or not alloc_ok:
263 # Dry test or allocation failed; undo allocation
264 for lr in lrs.ranges.values():
265 lr.set_address(None)
266 return alloc_ok
Tim Hall79d07d22020-04-27 18:20:16 +0100267
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200268 if sg.memory_used.get(mem_area, 0) == 0:
269 sg.memory_used[mem_area] = total_sz
270 else:
271 sg.memory_used[mem_area] += total_sz
272
273 # Keep track of how much should be used for scratch or permanent storage for NPU
274 for mem_type in mem_type_set:
275 if sg.memory_used_per_type.get(mem_type, 0) == 0:
276 sg.memory_used_per_type[mem_type] = total_sz
277 else:
278 sg.memory_used_per_type[mem_type] += total_sz
Tim Hall79d07d22020-04-27 18:20:16 +0100279
Tim Hall64556f32021-05-17 22:57:46 +0100280 if verbose_allocation:
281 print_allocation(lrs, mem_area, mem_type_set, tensor_allocator, sg, total_sz)
Tim Hall79d07d22020-04-27 18:20:16 +0100282
283 if mem_area == MemArea.Sram:
284 # Mark Sram usage for all subgraphs
285 for sg_ in nng.subgraphs:
286 mark_sram_used_for_cascaded_passes(sg_, lrs)
287
288 if sg == nng.get_root_subgraph():
289 nng.memory_used = sg.memory_used
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200290 return True