blob: 1efcd68694971ead4b5b78cc998631aec7a94e34 [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Wrapping function to do tensor address allocation. That is, assigning addresses to tensors based on what has been
18# worked out from the allowable overlaps that are calculated by the live range analysis.
Tim Hall79d07d22020-04-27 18:20:16 +010019import math
Tim Hall79d07d22020-04-27 18:20:16 +010020
Diego Russoea6111a2020-04-14 18:41:58 +010021import numpy as np
22
23from . import live_range
24from . import numeric_util
Jacob Bohlin0628a8c2020-08-28 13:25:14 +020025from .errors import AllocationError
Tim Hall79d07d22020-04-27 18:20:16 +010026from .greedy_allocation import allocate_live_ranges as greedy_allocate_live_ranges
Diego Russoe8a10452020-04-21 17:39:10 +010027from .nn_graph import TensorAllocator
28from .tensor import MemArea
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020029from .tensor import MemType
Jacob Bohlin0628a8c2020-08-28 13:25:14 +020030from .tensor import Tensor
Louis Verhaard0b8268a2020-08-05 16:11:29 +020031from .tensor import TensorPurpose
Tim Hall79d07d22020-04-27 18:20:16 +010032
33
Jacob Bohlin0628a8c2020-08-28 13:25:14 +020034def linear_allocate_live_ranges(live_ranges, alloc_granularity=Tensor.AllocationQuantum):
Louis Verhaard3c07c972020-05-07 08:12:58 +020035 # Allocates using increasing addresses. Duplicate constant tensors will be allocated to the same address
Tim Hall79d07d22020-04-27 18:20:16 +010036 total_sz = 0
37 allocated_tensors = []
38
Louis Verhaard3c07c972020-05-07 08:12:58 +020039 # just assign increasing addresses, except for duplicates
Tim Hall79d07d22020-04-27 18:20:16 +010040 for tens, lr in live_ranges.ranges.items():
41 if tens in allocated_tensors:
42 continue
43
Louis Verhaard3c07c972020-05-07 08:12:58 +020044 address = total_sz
45 if tens.weight_compression_config is not None:
46 for allocated_tens in allocated_tensors:
47 if allocated_tens.weight_compression_config == tens.weight_compression_config:
48 address = allocated_tens.address
49 break
Louis Verhaard0b8268a2020-08-05 16:11:29 +020050 if tens.purpose == TensorPurpose.LUT:
51 for allocated_tens in allocated_tensors:
52 if allocated_tens.equivalent(tens):
53 address = allocated_tens.address
54 break
Louis Verhaard3c07c972020-05-07 08:12:58 +020055 lr.set_address(address)
Tim Hall79d07d22020-04-27 18:20:16 +010056 allocated_tensors += lr.tensors
Louis Verhaard3c07c972020-05-07 08:12:58 +020057 if address == total_sz:
58 total_sz += numeric_util.round_up(int(math.ceil(lr.size)), alloc_granularity)
Tim Hall79d07d22020-04-27 18:20:16 +010059
Jacob Bohlin0628a8c2020-08-28 13:25:14 +020060 verify_alignment(live_ranges, alloc_granularity)
Tim Hall79d07d22020-04-27 18:20:16 +010061 return total_sz
62
63
Jacob Bohlin0628a8c2020-08-28 13:25:14 +020064def verify_alignment(live_ranges, alignment):
65 for lr in live_ranges.ranges.values():
66 for tens in lr.tensors:
67 if not all(op and op.run_on_npu for op in tens.ops + tens.consumer_list):
68 # This is a CPU tensor, verify alignment
69 if tens.address % alignment != 0:
70 raise AllocationError("Tensor {} not aligned to {} bytes".format(tens.name, alignment))
71
72
Tim Hall79d07d22020-04-27 18:20:16 +010073def mark_sram_used_for_cascaded_passes(sg, lrs):
74 end_pos = max(ps.time for ps in sg.cascaded_passes) + 2
75 mem_usage = np.zeros(end_pos, dtype=np.int64)
76
77 for tens, rng in lrs.ranges.items():
78 storage_size = tens.storage_size()
79 mem_usage[rng.start_time : rng.end_time] += storage_size
80
81 for cps in sg.cascaded_passes:
82 sram_used = max(mem_usage[cps.time], mem_usage[cps.time + 1])
83 cps.sram_used = sram_used
84 for ps in cps.passes:
85 ps.sram_used = sram_used
86
87
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020088def print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation, show_minimum_possible_allocation):
Tim Hall79d07d22020-04-27 18:20:16 +010089 if verbose_allocation:
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020090 if mem_type_set == set((MemType.Permanent_NPU,)) or mem_type_set == set((MemType.Permanent_CPU,)):
Tim Hall79d07d22020-04-27 18:20:16 +010091 print("allocation for", mem_area, "- constant tensors in", sg.placement.name, "subgraph(s)")
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020092 else:
93 print("allocation for", mem_area, "- non-constant tensors in Cpu and Npu subgraphs")
Louis Verhaard1356c2a2020-09-16 10:25:28 +020094 mem_usage = 0
Tim Hall79d07d22020-04-27 18:20:16 +010095 for start_time, start, end, name, end_time in sorted(
96 (
97 lr.start_time,
98 tens.address,
99 tens.address + int(math.ceil(tens.storage_size())),
100 tens.name + " " + str(tens.purpose),
101 lr.end_time,
102 )
103 for tens, lr in lrs.ranges.items()
104 ):
105 name = name.replace("\x00", "")
106 print("%9d: %#12x - %#12x: %3d - %3d %s" % ((end - start), start, end, start_time, end_time, name))
Louis Verhaard1356c2a2020-09-16 10:25:28 +0200107 mem_usage = max(mem_usage, end)
108 print("Memory usage: {} ({:#x}) bytes / {:.1f} KB".format(mem_usage, mem_usage, mem_usage / 1024))
Tim Hall79d07d22020-04-27 18:20:16 +0100109 print()
110
111 if show_minimum_possible_allocation and mem_area == MemArea.Sram:
112 min_possible_allocation = max(cps.sram_used for cps in sg.cascaded_passes)
113 print(
114 "Min possible allocation %d bytes / %.1f KB / %.1f MB"
115 % (min_possible_allocation, min_possible_allocation / 1024, min_possible_allocation / 1024 / 1024)
116 )
117
118
119def allocate_tensors(
120 nng,
121 sg,
122 arch,
123 mem_area,
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200124 mem_type_set,
Tim Hall79d07d22020-04-27 18:20:16 +0100125 use_ifm_ofm_overlap=True,
126 tensor_allocator=TensorAllocator.Greedy,
127 verbose_allocation=False,
128 show_minimum_possible_allocation=False,
129 lr_graph=None,
Jacob Bohlin0628a8c2020-08-28 13:25:14 +0200130 allocation_alignment=Tensor.AllocationQuantum,
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200131 max_size=None,
132 dry_test=False,
Tim Hall79d07d22020-04-27 18:20:16 +0100133):
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200134 # Allocates addresses to tensors, returns False if tensors could not be fit within max_size
Tim Hall79d07d22020-04-27 18:20:16 +0100135 ignore_subgraph_input_output_tensors = False
136 lrs = live_range.extract_live_ranges_from_cascaded_passes(
137 sg,
138 mem_area,
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200139 mem_type_set,
Tim Hall79d07d22020-04-27 18:20:16 +0100140 mark_output_tensors_overlapping_with_input_tensors=False,
141 use_ifm_ofm_overlap=use_ifm_ofm_overlap,
142 ignore_subgraph_input_output_tensors=ignore_subgraph_input_output_tensors,
143 lr_graph=lr_graph,
Jacob Bohlin0628a8c2020-08-28 13:25:14 +0200144 allocation_alignment=allocation_alignment,
Tim Hall79d07d22020-04-27 18:20:16 +0100145 )
146
147 if lrs.ranges:
148 tens_alloc = tensor_allocator
149 if tens_alloc == TensorAllocator.Greedy:
Jacob Bohlin0628a8c2020-08-28 13:25:14 +0200150 total_sz = greedy_allocate_live_ranges(sg, arch, lrs, mem_area, allocation_alignment, verbose_allocation)
Tim Hall79d07d22020-04-27 18:20:16 +0100151 elif tens_alloc == TensorAllocator.LinearAlloc:
Jacob Bohlin0628a8c2020-08-28 13:25:14 +0200152 total_sz = linear_allocate_live_ranges(lrs, allocation_alignment)
Tim Hall79d07d22020-04-27 18:20:16 +0100153 else:
154 assert 0
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200155 alloc_ok = max_size is None or total_sz <= max_size
156 if dry_test or not alloc_ok:
157 # Dry test or allocation failed; undo allocation
158 for lr in lrs.ranges.values():
159 lr.set_address(None)
160 return alloc_ok
Tim Hall79d07d22020-04-27 18:20:16 +0100161
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200162 if sg.memory_used.get(mem_area, 0) == 0:
163 sg.memory_used[mem_area] = total_sz
164 else:
165 sg.memory_used[mem_area] += total_sz
166
167 # Keep track of how much should be used for scratch or permanent storage for NPU
168 for mem_type in mem_type_set:
169 if sg.memory_used_per_type.get(mem_type, 0) == 0:
170 sg.memory_used_per_type[mem_type] = total_sz
171 else:
172 sg.memory_used_per_type[mem_type] += total_sz
Tim Hall79d07d22020-04-27 18:20:16 +0100173
174 nng.total_size[mem_area] = nng.total_size.get(mem_area, 0) + sum(tens.storage_size() for tens in lrs.ranges)
175 nng.total_elements[mem_area] = nng.total_elements.get(mem_area, 0) + sum(tens.elements() for tens in lrs.ranges)
176
Patrik Gustavssoneca2e952020-05-27 09:15:11 +0200177 print_allocation(lrs, mem_area, mem_type_set, sg, verbose_allocation, show_minimum_possible_allocation)
Tim Hall79d07d22020-04-27 18:20:16 +0100178
179 if mem_area == MemArea.Sram:
180 # Mark Sram usage for all subgraphs
181 for sg_ in nng.subgraphs:
182 mark_sram_used_for_cascaded_passes(sg_, lrs)
183
184 if sg == nng.get_root_subgraph():
185 nng.memory_used = sg.memory_used
186 for mem_area in nng.total_elements.keys():
187 try:
188 nng.bits_per_element[mem_area] = nng.total_size[mem_area] * 8 / nng.total_elements[mem_area]
189 except ZeroDivisionError:
190 nng.bits_per_element[mem_area] = 0.0
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200191 return True