blob: 255156e6a38d5880cb0652e6b87c7a87ac397b99 [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
17
18# Description:
19# Wrapping function to do tensor address allocation. That is, assigning addresses to tensors based on what has been
20# worked out from the allowable overlaps that are calculated by the live range analysis.
21
Tim Hall79d07d22020-04-27 18:20:16 +010022import math
Tim Hall79d07d22020-04-27 18:20:16 +010023
Diego Russoea6111a2020-04-14 18:41:58 +010024import numpy as np
25
26from . import live_range
27from . import numeric_util
28from .tensor import MemArea
29from .nn_graph import TensorAllocator
Tim Hall79d07d22020-04-27 18:20:16 +010030from .greedy_allocation import allocate_live_ranges as greedy_allocate_live_ranges
31
32
33def linear_allocate_live_ranges(live_ranges, alloc_granularity=256):
34 total_sz = 0
35 allocated_tensors = []
36
37 # just assign increasing addresses
38 for tens, lr in live_ranges.ranges.items():
39 if tens in allocated_tensors:
40 continue
41
42 lr.set_address(total_sz)
43 allocated_tensors += lr.tensors
44 total_sz += numeric_util.round_up(int(math.ceil(lr.size)), alloc_granularity)
45
46 return total_sz
47
48
49def mark_sram_used_for_cascaded_passes(sg, lrs):
50 end_pos = max(ps.time for ps in sg.cascaded_passes) + 2
51 mem_usage = np.zeros(end_pos, dtype=np.int64)
52
53 for tens, rng in lrs.ranges.items():
54 storage_size = tens.storage_size()
55 mem_usage[rng.start_time : rng.end_time] += storage_size
56
57 for cps in sg.cascaded_passes:
58 sram_used = max(mem_usage[cps.time], mem_usage[cps.time + 1])
59 cps.sram_used = sram_used
60 for ps in cps.passes:
61 ps.sram_used = sram_used
62
63
64def print_allocation(lrs, mem_area, sg, verbose_allocation, show_minimum_possible_allocation):
65 if verbose_allocation:
66 if mem_area == MemArea.Sram:
67 print("allocation for", mem_area, "- non-constant tensors in Cpu and Npu subgraphs")
68 else:
69 print("allocation for", mem_area, "- constant tensors in", sg.placement.name, "subgraph(s)")
70 for start_time, start, end, name, end_time in sorted(
71 (
72 lr.start_time,
73 tens.address,
74 tens.address + int(math.ceil(tens.storage_size())),
75 tens.name + " " + str(tens.purpose),
76 lr.end_time,
77 )
78 for tens, lr in lrs.ranges.items()
79 ):
80 name = name.replace("\x00", "")
81 print("%9d: %#12x - %#12x: %3d - %3d %s" % ((end - start), start, end, start_time, end_time, name))
82 print()
83
84 if show_minimum_possible_allocation and mem_area == MemArea.Sram:
85 min_possible_allocation = max(cps.sram_used for cps in sg.cascaded_passes)
86 print(
87 "Min possible allocation %d bytes / %.1f KB / %.1f MB"
88 % (min_possible_allocation, min_possible_allocation / 1024, min_possible_allocation / 1024 / 1024)
89 )
90
91
92def allocate_tensors(
93 nng,
94 sg,
95 arch,
96 mem_area,
97 use_ifm_ofm_overlap=True,
98 tensor_allocator=TensorAllocator.Greedy,
99 verbose_allocation=False,
100 show_minimum_possible_allocation=False,
101 lr_graph=None,
102):
103 ignore_subgraph_input_output_tensors = False
104 lrs = live_range.extract_live_ranges_from_cascaded_passes(
105 sg,
106 mem_area,
107 mark_output_tensors_overlapping_with_input_tensors=False,
108 use_ifm_ofm_overlap=use_ifm_ofm_overlap,
109 ignore_subgraph_input_output_tensors=ignore_subgraph_input_output_tensors,
110 lr_graph=lr_graph,
111 )
112
113 if lrs.ranges:
114 tens_alloc = tensor_allocator
115 if tens_alloc == TensorAllocator.Greedy:
116 total_sz = greedy_allocate_live_ranges(sg, arch, lrs, mem_area, verbose_allocation)
117 elif tens_alloc == TensorAllocator.LinearAlloc:
118 total_sz = linear_allocate_live_ranges(lrs)
119 else:
120 assert 0
121
122 sg.memory_used[mem_area] = total_sz
123
124 nng.total_size[mem_area] = nng.total_size.get(mem_area, 0) + sum(tens.storage_size() for tens in lrs.ranges)
125 nng.total_elements[mem_area] = nng.total_elements.get(mem_area, 0) + sum(tens.elements() for tens in lrs.ranges)
126
127 print_allocation(lrs, mem_area, sg, verbose_allocation, show_minimum_possible_allocation)
128
129 if mem_area == MemArea.Sram:
130 # Mark Sram usage for all subgraphs
131 for sg_ in nng.subgraphs:
132 mark_sram_used_for_cascaded_passes(sg_, lrs)
133
134 if sg == nng.get_root_subgraph():
135 nng.memory_used = sg.memory_used
136 for mem_area in nng.total_elements.keys():
137 try:
138 nng.bits_per_element[mem_area] = nng.total_size[mem_area] * 8 / nng.total_elements[mem_area]
139 except ZeroDivisionError:
140 nng.bits_per_element[mem_area] = 0.0