blob: 0ef8688ea1ad04172293c4bf2e485b62ccbb1046 [file] [log] [blame]
Louis Verhaard9bfe0f82020-12-03 12:26:25 +01001/*
2 * Copyright (c) 2020 Arm Limited. All rights reserved.
3 *
4 * SPDX-License-Identifier: Apache-2.0
5 *
6 * Licensed under the Apache License, Version 2.0 (the License); you may
7 * not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an AS IS BASIS, WITHOUT
14 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 *
18 * Description:
19 * Declaration of the search-based allocator.
20 */
21
22#ifndef __SEARCH_ALLOCATOR_H
23#define __SEARCH_ALLOCATOR_H
24
25#include <algorithm>
26#include <cstdint>
27#include <random>
28#include <set>
29#include <vector>
30
31/**
32 * Live range
33 */
34struct LiveRange {
35 /** Start time (input to the allocator algorithm) */
36 uint32_t start_time;
37 /** End time, inclusive (input to the allocator algorithm) */
38 uint32_t end_time;
39 /** Size in bytes (input to the allocator algorithm) */
40 uint32_t size;
41 /** Index of this live range */
42 int id;
43 /** Allocated address (the main output from the allocator algorithm) */
44 uint32_t address;
45 /** End address, exclusive */
46 uint32_t end_address;
47 /** id of predecessor live range (predecessor's end address == this lr's address) */
48 int predecessor;
49 /** Turn at which the live range was allocated */
50 size_t turn;
51
52 bool overlaps(uint32_t addr2, uint32_t size2) const {
53 return address < addr2 + size2 && addr2 < end_address;
54 }
55 bool is_neighbour(const LiveRange &lr) const {
56 return start_time <= lr.end_time && lr.start_time <= end_time;
57 }
58};
59
60/**
61 * Implements tensor allocator using state space exploration.
62 *
63 * The basic algorithm is:
64 *
65 * Use a heuristic allocator to find an initial allocation
66 * while allocation is not optimal and iterations < MAX_ITERATIONS {
67 * find the "bottleneck": the live range with highest end address
68 * find all live ranges that affected the allocation of the bottleneck
69 * swap the order of any two affecting live ranges
70 * reallocate tensors using the reordered live ranges
71 * if the new allocation is better: keep it, else set allocation to previous allocation
72 * }
73 */
74class SearchAllocator {
75private:
76 static constexpr int MAX_ITERATIONS = 500;
77 static constexpr uint32_t NOT_ALLOCATED = UINT32_MAX;
78 /** Used for live ranges allocated at address 0 */
79 static constexpr int NO_PREDECESSOR = -1;
80 /** Contains the live ranges */
81 std::vector<LiveRange> lrs;
82 /** Contains active live ranges at each timestamp */
83 std::vector<std::vector<LiveRange*>> lrs_at_time;
84 /**
85 * Contains neighbours of each live range (indexed by lr.id), i.e.
86 * live ranges with overlapping start/end time.
87 */
88 std::vector<std::vector<LiveRange*>> neighbours;
89 /**
90 * At each timestamp: accumulated size of active live ranges
91 */
92 std::vector<uint32_t> size_at_time;
93 /**
94 * For each live range: max value of size_at_time (only used in the heuristic allocation)
95 */
96 std::vector<uint32_t> lr_urgency;
97 /**
98 * The minimum possible size, assuming all live ranges can be perfectly allocated
99 */
100 uint32_t min_required_size;
101 /**
102 * The available size (input to algorithm).
103 */
104 uint32_t available_size;
105 /** The algorithm stops once the target size has been achieved */
106 uint32_t target_size;
107 /** The highest end address of the best found allocation */
108 uint32_t best_size;
109 /** Number of performed iterations */
110 size_t nr_iterations = 0;
111 /** Random number generator; use default seed (which is well-defined) */
112 std::mt19937 rng;
113public:
114 SearchAllocator(const std::vector<LiveRange> &live_ranges, uint32_t size_limit);
115 /**
116 * Runs the allocation algorithm. Finishes when the target size has been
117 * reached or when maximum iterations have been run.
118 * The allocated addresses are placed in the output vector, in the same
119 * order as the input vector.
120 *
121 * Implementation note: the algorithm produces reproduceable results by using
122 * a well-defined random number generator with well-defined default seed,
123 * and using a fixed number of iterations.
124 */
125 uint32_t allocate(std::vector<uint32_t> &output);
126 uint32_t get_min_required_size() const {
127 return min_required_size;
128 }
129 size_t get_nr_iterations() const {
130 return nr_iterations;
131 }
132private:
133 /**
134 * Allocates the given live range at the smallest possible address
135 */
136 void allocate_lr(LiveRange &lr) const;
137 /**
138 * Allocates the live ranges in the order indicated by the indices;
139 * allocates each live range at the lowest possible address.
140 */
141 uint32_t allocate_indices(const std::vector<size_t> &indices);
142 /** Sorts live ranges based on heuristics, used for the initial allocation */
143 void sort_indices_on_prio(std::vector<size_t> &indices) const;
144 /** Adds the given live range + predecessors to the turns vector */
145 void add_predecessor_turns(std::set<size_t> &turns, const LiveRange &lr) const;
146 /**
147 * Finds the "bottleneck", the live range with highest end address, and reorders the indices
148 * such that a next allocation might lower the memory usage.
149 *
150 * ---------
151 * | |
152 * | D |
153 * | |
154 * ----------------------------------
155 * | B |
156 * -------------------------------
157 * | |
158 * |A| ---
159 * | | |C|
160 * | | | |
161 * ---------------------------------------
162 *
163 * In the above example, the allocation order was [A, B, C, D] and D is the resulting bottle-neck.
164 * The live ranges that affected the allocation of D are the direct neighbours of D (i.e. B and C),
165 * and all direct and indirect predecessors of D and its neighbours
166 * (i.e. A, which is the predecessor of B, and indirect predecessor of D).
167 *
168 * By permuting the order in which the affecting live ranges are allocated, the bottleneck might
169 * be lowered. In the above example, almost any permutation would lower the bottleneck.
170 *
171 * Note that there is room to improve the efficiency of the algorithm.
172 * One way could be to first allocate all direct neighbours of the bottleneck
173 * (i.e. B, C, D) and then the other affecting live ranges (i.e. A). The algorithm currently does
174 * not actively try this, as it may lead to allocation loops (A could become the new bottle-neck);
175 * it just uses a higher probability of selecting A.
176 */
177 void attempt_bottleneck_fix(std::vector<size_t> &indices);
178 /** Search for a solution, using the given indices as initial solution. */
179 void search(std::vector<size_t> &indices, uint32_t initial_size, int iterations);
180};
181
182/** Wrapper function to perform live range allocation */
183uint32_t allocate(const std::vector<uint32_t> &input, int available_size, std::vector<uint32_t> &output);
184
185#endif // __SEARCH_ALLOCATOR_H