MLBEDSW-1373: Added search based allocator

Added a new tensor allocator that is based on searching,
implemented in C++ (C++11 compatible).

Change-Id: Ie96e9fcfc8e6c58d1fa53911f37de290eeba88cf
Signed-off-by: Louis Verhaard <louis.verhaard@arm.com>
diff --git a/ethosu/tensor_allocator/search_allocator.cpp b/ethosu/tensor_allocator/search_allocator.cpp
new file mode 100644
index 0000000..ce5c46d
--- /dev/null
+++ b/ethosu/tensor_allocator/search_allocator.cpp
@@ -0,0 +1,266 @@
+/*
+ * Copyright (c) 2020 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Description:
+ * Implementation of the search-based allocator.
+ */
+
+#include <algorithm>
+#include <cstdint>
+#include <set>
+#include <vector>
+
+#include "search_allocator.h"
+
+SearchAllocator::SearchAllocator(const std::vector<LiveRange> &live_ranges, uint32_t size_limit) {
+    lrs = live_ranges;
+    uint32_t max_end_time = 0;
+    for (size_t i = 0; i < lrs.size(); ++i) {
+        auto &lr = lrs[i];
+        lr.id = i;
+        max_end_time = std::max(max_end_time, lr.end_time);
+    }
+    lrs_at_time.resize(max_end_time + 1);
+    size_at_time.resize(max_end_time + 1);
+    neighbours.resize(lrs.size());
+    // Calculate which live ranges are active at every timestamp
+    for (size_t t = 0; t <= max_end_time; ++t) {
+        lrs_at_time[t].clear();
+    }
+    for (auto &lr : lrs) {
+        for (auto t = lr.start_time; t <= lr.end_time; ++t) {
+            lrs_at_time[t].push_back(&lr);
+        }
+    }
+    min_required_size = 0;
+    for (size_t t = 0; t <= max_end_time; ++t) {
+        // Calculate minimum needed size at each timestamp
+        uint32_t size_at_t = 0;
+        for (auto &lr : lrs_at_time[t]) {
+            size_at_t += lr->size;
+        }
+        size_at_time[t] = size_at_t;
+        min_required_size = std::max(size_at_t, min_required_size);
+        // Calculate all neighbours
+        for (size_t i = 0; i < lrs_at_time[t].size(); ++i) {
+            auto lr1 = lrs_at_time[t][i];
+            auto &nb1 = neighbours[lr1->id];
+            for (size_t j = i + 1; j < lrs_at_time[t].size(); ++j) {
+                auto lr2 = lrs_at_time[t][j];
+                if (find(nb1.begin(), nb1.end(), lr2) == nb1.end()) {
+                    nb1.push_back(lr2);
+                    neighbours[lr2->id].push_back(lr1);
+                }
+            }
+        }
+    }
+    target_size = std::max(min_required_size, available_size);
+    // Calculate the urgency of each live range
+    lr_urgency.resize(lrs.size());
+    for (size_t i = 0; i < lrs.size(); ++i) {
+        auto &lr = lrs[i];
+        uint32_t urgency = 0;
+        for (size_t t = lr.start_time; t <= lr.end_time; ++t) {
+            urgency = std::max(size_at_time[t], urgency);
+        }
+        lr_urgency[i] = urgency;
+    }
+    best_size = UINT32_MAX;
+}
+
+uint32_t SearchAllocator::allocate(std::vector<uint32_t> &output) {
+    output.clear();
+    nr_iterations = 0;
+    std::vector<size_t> indices;
+    // Initial solution, using a heuristic allocator
+    for (size_t i = 0; i < lrs.size(); ++i) {
+        indices.push_back(i);
+    }
+    sort_indices_on_prio(indices);
+    // Allocate the initial solution
+    best_size = UINT32_MAX;
+    best_size = allocate_indices(indices);
+    if (best_size <= target_size) {
+        // The heuristic allocation returned an optimal solution.
+        // No need to search.
+    } else {
+        // Try to improve the heuristic allocation
+        search(indices, best_size, MAX_ITERATIONS);
+    }
+    output.clear();
+    for (auto &lr : lrs) {
+        output.push_back(lr.address);
+    }
+    return best_size;
+}
+
+void SearchAllocator::allocate_lr(LiveRange &lr) const {
+    uint32_t address = 0;
+    int predecessor = NO_PREDECESSOR;
+    bool fits = false;
+    while (!fits) {
+        fits = true;
+        // Find neighbours that overlap with address
+        for (auto lr2_p : neighbours[lr.id]) {
+            if (lr2_p->address == NOT_ALLOCATED || lr2_p->end_address <= address) {
+                continue;
+            }
+            if (lr2_p->overlaps(address, lr.size)) {
+                // Overlap found; increase address
+                fits = false;
+                address = lr2_p->end_address;
+                predecessor = lr2_p->id;
+            }
+        }
+    }
+    lr.address = address;
+    lr.end_address = address + lr.size;
+    lr.predecessor = predecessor;
+}
+
+uint32_t SearchAllocator::allocate_indices(const std::vector<size_t> &indices) {
+    ++nr_iterations;
+    std::vector<size_t> count(indices.size());
+    for (auto &lr : lrs) {
+        lr.address = NOT_ALLOCATED;
+    }
+    uint32_t size = 0;
+    for (size_t turn = 0; size <= best_size && turn < indices.size(); ++turn) {
+        auto &lr = lrs[indices[turn]];
+        allocate_lr(lr);
+        lr.turn = turn;
+        size = std::max(size, lr.end_address);
+    }
+    return size;
+}
+
+void SearchAllocator::sort_indices_on_prio(std::vector<size_t> &indices) const {
+    std::sort(indices.begin(), indices.end(),
+        [this] (size_t const& a, size_t const& b) {
+            if (lr_urgency[a] != lr_urgency[b]) {
+                return lr_urgency[a] > lr_urgency[b];
+            }
+            auto &lr1 = lrs[a];
+            auto &lr2 = lrs[b];
+            auto duration1 = lr1.end_time - lr1.start_time;
+            auto duration2 = lr2.end_time - lr2.start_time;
+            if (duration1 != duration2) {
+                return duration1 > duration2;
+            }
+            if (lr1.start_time != lr2.start_time) {
+                return lr1.start_time < lr2.start_time;
+            }
+            if (lr1.size != lr2.size) {
+                return lr1.size > lr2.size;
+            }
+            return lr1.id < lr2.id;
+        });
+}
+
+void SearchAllocator::add_predecessor_turns(std::set<size_t> &turns, const LiveRange &lr) const {
+    turns.insert(lr.turn);
+    int id = lr.id;
+    while (lrs[id].predecessor != NO_PREDECESSOR) {
+        id = lrs[id].predecessor;
+        turns.insert(lrs[id].turn);
+    }
+}
+
+void SearchAllocator::attempt_bottleneck_fix(std::vector<size_t> &indices) {
+    // Find the bottleneck
+    LiveRange *max_lr = &lrs[0];
+    for (auto &lr : lrs) {
+        if (lr.end_address > max_lr->end_address) {
+            max_lr = &lr;
+        }
+    }
+    // Find all live ranges that affected the placement of the bottleneck live range.
+    // This consists of two types of live ranges:
+    // - direct neighbours of the bottleneck live range
+    // - direct and indirect predecessors of these neighbours + bottleneck
+    // The turns at which these live ranges were allocated are put in the turns vector.
+    std::set<size_t> turns;
+    add_predecessor_turns(turns, *max_lr);
+    for (auto lr_p : neighbours[max_lr->id]) {
+        add_predecessor_turns(turns, *lr_p);
+    }
+    // Non-direct neighbours that interfere with the allocation of the bottleneck are the
+    // immediate cause for gaps in the allocation, and are selected with higher probability.
+    std::vector<size_t> turn_list;
+    std::vector<size_t> non_nb_turn_list;
+    for (auto turn : turns) {
+        turn_list.push_back(turn);
+        auto &lr = lrs[indices[turn]];
+        if (!max_lr->is_neighbour(lr)) {
+            non_nb_turn_list.push_back(turn);
+        }
+    }
+    size_t ix1;
+    if (rng() % 100 < 30 && !non_nb_turn_list.empty()) {
+        // Pick a live range from the "non-neighbour list"
+        ix1 = non_nb_turn_list[rng() % non_nb_turn_list.size()];
+    } else {
+        // Pick any affecting live range.
+        ix1 = turn_list[rng() % turn_list.size()];
+    }
+    size_t ix2 = turn_list[rng() % turn_list.size() - 1];
+    if (ix1 == ix2) {
+        ix2 = turn_list[turn_list.size() - 1];
+    }
+    // Swap indices
+    std::swap(indices[ix1], indices[ix2]);
+}
+
+void SearchAllocator::search(std::vector<size_t> &indices, uint32_t initial_size, int iterations) {
+    std::vector<size_t> best_indices = indices;
+    std::vector<LiveRange> best_lrs = lrs;
+    for (int i = 0; i < iterations; ++i) {
+        // Reorder the indices
+        attempt_bottleneck_fix(indices);
+        // Allocate the reordered indices and check if it gave an improvement
+        auto new_size = allocate_indices(indices);
+        if (new_size <= best_size) {
+            // The new allocation produced a new best result; remember it
+            best_size = new_size;
+            best_indices = indices;
+            best_lrs = lrs;
+            if (best_size <= target_size) {
+                // Target reached; stop
+                return;
+            }
+        } else {
+            // The new allocation produced worse result; undo the change
+            indices = best_indices;
+            lrs = best_lrs;
+        }
+    }
+    lrs = best_lrs;
+}
+
+uint32_t allocate(const std::vector<uint32_t> &input, int available_size, std::vector<uint32_t> &output) {
+    // Convert input to vector of live ranges
+    std::vector<LiveRange> lrs;
+    for (size_t i = 0; i < input.size(); i += 3) {
+        LiveRange lr;
+        lr.start_time = input[i];
+        lr.end_time = input[i+1];
+        lr.size = input[i+2];
+        lrs.push_back(lr);
+    }
+    SearchAllocator allocator(lrs, available_size);
+    return allocator.allocate(output);
+}