MLBEDSW-3808: Ported search allocator to python

- Straight port of the C++ implementation to python.
- Renamed the allocator from "Search" to "HillClimb"

Change-Id: I50797d541f326d0264daf79bf7866aef32350a60
Signed-off-by: Louis Verhaard <louis.verhaard@arm.com>
diff --git a/OPTIONS.md b/OPTIONS.md
index e191d4e..1293f17 100644
--- a/OPTIONS.md
+++ b/OPTIONS.md
@@ -183,8 +183,8 @@
 Specify which allocator algorithm to use for non-constant NPU and CPU tensor
 allocation.  
 **Type: String**  
-**Default: Search**  
-**Choices: [Greedy, LinearAlloc, Search]**  
+**Default: HillClimb**  
+**Choices: [Greedy, LinearAlloc, HillClimb]**  
 
 ```bash
 vela network.tflite --tensor-allocator=LinearAlloc
diff --git a/README.md b/README.md
index 81bb0a0..1d6d073 100644
--- a/README.md
+++ b/README.md
@@ -54,9 +54,9 @@
 [ML Platform](https://review.mlplatform.org/plugins/gitiles/ml/ethos-u/ethos-u-vela).
 Both methods will automatically install all the required dependencies.
 
-**Note:** For installing on Microsoft Windows you need to have a C99 and C++11
-capable toolchain installed. The recommended and tested toolchain is Microsoft
-Visual C++ 14.x Build Tools, see <https://wiki.python.org/moin/WindowsCompilers>
+**Note:** For installing on Microsoft Windows you need to have a C99 capable
+toolchain installed. The recommended and tested toolchain is Microsoft Visual
+C++ 14.x Build Tools, see <https://wiki.python.org/moin/WindowsCompilers>
 
 ### PyPi
 
diff --git a/ethosu/tensor_allocator/makefile b/ethosu/tensor_allocator/makefile
deleted file mode 100644
index 9636eb9..0000000
--- a/ethosu/tensor_allocator/makefile
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
-#
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the License); you may
-# not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an AS IS BASIS, WITHOUT
-# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Description:
-# Makefile to build tensor_allocator_main
-
-UNAME=$(shell uname -o)
-
-CXXFLAGS=--std=c++11 -pedantic-errors -Wall -Werror -Wdate-time
-CXXFLAGS+=-fwrapv -fstack-protector-strong -flto -fuse-linker-plugin -ffat-lto-objects -fPIC
-
-ifeq ($(DEBUG),1)
-    CXXFLAGS+=-g -O0 -DDEBUG
-else
-    CXXFLAGS+=-O2
-endif
-
-LIBSRCS=tensor_allocator_main.cpp search_allocator.cpp
-LIBHDRS=search_allocator.h
-
-ifeq ($(UNAME),Cygwin)
-    TENSOR_ALLOCATOR_EXE=tensor_allocator_main.exe
-else
-    TENSOR_ALLOCATOR_EXE=tensor_allocator_main
-endif
-
-all: tensor_allocator_exe
-
-.PHONY: tensor_allocator_exe
-tensor_allocator_exe: $(TENSOR_ALLOCATOR_EXE)
-
-clean:
-	rm -f $(TENSOR_ALLOCATOR_EXE)
-
-$(TENSOR_ALLOCATOR_EXE): $(LIBSRCS) $(LIBHDRS) makefile
-	g++ $(CXXFLAGS) $(LIBSRCS) -o $(TENSOR_ALLOCATOR_EXE)
diff --git a/ethosu/tensor_allocator/search_allocator.cpp b/ethosu/tensor_allocator/search_allocator.cpp
deleted file mode 100644
index 5c7492b..0000000
--- a/ethosu/tensor_allocator/search_allocator.cpp
+++ /dev/null
@@ -1,267 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * Description:
- * Implementation of the search-based allocator.
- */
-
-#include <algorithm>
-#include <cstdint>
-#include <set>
-#include <vector>
-
-#include "search_allocator.h"
-
-SearchAllocator::SearchAllocator(const std::vector<LiveRange> &live_ranges, uint32_t size_limit) {
-    lrs = live_ranges;
-    uint32_t max_end_time = 0;
-    for (size_t i = 0; i < lrs.size(); ++i) {
-        auto &lr = lrs[i];
-        lr.id = static_cast<int>(i);
-        max_end_time = std::max(max_end_time, lr.end_time);
-    }
-    lrs_at_time.resize(max_end_time + 1);
-    size_at_time.resize(max_end_time + 1);
-    neighbours.resize(lrs.size());
-    // Calculate which live ranges are active at every timestamp
-    for (size_t t = 0; t <= max_end_time; ++t) {
-        lrs_at_time[t].clear();
-    }
-    for (auto &lr : lrs) {
-        for (auto t = lr.start_time; t <= lr.end_time; ++t) {
-            lrs_at_time[t].push_back(&lr);
-        }
-    }
-    min_required_size = 0;
-    for (size_t t = 0; t <= max_end_time; ++t) {
-        // Calculate minimum needed size at each timestamp
-        uint32_t size_at_t = 0;
-        for (auto &lr : lrs_at_time[t]) {
-            size_at_t += lr->size;
-        }
-        size_at_time[t] = size_at_t;
-        min_required_size = std::max(size_at_t, min_required_size);
-        // Calculate all neighbours
-        for (size_t i = 0; i < lrs_at_time[t].size(); ++i) {
-            auto lr1 = lrs_at_time[t][i];
-            auto &nb1 = neighbours[lr1->id];
-            for (size_t j = i + 1; j < lrs_at_time[t].size(); ++j) {
-                auto lr2 = lrs_at_time[t][j];
-                if (find(nb1.begin(), nb1.end(), lr2) == nb1.end()) {
-                    nb1.push_back(lr2);
-                    neighbours[lr2->id].push_back(lr1);
-                }
-            }
-        }
-    }
-    target_size = std::max(min_required_size, size_limit);
-    // Calculate the urgency of each live range
-    lr_urgency.resize(lrs.size());
-    for (size_t i = 0; i < lrs.size(); ++i) {
-        auto &lr = lrs[i];
-        uint32_t urgency = 0;
-        for (size_t t = lr.start_time; t <= lr.end_time; ++t) {
-            urgency = std::max(size_at_time[t], urgency);
-        }
-        lr_urgency[i] = urgency;
-    }
-    best_size = UINT32_MAX;
-}
-
-uint32_t SearchAllocator::allocate(std::vector<uint32_t> &output) {
-    output.clear();
-    nr_iterations = 0;
-    std::vector<size_t> indices;
-    // Initial solution, using a heuristic allocator
-    for (size_t i = 0; i < lrs.size(); ++i) {
-        indices.push_back(i);
-    }
-    sort_indices_on_prio(indices);
-    // Allocate the initial solution
-    best_size = UINT32_MAX;
-    best_size = allocate_indices(indices);
-    if (best_size <= target_size) {
-        // The heuristic allocation returned an optimal solution.
-        // No need to search.
-    } else {
-        // Try to improve the heuristic allocation
-        search(indices, best_size, MAX_ITERATIONS);
-    }
-    output.clear();
-    for (auto &lr : lrs) {
-        output.push_back(lr.address);
-    }
-    return best_size;
-}
-
-void SearchAllocator::allocate_lr(LiveRange &lr) const {
-    uint32_t address = 0;
-    int predecessor = NO_PREDECESSOR;
-    bool fits = false;
-    while (!fits) {
-        fits = true;
-        // Find neighbours that overlap with address
-        for (auto lr2_p : neighbours[lr.id]) {
-            if (lr2_p->address == NOT_ALLOCATED || lr2_p->end_address <= address) {
-                continue;
-            }
-            if (lr2_p->overlaps(address, lr.size)) {
-                // Overlap found; increase address
-                fits = false;
-                address = lr2_p->end_address;
-                predecessor = lr2_p->id;
-            }
-        }
-    }
-    lr.address = address;
-    lr.end_address = address + lr.size;
-    lr.predecessor = predecessor;
-}
-
-uint32_t SearchAllocator::allocate_indices(const std::vector<size_t> &indices) {
-    ++nr_iterations;
-    std::vector<size_t> count(indices.size());
-    for (auto &lr : lrs) {
-        lr.address = NOT_ALLOCATED;
-    }
-    uint32_t size = 0;
-    for (size_t turn = 0; size <= best_size && turn < indices.size(); ++turn) {
-        auto &lr = lrs[indices[turn]];
-        allocate_lr(lr);
-        lr.turn = turn;
-        size = std::max(size, lr.end_address);
-    }
-    return size;
-}
-
-void SearchAllocator::sort_indices_on_prio(std::vector<size_t> &indices) const {
-    std::sort(indices.begin(), indices.end(),
-        [this] (size_t const& a, size_t const& b) {
-            if (lr_urgency[a] != lr_urgency[b]) {
-                return lr_urgency[a] > lr_urgency[b];
-            }
-            auto &lr1 = lrs[a];
-            auto &lr2 = lrs[b];
-            auto duration1 = lr1.end_time - lr1.start_time;
-            auto duration2 = lr2.end_time - lr2.start_time;
-            if (duration1 != duration2) {
-                return duration1 > duration2;
-            }
-            if (lr1.start_time != lr2.start_time) {
-                return lr1.start_time < lr2.start_time;
-            }
-            if (lr1.size != lr2.size) {
-                return lr1.size > lr2.size;
-            }
-            return lr1.id < lr2.id;
-        });
-}
-
-void SearchAllocator::add_predecessor_turns(std::set<size_t> &turns, const LiveRange &lr) const {
-    turns.insert(lr.turn);
-    int id = lr.id;
-    while (lrs[id].predecessor != NO_PREDECESSOR) {
-        id = lrs[id].predecessor;
-        turns.insert(lrs[id].turn);
-    }
-}
-
-void SearchAllocator::attempt_bottleneck_fix(std::vector<size_t> &indices) {
-    // Find the bottleneck
-    LiveRange *max_lr = &lrs[0];
-    for (auto &lr : lrs) {
-        if (lr.end_address > max_lr->end_address) {
-            max_lr = &lr;
-        }
-    }
-    // Find all live ranges that affected the placement of the bottleneck live range.
-    // This consists of two types of live ranges:
-    // - direct neighbours of the bottleneck live range
-    // - direct and indirect predecessors of these neighbours + bottleneck
-    // The turns at which these live ranges were allocated are put in the turns vector.
-    std::set<size_t> turns;
-    add_predecessor_turns(turns, *max_lr);
-    for (auto lr_p : neighbours[max_lr->id]) {
-        add_predecessor_turns(turns, *lr_p);
-    }
-    // Non-direct neighbours that interfere with the allocation of the bottleneck are the
-    // immediate cause for gaps in the allocation, and are selected with higher probability.
-    std::vector<size_t> turn_list;
-    std::vector<size_t> non_nb_turn_list;
-    for (auto turn : turns) {
-        turn_list.push_back(turn);
-        auto &lr = lrs[indices[turn]];
-        if (!max_lr->is_neighbour(lr)) {
-            non_nb_turn_list.push_back(turn);
-        }
-    }
-    size_t ix1;
-    if (rng() % 100 < 30 && !non_nb_turn_list.empty()) {
-        // Pick a live range from the "non-neighbour list"
-        ix1 = non_nb_turn_list[rng() % non_nb_turn_list.size()];
-    } else {
-        // Pick any affecting live range.
-        ix1 = turn_list[rng() % turn_list.size()];
-    }
-    // Note: turn_list has always at least 2 elements for bottlenecks
-    size_t ix2 = turn_list[rng() % (turn_list.size() - 1)];
-    if (ix1 == ix2) {
-        ix2 = turn_list[turn_list.size() - 1];
-    }
-    // Swap indices
-    std::swap(indices[ix1], indices[ix2]);
-}
-
-void SearchAllocator::search(std::vector<size_t> &indices, uint32_t initial_size, int iterations) {
-    std::vector<size_t> best_indices = indices;
-    std::vector<LiveRange> best_lrs = lrs;
-    for (int i = 0; i < iterations; ++i) {
-        // Reorder the indices
-        attempt_bottleneck_fix(indices);
-        // Allocate the reordered indices and check if it gave an improvement
-        auto new_size = allocate_indices(indices);
-        if (new_size <= best_size) {
-            // The new allocation produced a new best result; remember it
-            best_size = new_size;
-            best_indices = indices;
-            best_lrs = lrs;
-            if (best_size <= target_size) {
-                // Target reached; stop
-                return;
-            }
-        } else {
-            // The new allocation produced worse result; undo the change
-            indices = best_indices;
-            lrs = best_lrs;
-        }
-    }
-    lrs = best_lrs;
-}
-
-uint32_t allocate(const std::vector<uint32_t> &input, int available_size, std::vector<uint32_t> &output) {
-    // Convert input to vector of live ranges
-    std::vector<LiveRange> lrs;
-    for (size_t i = 0; i < input.size(); i += 3) {
-        LiveRange lr;
-        lr.start_time = input[i];
-        lr.end_time = input[i+1];
-        lr.size = input[i+2];
-        lrs.push_back(lr);
-    }
-    SearchAllocator allocator(lrs, available_size);
-    return allocator.allocate(output);
-}
diff --git a/ethosu/tensor_allocator/search_allocator.h b/ethosu/tensor_allocator/search_allocator.h
deleted file mode 100644
index 6c75015..0000000
--- a/ethosu/tensor_allocator/search_allocator.h
+++ /dev/null
@@ -1,181 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- *
- * Description:
- * Declaration of the search-based allocator.
- */
-
-#ifndef __SEARCH_ALLOCATOR_H
-#define __SEARCH_ALLOCATOR_H
-
-#include <algorithm>
-#include <cstdint>
-#include <random>
-#include <set>
-#include <vector>
-
-/**
- * Live range
- */
-struct LiveRange {
-    /** Start time (input to the allocator algorithm) */
-    uint32_t start_time;
-    /** End time, inclusive (input to the allocator algorithm) */
-    uint32_t end_time;
-    /** Size in bytes (input to the allocator algorithm) */
-    uint32_t size;
-    /** Index of this live range */
-    int id;
-    /** Allocated address (the main output from the allocator algorithm) */
-    uint32_t address;
-    /** End address, exclusive */
-    uint32_t end_address;
-    /** id of predecessor live range (predecessor's end address == this lr's address) */
-    int predecessor;
-    /** Turn at which the live range was allocated */
-    size_t turn;
-
-    bool overlaps(uint32_t addr2, uint32_t size2) const {
-        return address < addr2 + size2 && addr2 < end_address;
-    }
-    bool is_neighbour(const LiveRange &lr) const {
-        return start_time <= lr.end_time && lr.start_time <= end_time;
-    }
-};
-
-/**
- * Implements tensor allocator using state space exploration.
- *
- * The basic algorithm is:
- *
- * Use a heuristic allocator to find an initial allocation
- * while allocation is not optimal and iterations < MAX_ITERATIONS {
- *     find the "bottleneck": the live range with highest end address
- *     find all live ranges that affected the allocation of the bottleneck
- *     swap the order of any two affecting live ranges
- *     reallocate tensors using the reordered live ranges
- *     if the new allocation is better: keep it, else set allocation to previous allocation
- * }
- */
-class SearchAllocator {
-private:
-    static constexpr int MAX_ITERATIONS = 500;
-    static constexpr uint32_t NOT_ALLOCATED = UINT32_MAX;
-    /** Used for live ranges allocated at address 0 */
-    static constexpr int NO_PREDECESSOR = -1;
-    /** Contains the live ranges */
-    std::vector<LiveRange> lrs;
-    /** Contains active live ranges at each timestamp */
-    std::vector<std::vector<LiveRange*>> lrs_at_time;
-    /**
-     * Contains neighbours of each live range (indexed by lr.id), i.e.
-     * live ranges with overlapping start/end time.
-     */
-    std::vector<std::vector<LiveRange*>> neighbours;
-    /**
-     * At each timestamp: accumulated size of active live ranges
-     */
-    std::vector<uint32_t> size_at_time;
-    /**
-     * For each live range: max value of size_at_time (only used in the heuristic allocation)
-     */
-    std::vector<uint32_t> lr_urgency;
-    /**
-     * The minimum possible size, assuming all live ranges can be perfectly allocated
-     */
-    uint32_t min_required_size;
-    /** The algorithm stops once the target size has been achieved */
-    uint32_t target_size;
-    /** The highest end address of the best found allocation */
-    uint32_t best_size;
-    /** Number of performed iterations */
-    size_t nr_iterations = 0;
-    /** Random number generator; use default seed (which is well-defined) */
-    std::mt19937 rng;
-public:
-    SearchAllocator(const std::vector<LiveRange> &live_ranges, uint32_t size_limit);
-    /**
-     * Runs the allocation algorithm. Finishes when the target size has been
-     * reached or when maximum iterations have been run.
-     * The allocated addresses are placed in the output vector, in the same
-     * order as the input vector.
-     *
-     * Implementation note: the algorithm produces reproduceable results by using
-     * a well-defined random number generator with well-defined default seed,
-     * and using a fixed number of iterations.
-     */
-    uint32_t allocate(std::vector<uint32_t> &output);
-    uint32_t get_min_required_size() const {
-        return min_required_size;
-    }
-    size_t get_nr_iterations() const {
-        return nr_iterations;
-    }
-private:
-    /**
-     * Allocates the given live range at the smallest possible address
-     */
-    void allocate_lr(LiveRange &lr) const;
-    /**
-     * Allocates the live ranges in the order indicated by the indices;
-     * allocates each live range at the lowest possible address.
-     */
-    uint32_t allocate_indices(const std::vector<size_t> &indices);
-    /** Sorts live ranges based on heuristics, used for the initial allocation */
-    void sort_indices_on_prio(std::vector<size_t> &indices) const;
-    /** Adds the given live range + predecessors to the turns vector */
-    void add_predecessor_turns(std::set<size_t> &turns, const LiveRange &lr) const;
-    /**
-     * Finds the "bottleneck", the live range with highest end address, and reorders the indices
-     * such that a next allocation might lower the memory usage.
-     *
-     *                          ---------
-     *                          |       |
-     *                          |   D   |
-     *                          |       |
-     * ----------------------------------
-     * |           B                 |
-     * -------------------------------
-     * | |
-     * |A|                      ---
-     * | |                      |C|
-     * | |                      | |
-     * ---------------------------------------
-     *
-     * In the above example, the allocation order was [A, B, C, D] and D is the resulting bottle-neck.
-     * The live ranges that affected the allocation of D are the direct neighbours of D (i.e. B and C),
-     * and all direct and indirect predecessors of D and its neighbours
-     * (i.e. A, which is the predecessor of B, and indirect predecessor of D).
-     *
-     * By permuting the order in which the affecting live ranges are allocated, the bottleneck might
-     * be lowered. In the above example, almost any permutation would lower the bottleneck.
-     *
-     * Note that there is room to improve the efficiency of the algorithm.
-     * One way could be to first allocate all direct neighbours of the bottleneck
-     * (i.e. B, C, D) and then the other affecting live ranges (i.e. A). The algorithm currently does
-     * not actively try this, as it may lead to allocation loops (A could become the new bottle-neck);
-     * it just uses a higher probability of selecting A.
-     */
-    void attempt_bottleneck_fix(std::vector<size_t> &indices);
-    /** Search for a solution, using the given indices as initial solution. */
-    void search(std::vector<size_t> &indices, uint32_t initial_size, int iterations);
-};
-
-/** Wrapper function to perform live range allocation */
-uint32_t allocate(const std::vector<uint32_t> &input, int available_size, std::vector<uint32_t> &output);
-
-#endif // __SEARCH_ALLOCATOR_H
diff --git a/ethosu/tensor_allocator/tensor_allocator_main.cpp b/ethosu/tensor_allocator/tensor_allocator_main.cpp
deleted file mode 100644
index 27d96ef..0000000
--- a/ethosu/tensor_allocator/tensor_allocator_main.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <cstdint>
-#include <iostream>
-#include <vector>
-
-#include "search_allocator.h"
-
-using namespace std;
-
-/**
- * Reads live ranges from the input, and then performs allocation.
- * The input has format:
-
-<number of live ranges>
-<start_time> <end_time> <size>
-...
-
- * e.g.:
-4
-0 20 4096
-2 8 16000
-4 10 800
-6 20 1024
- */
-int main() {
-    int lr_count;
-    cin >> lr_count;
-    cin.ignore();
-    vector<uint32_t> input;
-    vector<uint32_t> output;
-    for (int i = 0; i < lr_count; ++i) {
-        LiveRange lr;
-        cin >> lr.start_time >> lr.end_time >> lr.size;
-        lr.id = i;
-        cin.ignore();
-        input.push_back(lr.start_time);
-        input.push_back(lr.end_time);
-        input.push_back(lr.size);
-    }
-    vector<LiveRange> lrs;
-    for (size_t i = 0; i < input.size(); i += 3) {
-        LiveRange lr;
-        lr.start_time = input[i];
-        lr.end_time = input[i+1];
-        lr.size = input[i+2];
-        lrs.push_back(lr);
-    }
-    SearchAllocator allocator(lrs, 0);
-    uint32_t result = allocator.allocate(output);
-    printf("Output:\n");
-    for (int i = 0; i < lr_count; ++i) {
-        printf("%4d: %6d %4d-%4d size %6d\n", i, output[i], input[3*i], input[3*i+1], input[3*i+2]);
-    }
-    uint32_t min_size = allocator.get_min_required_size();
-    double overhead = 100.0 * (result - min_size)/(double)min_size;
-    printf("Total size: %d (%1.1f K), minimum required size: %d, overhead: %1.2f%%\n",
-        result, result/1024.0, min_size, overhead);
-    printf("Search used %ld iterations\n", (long)allocator.get_nr_iterations());
-    return 0;
-}
diff --git a/ethosu/tensor_allocator/tensor_allocatormodule.cpp b/ethosu/tensor_allocator/tensor_allocatormodule.cpp
deleted file mode 100644
index 52f1c69..0000000
--- a/ethosu/tensor_allocator/tensor_allocatormodule.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-/*
- * Copyright (c) 2020 Arm Limited. All rights reserved.
- *
- * SPDX-License-Identifier: Apache-2.0
- *
- * Licensed under the Apache License, Version 2.0 (the License); you may
- * not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an AS IS BASIS, WITHOUT
- * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#define PY_SSIZE_T_CLEAN
-#include <Python.h>
-#include <cstdint>
-
-#include <vector>
-#include "search_allocator.h"
-
-
-
-/**
- * C++ extension wrapper for allocate
- *
- * This method is exposed directly in python with the arguments with a
- * prototype of the form:
- *
- * output = tensor_allocator.allocate(input, available_size=0)
- *
- * input: [int]
- * available_size: int
- * output: [int]
- */
-static PyObject *method_allocate (PyObject *self, PyObject *args)
-{
-    /* Object to hold the input integer list. */
-    PyObject *input_list_object;
-
-    /* Object to hold the available size */
-    int available_size = 0;
-
-    /* Arguments to the method are delivered as a tuple, unpack the
-     * tuple to get the individual arguments, note the second is
-     * optional.
-     */
-    if (!PyArg_ParseTuple(args, "O|i", &input_list_object, &available_size)) {
-        return NULL;
-    }
-
-    /* Unpack the length of the input integer list. */
-    auto input_length = PyObject_Length(input_list_object);
-    if (input_length < 0) {
-        return NULL;
-    }
-    if (input_length % 3 != 0) {
-        PyErr_SetString(PyExc_ValueError, "Input length must be multiple of 3");
-        return NULL;
-    }
-    std::vector<uint32_t> input;
-    std::vector<uint32_t> output;
-    for (int i = 0; i < input_length; ++i) {
-        PyObject *obj = PyList_GetItem(input_list_object, i);
-        if (!PyLong_Check(obj)) {
-            PyErr_SetString(PyExc_ValueError, "Illegal value in input");
-            return NULL;
-        }
-        auto value = PyLong_AsLong(obj);
-        if (value < 0 || value > UINT32_MAX) {
-            PyErr_SetString(PyExc_ValueError, "Input value out of bounds");
-            return NULL;
-        }
-        input.push_back(value);
-    }
-    allocate(input, available_size, output);
-    PyObject *output_list = PyList_New(output.size());
-    for (size_t i = 0; i < output.size(); ++i) {
-        PyList_SetItem(output_list, i, PyLong_FromLong(output[i]));
-    }
-    return output_list;
-}
-
-/** tensor_allocator method descriptors. */
-static PyMethodDef tensor_allocator_methods[] = {
-    {"allocate", method_allocate, METH_VARARGS, "Python interface for allocate"},
-    {NULL, NULL, 0, NULL}
-};
-
-/** tensor_allocator module descriptor. */
-static struct PyModuleDef tensor_allocatormodule = {
-    PyModuleDef_HEAD_INIT,
-    "tensor_allocator",
-    "Python interface for tensor_allocator",
-    -1,
-    tensor_allocator_methods
-};
-
-PyMODINIT_FUNC PyInit_tensor_allocator(void) {
-    return PyModule_Create(&tensor_allocatormodule);
-}
diff --git a/ethosu/vela/hillclimb_allocation.py b/ethosu/vela/hillclimb_allocation.py
new file mode 100644
index 0000000..de53ab8
--- /dev/null
+++ b/ethosu/vela/hillclimb_allocation.py
@@ -0,0 +1,310 @@
+# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Description:
+# Tensor allocator based on a hill-climb search
+import random
+from typing import List
+from typing import Set
+
+from .live_range import LiveRange
+
+
+class LiveRangeInfo:
+    def __init__(self, id: int, start_time: int, end_time: int, size: int):
+        # Index of this live range
+        self.id = id
+        # Start time (input to the allocator algorithm)
+        self.start_time = start_time
+        # End time, inclusive (input to the allocator algorithm)
+        self.end_time = end_time
+        # Size in bytes (input to the allocator algorithm)
+        self.size = size
+        # Allocated address (the main output from the allocator algorithm)
+        self.address: int = 0
+        # End address, exclusive
+        self.end_address: int = 0
+        # id of predecessor live range (predecessor's end address == this lr's address)
+        self.predecessor: int = 0
+        # Turn at which the live range was allocated
+        self.turn: int = 0
+        # Max value of size_at_time (only used in the heuristic allocation)
+        self.urgency = 0
+        self.neighbours: List["LiveRangeInfo"] = []
+
+    def overlaps(self, addr2: int, size2: int) -> int:
+        return self.address < addr2 + size2 and addr2 < self.end_address
+
+    def is_neighbour(self, lr: "LiveRangeInfo") -> bool:
+        return self.start_time <= lr.end_time and lr.start_time <= self.end_time
+
+    def __str__(self):
+        return "<LiveRangeInfo: id={}, start_time={}, end_time={}, size={}, address={}>".format(
+            self.id, self.start_time, self.end_time, self.size, self.address
+        )
+
+    def __lt__(self, other) -> bool:
+        if self.urgency != other.urgency:
+            return self.urgency > other.urgency
+        duration1 = self.end_time - self.start_time
+        duration2 = other.end_time - other.start_time
+        if duration1 != duration2:
+            return duration1 > duration2
+
+        if self.start_time != other.start_time:
+            return self.start_time < other.start_time
+
+        if self.size != other.size:
+            return self.size > other.size
+
+        return self.id < other.id
+
+
+class HillClimbAllocator:
+    """
+    Implements tensor allocator using a hill climb search.
+
+    The basic algorithm is:
+
+    Use a heuristic allocator to find an initial allocation
+    while allocation is not optimal and iterations < MAX_ITERATIONS:
+        find the "bottleneck": the live range with highest end address
+        find all live ranges that affected the allocation of the bottleneck
+        swap the order of any two affecting live ranges
+        reallocate tensors using the reordered live ranges
+        if the new allocation is better: keep it, else set allocation to previous allocation
+    """
+
+    MAX_ITERATIONS = 500
+    NOT_ALLOCATED = -1
+    # Used for live ranges allocated at address 0
+    NO_PREDECESSOR = -1
+
+    def __init__(self, live_ranges: List[LiveRange]):
+        # Contains the live ranges
+        self.lrs: List[LiveRangeInfo] = [
+            LiveRangeInfo(id, lr.start_time, lr.end_time, lr.size) for id, lr in enumerate(live_ranges)
+        ]
+        self.lrs_at_time = []
+        # The available size (input to algorithm).
+        self.available_size: int = 0
+        # The algorithm stops once the target size has been achieved
+        self.target_size: int = 0
+        # The highest end address of the best found allocation
+        self.best_size: int = 1 << 63
+        # For each live range: max value of size_at_time (only used in the heuristic allocation)
+        self.lr_urgency = len(self.lrs) * [0]
+        nr_time_slots = 1 + max(lr.end_time for lr in self.lrs)
+        # Contains active live ranges at each timestamp
+        self.lrs_at_time = [[] for i in range(nr_time_slots)]
+        for lr in self.lrs:
+            for t in range(lr.start_time, lr.end_time + 1):
+                self.lrs_at_time[t].append(lr)
+        # At each timestamp: accumulated size of active live ranges
+        size_at_time = [sum(lr.size for lr in self.lrs_at_time[t]) for t in range(nr_time_slots)]
+        # The minimum possible size, assuming all live ranges can be perfectly allocated
+        self.min_required_size: int = max(size_at_time)
+        # Calculate all neighbours + the urgency of each live range
+        for lr in self.lrs:
+            lr.urgency = 0
+            lr.neighbours = []
+            neighbours = set()
+            for t in range(lr.start_time, lr.end_time + 1):
+                lr.urgency = max(size_at_time[t], lr.urgency)
+                for lr2 in self.lrs_at_time[t]:
+                    if lr2 not in neighbours and lr != lr2:
+                        neighbours.add(lr2)
+                        lr.neighbours.append(lr2)
+
+    def allocate_lr(self, lr: LiveRangeInfo):
+        """
+        Allocates the given live range at the smallest possible address
+        """
+        address = 0
+        predecessor = HillClimbAllocator.NO_PREDECESSOR
+        fits = False
+        while not fits:
+            fits = True
+            # Find neighbours that overlap with address
+            for lr2 in lr.neighbours:
+                if lr2.address == HillClimbAllocator.NOT_ALLOCATED or lr2.end_address <= address:
+                    continue
+                if lr2.overlaps(address, lr.size):
+                    # Overlap found increase address
+                    fits = False
+                    address = lr2.end_address
+                    predecessor = lr2.id
+        lr.address = address
+        lr.end_address = address + lr.size
+        lr.predecessor = predecessor
+
+    def allocate_indices(self, indices: List[int]):
+        """
+        Allocates the live ranges in the order indicated by the indices;
+        allocates each live range at the lowest possible address.
+        """
+        for lr in self.lrs:
+            lr.address = HillClimbAllocator.NOT_ALLOCATED
+        size = 0
+        for turn, index in enumerate(indices):
+            lr = self.lrs[index]
+            self.allocate_lr(lr)
+            lr.turn = turn
+            size = max(size, lr.end_address)
+            if size > self.best_size:
+                # This allocation is worse than the best known allocation;
+                # no need to continue
+                break
+        return size
+
+    def add_predecessor_turns(self, turn_set: Set[int], turn_list: List[int], lr: LiveRangeInfo):
+        """
+        Adds the given live range + predecessors to the turns vector.
+        Note: the turn_set is for quick detection of duplicates,
+        the turn_list is to get reproduceable results
+        """
+        if lr.turn not in turn_set:
+            turn_set.add(lr.turn)
+            turn_list.append(lr.turn)
+        id = lr.id
+        while self.lrs[id].predecessor != HillClimbAllocator.NO_PREDECESSOR:
+            id = self.lrs[id].predecessor
+            turn = self.lrs[id].turn
+            if turn not in turn_set:
+                turn_set.add(turn)
+                turn_list.append(turn)
+
+    def attempt_bottleneck_fix(self, indices: List[int]):
+        """
+        Finds the "bottleneck", the live range with highest end address, and reorders the indices
+        such that a next allocation might lower the memory usage.
+
+                                  ---------
+                                  |       |
+                                  |   D   |
+                                  |       |
+         ----------------------------------
+         |           B                 |
+         -------------------------------
+         | |
+         |A|                      ---
+         | |                      |C|
+         | |                      | |
+         ---------------------------------------
+
+        In the above example, the allocation order was [A, B, C, D] and D is the resulting bottle-neck.
+        The live ranges that affected the allocation of D are the direct neighbours of D (i.e. B and C),
+        and all direct and indirect predecessors of D and its neighbours
+        (i.e. A, which is the predecessor of B, and indirect predecessor of D).
+
+        By permuting the order in which the affecting live ranges are allocated, the bottleneck might
+        be lowered. In the above example, almost any permutation would lower the bottleneck.
+        """
+        # Find the bottleneck
+        max_lr = self.lrs[0]
+        for lr in self.lrs[1:]:
+            if lr.end_address > max_lr.end_address:
+                max_lr = lr
+
+        # Find all live ranges that affected the placement of the bottleneck live range.
+        # This consists of two types of live ranges:
+        # - direct neighbours of the bottleneck live range
+        # - direct and indirect predecessors of these neighbours + bottleneck
+        # The turns at which these live ranges were allocated are put in the turns set.
+        turn_set = set()
+        turn_list = list()
+        self.add_predecessor_turns(turn_set, turn_list, max_lr)
+        for lr2 in max_lr.neighbours:
+            self.add_predecessor_turns(turn_set, turn_list, lr2)
+
+        # Non-direct neighbours that interfere with the allocation of the bottleneck are the
+        # immediate cause for gaps in the allocation, and are selected with higher probability.
+        non_nb_turn_list = []
+        for turn in turn_list:
+            lr = self.lrs[indices[turn]]
+            if not max_lr.is_neighbour(lr):
+                non_nb_turn_list.append(turn)
+        assert turn_list
+        # Pick from non-neighbour list with 30% probability
+        # (magic number based on tuning)
+        if random.randint(0, 100) < 30 and non_nb_turn_list:
+            # Pick a live range from the "non-neighbour list"
+            ix1 = non_nb_turn_list[random.randint(0, len(non_nb_turn_list) - 1)]
+        else:
+            # Pick any affecting live range.
+            ix1 = turn_list[random.randint(0, len(turn_list) - 1)]
+
+        ix2 = turn_list[random.randint(0, len(turn_list) - 2)]
+        if ix1 == ix2:
+            ix2 = turn_list[-1]
+        # Swap indices
+        indices[ix1], indices[ix2] = indices[ix2], indices[ix1]
+
+    def search(self, indices: List[int], iterations: int):
+        """
+        Search for a solution, using the given indices as initial solution.
+        """
+        best_indices = indices[:]
+        for _ in range(iterations):
+            # Reorder the indices
+            self.attempt_bottleneck_fix(indices)
+            # Allocate the reordered indices and check if it gave an improvement
+            new_size = self.allocate_indices(indices)
+            if new_size <= self.best_size:
+                # The new allocation produced a new best result remember it
+                self.best_size = new_size
+                best_indices = indices[:]
+                self.allocated_addresses = [lr.address for lr in self.lrs]
+                if self.best_size <= self.min_required_size:
+                    # Target reached stop
+                    return
+            else:
+                # The new allocation produced worse result undo the change
+                indices = best_indices[:]
+
+    def allocate(self) -> List[int]:
+        """
+        Runs the allocation algorithm. Finishes when an optimal solution has been
+        found or when maximum iterations have been run.
+        The allocated addresses are placed in the output vector, in the same
+        order as the input vector.
+
+        Implementation note: the algorithm produces reproduceable results by using
+        a well-defined random number generator with well-defined default seed,
+        and using a fixed number of iterations.
+        """
+        random.seed(1)
+        # Sort indices on priority. Note: self.lrs must be left unchanged
+        indices = [lr.id for lr in sorted(self.lrs)]
+        # Allocate the initial solution
+        self.best_size = self.allocate_indices(indices)
+        self.allocated_addresses = [lr.address for lr in self.lrs]
+        if self.best_size > self.min_required_size:
+            # Try to improve the heuristic allocation
+            self.search(indices, HillClimbAllocator.MAX_ITERATIONS)
+        # else the heuristic allocation returned an optimal solution; no search needed
+        return self.allocated_addresses
+
+
+def allocate_live_ranges(lrs: List[LiveRange]) -> List[int]:
+    """
+    Allocates live ranges using a search based allocator.
+    Returns the list of allocated addresses (one for each live range)
+    """
+    if not lrs:
+        return []
+    allocator = HillClimbAllocator(lrs)
+    return allocator.allocate()
diff --git a/ethosu/vela/nn_graph.py b/ethosu/vela/nn_graph.py
index db878bc..c45d0e3 100644
--- a/ethosu/vela/nn_graph.py
+++ b/ethosu/vela/nn_graph.py
@@ -38,7 +38,7 @@
 class TensorAllocator(enum.Enum):
     LinearAlloc = 1
     Greedy = 2
-    Search = 3
+    HillClimb = 3
 
     def __str__(self):
         return self.name
diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py
index 0a7da5d..b7057f0 100644
--- a/ethosu/vela/tensor_allocation.py
+++ b/ethosu/vela/tensor_allocation.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -20,6 +20,7 @@
 
 import numpy as np
 
+from . import hillclimb_allocation
 from . import live_range
 from . import numeric_util
 from .errors import AllocationError
@@ -30,7 +31,6 @@
 from .tensor import MemType
 from .tensor import Tensor
 from .tensor import TensorPurpose
-from ethosu import tensor_allocator
 
 
 def linear_allocate_live_ranges(live_ranges, alloc_granularity=Tensor.AllocationQuantum):
@@ -63,22 +63,14 @@
     return total_sz
 
 
-def search_allocate_live_ranges(live_ranges: LiveRangeGraph, alloc_granularity: int) -> int:
-    # Allocates using the search-based allocator (implemented in C++)
-    input = []
-    lrs = []
-    lr_set = set()
-    for lr in live_ranges.ranges.values():
-        lr_set.add((lr.start_time, lr.end_time, lr))
-    lr_list = sorted(lr_set)
-    # Create a single array of ints containing start/end/size of the live ranges
-    for start, end, lr in lr_list:
-        input += [start, end, numeric_util.round_up(lr.size, alloc_granularity)]
-        lrs.append(lr)
-    addresses = tensor_allocator.allocate(input, 0)
+def hillclimb_allocate_live_ranges(live_ranges: LiveRangeGraph, alloc_granularity: int) -> int:
+    # Allocates using the hill climb allocator
+    lr_set = {(lr.start_time, lr.end_time, lr) for lr in live_ranges.ranges.values()}
+    lr_list = [lr for _, _, lr in lr_set]
+    addresses = hillclimb_allocation.allocate_live_ranges(lr_list)
     # The result is a list containing the allocated addresses
     total_sz = 0
-    for lr, address in zip(lrs, addresses):
+    for lr, address in zip(lr_list, addresses):
         total_sz = max(total_sz, address + lr.size)
         lr.set_address(address)
     verify_allocation(live_ranges, alloc_granularity)
@@ -179,8 +171,8 @@
             verify_allocation(lrs, cpu_tensor_alignment)
         elif tens_alloc == TensorAllocator.LinearAlloc:
             total_sz = linear_allocate_live_ranges(lrs, cpu_tensor_alignment)
-        elif tens_alloc == TensorAllocator.Search:
-            total_sz = search_allocate_live_ranges(lrs, cpu_tensor_alignment)
+        elif tens_alloc == TensorAllocator.HillClimb:
+            total_sz = hillclimb_allocate_live_ranges(lrs, cpu_tensor_alignment)
         else:
             assert 0
         alloc_ok = max_size is None or total_sz <= max_size
diff --git a/ethosu/tensor_allocator/test/test_tensor_allocator.py b/ethosu/vela/test/test_hillclimb_allocation.py
similarity index 67%
rename from ethosu/tensor_allocator/test/test_tensor_allocator.py
rename to ethosu/vela/test/test_hillclimb_allocation.py
index 1011279..8a56c3f 100644
--- a/ethosu/tensor_allocator/test/test_tensor_allocator.py
+++ b/ethosu/vela/test/test_hillclimb_allocation.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
@@ -15,10 +15,12 @@
 # limitations under the License.
 #
 # Description:
-# Unit tests for tensor_allocator.
+# Unit tests for hillclimb_allocator.
 import pytest
 
-from ethosu import tensor_allocator
+from ethosu.vela.hillclimb_allocation import allocate_live_ranges
+from ethosu.vela.live_range import LiveRange
+
 
 test_data = [
     ([(0, 100, 8000), (0, 1, 8016), (100, 110, 2000), (108, 110, 4000), (109, 110, 6000)], 16016),
@@ -40,24 +42,22 @@
 ]
 
 
+def live_range(start_time, end_time, size):
+    lr = LiveRange(None, 1)
+    lr.start_time = start_time
+    lr.end_time = end_time
+    lr.size = size
+    return lr
+
+
 @pytest.mark.parametrize("lrs, expected_size", test_data)
 def test_allocate(lrs, expected_size):
     """Tests the search allocator"""
-    input = [x for lr in lrs for x in lr]
-    res = tensor_allocator.allocate(input, 0)
+    lr_list = [live_range(start, end, size) for start, end, size in lrs]
+    res = allocate_live_ranges(lr_list)
     assert len(res) == len(lrs)
     assert max(addr + lr[2] for addr, lr in zip(res, lrs)) == expected_size
 
 
 def test_allocate_empty_input():
-    assert [] == tensor_allocator.allocate([], 0)
-
-
-invalid_input_test_data = [None, 3, [1, 2, 16, 9, 15], [1, 5, None], [-1, 0, 16], [1, 2, 10000000000]]
-
-
-@pytest.mark.parametrize("input", invalid_input_test_data)
-def test_allocate_invalid_input(input):
-    """Tests the search allocator with invalid input data"""
-    with pytest.raises(Exception):
-        tensor_allocator.allocate(input, 0)
+    assert [] == allocate_live_ranges([])
diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py
index c4510b1..7271002 100644
--- a/ethosu/vela/vela.py
+++ b/ethosu/vela/vela.py
@@ -317,7 +317,7 @@
     )
     parser.add_argument(
         "--tensor-allocator",
-        default=TensorAllocator.Search,
+        default=TensorAllocator.HillClimb,
         type=lambda s: TensorAllocator[s],
         choices=list(TensorAllocator),
         help="Tensor Allocator algorithm (default: %(default)s)",
diff --git a/setup.py b/setup.py
index f67d42e..98dfe52 100644
--- a/setup.py
+++ b/setup.py
@@ -44,12 +44,6 @@
     ["ethosu/mlw_codec/mlw_encode.c", "ethosu/mlw_codec/mlw_decode.c", "ethosu/mlw_codec/mlw_codecmodule.c"],
 )
 
-tensor_allocator_module = Extension(
-    "ethosu.tensor_allocator",
-    ["ethosu/tensor_allocator/tensor_allocatormodule.cpp", "ethosu/tensor_allocator/search_allocator.cpp"],
-    depends=["ethosu/tensor_allocator/search_allocator.h"],
-)
-
 setup(
     name="ethos-u-vela",
     use_scm_version=True,
@@ -66,7 +60,6 @@
         "Operating System :: POSIX :: Linux",
         "Operating System :: Microsoft :: Windows",
         "Programming Language :: C",
-        "Programming Language :: C++ :: C++11",
         "Programming Language :: Python :: 3",
         "Programming Language :: Python :: 3.6",
         "Programming Language :: Python :: 3.7",
@@ -84,6 +77,6 @@
         "lxml>=4.5.1",
     ],
     entry_points={"console_scripts": ["vela = ethosu.vela.vela:main"]},
-    ext_modules=[mlw_module, tensor_allocator_module],
+    ext_modules=[mlw_module],
     setup_requires=["setuptools_scm"],
 )