ethosu/tensor_allocator/search_allocator.cpp - ml/ethos-u/ethos-u-vela - Gitiles

 /*
  * Copyright (c) 2020 Arm Limited. All rights reserved.
  *
  * SPDX-License-Identifier: Apache-2.0
  *
  * Licensed under the Apache License, Version 2.0 (the License); you may
  * not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  * www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an AS IS BASIS, WITHOUT
  * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  *
  * Description:
  * Implementation of the search-based allocator.
  */

 #include <algorithm>
 #include <cstdint>
 #include <set>
 #include <vector>

 #include "search_allocator.h"

 SearchAllocator::SearchAllocator(const std::vector<LiveRange> &live_ranges, uint32_t size_limit) {
     lrs = live_ranges;
     uint32_t max_end_time = 0;
     for (size_t i = 0; i < lrs.size(); ++i) {
         auto &lr = lrs[i];
         lr.id = static_cast<int>(i);
         max_end_time = std::max(max_end_time, lr.end_time);
     }
     lrs_at_time.resize(max_end_time + 1);
     size_at_time.resize(max_end_time + 1);
     neighbours.resize(lrs.size());
     // Calculate which live ranges are active at every timestamp
     for (size_t t = 0; t <= max_end_time; ++t) {
         lrs_at_time[t].clear();
     }
     for (auto &lr : lrs) {
         for (auto t = lr.start_time; t <= lr.end_time; ++t) {
             lrs_at_time[t].push_back(&lr);
         }
     }
     min_required_size = 0;
     for (size_t t = 0; t <= max_end_time; ++t) {
         // Calculate minimum needed size at each timestamp
         uint32_t size_at_t = 0;
         for (auto &lr : lrs_at_time[t]) {
             size_at_t += lr->size;
         }
         size_at_time[t] = size_at_t;
         min_required_size = std::max(size_at_t, min_required_size);
         // Calculate all neighbours
         for (size_t i = 0; i < lrs_at_time[t].size(); ++i) {
             auto lr1 = lrs_at_time[t][i];
             auto &nb1 = neighbours[lr1->id];
             for (size_t j = i + 1; j < lrs_at_time[t].size(); ++j) {
                 auto lr2 = lrs_at_time[t][j];
                 if (find(nb1.begin(), nb1.end(), lr2) == nb1.end()) {
                     nb1.push_back(lr2);
                     neighbours[lr2->id].push_back(lr1);
                 }
             }
         }
     }
     target_size = std::max(min_required_size, size_limit);
     // Calculate the urgency of each live range
     lr_urgency.resize(lrs.size());
     for (size_t i = 0; i < lrs.size(); ++i) {
         auto &lr = lrs[i];
         uint32_t urgency = 0;
         for (size_t t = lr.start_time; t <= lr.end_time; ++t) {
             urgency = std::max(size_at_time[t], urgency);
         }
         lr_urgency[i] = urgency;
     }
     best_size = UINT32_MAX;
 }

 uint32_t SearchAllocator::allocate(std::vector<uint32_t> &output) {
     output.clear();
     nr_iterations = 0;
     std::vector<size_t> indices;
     // Initial solution, using a heuristic allocator
     for (size_t i = 0; i < lrs.size(); ++i) {
         indices.push_back(i);
     }
     sort_indices_on_prio(indices);
     // Allocate the initial solution
     best_size = UINT32_MAX;
     best_size = allocate_indices(indices);
     if (best_size <= target_size) {
         // The heuristic allocation returned an optimal solution.
         // No need to search.
     } else {
         // Try to improve the heuristic allocation
         search(indices, best_size, MAX_ITERATIONS);
     }
     output.clear();
     for (auto &lr : lrs) {
         output.push_back(lr.address);
     }
     return best_size;
 }

 void SearchAllocator::allocate_lr(LiveRange &lr) const {
     uint32_t address = 0;
     int predecessor = NO_PREDECESSOR;
     bool fits = false;
     while (!fits) {
         fits = true;
         // Find neighbours that overlap with address
         for (auto lr2_p : neighbours[lr.id]) {
             if (lr2_p->address == NOT_ALLOCATED || lr2_p->end_address <= address) {
                 continue;
             }
             if (lr2_p->overlaps(address, lr.size)) {
                 // Overlap found; increase address
                 fits = false;
                 address = lr2_p->end_address;
                 predecessor = lr2_p->id;
             }
         }
     }
     lr.address = address;
     lr.end_address = address + lr.size;
     lr.predecessor = predecessor;
 }

 uint32_t SearchAllocator::allocate_indices(const std::vector<size_t> &indices) {
     ++nr_iterations;
     std::vector<size_t> count(indices.size());
     for (auto &lr : lrs) {
         lr.address = NOT_ALLOCATED;
     }
     uint32_t size = 0;
     for (size_t turn = 0; size <= best_size && turn < indices.size(); ++turn) {
         auto &lr = lrs[indices[turn]];
         allocate_lr(lr);
         lr.turn = turn;
         size = std::max(size, lr.end_address);
     }
     return size;
 }

 void SearchAllocator::sort_indices_on_prio(std::vector<size_t> &indices) const {
     std::sort(indices.begin(), indices.end(),
         [this] (size_t const& a, size_t const& b) {
             if (lr_urgency[a] != lr_urgency[b]) {
                 return lr_urgency[a] > lr_urgency[b];
             }
             auto &lr1 = lrs[a];
             auto &lr2 = lrs[b];
             auto duration1 = lr1.end_time - lr1.start_time;
             auto duration2 = lr2.end_time - lr2.start_time;
             if (duration1 != duration2) {
                 return duration1 > duration2;
             }
             if (lr1.start_time != lr2.start_time) {
                 return lr1.start_time < lr2.start_time;
             }
             if (lr1.size != lr2.size) {
                 return lr1.size > lr2.size;
             }
             return lr1.id < lr2.id;
         });
 }

 void SearchAllocator::add_predecessor_turns(std::set<size_t> &turns, const LiveRange &lr) const {
     turns.insert(lr.turn);
     int id = lr.id;
     while (lrs[id].predecessor != NO_PREDECESSOR) {
         id = lrs[id].predecessor;
         turns.insert(lrs[id].turn);
     }
 }

 void SearchAllocator::attempt_bottleneck_fix(std::vector<size_t> &indices) {
     // Find the bottleneck
     LiveRange *max_lr = &lrs[0];
     for (auto &lr : lrs) {
         if (lr.end_address > max_lr->end_address) {
             max_lr = &lr;
         }
     }
     // Find all live ranges that affected the placement of the bottleneck live range.
     // This consists of two types of live ranges:
     // - direct neighbours of the bottleneck live range
     // - direct and indirect predecessors of these neighbours + bottleneck
     // The turns at which these live ranges were allocated are put in the turns vector.
     std::set<size_t> turns;
     add_predecessor_turns(turns, *max_lr);
     for (auto lr_p : neighbours[max_lr->id]) {
         add_predecessor_turns(turns, *lr_p);
     }
     // Non-direct neighbours that interfere with the allocation of the bottleneck are the
     // immediate cause for gaps in the allocation, and are selected with higher probability.
     std::vector<size_t> turn_list;
     std::vector<size_t> non_nb_turn_list;
     for (auto turn : turns) {
         turn_list.push_back(turn);
         auto &lr = lrs[indices[turn]];
         if (!max_lr->is_neighbour(lr)) {
             non_nb_turn_list.push_back(turn);
         }
     }
     size_t ix1;
     if (rng() % 100 < 30 && !non_nb_turn_list.empty()) {
         // Pick a live range from the "non-neighbour list"
         ix1 = non_nb_turn_list[rng() % non_nb_turn_list.size()];
     } else {
         // Pick any affecting live range.
         ix1 = turn_list[rng() % turn_list.size()];
     }
     // Note: turn_list has always at least 2 elements for bottlenecks
     size_t ix2 = turn_list[rng() % (turn_list.size() - 1)];
     if (ix1 == ix2) {
         ix2 = turn_list[turn_list.size() - 1];
     }
     // Swap indices
     std::swap(indices[ix1], indices[ix2]);
 }

 void SearchAllocator::search(std::vector<size_t> &indices, uint32_t initial_size, int iterations) {
     std::vector<size_t> best_indices = indices;
     std::vector<LiveRange> best_lrs = lrs;
     for (int i = 0; i < iterations; ++i) {
         // Reorder the indices
         attempt_bottleneck_fix(indices);
         // Allocate the reordered indices and check if it gave an improvement
         auto new_size = allocate_indices(indices);
         if (new_size <= best_size) {
             // The new allocation produced a new best result; remember it
             best_size = new_size;
             best_indices = indices;
             best_lrs = lrs;
             if (best_size <= target_size) {
                 // Target reached; stop
                 return;
             }
         } else {
             // The new allocation produced worse result; undo the change
             indices = best_indices;
             lrs = best_lrs;
         }
     }
     lrs = best_lrs;
 }

 uint32_t allocate(const std::vector<uint32_t> &input, int available_size, std::vector<uint32_t> &output) {
     // Convert input to vector of live ranges
     std::vector<LiveRange> lrs;
     for (size_t i = 0; i < input.size(); i += 3) {
         LiveRange lr;
         lr.start_time = input[i];
         lr.end_time = input[i+1];
         lr.size = input[i+2];
         lrs.push_back(lr);
     }
     SearchAllocator allocator(lrs, available_size);
     return allocator.allocate(output);
 }
	/*
	* Copyright (c) 2020 Arm Limited. All rights reserved.
	*
	* SPDX-License-Identifier: Apache-2.0
	*
	* Licensed under the Apache License, Version 2.0 (the License); you may
	* not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*
	* Description:
	* Implementation of the search-based allocator.
	*/

	#include <algorithm>
	#include <cstdint>
	#include <set>
	#include <vector>

	#include "search_allocator.h"

	SearchAllocator::SearchAllocator(const std::vector<LiveRange> &live_ranges, uint32_t size_limit) {
	lrs = live_ranges;
	uint32_t max_end_time = 0;
	for (size_t i = 0; i < lrs.size(); ++i) {
	auto &lr = lrs[i];
	lr.id = static_cast<int>(i);
	max_end_time = std::max(max_end_time, lr.end_time);
	}
	lrs_at_time.resize(max_end_time + 1);
	size_at_time.resize(max_end_time + 1);
	neighbours.resize(lrs.size());
	// Calculate which live ranges are active at every timestamp
	for (size_t t = 0; t <= max_end_time; ++t) {
	lrs_at_time[t].clear();
	}
	for (auto &lr : lrs) {
	for (auto t = lr.start_time; t <= lr.end_time; ++t) {
	lrs_at_time[t].push_back(&lr);
	}
	}
	min_required_size = 0;
	for (size_t t = 0; t <= max_end_time; ++t) {
	// Calculate minimum needed size at each timestamp
	uint32_t size_at_t = 0;
	for (auto &lr : lrs_at_time[t]) {
	size_at_t += lr->size;
	}
	size_at_time[t] = size_at_t;
	min_required_size = std::max(size_at_t, min_required_size);
	// Calculate all neighbours
	for (size_t i = 0; i < lrs_at_time[t].size(); ++i) {
	auto lr1 = lrs_at_time[t][i];
	auto &nb1 = neighbours[lr1->id];
	for (size_t j = i + 1; j < lrs_at_time[t].size(); ++j) {
	auto lr2 = lrs_at_time[t][j];
	if (find(nb1.begin(), nb1.end(), lr2) == nb1.end()) {
	nb1.push_back(lr2);
	neighbours[lr2->id].push_back(lr1);
	}
	}
	}
	}
	target_size = std::max(min_required_size, size_limit);
	// Calculate the urgency of each live range
	lr_urgency.resize(lrs.size());
	for (size_t i = 0; i < lrs.size(); ++i) {
	auto &lr = lrs[i];
	uint32_t urgency = 0;
	for (size_t t = lr.start_time; t <= lr.end_time; ++t) {
	urgency = std::max(size_at_time[t], urgency);
	}
	lr_urgency[i] = urgency;
	}
	best_size = UINT32_MAX;
	}

	uint32_t SearchAllocator::allocate(std::vector<uint32_t> &output) {
	output.clear();
	nr_iterations = 0;
	std::vector<size_t> indices;
	// Initial solution, using a heuristic allocator
	for (size_t i = 0; i < lrs.size(); ++i) {
	indices.push_back(i);
	}
	sort_indices_on_prio(indices);
	// Allocate the initial solution
	best_size = UINT32_MAX;
	best_size = allocate_indices(indices);
	if (best_size <= target_size) {
	// The heuristic allocation returned an optimal solution.
	// No need to search.
	} else {
	// Try to improve the heuristic allocation
	search(indices, best_size, MAX_ITERATIONS);
	}
	output.clear();
	for (auto &lr : lrs) {
	output.push_back(lr.address);
	}
	return best_size;
	}

	void SearchAllocator::allocate_lr(LiveRange &lr) const {
	uint32_t address = 0;
	int predecessor = NO_PREDECESSOR;
	bool fits = false;
	while (!fits) {
	fits = true;
	// Find neighbours that overlap with address
	for (auto lr2_p : neighbours[lr.id]) {
	if (lr2_p->address == NOT_ALLOCATED \|\| lr2_p->end_address <= address) {
	continue;
	}
	if (lr2_p->overlaps(address, lr.size)) {
	// Overlap found; increase address
	fits = false;
	address = lr2_p->end_address;
	predecessor = lr2_p->id;
	}
	}
	}
	lr.address = address;
	lr.end_address = address + lr.size;
	lr.predecessor = predecessor;
	}

	uint32_t SearchAllocator::allocate_indices(const std::vector<size_t> &indices) {
	++nr_iterations;
	std::vector<size_t> count(indices.size());
	for (auto &lr : lrs) {
	lr.address = NOT_ALLOCATED;
	}
	uint32_t size = 0;
	for (size_t turn = 0; size <= best_size && turn < indices.size(); ++turn) {
	auto &lr = lrs[indices[turn]];
	allocate_lr(lr);
	lr.turn = turn;
	size = std::max(size, lr.end_address);
	}
	return size;
	}

	void SearchAllocator::sort_indices_on_prio(std::vector<size_t> &indices) const {
	std::sort(indices.begin(), indices.end(),
	[this] (size_t const& a, size_t const& b) {
	if (lr_urgency[a] != lr_urgency[b]) {
	return lr_urgency[a] > lr_urgency[b];
	}
	auto &lr1 = lrs[a];
	auto &lr2 = lrs[b];
	auto duration1 = lr1.end_time - lr1.start_time;
	auto duration2 = lr2.end_time - lr2.start_time;
	if (duration1 != duration2) {
	return duration1 > duration2;
	}
	if (lr1.start_time != lr2.start_time) {
	return lr1.start_time < lr2.start_time;
	}
	if (lr1.size != lr2.size) {
	return lr1.size > lr2.size;
	}
	return lr1.id < lr2.id;
	});
	}

	void SearchAllocator::add_predecessor_turns(std::set<size_t> &turns, const LiveRange &lr) const {
	turns.insert(lr.turn);
	int id = lr.id;
	while (lrs[id].predecessor != NO_PREDECESSOR) {
	id = lrs[id].predecessor;
	turns.insert(lrs[id].turn);
	}
	}

	void SearchAllocator::attempt_bottleneck_fix(std::vector<size_t> &indices) {
	// Find the bottleneck
	LiveRange *max_lr = &lrs[0];
	for (auto &lr : lrs) {
	if (lr.end_address > max_lr->end_address) {
	max_lr = &lr;
	}
	}
	// Find all live ranges that affected the placement of the bottleneck live range.
	// This consists of two types of live ranges:
	// - direct neighbours of the bottleneck live range
	// - direct and indirect predecessors of these neighbours + bottleneck
	// The turns at which these live ranges were allocated are put in the turns vector.
	std::set<size_t> turns;
	add_predecessor_turns(turns, *max_lr);
	for (auto lr_p : neighbours[max_lr->id]) {
	add_predecessor_turns(turns, *lr_p);
	}
	// Non-direct neighbours that interfere with the allocation of the bottleneck are the
	// immediate cause for gaps in the allocation, and are selected with higher probability.
	std::vector<size_t> turn_list;
	std::vector<size_t> non_nb_turn_list;
	for (auto turn : turns) {
	turn_list.push_back(turn);
	auto &lr = lrs[indices[turn]];
	if (!max_lr->is_neighbour(lr)) {
	non_nb_turn_list.push_back(turn);
	}
	}
	size_t ix1;
	if (rng() % 100 < 30 && !non_nb_turn_list.empty()) {
	// Pick a live range from the "non-neighbour list"
	ix1 = non_nb_turn_list[rng() % non_nb_turn_list.size()];
	} else {
	// Pick any affecting live range.
	ix1 = turn_list[rng() % turn_list.size()];
	}
	// Note: turn_list has always at least 2 elements for bottlenecks
	size_t ix2 = turn_list[rng() % (turn_list.size() - 1)];
	if (ix1 == ix2) {
	ix2 = turn_list[turn_list.size() - 1];
	}
	// Swap indices
	std::swap(indices[ix1], indices[ix2]);
	}

	void SearchAllocator::search(std::vector<size_t> &indices, uint32_t initial_size, int iterations) {
	std::vector<size_t> best_indices = indices;
	std::vector<LiveRange> best_lrs = lrs;
	for (int i = 0; i < iterations; ++i) {
	// Reorder the indices
	attempt_bottleneck_fix(indices);
	// Allocate the reordered indices and check if it gave an improvement
	auto new_size = allocate_indices(indices);
	if (new_size <= best_size) {
	// The new allocation produced a new best result; remember it
	best_size = new_size;
	best_indices = indices;
	best_lrs = lrs;
	if (best_size <= target_size) {
	// Target reached; stop
	return;
	}
	} else {
	// The new allocation produced worse result; undo the change
	indices = best_indices;
	lrs = best_lrs;
	}
	}
	lrs = best_lrs;
	}

	uint32_t allocate(const std::vector<uint32_t> &input, int available_size, std::vector<uint32_t> &output) {
	// Convert input to vector of live ranges
	std::vector<LiveRange> lrs;
	for (size_t i = 0; i < input.size(); i += 3) {
	LiveRange lr;
	lr.start_time = input[i];
	lr.end_time = input[i+1];
	lr.size = input[i+2];
	lrs.push_back(lr);
	}
	SearchAllocator allocator(lrs, available_size);
	return allocator.allocate(output);
	}