Blame - ethosu/tensor_allocator/search_allocator.h - ml/ethos-u/ethos-u-vela

blob: 6c750151d9c64d7c73da77f6240b80c341f2638c [file] [log] [blame]

Louis Verhaard	9bfe0f8	2020-12-03 12:26:25 +0100	[diff] [blame]	1	/*
				2	* Copyright (c) 2020 Arm Limited. All rights reserved.
				3	*
				4	* SPDX-License-Identifier: Apache-2.0
				5	*
				6	* Licensed under the Apache License, Version 2.0 (the License); you may
				7	* not use this file except in compliance with the License.
				8	* You may obtain a copy of the License at
				9	*
				10	* www.apache.org/licenses/LICENSE-2.0
				11	*
				12	* Unless required by applicable law or agreed to in writing, software
				13	* distributed under the License is distributed on an AS IS BASIS, WITHOUT
				14	* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				15	* See the License for the specific language governing permissions and
				16	* limitations under the License.
				17	*
				18	* Description:
				19	* Declaration of the search-based allocator.
				20	*/
				21
				22	#ifndef __SEARCH_ALLOCATOR_H
				23	#define __SEARCH_ALLOCATOR_H
				24
				25	#include <algorithm>
				26	#include <cstdint>
				27	#include <random>
				28	#include <set>
				29	#include <vector>
				30
				31	/**
				32	* Live range
				33	*/
				34	struct LiveRange {
				35	/** Start time (input to the allocator algorithm) */
				36	uint32_t start_time;
				37	/** End time, inclusive (input to the allocator algorithm) */
				38	uint32_t end_time;
				39	/** Size in bytes (input to the allocator algorithm) */
				40	uint32_t size;
				41	/** Index of this live range */
				42	int id;
				43	/** Allocated address (the main output from the allocator algorithm) */
				44	uint32_t address;
				45	/** End address, exclusive */
				46	uint32_t end_address;
				47	/** id of predecessor live range (predecessor's end address == this lr's address) */
				48	int predecessor;
				49	/** Turn at which the live range was allocated */
				50	size_t turn;
				51
				52	bool overlaps(uint32_t addr2, uint32_t size2) const {
				53	return address < addr2 + size2 && addr2 < end_address;
				54	}
				55	bool is_neighbour(const LiveRange &lr) const {
				56	return start_time <= lr.end_time && lr.start_time <= end_time;
				57	}
				58	};
				59
				60	/**
				61	* Implements tensor allocator using state space exploration.
				62	*
				63	* The basic algorithm is:
				64	*
				65	* Use a heuristic allocator to find an initial allocation
				66	* while allocation is not optimal and iterations < MAX_ITERATIONS {
				67	* find the "bottleneck": the live range with highest end address
				68	* find all live ranges that affected the allocation of the bottleneck
				69	* swap the order of any two affecting live ranges
				70	* reallocate tensors using the reordered live ranges
				71	* if the new allocation is better: keep it, else set allocation to previous allocation
				72	* }
				73	*/
				74	class SearchAllocator {
				75	private:
				76	static constexpr int MAX_ITERATIONS = 500;
				77	static constexpr uint32_t NOT_ALLOCATED = UINT32_MAX;
				78	/** Used for live ranges allocated at address 0 */
				79	static constexpr int NO_PREDECESSOR = -1;
				80	/** Contains the live ranges */
				81	std::vector<LiveRange> lrs;
				82	/** Contains active live ranges at each timestamp */
				83	std::vector<std::vector<LiveRange*>> lrs_at_time;
				84	/**
				85	* Contains neighbours of each live range (indexed by lr.id), i.e.
				86	* live ranges with overlapping start/end time.
				87	*/
				88	std::vector<std::vector<LiveRange*>> neighbours;
				89	/**
				90	* At each timestamp: accumulated size of active live ranges
				91	*/
				92	std::vector<uint32_t> size_at_time;
				93	/**
				94	* For each live range: max value of size_at_time (only used in the heuristic allocation)
				95	*/
				96	std::vector<uint32_t> lr_urgency;
				97	/**
				98	* The minimum possible size, assuming all live ranges can be perfectly allocated
				99	*/
				100	uint32_t min_required_size;
Louis Verhaard	9bfe0f8	2020-12-03 12:26:25 +0100	[diff] [blame]	101	/** The algorithm stops once the target size has been achieved */
				102	uint32_t target_size;
				103	/** The highest end address of the best found allocation */
				104	uint32_t best_size;
				105	/** Number of performed iterations */
				106	size_t nr_iterations = 0;
				107	/** Random number generator; use default seed (which is well-defined) */
				108	std::mt19937 rng;
				109	public:
				110	SearchAllocator(const std::vector<LiveRange> &live_ranges, uint32_t size_limit);
				111	/**
				112	* Runs the allocation algorithm. Finishes when the target size has been
				113	* reached or when maximum iterations have been run.
				114	* The allocated addresses are placed in the output vector, in the same
				115	* order as the input vector.
				116	*
				117	* Implementation note: the algorithm produces reproduceable results by using
				118	* a well-defined random number generator with well-defined default seed,
				119	* and using a fixed number of iterations.
				120	*/
				121	uint32_t allocate(std::vector<uint32_t> &output);
				122	uint32_t get_min_required_size() const {
				123	return min_required_size;
				124	}
				125	size_t get_nr_iterations() const {
				126	return nr_iterations;
				127	}
				128	private:
				129	/**
				130	* Allocates the given live range at the smallest possible address
				131	*/
				132	void allocate_lr(LiveRange &lr) const;
				133	/**
				134	* Allocates the live ranges in the order indicated by the indices;
				135	* allocates each live range at the lowest possible address.
				136	*/
				137	uint32_t allocate_indices(const std::vector<size_t> &indices);
				138	/** Sorts live ranges based on heuristics, used for the initial allocation */
				139	void sort_indices_on_prio(std::vector<size_t> &indices) const;
				140	/** Adds the given live range + predecessors to the turns vector */
				141	void add_predecessor_turns(std::set<size_t> &turns, const LiveRange &lr) const;
				142	/**
				143	* Finds the "bottleneck", the live range with highest end address, and reorders the indices
				144	* such that a next allocation might lower the memory usage.
				145	*
				146	* ---------
				147	* \| \|
				148	* \| D \|
				149	* \| \|
				150	* ----------------------------------
				151	* \| B \|
				152	* -------------------------------
				153	* \| \|
				154	* \|A\| ---
				155	* \| \| \|C\|
				156	* \| \| \| \|
				157	* ---------------------------------------
				158	*
				159	* In the above example, the allocation order was [A, B, C, D] and D is the resulting bottle-neck.
				160	* The live ranges that affected the allocation of D are the direct neighbours of D (i.e. B and C),
				161	* and all direct and indirect predecessors of D and its neighbours
				162	* (i.e. A, which is the predecessor of B, and indirect predecessor of D).
				163	*
				164	* By permuting the order in which the affecting live ranges are allocated, the bottleneck might
				165	* be lowered. In the above example, almost any permutation would lower the bottleneck.
				166	*
				167	* Note that there is room to improve the efficiency of the algorithm.
				168	* One way could be to first allocate all direct neighbours of the bottleneck
				169	* (i.e. B, C, D) and then the other affecting live ranges (i.e. A). The algorithm currently does
				170	* not actively try this, as it may lead to allocation loops (A could become the new bottle-neck);
				171	* it just uses a higher probability of selecting A.
				172	*/
				173	void attempt_bottleneck_fix(std::vector<size_t> &indices);
				174	/** Search for a solution, using the given indices as initial solution. */
				175	void search(std::vector<size_t> &indices, uint32_t initial_size, int iterations);
				176	};
				177
				178	/** Wrapper function to perform live range allocation */
				179	uint32_t allocate(const std::vector<uint32_t> &input, int available_size, std::vector<uint32_t> &output);
				180
				181	#endif // __SEARCH_ALLOCATOR_H