Blame - ethosu/vela/hillclimb_allocation.py - ml/ethos-u/ethos-u-vela

2021-01-20 17:23:54 +0100

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

16

#

17

# Description:

18

# Tensor allocator based on a hill-climb search

19

import random

20

from typing import List

21

from typing import Set

22

Henrik G Olsson

2021-03-08 18:20:00 +0100

[diff] [blame]

23

from . import numeric_util

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

24

from .live_range import LiveRange

25

26

27

class LiveRangeInfo:

Henrik G Olsson

2021-03-08 18:20:00 +0100

[diff] [blame]

28

def __init__(self, id: int, start_time: int, end_time: int, size: int, min_alignment: int):

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

29

# Index of this live range

30

self.id = id

31

# Start time (input to the allocator algorithm)

32

self.start_time = start_time

33

# End time, inclusive (input to the allocator algorithm)

34

self.end_time = end_time

35

# Size in bytes (input to the allocator algorithm)

36

self.size = size

37

# Allocated address (the main output from the allocator algorithm)

38

self.address: int = 0

39

# End address, exclusive

40

self.end_address: int = 0

41

# id of predecessor live range (predecessor's end address == this lr's address)

42

self.predecessor: int = 0

43

# Turn at which the live range was allocated

44

self.turn: int = 0

45

# Max value of size_at_time (only used in the heuristic allocation)

46

self.urgency = 0

47

self.neighbours: List["LiveRangeInfo"] = []

Henrik G Olsson

2021-03-08 18:20:00 +0100

[diff] [blame]

48

self.min_alignment = min_alignment

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

49

50

def overlaps(self, addr2: int, size2: int) -> int:

51

return self.address < addr2 + size2 and addr2 < self.end_address

52

53

def is_neighbour(self, lr: "LiveRangeInfo") -> bool:

54

return self.start_time <= lr.end_time and lr.start_time <= self.end_time

55

56

def __str__(self):

57

return "<LiveRangeInfo: id={}, start_time={}, end_time={}, size={}, address={}>".format(

58

self.id, self.start_time, self.end_time, self.size, self.address

59

)

60

61

def __lt__(self, other) -> bool:

62

if self.urgency != other.urgency:

63

return self.urgency > other.urgency

64

duration1 = self.end_time - self.start_time

65

duration2 = other.end_time - other.start_time

66

if duration1 != duration2:

67

return duration1 > duration2

68

69

if self.start_time != other.start_time:

70

return self.start_time < other.start_time

71

72

if self.size != other.size:

73

return self.size > other.size

74

75

return self.id < other.id

76

77

78

class HillClimbAllocator:

79

"""

80

Implements tensor allocator using a hill climb search.

81

82

The basic algorithm is:

83

84

Use a heuristic allocator to find an initial allocation

85

while allocation is not optimal and iterations < MAX_ITERATIONS:

86

find the "bottleneck": the live range with highest end address

87

find all live ranges that affected the allocation of the bottleneck

88

swap the order of any two affecting live ranges

89

reallocate tensors using the reordered live ranges

90

if the new allocation is better: keep it, else set allocation to previous allocation

91

"""

92

Tim Hall

2022-05-19 12:36:58 +0100

[diff] [blame]

93

# Maximum number of iterations of the algorithm (can be override from the command line)

94

MAX_ITERATIONS = 99999

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

95

NOT_ALLOCATED = -1

96

# Used for live ranges allocated at address 0

97

NO_PREDECESSOR = -1

Louis Verhaard

2022-02-28 15:12:57 +0100

[diff] [blame]

98

# Special handling if best solution has not improved during this many iterations

99

MAX_ITERATIONS_STUCK = 50

Tim Hall

2022-05-19 12:36:58 +0100

[diff] [blame]

100

# Minimum number of iterations since the last improvement (unless an optimal solution is found)

101

MIN_ITERATIONS_IMPROVE = 500

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

102

Tim Hall

2022-05-19 12:36:58 +0100

[diff] [blame]

103

def __init__(self, live_ranges: List[LiveRange], max_iterations: int, memory_limit: int):

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

104

# Contains the live ranges

105

self.lrs: List[LiveRangeInfo] = [

Henrik G Olsson

2021-03-08 18:20:00 +0100

[diff] [blame]

106

LiveRangeInfo(id, lr.start_time, lr.end_time, lr.size, lr.get_alignment())

107

for id, lr in enumerate(live_ranges)

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

108

]

Jonas Ohlsson

845e232

2022-03-01 12:39:55 +0100

[diff] [blame]

109

self.lrs_at_time: List[List[LiveRangeInfo]] = []

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

110

# The available size (input to algorithm).

111

self.available_size: int = 0

112

# The algorithm stops once the target size has been achieved

113

self.target_size: int = 0

114

# The highest end address of the best found allocation

115

self.best_size: int = 1 << 63

116

# For each live range: max value of size_at_time (only used in the heuristic allocation)

117

self.lr_urgency = len(self.lrs) * [0]

118

nr_time_slots = 1 + max(lr.end_time for lr in self.lrs)

119

# Contains active live ranges at each timestamp

120

self.lrs_at_time = [[] for i in range(nr_time_slots)]

121

for lr in self.lrs:

122

for t in range(lr.start_time, lr.end_time + 1):

123

self.lrs_at_time[t].append(lr)

124

# At each timestamp: accumulated size of active live ranges

125

size_at_time = [sum(lr.size for lr in self.lrs_at_time[t]) for t in range(nr_time_slots)]

126

# The minimum possible size, assuming all live ranges can be perfectly allocated

127

self.min_required_size: int = max(size_at_time)

Tim Hall

2022-05-19 12:36:58 +0100

[diff] [blame]

128

# Set the maximum number of iterations the search will iterate to find a solution

129

if max_iterations is None:

130

self.max_iterations = self.MAX_ITERATIONS

131

else:

132

self.max_iterations = max_iterations

133

# Defines a memory limit that the allocation must meet

134

self.memory_limit = memory_limit

135

if self.memory_limit < self.min_required_size:

136

print(

137

f"Warning: Memory limit = {self.memory_limit} is less than the minimum possible allocation size ="

138

f" {self.min_required_size}"

139

)

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

140

# Calculate all neighbours + the urgency of each live range

for lr in self.lrs:

lr.urgency = 0

lr.neighbours = []

neighbours = set()

for t in range(lr.start_time, lr.end_time + 1):

146

lr.urgency = max(size_at_time[t], lr.urgency)

147

for lr2 in self.lrs_at_time[t]:

148

if lr2 not in neighbours and lr != lr2:

149

neighbours.add(lr2)

150

lr.neighbours.append(lr2)

151

152

def allocate_lr(self, lr: LiveRangeInfo):

153

"""

154

Allocates the given live range at the smallest possible address

155

"""

156

address = 0

157

predecessor = HillClimbAllocator.NO_PREDECESSOR

fits = False

while not fits:

fits = True

# Find neighbours that overlap with address

162

for lr2 in lr.neighbours:

163

if lr2.address == HillClimbAllocator.NOT_ALLOCATED or lr2.end_address <= address:

164

continue

165

if lr2.overlaps(address, lr.size):

166

# Overlap found increase address

167

fits = False

Henrik G Olsson

2021-03-08 18:20:00 +0100

[diff] [blame]

168

address = numeric_util.round_up(lr2.end_address, lr.min_alignment)

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

169

predecessor = lr2.id

170

lr.address = address

171

lr.end_address = address + lr.size

172

lr.predecessor = predecessor

173

174

def allocate_indices(self, indices: List[int]):

175

"""

176

Allocates the live ranges in the order indicated by the indices;

177

allocates each live range at the lowest possible address.

178

"""

179

for lr in self.lrs:

180

lr.address = HillClimbAllocator.NOT_ALLOCATED

181

size = 0

182

for turn, index in enumerate(indices):

lr = self.lrs[index]

self.allocate_lr(lr)

lr.turn = turn

size = max(size, lr.end_address)

187

if size > self.best_size:

188

# This allocation is worse than the best known allocation;

189

# no need to continue

break

return size

def add_predecessor_turns(self, turn_set: Set[int], turn_list: List[int], lr: LiveRangeInfo):

194

"""

195

Adds the given live range + predecessors to the turns vector.

196

Note: the turn_set is for quick detection of duplicates,

197

the turn_list is to get reproduceable results

198

"""

199

if lr.turn not in turn_set:

200

turn_set.add(lr.turn)

201

turn_list.append(lr.turn)

202

id = lr.id

203

while self.lrs[id].predecessor != HillClimbAllocator.NO_PREDECESSOR:

204

id = self.lrs[id].predecessor

205

turn = self.lrs[id].turn

206

if turn not in turn_set:

207

turn_set.add(turn)

208

turn_list.append(turn)

209

Louis Verhaard

2022-02-28 15:12:57 +0100

[diff] [blame]

210

def attempt_bottleneck_fix(self, indices: List[int], iterations_stuck):

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

211

"""

212

Finds the "bottleneck", the live range with highest end address, and reorders the indices

213

such that a next allocation might lower the memory usage.

---------

| |

| D |

| |

----------------------------------

220

| B |

221

-------------------------------

| |

|A| ---

| | |C|

| | | |

---------------------------------------

227

228

In the above example, the allocation order was [A, B, C, D] and D is the resulting bottle-neck.

229

The live ranges that affected the allocation of D are the direct neighbours of D (i.e. B and C),

230

and all direct and indirect predecessors of D and its neighbours

231

(i.e. A, which is the predecessor of B, and indirect predecessor of D).

232

233

By permuting the order in which the affecting live ranges are allocated, the bottleneck might

234

be lowered. In the above example, almost any permutation would lower the bottleneck.

235

"""

236

# Find the bottleneck

237

max_lr = self.lrs[0]

238

for lr in self.lrs[1:]:

239

if lr.end_address > max_lr.end_address:

240

max_lr = lr

241

242

# Find all live ranges that affected the placement of the bottleneck live range.

243

# This consists of two types of live ranges:

244

# - direct neighbours of the bottleneck live range

245

# - direct and indirect predecessors of these neighbours + bottleneck

246

# The turns at which these live ranges were allocated are put in the turns set.

Jonas Ohlsson

845e232

2022-03-01 12:39:55 +0100

[diff] [blame]

247

turn_set: Set[int] = set()

248

turn_list: List[int] = list()

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

249

self.add_predecessor_turns(turn_set, turn_list, max_lr)

250

for lr2 in max_lr.neighbours:

251

self.add_predecessor_turns(turn_set, turn_list, lr2)

252

253

# Non-direct neighbours that interfere with the allocation of the bottleneck are the

254

# immediate cause for gaps in the allocation, and are selected with higher probability.

255

non_nb_turn_list = []

256

for turn in turn_list:

257

lr = self.lrs[indices[turn]]

258

if not max_lr.is_neighbour(lr):

259

non_nb_turn_list.append(turn)

260

assert turn_list

261

# Pick from non-neighbour list with 30% probability

262

# (magic number based on tuning)

263

if random.randint(0, 100) < 30 and non_nb_turn_list:

264

# Pick a live range from the "non-neighbour list"

265

ix1 = non_nb_turn_list[random.randint(0, len(non_nb_turn_list) - 1)]

266

else:

267

# Pick any affecting live range.

268

ix1 = turn_list[random.randint(0, len(turn_list) - 1)]

269

270

ix2 = turn_list[random.randint(0, len(turn_list) - 2)]

if ix1 == ix2:

ix2 = turn_list[-1]

# Swap indices

indices[ix1], indices[ix2] = indices[ix2], indices[ix1]

Louis Verhaard

2022-02-28 15:12:57 +0100

[diff] [blame]

275

if iterations_stuck > HillClimbAllocator.MAX_ITERATIONS_STUCK:

276

# The best allocation has not improved for a while, maybe improvement is not possible

277

# by single-swapping indices; add more neighbour live ranges and swap up to two more indices.

278

# Adding more neighbours can sometimes resolve the situation where the current bottleneck

279

# is resolved, but always results in a higher bottleneck at a nearby live range.

280

# Magic number is based on tuning

281

for turn in non_nb_turn_list:

282

for lr in self.lrs[indices[turn]].neighbours:

283

if lr.turn not in turn_set:

284

turn_set.add(lr.turn)

285

turn_list.append(lr.turn)

286

ix1 = turn_list[random.randint(0, len(turn_list) - 1)]

287

ix2 = turn_list[random.randint(0, len(turn_list) - 1)]

288

indices[ix1], indices[ix2] = indices[ix2], indices[ix1]

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

289

Tim Hall

2022-05-19 12:36:58 +0100

[diff] [blame]

290

def search(self, indices: List[int]):

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

291

"""

292

Search for a solution, using the given indices as initial solution.

293

"""

294

best_indices = indices[:]

Louis Verhaard

2022-02-28 15:12:57 +0100

[diff] [blame]

295

last_improvement_iteration = 0

Tim Hall

2022-05-19 12:36:58 +0100

[diff] [blame]

296

i = 0

297

while (self.best_size > self.memory_limit and i < self.max_iterations) or (

298

i - last_improvement_iteration < self.MIN_ITERATIONS_IMPROVE

299

):

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

300

# Reorder the indices

Louis Verhaard

2022-02-28 15:12:57 +0100

[diff] [blame]

301

self.attempt_bottleneck_fix(indices, i - last_improvement_iteration)

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

302

# Allocate the reordered indices and check if it gave an improvement

303

new_size = self.allocate_indices(indices)

304

if new_size <= self.best_size:

305

# The new allocation produced a new best result remember it

Louis Verhaard

2022-02-28 15:12:57 +0100

[diff] [blame]

306

if new_size < self.best_size:

307

last_improvement_iteration = i

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

308

self.best_size = new_size

309

best_indices = indices[:]

310

self.allocated_addresses = [lr.address for lr in self.lrs]

311

if self.best_size <= self.min_required_size:

Louis Verhaard

2022-02-28 15:12:57 +0100

[diff] [blame]

312

# Target reached; stop

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

313

return

314

else:

315

# The new allocation produced worse result undo the change

316

indices = best_indices[:]

Tim Hall

2022-05-19 12:36:58 +0100

[diff] [blame]

317

i += 1

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

318

319

def allocate(self) -> List[int]:

320

"""

321

Runs the allocation algorithm. Finishes when an optimal solution has been

322

found or when maximum iterations have been run.

323

The allocated addresses are placed in the output vector, in the same

324

order as the input vector.

325

326

Implementation note: the algorithm produces reproduceable results by using

327

a well-defined random number generator with well-defined default seed,

328

and using a fixed number of iterations.

329

"""

330

random.seed(1)

331

# Sort indices on priority. Note: self.lrs must be left unchanged

332

indices = [lr.id for lr in sorted(self.lrs)]

333

# Allocate the initial solution

334

self.best_size = self.allocate_indices(indices)

335

self.allocated_addresses = [lr.address for lr in self.lrs]

336

if self.best_size > self.min_required_size:

337

# Try to improve the heuristic allocation

Tim Hall

2022-05-19 12:36:58 +0100

[diff] [blame]

338

self.search(indices)

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

339

# else the heuristic allocation returned an optimal solution; no search needed

340

return self.allocated_addresses

341

342

Tim Hall

2022-05-19 12:36:58 +0100

[diff] [blame]

343

def allocate_live_ranges(lrs: List[LiveRange], max_iterations: int, memory_limit: int) -> List[int]:

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

344

"""

345

Allocates live ranges using a search based allocator.

346

Returns the list of allocated addresses (one for each live range)

347

"""

348

if not lrs:

349

return []

Tim Hall

2022-05-19 12:36:58 +0100

[diff] [blame]

350

allocator = HillClimbAllocator(lrs, max_iterations, memory_limit)

Louis Verhaard