Blame - ethosu/vela/hillclimb_allocation.py - ml/ethos-u/ethos-u-vela

2021-01-20 17:23:54 +0100

[diff] [blame]

1

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

16

#

17

# Description:

18

# Tensor allocator based on a hill-climb search

19

import random

20

from typing import List

21

from typing import Set

22

Henrik G Olsson

2021-03-08 18:20:00 +0100

[diff] [blame]

23

from . import numeric_util

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

24

from .live_range import LiveRange

25

26

27

class LiveRangeInfo:

Henrik G Olsson

2021-03-08 18:20:00 +0100

[diff] [blame]

28

def __init__(self, id: int, start_time: int, end_time: int, size: int, min_alignment: int):

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

29

# Index of this live range

30

self.id = id

31

# Start time (input to the allocator algorithm)

32

self.start_time = start_time

33

# End time, inclusive (input to the allocator algorithm)

34

self.end_time = end_time

35

# Size in bytes (input to the allocator algorithm)

36

self.size = size

37

# Allocated address (the main output from the allocator algorithm)

38

self.address: int = 0

39

# End address, exclusive

40

self.end_address: int = 0

41

# id of predecessor live range (predecessor's end address == this lr's address)

42

self.predecessor: int = 0

43

# Turn at which the live range was allocated

44

self.turn: int = 0

45

# Max value of size_at_time (only used in the heuristic allocation)

46

self.urgency = 0

47

self.neighbours: List["LiveRangeInfo"] = []

Henrik G Olsson

2021-03-08 18:20:00 +0100

[diff] [blame]

48

self.min_alignment = min_alignment

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

49

50

def overlaps(self, addr2: int, size2: int) -> int:

51

return self.address < addr2 + size2 and addr2 < self.end_address

52

53

def is_neighbour(self, lr: "LiveRangeInfo") -> bool:

54

return self.start_time <= lr.end_time and lr.start_time <= self.end_time

55

56

def __str__(self):

57

return "<LiveRangeInfo: id={}, start_time={}, end_time={}, size={}, address={}>".format(

58

self.id, self.start_time, self.end_time, self.size, self.address

59

)

60

61

def __lt__(self, other) -> bool:

62

if self.urgency != other.urgency:

63

return self.urgency > other.urgency

64

duration1 = self.end_time - self.start_time

65

duration2 = other.end_time - other.start_time

66

if duration1 != duration2:

67

return duration1 > duration2

68

69

if self.start_time != other.start_time:

70

return self.start_time < other.start_time

71

72

if self.size != other.size:

73

return self.size > other.size

74

75

return self.id < other.id

76

77

78

class HillClimbAllocator:

79

"""

80

Implements tensor allocator using a hill climb search.

81

82

The basic algorithm is:

83

84

Use a heuristic allocator to find an initial allocation

85

while allocation is not optimal and iterations < MAX_ITERATIONS:

86

find the "bottleneck": the live range with highest end address

87

find all live ranges that affected the allocation of the bottleneck

88

swap the order of any two affecting live ranges

89

reallocate tensors using the reordered live ranges

90

if the new allocation is better: keep it, else set allocation to previous allocation

"""

MAX_ITERATIONS = 500

NOT_ALLOCATED = -1

# Used for live ranges allocated at address 0

96

NO_PREDECESSOR = -1

Louis Verhaard

2022-02-28 15:12:57 +0100

[diff] [blame]

97

# Special handling if best solution has not improved during this many iterations

98

MAX_ITERATIONS_STUCK = 50

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

99

100

def __init__(self, live_ranges: List[LiveRange]):

101

# Contains the live ranges

102

self.lrs: List[LiveRangeInfo] = [

Henrik G Olsson

2021-03-08 18:20:00 +0100

[diff] [blame]

103

LiveRangeInfo(id, lr.start_time, lr.end_time, lr.size, lr.get_alignment())

104

for id, lr in enumerate(live_ranges)

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

105

]

Jonas Ohlsson

845e232

2022-03-01 12:39:55 +0100

[diff] [blame]

106

self.lrs_at_time: List[List[LiveRangeInfo]] = []

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

107

# The available size (input to algorithm).

108

self.available_size: int = 0

109

# The algorithm stops once the target size has been achieved

110

self.target_size: int = 0

111

# The highest end address of the best found allocation

112

self.best_size: int = 1 << 63

113

# For each live range: max value of size_at_time (only used in the heuristic allocation)

114

self.lr_urgency = len(self.lrs) * [0]

115

nr_time_slots = 1 + max(lr.end_time for lr in self.lrs)

116

# Contains active live ranges at each timestamp

117

self.lrs_at_time = [[] for i in range(nr_time_slots)]

118

for lr in self.lrs:

119

for t in range(lr.start_time, lr.end_time + 1):

120

self.lrs_at_time[t].append(lr)

121

# At each timestamp: accumulated size of active live ranges

122

size_at_time = [sum(lr.size for lr in self.lrs_at_time[t]) for t in range(nr_time_slots)]

123

# The minimum possible size, assuming all live ranges can be perfectly allocated

124

self.min_required_size: int = max(size_at_time)

125

# Calculate all neighbours + the urgency of each live range

for lr in self.lrs:

lr.urgency = 0

lr.neighbours = []

neighbours = set()

for t in range(lr.start_time, lr.end_time + 1):

131

lr.urgency = max(size_at_time[t], lr.urgency)

132

for lr2 in self.lrs_at_time[t]:

133

if lr2 not in neighbours and lr != lr2:

134

neighbours.add(lr2)

135

lr.neighbours.append(lr2)

136

137

def allocate_lr(self, lr: LiveRangeInfo):

138

"""

139

Allocates the given live range at the smallest possible address

140

"""

141

address = 0

142

predecessor = HillClimbAllocator.NO_PREDECESSOR

fits = False

while not fits:

fits = True

# Find neighbours that overlap with address

147

for lr2 in lr.neighbours:

148

if lr2.address == HillClimbAllocator.NOT_ALLOCATED or lr2.end_address <= address:

149

continue

150

if lr2.overlaps(address, lr.size):

151

# Overlap found increase address

152

fits = False

Henrik G Olsson

2021-03-08 18:20:00 +0100

[diff] [blame]

153

address = numeric_util.round_up(lr2.end_address, lr.min_alignment)

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

154

predecessor = lr2.id

155

lr.address = address

156

lr.end_address = address + lr.size

157

lr.predecessor = predecessor

158

159

def allocate_indices(self, indices: List[int]):

160

"""

161

Allocates the live ranges in the order indicated by the indices;

162

allocates each live range at the lowest possible address.

163

"""

164

for lr in self.lrs:

165

lr.address = HillClimbAllocator.NOT_ALLOCATED

166

size = 0

167

for turn, index in enumerate(indices):

lr = self.lrs[index]

self.allocate_lr(lr)

lr.turn = turn

size = max(size, lr.end_address)

172

if size > self.best_size:

173

# This allocation is worse than the best known allocation;

174

# no need to continue

break

return size

def add_predecessor_turns(self, turn_set: Set[int], turn_list: List[int], lr: LiveRangeInfo):

179

"""

180

Adds the given live range + predecessors to the turns vector.

181

Note: the turn_set is for quick detection of duplicates,

182

the turn_list is to get reproduceable results

183

"""

184

if lr.turn not in turn_set:

185

turn_set.add(lr.turn)

186

turn_list.append(lr.turn)

187

id = lr.id

188

while self.lrs[id].predecessor != HillClimbAllocator.NO_PREDECESSOR:

189

id = self.lrs[id].predecessor

190

turn = self.lrs[id].turn

191

if turn not in turn_set:

192

turn_set.add(turn)

193

turn_list.append(turn)

194

Louis Verhaard

2022-02-28 15:12:57 +0100

[diff] [blame]

195

def attempt_bottleneck_fix(self, indices: List[int], iterations_stuck):

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

196

"""

197

Finds the "bottleneck", the live range with highest end address, and reorders the indices

198

such that a next allocation might lower the memory usage.

---------

| |

| D |

| |

----------------------------------

205

| B |

206

-------------------------------

| |

|A| ---

| | |C|

| | | |

---------------------------------------

212

213

In the above example, the allocation order was [A, B, C, D] and D is the resulting bottle-neck.

214

The live ranges that affected the allocation of D are the direct neighbours of D (i.e. B and C),

215

and all direct and indirect predecessors of D and its neighbours

216

(i.e. A, which is the predecessor of B, and indirect predecessor of D).

217

218

By permuting the order in which the affecting live ranges are allocated, the bottleneck might

219

be lowered. In the above example, almost any permutation would lower the bottleneck.

220

"""

221

# Find the bottleneck

222

max_lr = self.lrs[0]

223

for lr in self.lrs[1:]:

224

if lr.end_address > max_lr.end_address:

225

max_lr = lr

226

227

# Find all live ranges that affected the placement of the bottleneck live range.

228

# This consists of two types of live ranges:

229

# - direct neighbours of the bottleneck live range

230

# - direct and indirect predecessors of these neighbours + bottleneck

231

# The turns at which these live ranges were allocated are put in the turns set.

Jonas Ohlsson

845e232

2022-03-01 12:39:55 +0100

[diff] [blame]

232

turn_set: Set[int] = set()

233

turn_list: List[int] = list()

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

234

self.add_predecessor_turns(turn_set, turn_list, max_lr)

235

for lr2 in max_lr.neighbours:

236

self.add_predecessor_turns(turn_set, turn_list, lr2)

237

238

# Non-direct neighbours that interfere with the allocation of the bottleneck are the

239

# immediate cause for gaps in the allocation, and are selected with higher probability.

240

non_nb_turn_list = []

241

for turn in turn_list:

242

lr = self.lrs[indices[turn]]

243

if not max_lr.is_neighbour(lr):

244

non_nb_turn_list.append(turn)

245

assert turn_list

246

# Pick from non-neighbour list with 30% probability

247

# (magic number based on tuning)

248

if random.randint(0, 100) < 30 and non_nb_turn_list:

249

# Pick a live range from the "non-neighbour list"

250

ix1 = non_nb_turn_list[random.randint(0, len(non_nb_turn_list) - 1)]

251

else:

252

# Pick any affecting live range.

253

ix1 = turn_list[random.randint(0, len(turn_list) - 1)]

254

255

ix2 = turn_list[random.randint(0, len(turn_list) - 2)]

if ix1 == ix2:

ix2 = turn_list[-1]

# Swap indices

indices[ix1], indices[ix2] = indices[ix2], indices[ix1]

Louis Verhaard

2022-02-28 15:12:57 +0100

[diff] [blame]

260

if iterations_stuck > HillClimbAllocator.MAX_ITERATIONS_STUCK:

261

# The best allocation has not improved for a while, maybe improvement is not possible

262

# by single-swapping indices; add more neighbour live ranges and swap up to two more indices.

263

# Adding more neighbours can sometimes resolve the situation where the current bottleneck

264

# is resolved, but always results in a higher bottleneck at a nearby live range.

265

# Magic number is based on tuning

266

for turn in non_nb_turn_list:

267

for lr in self.lrs[indices[turn]].neighbours:

268

if lr.turn not in turn_set:

269

turn_set.add(lr.turn)

270

turn_list.append(lr.turn)

271

ix1 = turn_list[random.randint(0, len(turn_list) - 1)]

272

ix2 = turn_list[random.randint(0, len(turn_list) - 1)]

273

indices[ix1], indices[ix2] = indices[ix2], indices[ix1]

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

274

275

def search(self, indices: List[int], iterations: int):

276

"""

277

Search for a solution, using the given indices as initial solution.

278

"""

279

best_indices = indices[:]

Louis Verhaard

2022-02-28 15:12:57 +0100

[diff] [blame]

280

last_improvement_iteration = 0

281

for i in range(iterations):

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

282

# Reorder the indices

Louis Verhaard

2022-02-28 15:12:57 +0100

[diff] [blame]

283

self.attempt_bottleneck_fix(indices, i - last_improvement_iteration)

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

284

# Allocate the reordered indices and check if it gave an improvement

285

new_size = self.allocate_indices(indices)

286

if new_size <= self.best_size:

287

# The new allocation produced a new best result remember it

Louis Verhaard

2022-02-28 15:12:57 +0100

[diff] [blame]

288

if new_size < self.best_size:

289

last_improvement_iteration = i

Louis Verhaard

2021-01-20 17:23:54 +0100

[diff] [blame]

290

self.best_size = new_size

291

best_indices = indices[:]

292

self.allocated_addresses = [lr.address for lr in self.lrs]

293

if self.best_size <= self.min_required_size:

Louis Verhaard

2022-02-28 15:12:57 +0100

[diff] [blame]

294

# Target reached; stop

Louis Verhaard