Blame - ethosu/vela/compiler_driver.py - ml/ethos-u/ethos-u-vela

2020-04-27 18:20:16 +0100

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

16

# Description:

17

# Contains the main sequencing of the compiler.

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

18

import time

19

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

20

from . import extract_npu_subgraphs

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

21

from . import graph_optimiser

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

22

from . import high_level_command_stream_generator

Louis Verhaard

1e17018

2020-11-26 11:42:04 +0100

[diff] [blame]

23

from . import high_level_command_to_npu_op

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

24

from . import insert_dma

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

25

from . import live_range

Louis Verhaard

0b8268a

2020-08-05 16:11:29 +0200

[diff] [blame]

26

from . import lut

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

27

from . import mark_tensors

28

from . import npu_performance

29

from . import npu_serialisation

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

30

from . import pass_packing

31

from . import scheduler

32

from . import tensor_allocation

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

33

from . import weight_compressor

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

34

from .debug_database import DebugDatabase

Patrik Gustavsson

c0bb899

2020-08-11 16:45:35 +0200

[diff] [blame]

35

from .errors import VelaError

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

36

from .nn_graph import PassPlacement

37

from .nn_graph import TensorAllocator

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

38

from .operation import Op

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

39

from .rewrite_graph import verify_graph_health

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

40

from .rewrite_graph import visit_graph_post_order

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame]

41

from .tensor import MemType

Jacob Bohlin

0628a8c

2020-08-28 13:25:14 +0200

[diff] [blame]

42

from .tensor import Tensor

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

43

44

45

class CompilerOptions:

46

"""Set of options to change compiler behaviour - verbosity, targets, turning off passes.

47

48

Note the difference between ArchitectureFeatures and CompilerOptions

Tim Hall

c8a7386

2020-10-27 12:43:14 +0000

[diff] [blame]

49

- ArchitectureFeatures is for changing the Ethos-U and system architecture

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

50

- CompilerOptions is for changing the behaviour of the compiler

"""

def __init__(

self,

verbose_graph=False,

verbose_quantization=False,

57

verbose_packing=False,

58

verbose_tensor_purpose=False,

59

verbose_tensor_format=False,

60

verbose_allocation=False,

61

verbose_high_level_command_stream=False,

62

verbose_register_command_stream=False,

63

verbose_operators=False,

Fredrik Svedberg

f5c07c4

2021-04-23 14:36:42 +0200

[diff] [blame^]

64

verbose_weights=False,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

65

show_cpu_operations=False,

66

tensor_allocator=TensorAllocator.Greedy,

67

timing=False,

68

output_dir="outputs",

Tim Hall

2020-11-01 21:27:19 +0000

[diff] [blame]

69

cpu_tensor_alignment=Tensor.AllocationQuantum,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

70

):

71

72

self.verbose_graph = verbose_graph

73

self.verbose_quantization = verbose_quantization

74

self.verbose_packing = verbose_packing

75

self.verbose_tensor_purpose = verbose_tensor_purpose

76

self.verbose_tensor_format = verbose_tensor_format

77

self.verbose_allocation = verbose_allocation

78

self.verbose_high_level_command_stream = verbose_high_level_command_stream

79

self.verbose_register_command_stream = verbose_register_command_stream

80

self.verbose_operators = verbose_operators

Fredrik Svedberg

f5c07c4

2021-04-23 14:36:42 +0200

[diff] [blame^]

81

self.verbose_weights = verbose_weights

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

82

self.show_cpu_operations = show_cpu_operations

83

self.tensor_allocator = tensor_allocator

84

self.timing = timing

85

self.output_dir = output_dir

Tim Hall

2020-11-01 21:27:19 +0000

[diff] [blame]

86

self.cpu_tensor_alignment = cpu_tensor_alignment

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

87

88

def __str__(self):

89

return type(self).__name__ + ": " + str(self.__dict__)

__repr__ = __str__

Louis Verhaard

2020-09-15 14:05:38 +0200

[diff] [blame]

94

def next_sram_factor(alloc_results):

95

# Bisects to find the max SRAM usage that successfully can be fitted with the tensor allocator.

96

# Returns tuple (factor, dry_test), with factor is None (stop) or 0 <= factor <= 1 (next SRAM factor to try),

97

# dry_test is True while still bisecting.

upper = 1.0

lower = 0.7

MAX_ITERATIONS = 8

if len(alloc_results) == 0:

102

# First iteration, try max SRAM, keep the result if it succeeds

103

return (upper, False)

104

elif len(alloc_results) == 1:

105

if alloc_results[0]:

106

# The allocator succeeded at first try; stop

107

return (None, False)

108

else:

109

# Start bisecting, try lowerbound SRAM

110

return (lower, True)

111

elif len(alloc_results) > MAX_ITERATIONS:

112

# Stop

113

return (None, False)

114

if not alloc_results[1]:

115

# Allocation at lower failed; search interval 0 - lower

upper = lower

lower = 0

best = lower

for success in alloc_results[2:]:

120

middle = (lower + upper) / 2

121

if success:

122

best = max(best, middle)

lower = middle

else:

upper = middle

if len(alloc_results) == MAX_ITERATIONS:

127

# Done bisecting; repeat the best match, but not as dry test

128

return (best, False)

129

# Next try; run only as dry test

130

return ((lower + upper) / 2, True)

131

132

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

133

def _record_operator(op, arch):

134

if op.type != Op.Const:

135

DebugDatabase.add_source(op)

136

137

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

138

def compiler_driver(nng, arch, options, scheduler_options):

139

assert verify_graph_health(nng)

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

140

141

# Pre-optimisation operator tracking

142

for sg in nng.subgraphs:

143

visit_graph_post_order(sg.output_tensors, arch, [], [_record_operator])

144

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

145

nng = graph_optimiser.optimise_graph_a(nng, arch, options.verbose_graph)

146

assert verify_graph_health(nng)

147

148

if options.verbose_quantization:

149

nng.print_graph_with_tensor_quantization()

150

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

151

nng = mark_tensors.mark_tensor_purpose(nng, arch, options.verbose_tensor_purpose)

152

assert verify_graph_health(nng)

153

nng = insert_dma.insert_dma_commands(nng, arch, options.verbose_graph)

154

assert verify_graph_health(nng)

155

pass_packing.pack_into_passes(nng, arch, options.verbose_packing)

156

assert verify_graph_health(nng)

157

158

extract_npu_subgraphs.extract_npu_subgraphs(nng, arch)

159

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

160

assert verify_graph_health(nng)

if options.timing:

start = time.time()

# Run the scheduler

scheduler.schedule_passes(nng, arch, scheduler_options)

if options.timing:

stop = time.time()

print("Scheduling took %f s" % (stop - start))

170

start = time.time()

171

172

# Update the compressed weights now that we have determined the

173

# block config, and calc and pack the scales and biases

174

weight_compressor.update_pass_weight_and_scale_tensors(nng, arch)

175

Tim Hall

14e8a20

2020-11-27 12:23:42 +0000

[diff] [blame]

176

if scheduler_options.cache_bias_scale_tensor:

Andreas Nevalainen

27d36f0

2020-11-19 11:27:50 +0100

[diff] [blame]

177

scheduler.move_scales_to_fast_storage(nng, arch)

178

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

179

# LiveRanges for constant tensors for all Npu subgraphs

180

permanent_storage = arch.permanent_storage_mem_area

181

lr_graph_flash = live_range.LiveRangeGraph()

182

183

# Placeholders for scratch and flash tensors that are common for all Npu subgraphs

184

scratch_tens = None

Patrik Gustavsson

3ab9452

2020-06-29 17:36:55 +0200

[diff] [blame]

185

scratch_fast_tens = None

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

186

flash_tens = None

187

188

# Calculate live ranges for all constant Npu tensors, in permanent storage

189

for sg in nng.subgraphs:

190

if sg.placement == PassPlacement.Npu:

191

lr_graph_flash = live_range.extract_live_ranges_from_cascaded_passes(

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame]

192

sg,

193

permanent_storage,

194

MemType.Permanent_NPU,

195

ignore_subgraph_input_output_tensors=True,

196

lr_graph=lr_graph_flash,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

197

)

198

Tim Hall

25f605c

2020-05-18 18:04:26 +0100

[diff] [blame]

199

if len(nng.subgraphs) > 1:

200

# Allocate all Npu constant tensors to the first Npu subgraph since it is

201

# processed first during serialization into tensors

202

first_npu_sg = nng.subgraphs[1]

203

assert first_npu_sg.placement == PassPlacement.Npu

Tim Hall

25f605c

2020-05-18 18:04:26 +0100

[diff] [blame]

204

tensor_allocation.allocate_tensors(

nng,

first_npu_sg,

arch,

permanent_storage,

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame]

209

set((MemType.Permanent_NPU,)),

Louis Verhaard

2020-09-15 14:05:38 +0200

[diff] [blame]

210

tensor_allocator=TensorAllocator.LinearAlloc,

211

verbose_allocation=options.verbose_allocation,

Louis Verhaard

2020-09-15 14:05:38 +0200

[diff] [blame]

212

lr_graph=lr_graph_flash,

Tim Hall

25f605c

2020-05-18 18:04:26 +0100

[diff] [blame]

213

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

214

215

# Allocate all non-constant tensors to the root, i.e. Cpu, subgraph. This step

216

# will start at the root subgraph's input and traverse from top to bottom. When

217

# it comes across an Npu-op it will extract live ranges for it's corresponding

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame]

218

# Npu subgraph and add them to the root's live range graph.

219

# The non-constant tensors are stored either in arch.feature_map_storage_mem_area or

220

# arch.fast_storage_mem_area.

221

# When these memory areas are the same, all non-constant tensors are allocated together.

222

# Otherwise they are allocated separately.

223

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

224

root_sg = nng.get_root_subgraph()

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame]

225

226

alloc_list = []

Tim Hall

2020-11-01 20:59:36 +0000

[diff] [blame]

227

if arch.is_spilling_enabled():

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame]

228

mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,)))

Louis Verhaard

2020-09-15 14:05:38 +0200

[diff] [blame]

229

mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))

230

# Order is important

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame]

231

alloc_list.append(mem_alloc_scratch_fast)

Louis Verhaard

2020-09-15 14:05:38 +0200

[diff] [blame]

232

alloc_list.append(mem_alloc_scratch)

Tim Hall

2020-11-01 20:59:36 +0000

[diff] [blame]

233

else:

234

mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))

235

alloc_list.append(mem_alloc_scratch)

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame]

236

Louis Verhaard

2020-09-15 14:05:38 +0200

[diff] [blame]

237

for mem_area, mem_type_set in alloc_list:

Tim Hall

2020-11-01 20:59:36 +0000

[diff] [blame]

238

if arch.is_spilling_enabled() and mem_area == arch.fast_storage_mem_area:

Louis Verhaard

2020-09-15 14:05:38 +0200

[diff] [blame]

239

# For the case where scratch_fast != scratch: attempt to place feature maps used between

240

# cascaded passes in fast storage. Bisection is used to find the max possible usage of SRAM.

241

alloc_results = []

242

while True:

243

assert len(alloc_results) < 10, "Infinite allocator loop"

244

sram_factor, dry_test = next_sram_factor(alloc_results)

245

if sram_factor is None:

246

break

247

# Try to move as many feature maps as possible to SRAM before allocating

248

sram_limit = sram_factor * arch.sram_size

249

for sg in nng.subgraphs:

250

scheduler.use_fast_storage_for_feature_maps(sg, sram_limit, arch)

251

alloc_success = tensor_allocation.allocate_tensors(

nng,

root_sg,

arch,

mem_area,

mem_type_set,

max_size=arch.sram_size,

258

dry_test=dry_test,

Louis Verhaard

2020-09-15 14:05:38 +0200

[diff] [blame]

259

tensor_allocator=options.tensor_allocator,

260

verbose_allocation=options.verbose_allocation,

Tim Hall

2020-11-01 21:27:19 +0000

[diff] [blame]

261

cpu_tensor_alignment=options.cpu_tensor_alignment,

Louis Verhaard

2020-09-15 14:05:38 +0200

[diff] [blame]

262

)

263

if dry_test or not alloc_success:

264

for sg in nng.subgraphs:

265

scheduler.undo_use_fast_storage(sg, arch)

266

alloc_results.append(alloc_success)

267

if not alloc_results[-1]:

268

raise VelaError(

Michael McGeagh

7a6f843

2020-12-02 15:29:22 +0000

[diff] [blame]

269

f"Sram limit {arch.sram_size} bytes, has been exceeded by the scratch fast tensor. "

Louis Verhaard

2020-09-15 14:05:38 +0200

[diff] [blame]

270

"Increasing the value of --weight-estimation-scaling may help to resolve the issue. "

Michael McGeagh

7a6f843

2020-12-02 15:29:22 +0000

[diff] [blame]

271

"See OPTIONS.md for more information"

Louis Verhaard

2020-09-15 14:05:38 +0200

[diff] [blame]

272

)

Tim Hall

2020-11-01 20:59:36 +0000

[diff] [blame]

273

else:

274

tensor_allocation.allocate_tensors(

nng,

root_sg,

arch,

mem_area,

mem_type_set,

tensor_allocator=options.tensor_allocator,

281

verbose_allocation=options.verbose_allocation,

Tim Hall

2020-11-01 21:27:19 +0000

[diff] [blame]

282

cpu_tensor_alignment=options.cpu_tensor_alignment,

Tim Hall

2020-11-01 20:59:36 +0000

[diff] [blame]

283

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

284

285

# Generate command streams and serialise Npu-ops into tensors

286

for sg in nng.subgraphs:

287

high_level_command_stream_generator.generate_high_level_command_stream(

288

nng, sg, arch, options.verbose_high_level_command_stream

289

)

Louis Verhaard

0b8268a

2020-08-05 16:11:29 +0200

[diff] [blame]

290

lut.optimize_high_level_cmd_stream(sg, arch)

Louis Verhaard

1e17018

2020-11-26 11:42:04 +0100

[diff] [blame]

291

high_level_command_to_npu_op.generate_register_command_stream_for_sg(

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

292

nng, sg, arch, options.verbose_register_command_stream

293

)

Patrik Gustavsson

3ab9452

2020-06-29 17:36:55 +0200

[diff] [blame]

294

scratch_tens, scratch_fast_tens, flash_tens = npu_serialisation.serialise_npu_subgraph_into_tensors(

295

nng, sg, arch, scratch_tens, scratch_fast_tens, flash_tens

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

296

)

297

298

npu_serialisation.rewrite_npu_call_ops(nng, root_sg, arch)

299

Jacob Bohlin

268394d

2020-08-13 13:24:59 +0200

[diff] [blame]

300

# Set Scratch and Fast_scratch Tensor size

301

if scratch_tens is not None:

302

scratch_tens.set_all_shapes([root_sg.memory_used_per_type.get(MemType.Scratch, 0)])

303

if scratch_fast_tens is not None:

304

scratch_fast_tens.set_all_shapes([root_sg.memory_used_per_type.get(MemType.Scratch_fast, 0)])

305

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

306

# Allocate all Cpu constant tensors, this is done last because the Npu-ops

307

# have to be serialized into flash and scratch tensors first

308

tensor_allocation.allocate_tensors(

nng,

root_sg,

arch,

permanent_storage,

Patrik Gustavsson

2020-05-27 09:15:11 +0200

[diff] [blame]

313

set((MemType.Permanent_CPU,)),

Louis Verhaard

2020-09-15 14:05:38 +0200

[diff] [blame]

314

tensor_allocator=TensorAllocator.LinearAlloc,

315

verbose_allocation=options.verbose_allocation,

Tim Hall

2020-11-01 21:27:19 +0000

[diff] [blame]

316

cpu_tensor_alignment=options.cpu_tensor_alignment,

Tim Hall