Blame - ethosu/vela/scheduler.py - ml/ethos-u/ethos-u-vela

2021-05-27 18:49:40 +0100

[diff] [blame]

1

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

16

#

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

17

# Description:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

18

# The scheduler creates and searches for an optimal plan for the network, selecting block configurations and

19

# subdivisions for the Operators

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

20

import copy

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

21

from enum import auto

22

from enum import IntEnum

23

from typing import Dict

24

from typing import List

25

from typing import Optional

26

from typing import Tuple

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

27

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame^]

28

import numpy as np

29

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

30

from . import live_range

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

31

from . import npu_performance

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

32

from . import tensor_allocation

33

from . import weight_compressor

34

from .architecture_allocator import ArchitectureBlockConfig

35

from .architecture_allocator import find_block_config

36

from .architecture_allocator import get_ifm_area_required

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

37

from .architecture_features import ArchitectureFeatures

38

from .architecture_features import Block

39

from .cascade_builder import CascadeBuilder

40

from .cascade_builder import CascadeInfo

Fredrik Svedberg

880e735

2020-08-25 11:31:47 +0200

[diff] [blame]

41

from .data_type import DataType

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

42

from .nn_graph import CascadedPass

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

43

from .nn_graph import Graph

44

from .nn_graph import Pass

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

45

from .nn_graph import PassPlacement

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

46

from .nn_graph import SchedulingStrategy

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

47

from .nn_graph import Subgraph

48

from .numeric_util import round_down

49

from .numeric_util import round_up

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

50

from .operation import NpuBlockType

Louis Verhaard

aee5d75

2020-09-30 09:01:52 +0200

[diff] [blame]

51

from .operation import Op

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

52

from .shape4d import Shape4D

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

53

from .tensor import MemArea

Patrik Gustavsson

eca2e95

2020-05-27 09:15:11 +0200

[diff] [blame]

54

from .tensor import MemType

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

55

from .tensor import Tensor

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

56

from .tensor import TensorFormat

57

from .tensor import TensorPurpose

58

from .tensor import TensorSubPurpose

Jacob Bohlin

1a66697

2020-09-11 10:04:15 +0200

[diff] [blame]

59

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

60

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

61

def shape_for_format(shape: Shape4D, tensor_format: TensorFormat) -> Shape4D:

62

if tensor_format == TensorFormat.NHCWB16:

63

return shape.with_depth(round_up(shape.depth, 16))

return shape

class OptimizationStrategy(IntEnum):

69

"""Enum defining the different optimization strategies for the Scheduler"""

70

71

Size = auto()

72

Performance = auto()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

def __str__(self):

return self.name

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

78

class SchedulerOpInfo:

79

"""Contains metadata about a SchedulerOperation that is unique to one Schedule"""

80

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

81

def __init__(

82

self,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

83

block_config: ArchitectureBlockConfig,

84

weights_size: int,

85

stripe_input: Shape4D,

86

stripe_input2: Optional[Shape4D],

87

stripe: Shape4D,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

88

):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

89

self.block_config = block_config

90

self.weights_size = weights_size

91

self.stripe_input = stripe_input

92

self.stripe_input2 = stripe_input2

93

self.stripe = stripe

94

self.cascade = 0 # Assigned by CascadeBuilder. 0 means not part of a cascade

95

self.time_index = None # Set by update_op_memory_snapshot

96

self.ofm_depth_slices: List[int] = [0, stripe.depth]

97

self.npu_weights_tensor = None

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

98

self.npu_scales_tensor = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

99

self.buffered_weight_tensor = None

100

self.cycles = None

101

self.slack_buffering_cycles = 0

102

self.slack_buffering_memory = 0

103

self.full_weight_transfer_cycles = 0

104

105

def copy(self):

106

res = SchedulerOpInfo(self.block_config, self.weights_size, self.stripe_input, self.stripe_input2, self.stripe,)

107

res.cascade = self.cascade

return res

def __str__(self):

res = f"\t\tBlock Config = {self.block_config}\n"

112

res += f"\t\tOFM Block = {self.block_config.ofm_block}\n"

113

res += f"\t\tIFM Stripe = {self.stripe_input}\n"

114

res += f"\t\tIFM2 Stripe = {self.stripe_input2}\n"

115

res += f"\t\tOFM Stripe = {self.stripe}\n"

116

res += f"\t\tEncoded Weights = {self.npu_weights_tensor and len(self.npu_weights_tensor.buffer)} bytes\n"

117

res += (

118

f"\t\tWeight buffer = {self.buffered_weight_tensor and self.buffered_weight_tensor.storage_size()} bytes\n"

119

)

120

res += f"\t\tDepth slices = {self.ofm_depth_slices}\n"

121

res += f"\t\tAssigned Cascade = {self.cascade}"

return res

class SchedulerOptions:

126

"""Contains options for the Scheduler"""

127

128

def __init__(

129

self, optimization_strategy, sram_target, verbose_schedule,

130

):

131

self.optimization_strategy = optimization_strategy

132

self.optimization_sram_limit = sram_target

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

133

self.verbose_schedule = verbose_schedule

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

134

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

135

def __str__(self) -> str:

136

return f"{type(self).__name__}: {str(self.__dict__)}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

__repr__ = __str__

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

141

class SchedulerTensor:

142

def __init__(self, shape, dt, mem_area, _format):

143

self.dtype = dt

144

self.mem_area = mem_area

145

self.shape = shape

146

self.format = _format

147

self.connection = None

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

148

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

149

150

class SchedulerOperation:

151

"""Scheduler internal representation of 'Operation'

152

This class can be seen as a node within the Scheduler Graph representation

153

"""

154

155

def __init__(self, ps: Pass, arch: ArchitectureFeatures, nng: Graph):

156

self.arch = arch

157

self.parent_ps = ps

158

self.parent_op = ps.primary_op

159

self.name = ps.primary_op.name

160

self.op_type = ps.primary_op.type

161

self.activation = ps.primary_op.activation

162

self.kernel = ps.primary_op.kernel

163

self.resampling_mode = ps.primary_op.ifm.resampling_mode

164

self.uses_scalar = ps.primary_op.ifm2 is not None and (

165

ps.primary_op.ifm.shape == [] or ps.primary_op.ifm2.shape == []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

166

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

167

self.ifm_ublock = arch.ifm_ublock

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

168

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

169

self.ifm = SchedulerTensor(ps.ifm_shapes[0], ps.ifm_tensor.dtype, ps.ifm_tensor.mem_area, ps.ifm_tensor.format,)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

170

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

171

self.ifm2 = None

172

if ps.ifm2_tensor:

173

self.ifm2 = SchedulerTensor(

174

ps.ifm_shapes[1], ps.ifm2_tensor.dtype, ps.ifm2_tensor.mem_area, ps.ifm2_tensor.format,

175

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

176

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

177

self.ofm = SchedulerTensor(ps.ofm_shapes[0], ps.ofm_tensor.dtype, ps.ofm_tensor.mem_area, ps.ofm_tensor.format,)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

178

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

179

# Input volume width and height required to produce the smallest possible stripe

180

self.min_stripe_input_w, self.min_stripe_input_h = self._calculate_min_stripe_input()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

181

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

182

# Flags that marks whether this SchedulerOperation requires full IFM/OFM

183

self.requires_full_ifm = False

184

self.requires_full_ifm2 = False

185

self.requires_full_ofm = False

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

186

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

187

self.index = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

188

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

189

def add_ifm_connection(self, conn: "Connection"):

190

"""Add input connection to another SchedulerOperation or Subgraph Input"""

191

conn.consumers.append(self)

192

self.ifm.connection = conn

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

193

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

194

def add_ifm2_connection(self, conn: "Connection"):

195

"""Add input connection to another SchedulerOperation or Subgraph Input"""

196

if self.ifm2:

197

conn.consumers.append(self)

198

self.ifm2.connection = conn

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

199

else:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

200

assert False, f"Trying to set an IFM2 Connection to {self} which has no IFM2"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

201

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

202

def add_ofm_connection(self, conn: "Connection"):

203

"""Add output connection to another SchedulerOperation or Subgraph Output"""

204

conn.producers.append(self)

205

self.ofm.connection = conn

206

207

def get_dependants(self):

208

"""Returns a list of the Ops that depend on this Operation's OFM"""

209

return self.ofm.connection.consumers

210

211

def ifm_size_in_bytes(self) -> int:

212

"""Returns size of the IFM in bytes"""

213

ifm_storage_shape = shape_for_format(self.ifm.shape, self.ifm.format)

214

return round_up(ifm_storage_shape.elements() * self.ifm.dtype.size_in_bytes(), Tensor.AllocationQuantum)

215

216

def ifm2_size_in_bytes(self) -> int:

217

"""Returns size of the IFM2 in bytes"""

218

if self.ifm2:

219

ifm2_storage_shape = shape_for_format(self.ifm2.shape, self.ifm2.format)

220

return round_up(ifm2_storage_shape.elements() * self.ifm2.dtype.size_in_bytes(), Tensor.AllocationQuantum)

return 0

def ofm_size_in_bytes(self) -> int:

225

"""Returns size of the OFM in bytes"""

226

ofm_storage_shape = shape_for_format(self.ofm.shape, self.ofm.format)

227

return round_up(ofm_storage_shape.elements() * self.ofm.dtype.size_in_bytes(), Tensor.AllocationQuantum)

228

229

def create_scheduler_info(self, nng: Graph, stripe: Shape4D) -> SchedulerOpInfo:

230

"""Returns schedule info about this SchedulerOperation based on how many ofm elements it should produce"""

231

ifm_shape = self.ifm.shape

232

ifm2_shape = self.ifm2 and self.ifm2.shape

233

ofm_shape = stripe

234

235

if ofm_shape != self.ofm.shape:

236

# Striped Op - Need to calculate stripe input volume

237

stripe_input_w, stripe_input_h = self._get_stripe_input_requirement(stripe)

238

# Ensure stripe input volume is within the full IFM volume

239

stripe_input_h = min(stripe_input_h, self.ifm.shape.height)

240

stripe_input_w = min(stripe_input_w, self.ifm.shape.width)

241

ifm_shape = ifm_shape.with_hw(stripe_input_h, stripe_input_w)

242

243

if self.ifm2:

244

stripe_input2_h = min(stripe_input_h, self.ifm2.shape.height)

245

stripe_input2_w = min(stripe_input_w, self.ifm2.shape.width)

246

ifm2_shape = ifm2_shape.with_hw(stripe_input2_h, stripe_input2_w)

247

248

block_config = self._get_block_config(ifm_shape, ifm2_shape, self.uses_scalar, ofm_shape)

249

250

scheduler_op_info = SchedulerOpInfo(block_config, 0, ifm_shape, ifm2_shape, ofm_shape)

251

if self.parent_op.weights:

252

# Default full-depth weight encoding with no buffering

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

253

(

254

scheduler_op_info.npu_weights_tensor,

255

scheduler_op_info.npu_scales_tensor,

256

) = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

257

self.arch,

258

self.parent_op,

259

self.parent_op.weights,

self.parent_op.bias,

self.kernel,

block_config,

[0, self.ofm.shape.depth],

264

)

265

266

self.parent_ps.block_config = block_config.old_style_representation()

267

return scheduler_op_info

268

269

def _get_stripe_input_requirement(self, stripe_shape: Shape4D) -> Tuple[int, int]:

270

"""Returns the amount of IFM required to produce the stripe with shape:'stripe_shape'"""

271

ofm_shape_to_produce = Block.from_shape(stripe_shape.as_list())

272

Fredrik Svedberg

3ff7a4a

2021-09-29 10:08:04 +0200

[diff] [blame]

273

return get_ifm_area_required(ofm_shape_to_produce, self.kernel, self.resampling_mode)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

274

275

def _calculate_min_stripe_input(self) -> Shape4D:

276

# Calculate the input volume required height and width for the smallest possible stripe (h,w = 1,1)

277

min_stripe = self.ofm.shape.with_hw(1, 1)

278

return self._get_stripe_input_requirement(min_stripe)

279

280

def _get_block_config(

281

self, ifm_shape: Shape4D, ifm2_shape: Optional[Shape4D], uses_scalar: bool, ofm_shape: Shape4D

282

) -> ArchitectureBlockConfig:

283

# Returns a block config and SHRAM layout

284

lut_banks = 2 if self.parent_op.activation_lut else 0

285

return find_block_config(

286

self.arch,

287

self.op_type.npu_block_type,

ofm_shape,

ifm_shape,

ifm2_shape,

uses_scalar,

self.ifm.dtype.size_in_bits(),

293

self.kernel,

294

lut_banks,

295

self.parent_op.has_scaling(),

296

self.resampling_mode,

)

class Connection:

"""Scheduler internal representation of a Tensor that connects two SchedulerOperations

302

This class can be seen as an edge within the Scheduler Graph representation

303

"""

304

305

def __init__(self, tensor: Tensor):

306

self.parent_tens = tensor

307

308

# SchedulerOperation relationships

309

self.producers: List[SchedulerOperation] = []

310

self.consumers: List[SchedulerOperation] = []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

311

312

def __str__(self):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

313

return f"<Connection {self.parent_tens.name}>"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

__repr__ = __str__

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

318

class Schedule:

319

"""Class that contains a solution of how to schedule an NPU subgraph and its cost"""

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

320

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

321

def __init__(self, sg: Subgraph, label: str):

322

self.sg = sg

323

self.label = label

324

self.cost_map: Dict[SchedulerOperation, SchedulerOpInfo] = {}

325

self.cascades: Dict[int, CascadeInfo] = {}

326

self.fast_storage_peak_usage = 0

327

self.memory_snapshot = None

@property

def name(self):

return f"{self.sg.name}_{self.label}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

332

333

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

334

class Scheduler:

335

"""Main class of the Vela Scheduling"""

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

336

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

337

def __init__(self, nng: Graph, sg: Subgraph, arch: ArchitectureFeatures, options: SchedulerOptions):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

338

self.nng = nng

339

self.sg = sg

340

self.arch = arch

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

341

self.sched_ops: List(SchedulerOperation) = []

342

self.max_schedule = None

343

self.scheduler_options = options

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

344

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

345

def create_scheduler_representation(self, arch: ArchitectureFeatures):

346

"""Creates a Scheduler Graph representation"""

347

# Temporary dict for creating connections between the Operations

348

connections: Dict[Tensor, Connection] = {}

349

# Memory required for the largest FeatureMap that has to be full

350

min_memory_req = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

351

for ps in self.sg.passes:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

352

if ps.primary_op:

353

# Set tensor format to NHCWB16 for output FeatureMaps, if possible

Louis Verhaard

0b9c9a3

2020-09-15 14:05:38 +0200

[diff] [blame]

354

for output in ps.outputs:

Jacob Bohlin

a5e8c1c

2021-06-14 13:33:39 +0200

[diff] [blame]

355

if output in self.sg.output_tensors or output.purpose != TensorPurpose.FeatureMap:

Patrik Gustavsson

feeb06d

2020-04-22 12:53:47 +0200

[diff] [blame]

356

continue

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

357

if not output.needs_linear_format:

Louis Verhaard

0b9c9a3

2020-09-15 14:05:38 +0200

[diff] [blame]

358

output.set_format(TensorFormat.NHCWB16, arch)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

359

360

# Create SchedulerOperations

361

op = SchedulerOperation(ps, arch, self.nng)

362

op.index = len(self.sched_ops)

363

364

# Make connections

365

if ps.ifm_tensor not in connections:

366

connections[ps.ifm_tensor] = Connection(ps.ifm_tensor)

367

if ps.ifm2_tensor and ps.ifm2_tensor not in connections:

368

connections[ps.ifm2_tensor] = Connection(ps.ifm2_tensor)

369

if ps.ofm_tensor not in connections:

370

connections[ps.ofm_tensor] = Connection(ps.ofm_tensor)

371

372

op.add_ifm_connection(connections[ps.ifm_tensor])

373

if ps.ifm2_tensor:

374

op.add_ifm2_connection(connections[ps.ifm2_tensor])

375

op.add_ofm_connection(connections[ps.ofm_tensor])

376

377

# Set requirements on the ifm/ofm buffers

378

self.sched_ops.append(op)

379

if ps.ifm_tensor in self.sg.input_tensors:

380

# This Op consumes a subgraph input

381

op.requires_full_ifm = True

382

if ps.ifm2_tensor and ps.ifm2_tensor in self.sg.input_tensors:

383

# This Op consumes a subgraph input

384

op.requires_full_ifm2 = True

385

if ps.ofm_tensor in self.sg.output_tensors:

386

# This Op produces a subgraph output

387

op.requires_full_ofm = True

388

if ps.ifm_tensor.needs_linear_format:

389

op.requires_full_ifm = True

390

if ps.ifm2_tensor and ps.ifm2_tensor.needs_linear_format:

391

op.requires_full_ifm2 = True

392

if ps.ofm_tensor.needs_linear_format or ps.primary_op.memory_function == Op.ConcatSliceWrite:

393

op.requires_full_ofm = True

394

if len(ps.primary_op.outputs) > 1 or len(ps.primary_op.outputs[0].consumer_list) > 1:

395

# Op has multiple outputs or consumers - requires full OFM

396

op.requires_full_ofm = True

397

398

# Check memory requirements if this Op requires any full FeatureMaps

399

op_memory_req = 0

400

if op.requires_full_ifm:

401

op_memory_req += op.ifm_size_in_bytes()

402

if op.requires_full_ifm2:

403

op_memory_req += op.ifm2_size_in_bytes()

404

if op.requires_full_ofm:

405

op_memory_req += op.ofm_size_in_bytes()

406

407

min_memory_req = max(op_memory_req, min_memory_req)

408

409

# Theoretical minimum required memory - used to guide the cascade building

410

self.min_memory_req = min_memory_req

411

412

def create_initial_schedule(self) -> Schedule:

413

"""Creates an initial schedule with no cascading or buffering of any kind"""

414

schedule = Schedule(self.sg, "MAX")

415

416

for op in self.sched_ops:

417

cost = op.create_scheduler_info(self.nng, op.ofm.shape)

418

cost.cycles = self.estimate_op_performance(op, cost.block_config, op.ofm.shape.depth)

419

schedule.cost_map[op] = cost

return schedule

def update_op_memory_snapshot(self, schedule: Schedule):

424

memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

425

426

# Collect live ranges from tensors

427

lr_graph = live_range.LiveRangeGraph()

428

for mem_area, mem_type_set in memories_list:

429

live_range.extract_live_ranges_from_cascaded_passes(

Fredrik Svedberg

0ae2848

2021-10-27 13:58:03 +0200

[diff] [blame]

430

self.nng.get_root_subgraph(), mem_area, mem_type_set, lr_graph, Tensor.AllocationQuantum,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

431

)

432

433

# Populate time-array with memory used by live ranges

434

temporal_usage = lr_graph.get_temporal_memory_usage(self.arch.fast_storage_mem_area)

435

schedule.memory_snapshot = temporal_usage

436

437

# Set the peak memory usage

438

schedule.fast_storage_peak_usage = max(temporal_usage, default=0)

439

440

def estimate_op_performance(self, op: SchedulerOperation, block_config, ofm_depth):

441

query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)

442

query.ifm_shape = op.ifm.shape

443

query.ifm_memory_area = op.ifm.mem_area

444

query.ifm_bits = op.ifm.dtype.size_in_bits()

445

query.ifm_format = op.ifm.format

446

query.ifm2_shape = op.ifm2 and op.ifm2.shape

447

query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area

448

query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()

449

query.ifm2_format = op.ifm2 and op.ifm2.format

450

query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)

451

query.ofm_memory_area = op.ofm.mem_area

452

query.ofm_bits = op.ofm.dtype.size_in_bits()

453

query.ofm_format = op.ofm.format

454

if op.parent_op.bias:

455

query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)

456

query.const_memory_area = self.arch.fast_storage_mem_area

457

458

query.kernel = op.kernel

459

query.config = block_config

460

461

return npu_performance.measure_cycle_cost(self.arch, op.op_type, op.activation and op.activation.op_type, query)

462

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

463

def propose_schedule_buffering(self, ref_schedule: Schedule, staging_limit_bytes):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

464

"""Create a buffered schedule"""

465

buffered_schedule = Schedule(self.sg, f"{ref_schedule.label}_BUFFERED")

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

466

467

prev_op = None

468

for sched_op in self.sched_ops:

469

if sched_op not in ref_schedule.cost_map:

470

# sched_op is not part of this sub-schedule - skip

471

continue

472

473

self.propose_operator_buffering(sched_op, prev_op, buffered_schedule, ref_schedule, staging_limit_bytes)

474

prev_op = sched_op

475

476

return buffered_schedule

477

478

def propose_operator_buffering(

479

self,

480

sched_op: SchedulerOperation,

481

prev_op: SchedulerOperation,

482

buffered_schedule: Schedule,

483

ref_schedule: Schedule,

484

staging_limit_bytes,

485

):

486

# Mild recursion might mean this Op has already been seen

487

if sched_op in buffered_schedule.cost_map:

488

return

489

490

# Take the reference schedule as default costings for this schedule

491

ref_cost = ref_schedule.cost_map[sched_op]

492

cost = copy.copy(ref_cost)

493

cost.slack_buffering_cycles = ref_cost.cycles.op_cycles

494

memory_snapshot = ref_schedule.memory_snapshot

495

ref_memory_usage = memory_snapshot[ref_cost.time_index] if ref_cost.time_index < len(memory_snapshot) else 0

496

cost.slack_buffering_memory = staging_limit_bytes - ref_memory_usage

497

buffered_schedule.cost_map[sched_op] = cost

498

499

# Attempt weight buffering on anything with a weights tensor

500

if sched_op.parent_op.weights:

501

self.propose_weight_buffering(

502

sched_op.parent_op.weights,

503

sched_op.parent_op.bias,

sched_op,

prev_op,

buffered_schedule,

ref_schedule,

cost.slack_buffering_memory,

)

return cost

def weights_needs_dma(self, weight_tensor):

514

if weight_tensor and weight_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast):

515

# Weights are in permanent storage

516

# Only when permanent storage differs from feature map storage, there is a point moving the data

517

if (

518

weight_tensor.mem_area in (MemArea.Dram, MemArea.OffChipFlash)

519

and self.arch.permanent_storage_mem_area != self.arch.fast_storage_mem_area

):

return True

return False

def propose_weight_buffering(

self,

weight_tensor,

scale_tensor,

sched_op: SchedulerOperation,

529

prev_op: SchedulerOperation,

530

buffered_schedule: Schedule,

531

ref_schedule: Schedule,

532

buffer_limit_bytes,

533

):

534

cost = buffered_schedule.cost_map[sched_op]

535

prev_cost = buffered_schedule.cost_map.get(prev_op)

536

ref_cost = ref_schedule.cost_map[sched_op]

537

assert cost and ref_cost

538

539

needs_dma = self.weights_needs_dma(weight_tensor)

540

541

ofm_full_depth_slices = [0, ref_cost.stripe.depth]

542

543

# Encode weights for the full depth

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

544

full_weights, full_scales = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

self.arch,

sched_op.parent_op,

weight_tensor,

scale_tensor,

sched_op.kernel,

cost.block_config,

ofm_full_depth_slices,

552

)

553

full_weights_bytes = len(full_weights.buffer)

554

cost.ofm_depth_slices = ofm_full_depth_slices

555

556

# No buffering required - take all the weights from permanent storage

557

if sched_op.op_type == Op.FullyConnected or not needs_dma:

558

cost.npu_weights_tensor = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

559

cost.npu_scales_tensor = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

560

return

561

562

encoded_weights = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

563

encoded_scales = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

564

565

# How many NPU cycles are available under the previously executing

566

# operator and SRAM unused for performing buffered DMA transfers

567

slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0

568

slack_memory = prev_cost.slack_buffering_memory if prev_cost else 0

569

570

# Force full depth for cascaded Ops

571

if ref_cost.cascade != 0:

572

weight_tensor_purpose = TensorSubPurpose.Standard

573

weight_buffer_size = full_weights_bytes

574

# Update the memory snapshot to reflect the added size of the weights

575

ref_schedule.memory_snapshot[ref_cost.time_index] += weight_buffer_size

576

else:

577

# Estimate the buffering cycle time for the full set of weights

578

full_transfer_cycles = npu_performance.measure_mem2mem_cycles(

579

self.arch, weight_tensor.mem_area, self.arch.fast_storage_mem_area, full_weights_bytes

580

)

581

cost.full_weight_transfer_cycles = full_transfer_cycles

582

583

# Calculate the amount of prebuffering necessary (or what is possible with limited

584

# double buffer buffer size)

585

half_buffer_limit = buffer_limit_bytes // 2

586

if full_transfer_cycles > slack_cycles:

587

prebuffer_ratio = slack_cycles / full_transfer_cycles

588

prebuffer_bytes = min(prebuffer_ratio * full_weights_bytes, half_buffer_limit)

589

else:

590

prebuffer_bytes = min(full_weights_bytes, half_buffer_limit)

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

591

592

prebuffer_ratio = prebuffer_bytes / full_weights_bytes

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

593

594

# Have to split the weights if the initial buffering can't store

595

# all of the compressed weights

596

if prebuffer_bytes < full_weights_bytes:

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

597

block_depth = cost.block_config.ofm_block.depth

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

598

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

599

# Choose initial prebuffering depth (already buffer clamped)

600

prebuffer_depth = ref_cost.stripe.depth * prebuffer_ratio

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

601

prebuffer_depth = int(max(16, round_down(prebuffer_depth, ArchitectureFeatures.OFMSplitDepth)))

602

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

603

# Calculate cycles executed during the prebuffer

604

pre_op_cycles = self.estimate_op_performance(sched_op, cost.block_config, prebuffer_depth)

605

buffering_depth = ref_cost.stripe.depth * (pre_op_cycles.op_cycles / full_transfer_cycles)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

606

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

607

# Choose initial buffering depth and clamp to the double buffering limit

608

buffering_depth = round_up(buffering_depth, block_depth)

609

buffering_bytes = (buffering_depth / ref_cost.stripe.depth) * full_weights_bytes

610

if buffering_bytes > half_buffer_limit:

611

buffering_depth = (half_buffer_limit / full_weights_bytes) * ref_cost.stripe.depth

612

613

while True:

614

# Attempt to buffer whole blocks

615

if buffering_bytes > block_depth:

616

buffering_depth = round_down(buffering_depth, block_depth)

617

else:

618

buffering_depth = round_down(buffering_depth, ArchitectureFeatures.OFMSplitDepth)

619

buffering_depth = int(max(buffering_depth, ArchitectureFeatures.OFMSplitDepth))

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

620

621

# Create list of depth slices

622

depth_slices = [0]

623

if prebuffer_depth < ref_cost.stripe.depth:

624

depth_slices += list(range(prebuffer_depth, ref_cost.stripe.depth, buffering_depth))

625

depth_slices.append(ref_cost.stripe.depth)

626

627

# Encode weights based depth slices

628

cost.ofm_depth_slices = depth_slices

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

629

encoded_weights, encoded_scales = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

self.arch,

sched_op.parent_op,

weight_tensor,

scale_tensor,

sched_op.kernel,

cost.block_config,

cost.ofm_depth_slices,

637

)

638

639

# Chosen buffering might not fit at all, iterate until it does

640

# or until the minimum usable slice size is reached

641

if (

642

encoded_weights.max_range_bytes <= half_buffer_limit

643

or prebuffer_depth == ArchitectureFeatures.OFMSplitDepth

):

break

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

647

if buffering_depth > prebuffer_depth:

648

buffering_depth = round_up(buffering_depth // 2, ArchitectureFeatures.OFMSplitDepth)

649

else:

650

prebuffer_depth = round_up(prebuffer_depth // 2, ArchitectureFeatures.OFMSplitDepth)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

651

652

# Calculate cycles required to run the last op for use as future slack

653

tail_cycles = self.estimate_op_performance(

654

sched_op, cost.block_config, depth_slices[-1] - depth_slices[-2]

655

)

656

cost.slack_buffering_cycles = tail_cycles.op_cycles

657

658

# Determine whether the weights need to be double buffered

659

weight_buffer_size = min(len(encoded_weights.buffer), encoded_weights.max_range_bytes)

660

661

# Only buffer weights if there's still space left for the buffer

662

if weight_buffer_size <= buffer_limit_bytes:

663

assert weight_buffer_size % 16 == 0

664

# Determine whether to double buffer or single buffer

665

if (weight_buffer_size * 2 <= buffer_limit_bytes) and (weight_buffer_size < len(encoded_weights.buffer)):

666

weight_buffer_size = weight_buffer_size * 2

667

weight_tensor_purpose = TensorSubPurpose.DoubleBuffer

668

else:

669

weight_tensor_purpose = TensorSubPurpose.Standard

670

Jacob Bohlin

eee9e5d

2021-08-17 17:44:45 +0200

[diff] [blame]

671

cost.buffered_weight_tensor = self.buffer_tensor(

672

encoded_weights, weight_tensor_purpose, weight_buffer_size, weight_tensor.name

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

673

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

674

if ref_cost.cascade == 0:

675

# Determine if the lifetime can be extended and pre-buffer weights under the previous operation

676

cost.buffered_weight_tensor.pre_buffer = weight_buffer_size < slack_memory

677

678

cost.slack_buffering_memory -= weight_buffer_size

679

else:

680

# Don't slice or buffer - use the whole depth from persistent storage

681

cost.ofm_depth_slices = ofm_full_depth_slices

682

encoded_weights = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

683

encoded_scales = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

684

685

cost.npu_weights_tensor = encoded_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

686

cost.npu_scales_tensor = encoded_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

687

Jacob Bohlin

eee9e5d

2021-08-17 17:44:45 +0200

[diff] [blame]

688

def buffer_tensor(self, src_tensor: Tensor, sub_purpose: TensorSubPurpose, buffer_size: int, name: str) -> Tensor:

689

buffered_weight_tensor = Tensor([1, 1, 1, buffer_size], DataType.uint8, name + "_buffer")

690

buffered_weight_tensor.src_tensor = src_tensor

691

buffered_weight_tensor.mem_area = self.arch.fast_storage_mem_area

692

buffered_weight_tensor.mem_type = MemType.Scratch_fast

693

buffered_weight_tensor.purpose = TensorPurpose.Weights

694

buffered_weight_tensor.sub_purpose = sub_purpose

695

return buffered_weight_tensor

696

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

697

def propose_minimal_schedule(self) -> Schedule:

698

"""Proposes scheduling parameters where every operator is subdivided into the smallest stripe that satisfies the

699

next operators stride"""

700

min_schedule = Schedule(self.sg, "MIN")

701

cost_map = min_schedule.cost_map

702

703

# Keep track of the previous Op - which consumes the current Op's OFM

704

prev_op = None

705

for sched_op in reversed(self.sched_ops):

706

min_stripe_height = prev_op.kernel.stride.y if prev_op else 1

707

min_stripe = sched_op.ofm.shape.with_height(min_stripe_height)

708

709

cost = sched_op.create_scheduler_info(self.nng, min_stripe)

710

cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)

711

cost_map[sched_op] = cost

prev_op = sched_op

return min_schedule

def propose_schedule_striping(self, final_stripe: Shape4D, label: str, ref_schedule: Schedule) -> Schedule:

718

"""Proposes new striping for a schedule. The stripe is derived from the ifm requirements of the next Op down"""

719

ref_cost = ref_schedule.cost_map

720

721

striped_schedule = Schedule(self.sg, label)

722

stripe = final_stripe

723

for sched_op in reversed(self.sched_ops):

724

if sched_op not in ref_cost:

725

# sched_op is not part of the sub-schedule - skip

726

continue

727

728

# Create a cost entry with the new stripe

729

cost = sched_op.create_scheduler_info(self.nng, stripe)

730

Jacob Bohlin

eee9e5d

2021-08-17 17:44:45 +0200

[diff] [blame]

731

if ref_cost[sched_op].buffered_weight_tensor:

732

# If the weights are buffered in the reference schedule they should be in the new proposal

733

weight_tensor = cost.npu_weights_tensor

734

cost.buffered_weight_tensor = self.buffer_tensor(

735

weight_tensor, TensorSubPurpose.Standard, len(weight_tensor.buffer), weight_tensor.name

736

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

737

738

# Estimate performance

739

cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)

740

striped_schedule.cost_map[sched_op] = cost

741

742

# Calculate the preceeding Op's stripe

743

stripe = sched_op.ifm.shape.with_height(stripe.height * sched_op.kernel.stride.y)

744

745

return striped_schedule

746

747

def estimate_schedule_memory_usage(self, schedule: Schedule, non_local_mem_usage: dict):

748

"""Estimates the memory usage of a schedule"""

749

cost = schedule.cost_map

750

cascades = schedule.cascades

751

peak_mem_usage = 0

752

for sched_op in self.sched_ops:

753

if sched_op not in cost:

754

# sched_op is not part of the sub-schedule - skip

755

continue

756

757

if cost[sched_op].cascade:

758

# This Op is part of a cascade - use the cascade's memory usage

759

cascade_info = cascades[cost[sched_op].cascade]

760

# Non-local memory usage is already included in the cascade_info

761

peak_mem_usage = max(cascade_info.mem_usage, peak_mem_usage)

762

else:

763

# This Op is not part of a cascade - calculate the memory usage

764

op_weight_buffer = 0

765

if cost[sched_op].buffered_weight_tensor:

766

op_weight_buffer = cost[sched_op].buffered_weight_tensor.storage_size()

767

768

op_mem_usage = (

769

sched_op.ifm_size_in_bytes()

770

+ sched_op.ofm_size_in_bytes()

771

+ op_weight_buffer

772

+ non_local_mem_usage.get(sched_op, 0)

773

)

774

peak_mem_usage = max(op_mem_usage, peak_mem_usage)

775

776

return peak_mem_usage

777

778

def optimize_sub_schedule(

779

self, cascade_info: CascadeInfo, ref_schedule: Schedule, max_template: Schedule, memory_limit: int

780

) -> Schedule:

781

"""Extracts the Ops covered by the given cascade and creates a sub-schedule. The sub-schedule is optimized by

782

proposing weight buffering and then continously proposing new stripe sizes"""

783

ref_cost = ref_schedule.cost_map

784

# Extract the ops that are part of this sub-schedule

785

start = cascade_info.start

786

end = cascade_info.end

787

sub_schedule_ops = self.sched_ops[start : end + 1]

788

# Create a sub-schedule that contains only the costs for the Ops that are part of the sub-schedule

789

sub_schedule = Schedule(self.sg, f"SUB_{start}_{end}")

790

for sched_op in sub_schedule_ops:

791

sub_schedule.cost_map[sched_op] = ref_cost[sched_op]

792

793

sub_schedule.cascades[end] = cascade_info

794

# Use the memory snapshot from the reference schedule

795

sub_schedule.memory_snapshot = ref_schedule.memory_snapshot

796

797

# Calculate memory usage that is live during the sub-schedule but not part of it

798

time_for_cascade = ref_cost[sub_schedule_ops[0]].time_index

799

mem_usage_parallel_to_sub_schedule = ref_schedule.memory_snapshot[time_for_cascade] - cascade_info.mem_usage

800

# If the first Op's IFM has other consumers it has to live throughout the whole sub-schedule whether it's

801

# included in a cascade or not

802

persistent_initial_ifm = (

803

sub_schedule_ops[0].ifm_size_in_bytes() if len(sub_schedule_ops[0].ifm.connection.consumers) > 1 else 0

804

)

805

# Calculate non-local-mem-usage per Operator

806

non_local_mem_usage = {}

807

for idx, sched_op in enumerate(sub_schedule_ops):

808

non_local_mem_usage[sched_op] = mem_usage_parallel_to_sub_schedule

809

if idx != 0:

810

non_local_mem_usage[sched_op] += persistent_initial_ifm

811

812

cascade_builder = CascadeBuilder(sub_schedule_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)

813

814

# Start by adding buffering

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

815

buffered_sub_schedule = self.propose_schedule_buffering(

816

sub_schedule, self.scheduler_options.optimization_sram_limit

817

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

818

# Copy the cascades over from the unbuffered-schedule

819

buffered_sub_schedule.cascades = sub_schedule.cascades

820

821

# Generate the possible stripings for the final Op in the sub-schedule

822

final_ofm_shape = sub_schedule_ops[-1].ofm.shape

823

possible_stripes = [

824

final_ofm_shape.with_height(stripe_h) for stripe_h in range(1, final_ofm_shape.height // 2 + 1)

825

]

826

827

# Propose different striping - the possible stripes are proposed similarly to a binary search

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

828

best_schedule = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

829

iteration = 0

830

while len(possible_stripes) > 1:

831

proposed_stripe = possible_stripes[len(possible_stripes) // 2]

832

proposed_schedule = self.propose_schedule_striping(

833

proposed_stripe, f"OPTIMIZED_{iteration}", buffered_sub_schedule

834

)

835

836

cascade_builder.build_cascades(proposed_schedule, max_template, memory_limit)

837

838

# Check if proposal fits

839

proposed_schedule_mem_usage = self.estimate_schedule_memory_usage(proposed_schedule, non_local_mem_usage)

840

if (proposed_schedule_mem_usage) <= memory_limit:

841

# Remove all possible stripes smaller than this

842

possible_stripes = possible_stripes[len(possible_stripes) // 2 :]

843

best_schedule = proposed_schedule

844

if not proposed_schedule.cascades:

845

# No cascading required - early exit

846

break

847

else:

848

# Proposal doesn't fit within the limit - remove all possible stripes larger than this

849

possible_stripes = possible_stripes[: len(possible_stripes) // 2]

iteration += 1

return best_schedule

def optimize_schedule(

856

self, schedule: Schedule, max_sched: Schedule, max_template: Schedule, options: SchedulerOptions,

857

) -> Schedule:

858

"""Extracts sub-schedules based on the cascades and optimizes them and applies them to the final schedule"""

859

sram_limit = options.optimization_sram_limit

860

if max_sched.fast_storage_peak_usage < sram_limit and not self.arch.is_spilling_enabled():

861

# Maximum performance schedule fits within the SRAM target

862

return max_sched

863

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

864

# Iterate over a copy of the cascades since they may change during the loop

865

for cascade_info in list(schedule.cascades.values()):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

866

# Optimize the sub-schedule in this cascade

867

opt_sub_schedule = self.optimize_sub_schedule(cascade_info, schedule, max_template, sram_limit)

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

868

if opt_sub_schedule:

869

# Remove the existing cascade

870

del schedule.cascades[cascade_info.end]

871

# Update the sub-schedule Op and cascade costs to the full schedule

872

schedule.cost_map.update(opt_sub_schedule.cost_map)

873

schedule.cascades.update(opt_sub_schedule.cascades)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

874

875

# Update memory snapshot

876

self.sg.schedule = schedule

877

self.update_op_memory_snapshot(schedule)

878

# Propose schedule buffering to the optimized schedule

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

879

optimized_sched = self.propose_schedule_buffering(schedule, self.scheduler_options.optimization_sram_limit)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

880

# Copy the cascade's metadata from the unbuffered schedule

881

optimized_sched.cascades = schedule.cascades

882

return optimized_sched

883

884

def apply_schedule(self, sched: Schedule):

885

"""Applies the given schedule as a final solution"""

886

for sched_op in self.sched_ops:

887

op_info = sched.cost_map[sched_op]

888

cascade_info = sched.cascades.get(op_info.cascade, None)

889

if cascade_info and sched_op in cascade_info.buffers:

890

buffer_tens = sched_op.ifm.connection.parent_tens

891

# Apply memory area and type

892

buffer_tens.mem_area = self.arch.fast_storage_mem_area

893

buffer_tens.mem_type = MemType.Scratch_fast

894

# Apply Rolling buffer

895

buffer_tens.set_format(TensorFormat.NHCWB16, self.arch)

896

buffer_tens.set_new_sub_purpose(TensorSubPurpose.RollingBufferY, cascade_info.buffers[sched_op].height)

897

898

sched_op.parent_ps.block_config = op_info.block_config.old_style_representation()

899

900

# Ensure that the src_tensor reference is set correctly

901

if op_info.buffered_weight_tensor:

902

op_info.buffered_weight_tensor.src_tensor = op_info.npu_weights_tensor

903

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame^]

904

def use_fast_storage_for_feature_maps(self, schedule, staging_limit):

scratched_fms = {}

max_mem_usage = []

base_mem_usage = []

fast_storage_type = MemType.Scratch_fast

909

fast_storage_mem_area = self.arch.fast_storage_mem_area

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

910

911

# Force all OFMs to fast-storage

912

for sched_op in self.sched_ops:

913

cost = schedule.cost_map[sched_op]

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame^]

914

if cost.cascade == 0 and sched_op.get_dependants():

915

ofm_tens = sched_op.ofm.connection.parent_tens

916

if not any(cons is None for cons in ofm_tens.consumer_list):

917

if ofm_tens not in scratched_fms:

918

scratched_fms[ofm_tens] = (ofm_tens.mem_area, ofm_tens.mem_type)

919

ofm_tens.mem_area = fast_storage_mem_area

920

ofm_tens.mem_type = fast_storage_type

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

921

922

# Collect live ranges from tensors

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame^]

923

memories_list = [(fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

924

lr_graph = live_range.LiveRangeGraph()

925

for mem_area, mem_type_set in memories_list:

926

live_range.extract_live_ranges_from_cascaded_passes(

Fredrik Svedberg

0ae2848

2021-10-27 13:58:03 +0200

[diff] [blame]

927

self.nng.get_root_subgraph(), mem_area, mem_type_set, lr_graph, Tensor.AllocationQuantum,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

928

)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame^]

929

max_mem_usage = lr_graph.get_temporal_memory_usage(fast_storage_mem_area)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

930

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame^]

931

# If true, everything fits and we can proceed

932

if max(max_mem_usage) <= staging_limit:

933

return

934

935

# Build up the base memory usage by removing the

936

# mem_usage of the lrs we previously moved to fast-storage

937

base_mem_usage = np.array(max_mem_usage)

938

curr_lrs = []

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

939

for lr in lr_graph.lrs:

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame^]

940

for tens in lr.tensors:

941

if scratched_fms.get(tens):

942

curr_lrs.append(lr)

943

base_mem_usage[lr.start_time : lr.end_time + 1] -= lr.size

break

competing_lrs = []

for lr in curr_lrs:

base_usage = max(base_mem_usage[lr.start_time : lr.end_time + 1])

949

# If true, the lr will never fit and may thus be evicted

950

if base_usage + lr.size > staging_limit:

951

FastStorageComponentAllocator.evict(lr, max_mem_usage, scratched_fms)

952

continue

953

# Since max_mem_usage is the memory usage with all FMs still in fast-storage,

954

# the memory limit cannot be exceeded if max_mem_usage does not.

955

# Thus, the affected lrs can remain in fast-storage if the following is true

956

if max(max_mem_usage[lr.start_time : lr.end_time + 1]) <= staging_limit:

957

FastStorageComponentAllocator.keep(lr, base_mem_usage, staging_limit)

958

else:

959

competing_lrs.append(lr)

960

sz = len(competing_lrs)

961

# All lrs and their tensors have been handled if sz is zero, we may thus return

if sz == 0:

return

competing_lrs = sorted(competing_lrs, key=lambda lr: (lr.start_time, lr.end_time + 1, lr.size))

966

start = 0

967

start_time = competing_lrs[0].start_time

968

end_time = competing_lrs[0].end_time

969

component_allocator = FastStorageComponentAllocator(base_mem_usage, max_mem_usage, staging_limit)

970

# Build up components and then allocate each separately

971

for i, lr in enumerate(competing_lrs):

972

if lr.start_time <= end_time and i - start < component_allocator.max_exhaustive_size:

973

start_time = min(start_time, lr.start_time)

974

end_time = max(end_time, lr.end_time)

975

else:

976

component_allocator.allocate_component(

977

component_allocator,

978

competing_lrs[start:i],

max_mem_usage,

base_mem_usage,

staging_limit,

scratched_fms,

)

start = i

start_time = lr.start_time

986

end_time = lr.end_time

987

component_allocator.allocate_component(

988

component_allocator, competing_lrs[start:sz], max_mem_usage, base_mem_usage, staging_limit, scratched_fms

989

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

990

991

def move_constant_data(self):

992

"""Determine if data, can be moved from permanent storage to another memory area. A move

993

will generate a DMA command in the high-level command stream"""

994

for sched_op in self.sched_ops:

995

parent_op = sched_op.parent_op

996

is_lut_used = any(inp.purpose == TensorPurpose.LUT for inp in parent_op.inputs)

997

max_ifm_shram_avail = (

998

(self.arch.available_shram_banks(is_lut_used) - self.arch.shram_reserved_output_banks)

999

* self.arch.shram_bank_size

// 2

)

for idx, tens in enumerate(parent_op.inputs):

1004

if tens.mem_type not in (MemType.Scratch, MemType.Scratch_fast):

1005

# Tensor is in permanent storage

1006

# Only when permanent storage differs from feature map storage, there is a point moving the data

1007

if (

1008

tens.mem_area in self.arch.permanent_storage_mem_area

1009

and self.arch.permanent_storage_mem_area != self.arch.feature_map_storage_mem_area

1010

) or tens.purpose == TensorPurpose.LUT:

1011

if tens.purpose == TensorPurpose.LUT or (

Patrik Gustavsson

94292fe

2021-09-02 08:22:58 +0200

[diff] [blame]

1012

# For elementwise broadcast

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1013

tens.purpose == TensorPurpose.FeatureMap

1014

and sched_op.op_type.is_binary_elementwise_op()

1015

and tens.shape != []

1016

and sched_op.ifm.shape != sched_op.ofm.shape

Patrik Gustavsson

94292fe

2021-09-02 08:22:58 +0200

[diff] [blame]

1017

and parent_op.write_shape is None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1018

and tens.storage_size() > max_ifm_shram_avail

1019

):

1020

only_vector_product_consumers = all(

1021

oper and oper.type.npu_block_type == NpuBlockType.VectorProduct

1022

for oper in tens.consumers()

1023

)

1024

1025

if (not only_vector_product_consumers) or tens.purpose == TensorPurpose.LUT:

1026

new_tens = tens.clone_into_fast_storage(self.arch)

1027

if tens.purpose == TensorPurpose.LUT:

1028

new_tens.mem_area = MemArea.Shram

1029

1030

new_tens.consumer_list.append(parent_op)

1031

parent_op.inputs[idx] = new_tens

Dwight Lidman

352607c

2021-09-29 17:00:09 +0200

[diff] [blame]

1032

# If the index is out of range, IFM and IFM2 are the same tensor

1033

# and pass inputs don't have duplicates

1034

if idx < len(sched_op.parent_ps.inputs):

1035

sched_op.parent_ps.inputs[idx] = new_tens

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1036

1037

def print_schedule(self, schedule: Schedule):

1038

print(f"Schedule: '{schedule.name}'")

1039

for sched_op in self.sched_ops:

1040

if sched_op not in schedule.cost_map:

1041

# Sub-schedule printing

1042

continue

1043

1044

op_info = schedule.cost_map[sched_op]

1045

print(f"\t{sched_op.index}: Operation {sched_op.name} - OFM {sched_op.ofm.shape}")

1046

print(f"\t\tType: {sched_op.op_type}")

1047

print(f"\t\tKernel: {sched_op.kernel}")

1048

print(f"{op_info}")

1049

mem_usage = (

1050

schedule.memory_snapshot[op_info.time_index]

1051

if op_info.time_index < len(schedule.memory_snapshot)

1052

else 0

1053

)

1054

print(f"\t\tSRAM Used: {mem_usage} bytes")

1055

1056

print(f"\tCascades:")

1057

for i, cascade in enumerate(schedule.cascades.values()):

1058

print(f"\t\t{i}: {cascade.start} -> {cascade.end}, size: {cascade.mem_usage}")

Patrik Gustavsson

feeb06d

2020-04-22 12:53:47 +0200

[diff] [blame]

1059

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1060

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1061

def _update_tensor_allocation(nng: Graph, arch: ArchitectureFeatures, options):

1062

"""

1063

Creates live ranges and runs tensor allocator for the current schedule

1064

(i.e. sg.schedule for all subgraphs), returns the maximum memory usage

1065

and updates SchedulerOpInfo.mem_usage for all operations in the schedule.

1066

"""

1067

root_sg = nng.get_root_subgraph()

1068

1069

alloc_list = []

1070

if arch.is_spilling_enabled():

1071

mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,)))

1072

mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))

1073

# Order is important

1074

alloc_list.append(mem_alloc_scratch_fast)

1075

alloc_list.append(mem_alloc_scratch)

1076

else:

1077

mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))

1078

alloc_list.append(mem_alloc_scratch)

1079

1080

for mem_area, mem_type_set in alloc_list:

1081

tensor_allocation.allocate_tensors(

nng,

root_sg,

arch,

mem_area,

mem_type_set,

tensor_allocator=options.tensor_allocator,

1088

verbose_allocation=options.verbose_allocation,

1089

cpu_tensor_alignment=options.cpu_tensor_alignment,

1090

)

1091

1092

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame^]

1093

class FastStorageComponentAllocator:

1094

def __init__(self, base_mem_usage, max_mem_usage, staging_limit):

1095

self.base_mem_usage = base_mem_usage

1096

self.max_mem_usage = list(max_mem_usage)

1097

self.staging_limit = staging_limit

1098

self.lrs = []

1099

self.evicted = []

1100

self.curr_evicted = []

1101

self.remaining_total_size = []

1102

self.best_allocated_size = 0

1103

self.max_exhaustive_size = 20

1104

1105

def allocate_exhaustive(self, ix, alloc_size):

1106

if ix >= len(self.lrs):

1107

if alloc_size > self.best_allocated_size:

1108

self.best_allocated_size = alloc_size

1109

self.evicted = self.curr_evicted

return

lr = self.lrs[ix]

for t in range(lr.start_time, lr.end_time):

1114

assert self.base_mem_usage[t] <= self.max_mem_usage[t]

1115

base_usage = max(self.base_mem_usage[lr.start_time : lr.end_time + 1])

1116

can_fit = base_usage + lr.size <= self.staging_limit

1117

always_fits = can_fit

1118

1119

if can_fit:

1120

max_usage = max(self.max_mem_usage[lr.start_time : lr.end_time + 1])

1121

always_fits = max_usage <= self.staging_limit

1122

1123

if can_fit or always_fits:

1124

self.curr_evicted[ix] = False

1125

self.base_mem_usage = self.update_mem_usage(self.base_mem_usage, lr, True)

1126

self.allocate_exhaustive(ix + 1, alloc_size + lr.size)

1127

self.base_mem_usage = self.update_mem_usage(self.base_mem_usage, lr, False)

1128

1129

if not always_fits:

1130

self.curr_evicted[ix] = True

1131

self.max_mem_usage = self.update_mem_usage(self.max_mem_usage, lr, False)

1132

self.allocate_exhaustive(ix + 1, alloc_size)

1133

self.max_mem_usage = self.update_mem_usage(self.max_mem_usage, lr, True)

1134

1135

@staticmethod

1136

def update_mem_usage(mem_usage, lr, increase):

1137

for t in range(lr.start_time, lr.end_time + 1):

1138

mem_usage[t] += lr.size if increase else -lr.size

1139

assert mem_usage[t] >= 0

return mem_usage

@staticmethod

def evict(lr, max_mem_usage, scratched_fms):

1144

for t in range(lr.start_time, lr.end_time + 1):

1145

max_mem_usage[t] -= lr.size

1146

for tens in lr.tensors:

1147

if tens in scratched_fms:

1148

tens.mem_area = scratched_fms[tens][0]

1149

tens.mem_type = scratched_fms[tens][1]

1150

1151

@staticmethod

1152

def keep(lr, base_mem_usage, staging_limit):

1153

for t in range(lr.start_time, lr.end_time + 1):

1154

base_mem_usage[t] += lr.size

1155

assert base_mem_usage[t] <= staging_limit

1156

1157

def allocate_component(self, allocator, lrs, max_mem, min_mem, staging_limit, scratched_fms):

1158

sz = len(lrs)

1159

allocator.lrs = lrs

1160

allocator.evicted = [0] * len(lrs)

1161

allocator.curr_evicted = [0] * sz

1162

allocator.best_allocated_size = -1

1163

# Recursively evaluate all permutations of allocations of the lrs found in the component

1164

allocator.allocate_exhaustive(0, 0)

1165

1166

# Optimal allocation has been found, move lrs accordingly

1167

for i, e in enumerate(allocator.evicted):

1168

if e:

1169

self.evict(lrs[i], max_mem, scratched_fms)

1170

else:

1171

self.keep(lrs[i], min_mem, staging_limit)

1172

1173

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1174

def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_options: SchedulerOptions):

1175

"""Entry point for the Scheduler"""

1176

# Initialize CPU subgraphs

1177

schedulers = dict()

1178

# Initialize schedulers with max schedule. Only schedule NPU subgraphs

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1179

for sg in nng.subgraphs:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1180

if sg.placement != PassPlacement.Npu:

1181

# Create cascaded passes for CPU Ops

1182

cascaded_passes = []

1183

for idx, ps in enumerate(sg.passes):

1184

cps = CascadedPass(

1185

ps.name, SchedulingStrategy.WeightStream, ps.inputs, [], ps.outputs, [ps], ps.placement, False,

1186

)

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1187

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1188

cps.time = idx

1189

ps.cascade = cps

1190

cascaded_passes.append(cps)

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1191

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1192

sg.cascaded_passes = cascaded_passes

1193

else:

1194

# Npu subgraph - create schedule

1195

scheduler = Scheduler(nng, sg, arch, scheduler_options)

1196

schedulers[sg] = scheduler

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1197

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1198

scheduler.create_scheduler_representation(arch)

1199

sg.sched_ops = scheduler.sched_ops

1200

scheduler.move_constant_data()

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1201

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1202

# Create the Max schedule template

1203

max_schedule_template = scheduler.create_initial_schedule()

1204

scheduler.max_schedule = max_schedule_template

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1205

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1206

# Create the optimimised Max schedule

1207

sg.schedule = max_schedule_template

1208

scheduler.update_op_memory_snapshot(max_schedule_template)

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

1209

opt_max_schedule = scheduler.propose_schedule_buffering(max_schedule_template, 1 << 32)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1210

sg.schedule = opt_max_schedule

1211

scheduler.update_op_memory_snapshot(opt_max_schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1212

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1213

# Create Min schedule

1214

min_schedule = scheduler.propose_minimal_schedule()

1215

initial_sram_limit = scheduler_options.optimization_sram_limit

1216

if scheduler_options.optimization_strategy == OptimizationStrategy.Size:

1217

initial_sram_limit = scheduler.min_memory_req

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1218

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1219

cascade_builder = CascadeBuilder(scheduler.sched_ops, arch.is_spilling_enabled())

1220

cascade_builder.build_cascades(min_schedule, max_schedule_template, initial_sram_limit)

1221

sg.schedule = min_schedule

1222

scheduler.update_op_memory_snapshot(min_schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1223

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1224

if scheduler_options.optimization_strategy == OptimizationStrategy.Performance:

1225

# Create an optimized schedule

1226

sg.schedule = scheduler.optimize_schedule(

1227

min_schedule, opt_max_schedule, max_schedule_template, scheduler_options

1228

)

1229

scheduler.update_op_memory_snapshot(sg.schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1230

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1231

scheduler.apply_schedule(sg.schedule)

1232

scheduler.use_fast_storage_for_feature_maps(sg.schedule, scheduler_options.optimization_sram_limit)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1233

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1234

if scheduler_options.verbose_schedule:

1235

scheduler.print_schedule(sg.schedule)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1236

Tim Hall