Blame - ethosu/vela/scheduler.py - ml/ethos-u/ethos-u-vela

2021-05-27 18:49:40 +0100

[diff] [blame]

1

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

16

#

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

17

# Description:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

18

# The scheduler creates and searches for an optimal plan for the network, selecting block configurations and

19

# subdivisions for the Operators

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

20

# For Class name forward references for the type annotations. (see PEP 563).

21

from __future__ import annotations

22

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

23

import copy

Johan Alfvén

5e0ae55

2022-02-09 21:20:10 +0100

[diff] [blame]

24

from collections import namedtuple

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

25

from enum import auto

26

from enum import IntEnum

27

from typing import Dict

28

from typing import List

29

from typing import Optional

30

from typing import Tuple

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

31

from typing import TYPE_CHECKING

32

33

# Import needed for Type annotations. Only import for Type checking to avoid run-time errors due to cyclic import.

34

if TYPE_CHECKING:

35

from .npu_performance import CycleCost

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

36

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

37

import numpy as np

38

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

39

from . import live_range

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

40

from . import npu_performance

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

41

from . import tensor_allocation

42

from . import weight_compressor

43

from .architecture_allocator import ArchitectureBlockConfig

44

from .architecture_allocator import find_block_config

45

from .architecture_allocator import get_ifm_area_required

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

46

from .architecture_features import ArchitectureFeatures

47

from .architecture_features import Block

48

from .cascade_builder import CascadeBuilder

49

from .cascade_builder import CascadeInfo

Fredrik Svedberg

880e735

2020-08-25 11:31:47 +0200

[diff] [blame]

50

from .data_type import DataType

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

51

from .nn_graph import CascadedPass

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

52

from .nn_graph import Graph

53

from .nn_graph import Pass

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

54

from .nn_graph import PassPlacement

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

55

from .nn_graph import SchedulingStrategy

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

56

from .nn_graph import Subgraph

57

from .numeric_util import round_down

58

from .numeric_util import round_up

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

59

from .operation import NpuBlockType

Louis Verhaard

aee5d75

2020-09-30 09:01:52 +0200

[diff] [blame]

60

from .operation import Op

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

61

from .shape4d import Shape4D

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

62

from .tensor import MemArea

Patrik Gustavsson

eca2e95

2020-05-27 09:15:11 +0200

[diff] [blame]

63

from .tensor import MemType

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

64

from .tensor import Tensor

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

65

from .tensor import TensorFormat

66

from .tensor import TensorPurpose

67

from .tensor import TensorSubPurpose

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

68

from .weight_compressor import NpuWeightTensor

Jacob Bohlin

1a66697

2020-09-11 10:04:15 +0200

[diff] [blame]

69

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

70

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

71

def shape_for_format(shape: Shape4D, tensor_format: TensorFormat) -> Shape4D:

72

if tensor_format == TensorFormat.NHCWB16:

73

return shape.with_depth(round_up(shape.depth, 16))

return shape

class OptimizationStrategy(IntEnum):

79

"""Enum defining the different optimization strategies for the Scheduler"""

80

81

Size = auto()

82

Performance = auto()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

def __str__(self):

return self.name

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

88

class SchedulerOpInfo:

89

"""Contains metadata about a SchedulerOperation that is unique to one Schedule"""

90

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

91

def __init__(

92

self,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

93

block_config: ArchitectureBlockConfig,

94

weights_size: int,

95

stripe_input: Shape4D,

96

stripe_input2: Optional[Shape4D],

97

stripe: Shape4D,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

98

):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

99

self.block_config = block_config

100

self.weights_size = weights_size

101

self.stripe_input = stripe_input

102

self.stripe_input2 = stripe_input2

103

self.stripe = stripe

104

self.cascade = 0 # Assigned by CascadeBuilder. 0 means not part of a cascade

105

self.time_index = None # Set by update_op_memory_snapshot

106

self.ofm_depth_slices: List[int] = [0, stripe.depth]

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

107

self.npu_weights_tensor: Optional[NpuWeightTensor] = None

108

self.npu_scales_tensor: Optional[NpuWeightTensor] = None

Tim Hall

2022-05-04 16:20:43 +0100

[diff] [blame]

109

self.buffered_weight_tensor: Optional[Tensor] = None

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

110

self.cycles: Optional[CycleCost] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

111

self.slack_buffering_cycles = 0

112

self.slack_buffering_memory = 0

113

self.full_weight_transfer_cycles = 0

114

115

def copy(self):

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

116

res = SchedulerOpInfo(

self.block_config,

self.weights_size,

self.stripe_input,

self.stripe_input2,

self.stripe,

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

123

res.cascade = self.cascade

return res

def __str__(self):

res = f"\t\tBlock Config = {self.block_config}\n"

128

res += f"\t\tOFM Block = {self.block_config.ofm_block}\n"

129

res += f"\t\tIFM Stripe = {self.stripe_input}\n"

130

res += f"\t\tIFM2 Stripe = {self.stripe_input2}\n"

131

res += f"\t\tOFM Stripe = {self.stripe}\n"

132

res += f"\t\tEncoded Weights = {self.npu_weights_tensor and len(self.npu_weights_tensor.buffer)} bytes\n"

Tim Hall

2022-05-04 16:20:43 +0100

[diff] [blame]

133

res += (

134

f"\t\tWeight buffer = {self.buffered_weight_tensor and self.buffered_weight_tensor.storage_size()} bytes\n"

135

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

136

res += f"\t\tDepth slices = {self.ofm_depth_slices}\n"

137

res += f"\t\tAssigned Cascade = {self.cascade}"

return res

class SchedulerOptions:

142

"""Contains options for the Scheduler"""

143

144

def __init__(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

145

self,

146

optimization_strategy,

147

sram_target,

148

verbose_schedule,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

149

):

150

self.optimization_strategy = optimization_strategy

151

self.optimization_sram_limit = sram_target

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

152

self.verbose_schedule = verbose_schedule

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

153

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

154

def __str__(self) -> str:

155

return f"{type(self).__name__}: {str(self.__dict__)}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

__repr__ = __str__

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

160

class SchedulerTensor:

161

def __init__(self, shape, dt, mem_area, _format):

162

self.dtype = dt

163

self.mem_area = mem_area

164

self.shape = shape

165

self.format = _format

166

self.connection = None

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

167

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

168

169

class SchedulerOperation:

170

"""Scheduler internal representation of 'Operation'

171

This class can be seen as a node within the Scheduler Graph representation

172

"""

173

174

def __init__(self, ps: Pass, arch: ArchitectureFeatures, nng: Graph):

175

self.arch = arch

176

self.parent_ps = ps

177

self.parent_op = ps.primary_op

178

self.name = ps.primary_op.name

179

self.op_type = ps.primary_op.type

180

self.activation = ps.primary_op.activation

181

self.kernel = ps.primary_op.kernel

Tim Hall

3c5cfe9

2022-03-16 16:31:57 +0000

[diff] [blame]

182

self.resampling_mode = ps.primary_op.ifm_resampling_mode

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

183

self.uses_scalar = ps.primary_op.ifm2 is not None and (

184

ps.primary_op.ifm.shape == [] or ps.primary_op.ifm2.shape == []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

185

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

186

self.ifm_ublock = arch.ifm_ublock

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

187

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

188

self.ifm = SchedulerTensor(

189

ps.ifm_shapes[0],

190

ps.ifm_tensor.dtype,

191

ps.ifm_tensor.mem_area,

192

ps.ifm_tensor.format,

193

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

194

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

195

self.ifm2 = None

196

if ps.ifm2_tensor:

197

self.ifm2 = SchedulerTensor(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

198

ps.ifm_shapes[1],

199

ps.ifm2_tensor.dtype,

200

ps.ifm2_tensor.mem_area,

201

ps.ifm2_tensor.format,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

202

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

203

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

204

self.ofm = SchedulerTensor(

205

ps.ofm_shapes[0],

206

ps.ofm_tensor.dtype,

207

ps.ofm_tensor.mem_area,

208

ps.ofm_tensor.format,

209

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

210

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

211

# Input volume width and height required to produce the smallest possible stripe

212

self.min_stripe_input_w, self.min_stripe_input_h = self._calculate_min_stripe_input()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

213

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

214

# Flags that marks whether this SchedulerOperation requires full IFM/OFM

215

self.requires_full_ifm = False

216

self.requires_full_ifm2 = False

217

self.requires_full_ofm = False

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

218

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

219

self.index = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

220

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

221

def add_ifm_connection(self, conn: "Connection"):

222

"""Add input connection to another SchedulerOperation or Subgraph Input"""

223

conn.consumers.append(self)

224

self.ifm.connection = conn

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

225

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

226

def add_ifm2_connection(self, conn: "Connection"):

227

"""Add input connection to another SchedulerOperation or Subgraph Input"""

228

if self.ifm2:

229

conn.consumers.append(self)

230

self.ifm2.connection = conn

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

231

else:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

232

assert False, f"Trying to set an IFM2 Connection to {self} which has no IFM2"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

233

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

234

def add_ofm_connection(self, conn: "Connection"):

235

"""Add output connection to another SchedulerOperation or Subgraph Output"""

236

conn.producers.append(self)

237

self.ofm.connection = conn

238

239

def get_dependants(self):

240

"""Returns a list of the Ops that depend on this Operation's OFM"""

241

return self.ofm.connection.consumers

242

243

def ifm_size_in_bytes(self) -> int:

244

"""Returns size of the IFM in bytes"""

245

ifm_storage_shape = shape_for_format(self.ifm.shape, self.ifm.format)

246

return round_up(ifm_storage_shape.elements() * self.ifm.dtype.size_in_bytes(), Tensor.AllocationQuantum)

247

248

def ifm2_size_in_bytes(self) -> int:

249

"""Returns size of the IFM2 in bytes"""

250

if self.ifm2:

251

ifm2_storage_shape = shape_for_format(self.ifm2.shape, self.ifm2.format)

252

return round_up(ifm2_storage_shape.elements() * self.ifm2.dtype.size_in_bytes(), Tensor.AllocationQuantum)

return 0

def ofm_size_in_bytes(self) -> int:

257

"""Returns size of the OFM in bytes"""

258

ofm_storage_shape = shape_for_format(self.ofm.shape, self.ofm.format)

259

return round_up(ofm_storage_shape.elements() * self.ofm.dtype.size_in_bytes(), Tensor.AllocationQuantum)

260

261

def create_scheduler_info(self, nng: Graph, stripe: Shape4D) -> SchedulerOpInfo:

262

"""Returns schedule info about this SchedulerOperation based on how many ofm elements it should produce"""

263

ifm_shape = self.ifm.shape

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

264

ifm2_shape = self.ifm2.shape if self.ifm2 is not None else None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

265

ofm_shape = stripe

266

267

if ofm_shape != self.ofm.shape:

268

# Striped Op - Need to calculate stripe input volume

269

stripe_input_w, stripe_input_h = self._get_stripe_input_requirement(stripe)

270

# Ensure stripe input volume is within the full IFM volume

271

stripe_input_h = min(stripe_input_h, self.ifm.shape.height)

272

stripe_input_w = min(stripe_input_w, self.ifm.shape.width)

273

ifm_shape = ifm_shape.with_hw(stripe_input_h, stripe_input_w)

274

275

if self.ifm2:

276

stripe_input2_h = min(stripe_input_h, self.ifm2.shape.height)

277

stripe_input2_w = min(stripe_input_w, self.ifm2.shape.width)

278

ifm2_shape = ifm2_shape.with_hw(stripe_input2_h, stripe_input2_w)

279

280

block_config = self._get_block_config(ifm_shape, ifm2_shape, self.uses_scalar, ofm_shape)

281

282

scheduler_op_info = SchedulerOpInfo(block_config, 0, ifm_shape, ifm2_shape, ofm_shape)

283

if self.parent_op.weights:

284

# Default full-depth weight encoding with no buffering

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

285

(

286

scheduler_op_info.npu_weights_tensor,

287

scheduler_op_info.npu_scales_tensor,

288

) = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

289

self.arch,

290

self.parent_op,

291

self.parent_op.weights,

self.parent_op.bias,

self.kernel,

block_config,

[0, self.ofm.shape.depth],

296

)

297

298

self.parent_ps.block_config = block_config.old_style_representation()

299

return scheduler_op_info

300

301

def _get_stripe_input_requirement(self, stripe_shape: Shape4D) -> Tuple[int, int]:

302

"""Returns the amount of IFM required to produce the stripe with shape:'stripe_shape'"""

303

ofm_shape_to_produce = Block.from_shape(stripe_shape.as_list())

304

Fredrik Svedberg

3ff7a4a

2021-09-29 10:08:04 +0200

[diff] [blame]

305

return get_ifm_area_required(ofm_shape_to_produce, self.kernel, self.resampling_mode)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

306

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

307

def _calculate_min_stripe_input(self) -> Tuple[int, int]:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

308

# Calculate the input volume required height and width for the smallest possible stripe (h,w = 1,1)

309

min_stripe = self.ofm.shape.with_hw(1, 1)

310

return self._get_stripe_input_requirement(min_stripe)

311

312

def _get_block_config(

313

self, ifm_shape: Shape4D, ifm2_shape: Optional[Shape4D], uses_scalar: bool, ofm_shape: Shape4D

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

314

) -> Optional[ArchitectureBlockConfig]:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

315

# Returns a block config and SHRAM layout

316

lut_banks = 2 if self.parent_op.activation_lut else 0

317

return find_block_config(

318

self.arch,

319

self.op_type.npu_block_type,

ofm_shape,

ifm_shape,

ifm2_shape,

uses_scalar,

self.ifm.dtype.size_in_bits(),

325

self.kernel,

326

lut_banks,

327

self.parent_op.has_scaling(),

328

self.resampling_mode,

)

class Connection:

"""Scheduler internal representation of a Tensor that connects two SchedulerOperations

334

This class can be seen as an edge within the Scheduler Graph representation

335

"""

336

337

def __init__(self, tensor: Tensor):

338

self.parent_tens = tensor

339

340

# SchedulerOperation relationships

341

self.producers: List[SchedulerOperation] = []

342

self.consumers: List[SchedulerOperation] = []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

343

344

def __str__(self):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

345

return f"<Connection {self.parent_tens.name}>"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

__repr__ = __str__

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

350

class Schedule:

351

"""Class that contains a solution of how to schedule an NPU subgraph and its cost"""

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

352

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

353

def __init__(self, sg: Subgraph, label: str):

354

self.sg = sg

355

self.label = label

356

self.cost_map: Dict[SchedulerOperation, SchedulerOpInfo] = {}

357

self.cascades: Dict[int, CascadeInfo] = {}

358

self.fast_storage_peak_usage = 0

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

359

self.memory_snapshot: Optional[List[int]] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

@property

def name(self):

return f"{self.sg.name}_{self.label}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

364

365

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

366

class Scheduler:

367

"""Main class of the Vela Scheduling"""

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

368

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

369

def __init__(self, nng: Graph, sg: Subgraph, arch: ArchitectureFeatures, options: SchedulerOptions):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

370

self.nng = nng

371

self.sg = sg

372

self.arch = arch

Ayaan Masood

b801dda

2022-02-22 11:28:55 +0000

[diff] [blame]

373

self.sched_ops: List[SchedulerOperation] = []

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

374

self.max_schedule: Optional[Schedule] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

375

self.scheduler_options = options

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

376

Johan Alfvén

5e0ae55

2022-02-09 21:20:10 +0100

[diff] [blame]

377

def avoid_nhcwb16_for_ofm(self, tens, ps, arch):

378

# Only run this check for opt strategy Size

379

if self.scheduler_options.optimization_strategy == OptimizationStrategy.Performance:

return False

op = ps.primary_op

if not op.type.is_elementwise_op():

384

return False

385

386

depth = op.ofm_shapes[0][-1]

387

if (depth % 16) == 0:

388

return False

389

390

# Check if overwriting the inputs can be allowed

391

OpShapeTens = namedtuple("OpShapeTens", ["op_shape", "tens"])

392

outp = OpShapeTens(op.ofm_shapes[0], op.ofm)

393

inps = []

394

if op.ifm is not None:

395

inps.append(OpShapeTens(op.ifm_shapes[0], op.ifm))

396

if op.ifm2 is not None:

397

inps.append(OpShapeTens(op.ifm_shapes[1], op.ifm2))

398

399

# Find an input tensor that can be overwritten by the output

400

for inp in inps:

401

if (

402

# check op input and output shapes allow overlapping

403

inp.op_shape == outp.op_shape

404

# check input tensor is valid

405

and inp.tens is not None

406

and inp.tens.shape != []

407

# check input and output tensors are compatible

408

and inp.tens.format == outp.tens.format

409

and inp.tens.dtype == outp.tens.dtype

410

):

411

if inp.tens.format == TensorFormat.NHWC:

return True

return False

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

416

def create_scheduler_representation(self, arch: ArchitectureFeatures):

417

"""Creates a Scheduler Graph representation"""

418

# Temporary dict for creating connections between the Operations

419

connections: Dict[Tensor, Connection] = {}

420

# Memory required for the largest FeatureMap that has to be full

421

min_memory_req = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

422

for ps in self.sg.passes:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

423

if ps.primary_op:

424

# Set tensor format to NHCWB16 for output FeatureMaps, if possible

Louis Verhaard

0b9c9a3

2020-09-15 14:05:38 +0200

[diff] [blame]

425

for output in ps.outputs:

Jacob Bohlin

a5e8c1c

2021-06-14 13:33:39 +0200

[diff] [blame]

426

if output in self.sg.output_tensors or output.purpose != TensorPurpose.FeatureMap:

Patrik Gustavsson

feeb06d

2020-04-22 12:53:47 +0200

[diff] [blame]

427

continue

Johan Alfvén

5e0ae55

2022-02-09 21:20:10 +0100

[diff] [blame]

428

429

if output.needs_linear_format:

430

continue

431

432

if self.avoid_nhcwb16_for_ofm(output, ps, arch):

433

output.needs_linear_format = True

434

continue

435

436

output.set_format(TensorFormat.NHCWB16, arch)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

437

438

# Create SchedulerOperations

439

op = SchedulerOperation(ps, arch, self.nng)

440

op.index = len(self.sched_ops)

441

442

# Make connections

443

if ps.ifm_tensor not in connections:

444

connections[ps.ifm_tensor] = Connection(ps.ifm_tensor)

445

if ps.ifm2_tensor and ps.ifm2_tensor not in connections:

446

connections[ps.ifm2_tensor] = Connection(ps.ifm2_tensor)

447

if ps.ofm_tensor not in connections:

448

connections[ps.ofm_tensor] = Connection(ps.ofm_tensor)

449

450

op.add_ifm_connection(connections[ps.ifm_tensor])

451

if ps.ifm2_tensor:

452

op.add_ifm2_connection(connections[ps.ifm2_tensor])

453

op.add_ofm_connection(connections[ps.ofm_tensor])

454

455

# Set requirements on the ifm/ofm buffers

456

self.sched_ops.append(op)

457

if ps.ifm_tensor in self.sg.input_tensors:

458

# This Op consumes a subgraph input

459

op.requires_full_ifm = True

460

if ps.ifm2_tensor and ps.ifm2_tensor in self.sg.input_tensors:

461

# This Op consumes a subgraph input

462

op.requires_full_ifm2 = True

463

if ps.ofm_tensor in self.sg.output_tensors:

464

# This Op produces a subgraph output

465

op.requires_full_ofm = True

466

if ps.ifm_tensor.needs_linear_format:

467

op.requires_full_ifm = True

468

if ps.ifm2_tensor and ps.ifm2_tensor.needs_linear_format:

469

op.requires_full_ifm2 = True

470

if ps.ofm_tensor.needs_linear_format or ps.primary_op.memory_function == Op.ConcatSliceWrite:

471

op.requires_full_ofm = True

472

if len(ps.primary_op.outputs) > 1 or len(ps.primary_op.outputs[0].consumer_list) > 1:

473

# Op has multiple outputs or consumers - requires full OFM

474

op.requires_full_ofm = True

475

476

# Check memory requirements if this Op requires any full FeatureMaps

477

op_memory_req = 0

478

if op.requires_full_ifm:

479

op_memory_req += op.ifm_size_in_bytes()

480

if op.requires_full_ifm2:

481

op_memory_req += op.ifm2_size_in_bytes()

482

if op.requires_full_ofm:

483

op_memory_req += op.ofm_size_in_bytes()

484

485

min_memory_req = max(op_memory_req, min_memory_req)

486

487

# Theoretical minimum required memory - used to guide the cascade building

488

self.min_memory_req = min_memory_req

489

490

def create_initial_schedule(self) -> Schedule:

491

"""Creates an initial schedule with no cascading or buffering of any kind"""

492

schedule = Schedule(self.sg, "MAX")

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

493

for op in self.sched_ops:

494

cost = op.create_scheduler_info(self.nng, op.ofm.shape)

495

cost.cycles = self.estimate_op_performance(op, cost.block_config, op.ofm.shape.depth)

496

schedule.cost_map[op] = cost

return schedule

def update_op_memory_snapshot(self, schedule: Schedule):

501

memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

502

503

# Collect live ranges from tensors

504

lr_graph = live_range.LiveRangeGraph()

505

for mem_area, mem_type_set in memories_list:

506

live_range.extract_live_ranges_from_cascaded_passes(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

507

self.nng.get_root_subgraph(),

mem_area,

mem_type_set,

lr_graph,

Tensor.AllocationQuantum,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

512

)

513

514

# Populate time-array with memory used by live ranges

515

temporal_usage = lr_graph.get_temporal_memory_usage(self.arch.fast_storage_mem_area)

516

schedule.memory_snapshot = temporal_usage

517

518

# Set the peak memory usage

519

schedule.fast_storage_peak_usage = max(temporal_usage, default=0)

520

521

def estimate_op_performance(self, op: SchedulerOperation, block_config, ofm_depth):

522

query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)

523

query.ifm_shape = op.ifm.shape

524

query.ifm_memory_area = op.ifm.mem_area

525

query.ifm_bits = op.ifm.dtype.size_in_bits()

526

query.ifm_format = op.ifm.format

527

query.ifm2_shape = op.ifm2 and op.ifm2.shape

528

query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area

529

query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()

530

query.ifm2_format = op.ifm2 and op.ifm2.format

531

query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)

532

query.ofm_memory_area = op.ofm.mem_area

533

query.ofm_bits = op.ofm.dtype.size_in_bits()

534

query.ofm_format = op.ofm.format

535

if op.parent_op.bias:

536

query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)

537

query.const_memory_area = self.arch.fast_storage_mem_area

538

539

query.kernel = op.kernel

540

query.config = block_config

541

542

return npu_performance.measure_cycle_cost(self.arch, op.op_type, op.activation and op.activation.op_type, query)

543

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

544

def propose_schedule_buffering(self, ref_schedule: Schedule, staging_limit_bytes):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

545

"""Create a buffered schedule"""

546

buffered_schedule = Schedule(self.sg, f"{ref_schedule.label}_BUFFERED")

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

547

548

prev_op = None

549

for sched_op in self.sched_ops:

550

if sched_op not in ref_schedule.cost_map:

551

# sched_op is not part of this sub-schedule - skip

552

continue

553

554

self.propose_operator_buffering(sched_op, prev_op, buffered_schedule, ref_schedule, staging_limit_bytes)

555

prev_op = sched_op

556

557

return buffered_schedule

558

559

def propose_operator_buffering(

560

self,

561

sched_op: SchedulerOperation,

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

562

prev_op: Optional[SchedulerOperation],

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

563

buffered_schedule: Schedule,

564

ref_schedule: Schedule,

565

staging_limit_bytes,

566

):

567

# Mild recursion might mean this Op has already been seen

568

if sched_op in buffered_schedule.cost_map:

569

return

570

571

# Take the reference schedule as default costings for this schedule

572

ref_cost = ref_schedule.cost_map[sched_op]

573

cost = copy.copy(ref_cost)

574

cost.slack_buffering_cycles = ref_cost.cycles.op_cycles

575

memory_snapshot = ref_schedule.memory_snapshot

576

ref_memory_usage = memory_snapshot[ref_cost.time_index] if ref_cost.time_index < len(memory_snapshot) else 0

577

cost.slack_buffering_memory = staging_limit_bytes - ref_memory_usage

578

buffered_schedule.cost_map[sched_op] = cost

579

580

# Attempt weight buffering on anything with a weights tensor

581

if sched_op.parent_op.weights:

582

self.propose_weight_buffering(

583

sched_op.parent_op.weights,

584

sched_op.parent_op.bias,

sched_op,

prev_op,

buffered_schedule,

ref_schedule,

cost.slack_buffering_memory,

)

return cost

def weights_needs_dma(self, weight_tensor):

595

if weight_tensor and weight_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast):

596

# Weights are in permanent storage

597

# Only when permanent storage differs from feature map storage, there is a point moving the data

598

if (

599

weight_tensor.mem_area in (MemArea.Dram, MemArea.OffChipFlash)

600

and self.arch.permanent_storage_mem_area != self.arch.fast_storage_mem_area

):

return True

return False

def propose_weight_buffering(

self,

weight_tensor,

scale_tensor,

sched_op: SchedulerOperation,

610

prev_op: SchedulerOperation,

611

buffered_schedule: Schedule,

612

ref_schedule: Schedule,

613

buffer_limit_bytes,

614

):

615

cost = buffered_schedule.cost_map[sched_op]

616

prev_cost = buffered_schedule.cost_map.get(prev_op)

617

ref_cost = ref_schedule.cost_map[sched_op]

618

assert cost and ref_cost

619

620

needs_dma = self.weights_needs_dma(weight_tensor)

621

622

ofm_full_depth_slices = [0, ref_cost.stripe.depth]

623

624

# Encode weights for the full depth

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

625

full_weights, full_scales = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

self.arch,

sched_op.parent_op,

weight_tensor,

scale_tensor,

sched_op.kernel,

cost.block_config,

ofm_full_depth_slices,

633

)

634

full_weights_bytes = len(full_weights.buffer)

635

cost.ofm_depth_slices = ofm_full_depth_slices

636

637

# No buffering required - take all the weights from permanent storage

638

if sched_op.op_type == Op.FullyConnected or not needs_dma:

639

cost.npu_weights_tensor = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

640

cost.npu_scales_tensor = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

641

return

642

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

643

encoded_weights: Optional[NpuWeightTensor] = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

644

encoded_scales = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

645

646

# How many NPU cycles are available under the previously executing

647

# operator and SRAM unused for performing buffered DMA transfers

648

slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0

649

slack_memory = prev_cost.slack_buffering_memory if prev_cost else 0

650

651

# Force full depth for cascaded Ops

652

if ref_cost.cascade != 0:

653

weight_tensor_purpose = TensorSubPurpose.Standard

654

weight_buffer_size = full_weights_bytes

655

# Update the memory snapshot to reflect the added size of the weights

656

ref_schedule.memory_snapshot[ref_cost.time_index] += weight_buffer_size

657

else:

658

# Estimate the buffering cycle time for the full set of weights

659

full_transfer_cycles = npu_performance.measure_mem2mem_cycles(

660

self.arch, weight_tensor.mem_area, self.arch.fast_storage_mem_area, full_weights_bytes

661

)

662

cost.full_weight_transfer_cycles = full_transfer_cycles

663

664

# Calculate the amount of prebuffering necessary (or what is possible with limited

665

# double buffer buffer size)

666

half_buffer_limit = buffer_limit_bytes // 2

667

if full_transfer_cycles > slack_cycles:

668

prebuffer_ratio = slack_cycles / full_transfer_cycles

669

prebuffer_bytes = min(prebuffer_ratio * full_weights_bytes, half_buffer_limit)

670

else:

671

prebuffer_bytes = min(full_weights_bytes, half_buffer_limit)

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

672

673

prebuffer_ratio = prebuffer_bytes / full_weights_bytes

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

674

675

# Have to split the weights if the initial buffering can't store

676

# all of the compressed weights

677

if prebuffer_bytes < full_weights_bytes:

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

678

block_depth = cost.block_config.ofm_block.depth

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

679

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

680

# Choose initial prebuffering depth (already buffer clamped)

681

prebuffer_depth = ref_cost.stripe.depth * prebuffer_ratio

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

682

prebuffer_depth = int(max(16, round_down(prebuffer_depth, ArchitectureFeatures.OFMSplitDepth)))

683

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

684

# Calculate cycles executed during the prebuffer

685

pre_op_cycles = self.estimate_op_performance(sched_op, cost.block_config, prebuffer_depth)

686

buffering_depth = ref_cost.stripe.depth * (pre_op_cycles.op_cycles / full_transfer_cycles)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

687

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

688

# Choose initial buffering depth and clamp to the double buffering limit

689

buffering_depth = round_up(buffering_depth, block_depth)

690

buffering_bytes = (buffering_depth / ref_cost.stripe.depth) * full_weights_bytes

691

if buffering_bytes > half_buffer_limit:

692

buffering_depth = (half_buffer_limit / full_weights_bytes) * ref_cost.stripe.depth

693

694

while True:

695

# Attempt to buffer whole blocks

Johan Alfvén

cce7f2d

2022-04-08 10:47:09 +0200

[diff] [blame]

696

if buffering_depth > block_depth:

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

697

buffering_depth = round_down(buffering_depth, block_depth)

698

else:

699

buffering_depth = round_down(buffering_depth, ArchitectureFeatures.OFMSplitDepth)

700

buffering_depth = int(max(buffering_depth, ArchitectureFeatures.OFMSplitDepth))

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

701

702

# Create list of depth slices

703

depth_slices = [0]

704

if prebuffer_depth < ref_cost.stripe.depth:

705

depth_slices += list(range(prebuffer_depth, ref_cost.stripe.depth, buffering_depth))

706

depth_slices.append(ref_cost.stripe.depth)

707

708

# Encode weights based depth slices

709

cost.ofm_depth_slices = depth_slices

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

710

encoded_weights, encoded_scales = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

self.arch,

sched_op.parent_op,

weight_tensor,

scale_tensor,

sched_op.kernel,

cost.block_config,

cost.ofm_depth_slices,

718

)

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

719

assert encoded_weights is not None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

720

# Chosen buffering might not fit at all, iterate until it does

721

# or until the minimum usable slice size is reached

722

if (

Tim Hall

2022-05-04 16:20:43 +0100

[diff] [blame]

723

encoded_weights.max_range_bytes <= half_buffer_limit

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

724

or prebuffer_depth == ArchitectureFeatures.OFMSplitDepth

):

break

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

728

if buffering_depth > prebuffer_depth:

729

buffering_depth = round_up(buffering_depth // 2, ArchitectureFeatures.OFMSplitDepth)

730

else:

731

prebuffer_depth = round_up(prebuffer_depth // 2, ArchitectureFeatures.OFMSplitDepth)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

732

733

# Calculate cycles required to run the last op for use as future slack

734

tail_cycles = self.estimate_op_performance(

735

sched_op, cost.block_config, depth_slices[-1] - depth_slices[-2]

736

)

737

cost.slack_buffering_cycles = tail_cycles.op_cycles

738

739

# Determine whether the weights need to be double buffered

Tim Hall

2022-05-04 16:20:43 +0100

[diff] [blame]

740

weight_buffer_size = min(len(encoded_weights.buffer), encoded_weights.max_range_bytes)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

741

742

# Only buffer weights if there's still space left for the buffer

743

if weight_buffer_size <= buffer_limit_bytes:

744

assert weight_buffer_size % 16 == 0

745

# Determine whether to double buffer or single buffer

Tim Hall

2022-05-04 16:20:43 +0100

[diff] [blame]

746

if (weight_buffer_size * 2 <= buffer_limit_bytes) and (weight_buffer_size < len(encoded_weights.buffer)):

747

weight_buffer_size = weight_buffer_size * 2

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

748

weight_tensor_purpose = TensorSubPurpose.DoubleBuffer

749

else:

750

weight_tensor_purpose = TensorSubPurpose.Standard

751

Tim Hall

2022-05-04 16:20:43 +0100

[diff] [blame]

752

cost.buffered_weight_tensor = self.buffer_tensor(

753

encoded_weights, weight_tensor_purpose, weight_buffer_size, weight_tensor.name

754

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

755

if ref_cost.cascade == 0:

Tim Hall

2022-05-04 16:20:43 +0100

[diff] [blame]

756

# Determine if the lifetime can be extended and pre-buffer weights under the previous operation

757

cost.buffered_weight_tensor.pre_buffer = weight_buffer_size < slack_memory

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

758

759

cost.slack_buffering_memory -= weight_buffer_size

760

else:

761

# Don't slice or buffer - use the whole depth from persistent storage

762

cost.ofm_depth_slices = ofm_full_depth_slices

763

encoded_weights = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

764

encoded_scales = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

765

766

cost.npu_weights_tensor = encoded_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

767

cost.npu_scales_tensor = encoded_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

768

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

769

def buffer_tensor(self, src_tensor: Tensor, sub_purpose: TensorSubPurpose, buffer_size: int, name: str) -> Tensor:

Tim Hall

2022-05-04 16:20:43 +0100

[diff] [blame]

770

buffered_weight_tensor = Tensor([1, 1, 1, buffer_size], DataType.uint8, name + "_buffer")

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

771

buffered_weight_tensor.src_tensor = src_tensor

772

buffered_weight_tensor.mem_area = self.arch.fast_storage_mem_area

773

buffered_weight_tensor.mem_type = MemType.Scratch_fast

774

buffered_weight_tensor.purpose = TensorPurpose.Weights

775

buffered_weight_tensor.sub_purpose = sub_purpose

776

return buffered_weight_tensor

777

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

778

def propose_minimal_schedule(self) -> Schedule:

779

"""Proposes scheduling parameters where every operator is subdivided into the smallest stripe that satisfies the

780

next operators stride"""

781

min_schedule = Schedule(self.sg, "MIN")

782

cost_map = min_schedule.cost_map

783

784

# Keep track of the previous Op - which consumes the current Op's OFM

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

785

prev_op: Optional[SchedulerOperation] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

786

for sched_op in reversed(self.sched_ops):

787

min_stripe_height = prev_op.kernel.stride.y if prev_op else 1

788

min_stripe = sched_op.ofm.shape.with_height(min_stripe_height)

789

790

cost = sched_op.create_scheduler_info(self.nng, min_stripe)

791

cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)

792

cost_map[sched_op] = cost

prev_op = sched_op

return min_schedule

def propose_schedule_striping(self, final_stripe: Shape4D, label: str, ref_schedule: Schedule) -> Schedule:

799

"""Proposes new striping for a schedule. The stripe is derived from the ifm requirements of the next Op down"""

800

ref_cost = ref_schedule.cost_map

801

802

striped_schedule = Schedule(self.sg, label)

803

stripe = final_stripe

804

for sched_op in reversed(self.sched_ops):

805

if sched_op not in ref_cost:

806

# sched_op is not part of the sub-schedule - skip

807

continue

808

809

# Create a cost entry with the new stripe

810

cost = sched_op.create_scheduler_info(self.nng, stripe)

811

Tim Hall

2022-05-04 16:20:43 +0100

[diff] [blame]

812

if ref_cost[sched_op].buffered_weight_tensor:

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

813

# If the weights are buffered in the reference schedule they should be in the new proposal

814

weight_tensor = cost.npu_weights_tensor

Tim Hall

2022-05-04 16:20:43 +0100

[diff] [blame]

815

cost.buffered_weight_tensor = self.buffer_tensor(

816

weight_tensor, TensorSubPurpose.Standard, len(weight_tensor.buffer), weight_tensor.name

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

817

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

818

819

# Estimate performance

820

cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)

821

striped_schedule.cost_map[sched_op] = cost

822

823

# Calculate the preceeding Op's stripe

824

stripe = sched_op.ifm.shape.with_height(stripe.height * sched_op.kernel.stride.y)

825

826

return striped_schedule

827

828

def estimate_schedule_memory_usage(self, schedule: Schedule, non_local_mem_usage: dict):

829

"""Estimates the memory usage of a schedule"""

830

cost = schedule.cost_map

831

cascades = schedule.cascades

832

peak_mem_usage = 0

833

for sched_op in self.sched_ops:

834

if sched_op not in cost:

835

# sched_op is not part of the sub-schedule - skip

836

continue

837

838

if cost[sched_op].cascade:

839

# This Op is part of a cascade - use the cascade's memory usage

840

cascade_info = cascades[cost[sched_op].cascade]

841

# Non-local memory usage is already included in the cascade_info

842

peak_mem_usage = max(cascade_info.mem_usage, peak_mem_usage)

843

else:

844

# This Op is not part of a cascade - calculate the memory usage

Tim Hall

2022-05-04 16:20:43 +0100

[diff] [blame]

845

op_weight_buffer = 0

846

if cost[sched_op].buffered_weight_tensor:

847

op_weight_buffer = cost[sched_op].buffered_weight_tensor.storage_size()

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

848

849

op_mem_usage = (

850

sched_op.ifm_size_in_bytes()

851

+ sched_op.ofm_size_in_bytes()

852

+ op_weight_buffer

853

+ non_local_mem_usage.get(sched_op, 0)

854

)

855

peak_mem_usage = max(op_mem_usage, peak_mem_usage)

856

857

return peak_mem_usage

858

859

def optimize_sub_schedule(

860

self, cascade_info: CascadeInfo, ref_schedule: Schedule, max_template: Schedule, memory_limit: int

861

) -> Schedule:

862

"""Extracts the Ops covered by the given cascade and creates a sub-schedule. The sub-schedule is optimized by

863

proposing weight buffering and then continously proposing new stripe sizes"""

864

ref_cost = ref_schedule.cost_map

865

# Extract the ops that are part of this sub-schedule

866

start = cascade_info.start

867

end = cascade_info.end

868

sub_schedule_ops = self.sched_ops[start : end + 1]

869

# Create a sub-schedule that contains only the costs for the Ops that are part of the sub-schedule

870

sub_schedule = Schedule(self.sg, f"SUB_{start}_{end}")

871

for sched_op in sub_schedule_ops:

872

sub_schedule.cost_map[sched_op] = ref_cost[sched_op]

873

874

sub_schedule.cascades[end] = cascade_info

875

# Use the memory snapshot from the reference schedule

876

sub_schedule.memory_snapshot = ref_schedule.memory_snapshot

877

878

# Calculate memory usage that is live during the sub-schedule but not part of it

879

time_for_cascade = ref_cost[sub_schedule_ops[0]].time_index

880

mem_usage_parallel_to_sub_schedule = ref_schedule.memory_snapshot[time_for_cascade] - cascade_info.mem_usage

881

# If the first Op's IFM has other consumers it has to live throughout the whole sub-schedule whether it's

882

# included in a cascade or not

883

persistent_initial_ifm = (

884

sub_schedule_ops[0].ifm_size_in_bytes() if len(sub_schedule_ops[0].ifm.connection.consumers) > 1 else 0

885

)

886

# Calculate non-local-mem-usage per Operator

887

non_local_mem_usage = {}

888

for idx, sched_op in enumerate(sub_schedule_ops):

889

non_local_mem_usage[sched_op] = mem_usage_parallel_to_sub_schedule

890

if idx != 0:

891

non_local_mem_usage[sched_op] += persistent_initial_ifm

892

893

cascade_builder = CascadeBuilder(sub_schedule_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)

894

895

# Start by adding buffering

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

896

buffered_sub_schedule = self.propose_schedule_buffering(

897

sub_schedule, self.scheduler_options.optimization_sram_limit

898

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

899

# Copy the cascades over from the unbuffered-schedule

900

buffered_sub_schedule.cascades = sub_schedule.cascades

901

902

# Generate the possible stripings for the final Op in the sub-schedule

903

final_ofm_shape = sub_schedule_ops[-1].ofm.shape

904

possible_stripes = [

905

final_ofm_shape.with_height(stripe_h) for stripe_h in range(1, final_ofm_shape.height // 2 + 1)

906

]

907

908

# Propose different striping - the possible stripes are proposed similarly to a binary search

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

909

best_schedule = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

910

iteration = 0

911

while len(possible_stripes) > 1:

912

proposed_stripe = possible_stripes[len(possible_stripes) // 2]

913

proposed_schedule = self.propose_schedule_striping(

914

proposed_stripe, f"OPTIMIZED_{iteration}", buffered_sub_schedule

915

)

916

917

cascade_builder.build_cascades(proposed_schedule, max_template, memory_limit)

918

919

# Check if proposal fits

920

proposed_schedule_mem_usage = self.estimate_schedule_memory_usage(proposed_schedule, non_local_mem_usage)

921

if (proposed_schedule_mem_usage) <= memory_limit:

922

# Remove all possible stripes smaller than this

923

possible_stripes = possible_stripes[len(possible_stripes) // 2 :]

924

best_schedule = proposed_schedule

925

if not proposed_schedule.cascades:

926

# No cascading required - early exit

927

break

928

else:

929

# Proposal doesn't fit within the limit - remove all possible stripes larger than this

930

possible_stripes = possible_stripes[: len(possible_stripes) // 2]

iteration += 1

return best_schedule

def optimize_schedule(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

self,

schedule: Schedule,

max_sched: Schedule,

max_template: Schedule,

941

options: SchedulerOptions,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

942

) -> Schedule:

943

"""Extracts sub-schedules based on the cascades and optimizes them and applies them to the final schedule"""

944

sram_limit = options.optimization_sram_limit

945

if max_sched.fast_storage_peak_usage < sram_limit and not self.arch.is_spilling_enabled():

946

# Maximum performance schedule fits within the SRAM target

947

return max_sched

948

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

949

# Iterate over a copy of the cascades since they may change during the loop

950

for cascade_info in list(schedule.cascades.values()):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

951

# Optimize the sub-schedule in this cascade

952

opt_sub_schedule = self.optimize_sub_schedule(cascade_info, schedule, max_template, sram_limit)

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

953

if opt_sub_schedule:

954

# Remove the existing cascade

955

del schedule.cascades[cascade_info.end]

956

# Update the sub-schedule Op and cascade costs to the full schedule

957

schedule.cost_map.update(opt_sub_schedule.cost_map)

958

schedule.cascades.update(opt_sub_schedule.cascades)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

959

960

# Update memory snapshot

961

self.sg.schedule = schedule

962

self.update_op_memory_snapshot(schedule)

963

# Propose schedule buffering to the optimized schedule

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

964

optimized_sched = self.propose_schedule_buffering(schedule, self.scheduler_options.optimization_sram_limit)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

965

# Copy the cascade's metadata from the unbuffered schedule

966

optimized_sched.cascades = schedule.cascades

967

return optimized_sched

968

969

def apply_schedule(self, sched: Schedule):

970

"""Applies the given schedule as a final solution"""

971

for sched_op in self.sched_ops:

972

op_info = sched.cost_map[sched_op]

973

cascade_info = sched.cascades.get(op_info.cascade, None)

974

if cascade_info and sched_op in cascade_info.buffers:

975

buffer_tens = sched_op.ifm.connection.parent_tens

976

# Apply memory area and type

977

buffer_tens.mem_area = self.arch.fast_storage_mem_area

978

buffer_tens.mem_type = MemType.Scratch_fast

979

# Apply Rolling buffer

980

buffer_tens.set_format(TensorFormat.NHCWB16, self.arch)

981

buffer_tens.set_new_sub_purpose(TensorSubPurpose.RollingBufferY, cascade_info.buffers[sched_op].height)

982

983

sched_op.parent_ps.block_config = op_info.block_config.old_style_representation()

984

985

# Ensure that the src_tensor reference is set correctly

Tim Hall

2022-05-04 16:20:43 +0100

[diff] [blame]

986

if op_info.buffered_weight_tensor:

987

op_info.buffered_weight_tensor.src_tensor = op_info.npu_weights_tensor

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

988

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

989

def use_fast_storage_for_feature_maps(self, schedule, staging_limit):

scratched_fms = {}

max_mem_usage = []

base_mem_usage = []

fast_storage_type = MemType.Scratch_fast

994

fast_storage_mem_area = self.arch.fast_storage_mem_area

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

995

996

# Force all OFMs to fast-storage

997

for sched_op in self.sched_ops:

998

cost = schedule.cost_map[sched_op]

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

999

if cost.cascade == 0 and sched_op.get_dependants():

1000

ofm_tens = sched_op.ofm.connection.parent_tens

1001

if not any(cons is None for cons in ofm_tens.consumer_list):

1002

if ofm_tens not in scratched_fms:

1003

scratched_fms[ofm_tens] = (ofm_tens.mem_area, ofm_tens.mem_type)

1004

ofm_tens.mem_area = fast_storage_mem_area

1005

ofm_tens.mem_type = fast_storage_type

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1006

1007

# Collect live ranges from tensors

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1008

memories_list = [(fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1009

lr_graph = live_range.LiveRangeGraph()

1010

for mem_area, mem_type_set in memories_list:

1011

live_range.extract_live_ranges_from_cascaded_passes(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

1012

self.nng.get_root_subgraph(),

mem_area,

mem_type_set,

lr_graph,

Tensor.AllocationQuantum,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1017

)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1018

max_mem_usage = lr_graph.get_temporal_memory_usage(fast_storage_mem_area)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1019

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1020

# If true, everything fits and we can proceed

1021

if max(max_mem_usage) <= staging_limit:

1022

return

1023

1024

# Build up the base memory usage by removing the

1025

# mem_usage of the lrs we previously moved to fast-storage

1026

base_mem_usage = np.array(max_mem_usage)

1027

curr_lrs = []

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1028

for lr in lr_graph.lrs:

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1029

for tens in lr.tensors:

1030

if scratched_fms.get(tens):

1031

curr_lrs.append(lr)

1032

base_mem_usage[lr.start_time : lr.end_time + 1] -= lr.size

break

competing_lrs = []

for lr in curr_lrs:

base_usage = max(base_mem_usage[lr.start_time : lr.end_time + 1])

1038

# If true, the lr will never fit and may thus be evicted

1039

if base_usage + lr.size > staging_limit:

1040

FastStorageComponentAllocator.evict(lr, max_mem_usage, scratched_fms)

1041

continue

1042

# Since max_mem_usage is the memory usage with all FMs still in fast-storage,

1043

# the memory limit cannot be exceeded if max_mem_usage does not.

1044

# Thus, the affected lrs can remain in fast-storage if the following is true

1045

if max(max_mem_usage[lr.start_time : lr.end_time + 1]) <= staging_limit:

1046

FastStorageComponentAllocator.keep(lr, base_mem_usage, staging_limit)

1047

else:

1048

competing_lrs.append(lr)

1049

sz = len(competing_lrs)

1050

# All lrs and their tensors have been handled if sz is zero, we may thus return

if sz == 0:

return

competing_lrs = sorted(competing_lrs, key=lambda lr: (lr.start_time, lr.end_time + 1, lr.size))

1055

start = 0

1056

start_time = competing_lrs[0].start_time

1057

end_time = competing_lrs[0].end_time

1058

component_allocator = FastStorageComponentAllocator(base_mem_usage, max_mem_usage, staging_limit)

1059

# Build up components and then allocate each separately

1060

for i, lr in enumerate(competing_lrs):

1061

if lr.start_time <= end_time and i - start < component_allocator.max_exhaustive_size:

1062

start_time = min(start_time, lr.start_time)

1063

end_time = max(end_time, lr.end_time)

1064

else:

1065

component_allocator.allocate_component(

1066

component_allocator,

1067

competing_lrs[start:i],

max_mem_usage,

base_mem_usage,

staging_limit,

scratched_fms,

)

start = i

start_time = lr.start_time

1075

end_time = lr.end_time

1076

component_allocator.allocate_component(

1077

component_allocator, competing_lrs[start:sz], max_mem_usage, base_mem_usage, staging_limit, scratched_fms

1078

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1079

1080

def move_constant_data(self):

1081

"""Determine if data, can be moved from permanent storage to another memory area. A move

1082

will generate a DMA command in the high-level command stream"""

1083

for sched_op in self.sched_ops:

1084

parent_op = sched_op.parent_op

1085

is_lut_used = any(inp.purpose == TensorPurpose.LUT for inp in parent_op.inputs)

1086

max_ifm_shram_avail = (

1087

(self.arch.available_shram_banks(is_lut_used) - self.arch.shram_reserved_output_banks)

1088

* self.arch.shram_bank_size

// 2

)

for idx, tens in enumerate(parent_op.inputs):

1093

if tens.mem_type not in (MemType.Scratch, MemType.Scratch_fast):

1094

# Tensor is in permanent storage

1095

# Only when permanent storage differs from feature map storage, there is a point moving the data

1096

if (

1097

tens.mem_area in self.arch.permanent_storage_mem_area

1098

and self.arch.permanent_storage_mem_area != self.arch.feature_map_storage_mem_area

1099

) or tens.purpose == TensorPurpose.LUT:

1100

if tens.purpose == TensorPurpose.LUT or (

Patrik Gustavsson

94292fe

2021-09-02 08:22:58 +0200

[diff] [blame]

1101

# For elementwise broadcast

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1102

tens.purpose == TensorPurpose.FeatureMap

1103

and sched_op.op_type.is_binary_elementwise_op()

1104

and tens.shape != []

1105

and sched_op.ifm.shape != sched_op.ofm.shape

Patrik Gustavsson

94292fe

2021-09-02 08:22:58 +0200

[diff] [blame]

1106

and parent_op.write_shape is None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1107

and tens.storage_size() > max_ifm_shram_avail

1108

):

1109

only_vector_product_consumers = all(

1110

oper and oper.type.npu_block_type == NpuBlockType.VectorProduct

1111

for oper in tens.consumers()

1112

)

1113

1114

if (not only_vector_product_consumers) or tens.purpose == TensorPurpose.LUT:

1115

new_tens = tens.clone_into_fast_storage(self.arch)

1116

if tens.purpose == TensorPurpose.LUT:

1117

new_tens.mem_area = MemArea.Shram

1118

1119

new_tens.consumer_list.append(parent_op)

1120

parent_op.inputs[idx] = new_tens

Dwight Lidman

352607c

2021-09-29 17:00:09 +0200

[diff] [blame]

1121

# If the index is out of range, IFM and IFM2 are the same tensor

1122

# and pass inputs don't have duplicates

1123

if idx < len(sched_op.parent_ps.inputs):

1124

sched_op.parent_ps.inputs[idx] = new_tens

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1125

1126

def print_schedule(self, schedule: Schedule):

1127

print(f"Schedule: '{schedule.name}'")

1128

for sched_op in self.sched_ops:

1129

if sched_op not in schedule.cost_map:

1130

# Sub-schedule printing

1131

continue

1132

1133

op_info = schedule.cost_map[sched_op]

1134

print(f"\t{sched_op.index}: Operation {sched_op.name} - OFM {sched_op.ofm.shape}")

1135

print(f"\t\tType: {sched_op.op_type}")

1136

print(f"\t\tKernel: {sched_op.kernel}")

1137

print(f"{op_info}")

1138

mem_usage = (

1139

schedule.memory_snapshot[op_info.time_index]

1140

if op_info.time_index < len(schedule.memory_snapshot)

1141

else 0

1142

)

1143

print(f"\t\tSRAM Used: {mem_usage} bytes")

1144

Jonas Ohlsson

25e700c

2022-03-04 14:58:56 +0100

[diff] [blame]

1145

print("\tCascades:")

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1146

for i, cascade in enumerate(schedule.cascades.values()):

1147

print(f"\t\t{i}: {cascade.start} -> {cascade.end}, size: {cascade.mem_usage}")

Patrik Gustavsson

feeb06d

2020-04-22 12:53:47 +0200

[diff] [blame]

1148

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1149

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1150

def _update_tensor_allocation(nng: Graph, arch: ArchitectureFeatures, options):

1151

"""

1152

Creates live ranges and runs tensor allocator for the current schedule

1153

(i.e. sg.schedule for all subgraphs), returns the maximum memory usage

1154

and updates SchedulerOpInfo.mem_usage for all operations in the schedule.

1155

"""

1156

root_sg = nng.get_root_subgraph()

1157

1158

alloc_list = []

1159

if arch.is_spilling_enabled():

1160

mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,)))

1161

mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))

1162

# Order is important

1163

alloc_list.append(mem_alloc_scratch_fast)

1164

alloc_list.append(mem_alloc_scratch)

1165

else:

1166

mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))

1167

alloc_list.append(mem_alloc_scratch)

1168

1169

for mem_area, mem_type_set in alloc_list:

1170

tensor_allocation.allocate_tensors(

nng,

root_sg,

arch,

mem_area,

mem_type_set,

tensor_allocator=options.tensor_allocator,

1177

verbose_allocation=options.verbose_allocation,

1178

cpu_tensor_alignment=options.cpu_tensor_alignment,

1179

)

1180

1181

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1182

class FastStorageComponentAllocator:

1183

def __init__(self, base_mem_usage, max_mem_usage, staging_limit):

1184

self.base_mem_usage = base_mem_usage

1185

self.max_mem_usage = list(max_mem_usage)

1186

self.staging_limit = staging_limit

1187

self.lrs = []

1188

self.evicted = []

1189

self.curr_evicted = []

1190

self.remaining_total_size = []

1191

self.best_allocated_size = 0

1192

self.max_exhaustive_size = 20

1193

1194

def allocate_exhaustive(self, ix, alloc_size):

1195

if ix >= len(self.lrs):

1196

if alloc_size > self.best_allocated_size:

1197

self.best_allocated_size = alloc_size

Louis Verhaard

5c8f1e5

2022-02-23 14:13:07 +0100

[diff] [blame]

1198

self.evicted = self.curr_evicted.copy()

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

return

lr = self.lrs[ix]

for t in range(lr.start_time, lr.end_time):

1203

assert self.base_mem_usage[t] <= self.max_mem_usage[t]

1204

base_usage = max(self.base_mem_usage[lr.start_time : lr.end_time + 1])

1205

can_fit = base_usage + lr.size <= self.staging_limit

1206

always_fits = can_fit

1207

1208

if can_fit:

1209

max_usage = max(self.max_mem_usage[lr.start_time : lr.end_time + 1])

1210

always_fits = max_usage <= self.staging_limit

1211

1212

if can_fit or always_fits:

1213

self.curr_evicted[ix] = False

1214

self.base_mem_usage = self.update_mem_usage(self.base_mem_usage, lr, True)

1215

self.allocate_exhaustive(ix + 1, alloc_size + lr.size)

1216

self.base_mem_usage = self.update_mem_usage(self.base_mem_usage, lr, False)

1217

1218

if not always_fits:

1219

self.curr_evicted[ix] = True

1220

self.max_mem_usage = self.update_mem_usage(self.max_mem_usage, lr, False)

1221

self.allocate_exhaustive(ix + 1, alloc_size)

1222

self.max_mem_usage = self.update_mem_usage(self.max_mem_usage, lr, True)

1223

1224

@staticmethod

1225

def update_mem_usage(mem_usage, lr, increase):

1226

for t in range(lr.start_time, lr.end_time + 1):

1227

mem_usage[t] += lr.size if increase else -lr.size

1228

assert mem_usage[t] >= 0

return mem_usage

@staticmethod

def evict(lr, max_mem_usage, scratched_fms):

1233

for t in range(lr.start_time, lr.end_time + 1):

1234

max_mem_usage[t] -= lr.size

1235

for tens in lr.tensors:

1236

if tens in scratched_fms:

1237

tens.mem_area = scratched_fms[tens][0]

1238

tens.mem_type = scratched_fms[tens][1]

1239

1240

@staticmethod

1241

def keep(lr, base_mem_usage, staging_limit):

1242

for t in range(lr.start_time, lr.end_time + 1):

1243

base_mem_usage[t] += lr.size

1244

assert base_mem_usage[t] <= staging_limit

1245

1246

def allocate_component(self, allocator, lrs, max_mem, min_mem, staging_limit, scratched_fms):

1247

sz = len(lrs)

1248

allocator.lrs = lrs

1249

allocator.evicted = [0] * len(lrs)

1250

allocator.curr_evicted = [0] * sz

1251

allocator.best_allocated_size = -1

1252

# Recursively evaluate all permutations of allocations of the lrs found in the component

1253

allocator.allocate_exhaustive(0, 0)

1254

1255

# Optimal allocation has been found, move lrs accordingly

1256

for i, e in enumerate(allocator.evicted):

1257

if e:

1258

self.evict(lrs[i], max_mem, scratched_fms)

1259

else:

1260

self.keep(lrs[i], min_mem, staging_limit)

1261

1262

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1263

def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_options: SchedulerOptions):

1264

"""Entry point for the Scheduler"""

1265

# Initialize CPU subgraphs

1266

schedulers = dict()

1267

# Initialize schedulers with max schedule. Only schedule NPU subgraphs

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1268

for sg in nng.subgraphs:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1269

if sg.placement != PassPlacement.Npu:

1270

# Create cascaded passes for CPU Ops

1271

cascaded_passes = []

1272

for idx, ps in enumerate(sg.passes):

1273

cps = CascadedPass(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

1274

ps.name,

1275

SchedulingStrategy.WeightStream,

ps.inputs,

[],

ps.outputs,

[ps],

ps.placement,

False,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1282

)

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1283

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1284

cps.time = idx

1285

ps.cascade = cps

1286

cascaded_passes.append(cps)

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1287

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1288

sg.cascaded_passes = cascaded_passes

1289

else:

1290

# Npu subgraph - create schedule

1291

scheduler = Scheduler(nng, sg, arch, scheduler_options)

1292

schedulers[sg] = scheduler

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1293

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1294

scheduler.create_scheduler_representation(arch)

1295

sg.sched_ops = scheduler.sched_ops

1296

scheduler.move_constant_data()

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1297

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1298

# Create the Max schedule template

1299

max_schedule_template = scheduler.create_initial_schedule()

1300

scheduler.max_schedule = max_schedule_template

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1301

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1302

# Create the optimimised Max schedule

1303

sg.schedule = max_schedule_template

1304

scheduler.update_op_memory_snapshot(max_schedule_template)

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

1305

opt_max_schedule = scheduler.propose_schedule_buffering(max_schedule_template, 1 << 32)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1306

sg.schedule = opt_max_schedule

1307

scheduler.update_op_memory_snapshot(opt_max_schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1308

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1309

# Create Min schedule

1310

min_schedule = scheduler.propose_minimal_schedule()

1311

initial_sram_limit = scheduler_options.optimization_sram_limit

1312

if scheduler_options.optimization_strategy == OptimizationStrategy.Size:

1313

initial_sram_limit = scheduler.min_memory_req

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1314

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1315

cascade_builder = CascadeBuilder(scheduler.sched_ops, arch.is_spilling_enabled())

1316

cascade_builder.build_cascades(min_schedule, max_schedule_template, initial_sram_limit)

1317

sg.schedule = min_schedule

1318

scheduler.update_op_memory_snapshot(min_schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1319

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1320

if scheduler_options.optimization_strategy == OptimizationStrategy.Performance:

1321

# Create an optimized schedule

1322

sg.schedule = scheduler.optimize_schedule(

1323

min_schedule, opt_max_schedule, max_schedule_template, scheduler_options

1324

)

1325

scheduler.update_op_memory_snapshot(sg.schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1326

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1327

scheduler.apply_schedule(sg.schedule)

1328

scheduler.use_fast_storage_for_feature_maps(sg.schedule, scheduler_options.optimization_sram_limit)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1329

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1330

if scheduler_options.verbose_schedule:

1331

scheduler.print_schedule(sg.schedule)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1332

Tim Hall