Blame - ethosu/vela/scheduler.py - ml/ethos-u/ethos-u-vela

2021-05-27 18:49:40 +0100

[diff] [blame]

1

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

16

#

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

17

# Description:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

18

# The scheduler creates and searches for an optimal plan for the network, selecting block configurations and

19

# subdivisions for the Operators

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

20

# For Class name forward references for the type annotations. (see PEP 563).

21

from __future__ import annotations

22

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

23

import copy

Johan Alfvén

5e0ae55

2022-02-09 21:20:10 +0100

[diff] [blame]

24

from collections import namedtuple

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

25

from enum import auto

26

from enum import IntEnum

27

from typing import Dict

28

from typing import List

29

from typing import Optional

30

from typing import Tuple

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

31

from typing import TYPE_CHECKING

32

33

# Import needed for Type annotations. Only import for Type checking to avoid run-time errors due to cyclic import.

34

if TYPE_CHECKING:

35

from .npu_performance import CycleCost

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

36

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

37

import numpy as np

38

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

39

from . import live_range

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

40

from . import npu_performance

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

41

from . import tensor_allocation

42

from . import weight_compressor

43

from .architecture_allocator import ArchitectureBlockConfig

44

from .architecture_allocator import find_block_config

45

from .architecture_allocator import get_ifm_area_required

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

46

from .architecture_features import ArchitectureFeatures

47

from .architecture_features import Block

48

from .cascade_builder import CascadeBuilder

49

from .cascade_builder import CascadeInfo

Fredrik Svedberg

880e735

2020-08-25 11:31:47 +0200

[diff] [blame]

50

from .data_type import DataType

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

51

from .nn_graph import CascadedPass

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

52

from .nn_graph import Graph

53

from .nn_graph import Pass

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

54

from .nn_graph import PassPlacement

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

55

from .nn_graph import SchedulingStrategy

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

56

from .nn_graph import Subgraph

57

from .numeric_util import round_down

58

from .numeric_util import round_up

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

59

from .operation import NpuBlockType

Louis Verhaard

aee5d75

2020-09-30 09:01:52 +0200

[diff] [blame]

60

from .operation import Op

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

61

from .shape4d import Shape4D

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

62

from .tensor import MemArea

Patrik Gustavsson

eca2e95

2020-05-27 09:15:11 +0200

[diff] [blame]

63

from .tensor import MemType

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

64

from .tensor import Tensor

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

65

from .tensor import TensorFormat

66

from .tensor import TensorPurpose

67

from .tensor import TensorSubPurpose

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

68

from .weight_compressor import NpuWeightTensor

Jacob Bohlin

1a66697

2020-09-11 10:04:15 +0200

[diff] [blame]

69

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

70

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

71

def shape_for_format(shape: Shape4D, tensor_format: TensorFormat) -> Shape4D:

72

if tensor_format == TensorFormat.NHCWB16:

73

return shape.with_depth(round_up(shape.depth, 16))

return shape

class OptimizationStrategy(IntEnum):

79

"""Enum defining the different optimization strategies for the Scheduler"""

80

81

Size = auto()

82

Performance = auto()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

def __str__(self):

return self.name

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

88

class SchedulerOpInfo:

89

"""Contains metadata about a SchedulerOperation that is unique to one Schedule"""

90

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

91

def __init__(

92

self,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

93

block_config: ArchitectureBlockConfig,

94

weights_size: int,

95

stripe_input: Shape4D,

96

stripe_input2: Optional[Shape4D],

97

stripe: Shape4D,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

98

):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

99

self.block_config = block_config

100

self.weights_size = weights_size

101

self.stripe_input = stripe_input

102

self.stripe_input2 = stripe_input2

103

self.stripe = stripe

104

self.cascade = 0 # Assigned by CascadeBuilder. 0 means not part of a cascade

105

self.time_index = None # Set by update_op_memory_snapshot

106

self.ofm_depth_slices: List[int] = [0, stripe.depth]

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

107

self.npu_weights_tensor: Optional[NpuWeightTensor] = None

108

self.npu_scales_tensor: Optional[NpuWeightTensor] = None

Louis Verhaard

2022-03-01 11:26:58 +0100

[diff] [blame]

109

self.buffered_weight_tensors: List[Tensor] = []

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

110

self.cycles: Optional[CycleCost] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

111

self.slack_buffering_cycles = 0

112

self.slack_buffering_memory = 0

113

self.full_weight_transfer_cycles = 0

114

115

def copy(self):

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame^]

116

res = SchedulerOpInfo(

self.block_config,

self.weights_size,

self.stripe_input,

self.stripe_input2,

self.stripe,

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

123

res.cascade = self.cascade

return res

def __str__(self):

res = f"\t\tBlock Config = {self.block_config}\n"

128

res += f"\t\tOFM Block = {self.block_config.ofm_block}\n"

129

res += f"\t\tIFM Stripe = {self.stripe_input}\n"

130

res += f"\t\tIFM2 Stripe = {self.stripe_input2}\n"

131

res += f"\t\tOFM Stripe = {self.stripe}\n"

132

res += f"\t\tEncoded Weights = {self.npu_weights_tensor and len(self.npu_weights_tensor.buffer)} bytes\n"

Louis Verhaard

2022-03-01 11:26:58 +0100

[diff] [blame]

133

for idx, tens in enumerate(self.buffered_weight_tensors):

134

res += f"\t\tWeight buffer{idx + 1} = {tens.storage_size()} bytes\n"

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

135

res += f"\t\tDepth slices = {self.ofm_depth_slices}\n"

136

res += f"\t\tAssigned Cascade = {self.cascade}"

return res

class SchedulerOptions:

141

"""Contains options for the Scheduler"""

142

143

def __init__(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame^]

144

self,

145

optimization_strategy,

146

sram_target,

147

verbose_schedule,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

148

):

149

self.optimization_strategy = optimization_strategy

150

self.optimization_sram_limit = sram_target

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

151

self.verbose_schedule = verbose_schedule

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

152

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

153

def __str__(self) -> str:

154

return f"{type(self).__name__}: {str(self.__dict__)}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

__repr__ = __str__

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

159

class SchedulerTensor:

160

def __init__(self, shape, dt, mem_area, _format):

161

self.dtype = dt

162

self.mem_area = mem_area

163

self.shape = shape

164

self.format = _format

165

self.connection = None

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

166

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

167

168

class SchedulerOperation:

169

"""Scheduler internal representation of 'Operation'

170

This class can be seen as a node within the Scheduler Graph representation

171

"""

172

173

def __init__(self, ps: Pass, arch: ArchitectureFeatures, nng: Graph):

174

self.arch = arch

175

self.parent_ps = ps

176

self.parent_op = ps.primary_op

177

self.name = ps.primary_op.name

178

self.op_type = ps.primary_op.type

179

self.activation = ps.primary_op.activation

180

self.kernel = ps.primary_op.kernel

Tim Hall

3c5cfe9

2022-03-16 16:31:57 +0000

[diff] [blame]

181

self.resampling_mode = ps.primary_op.ifm_resampling_mode

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

182

self.uses_scalar = ps.primary_op.ifm2 is not None and (

183

ps.primary_op.ifm.shape == [] or ps.primary_op.ifm2.shape == []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

184

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

185

self.ifm_ublock = arch.ifm_ublock

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

186

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame^]

187

self.ifm = SchedulerTensor(

188

ps.ifm_shapes[0],

189

ps.ifm_tensor.dtype,

190

ps.ifm_tensor.mem_area,

191

ps.ifm_tensor.format,

192

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

193

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

194

self.ifm2 = None

195

if ps.ifm2_tensor:

196

self.ifm2 = SchedulerTensor(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame^]

197

ps.ifm_shapes[1],

198

ps.ifm2_tensor.dtype,

199

ps.ifm2_tensor.mem_area,

200

ps.ifm2_tensor.format,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

201

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

202

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame^]

203

self.ofm = SchedulerTensor(

204

ps.ofm_shapes[0],

205

ps.ofm_tensor.dtype,

206

ps.ofm_tensor.mem_area,

207

ps.ofm_tensor.format,

208

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

209

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

210

# Input volume width and height required to produce the smallest possible stripe

211

self.min_stripe_input_w, self.min_stripe_input_h = self._calculate_min_stripe_input()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

212

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

213

# Flags that marks whether this SchedulerOperation requires full IFM/OFM

214

self.requires_full_ifm = False

215

self.requires_full_ifm2 = False

216

self.requires_full_ofm = False

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

217

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

218

self.index = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

219

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

220

def add_ifm_connection(self, conn: "Connection"):

221

"""Add input connection to another SchedulerOperation or Subgraph Input"""

222

conn.consumers.append(self)

223

self.ifm.connection = conn

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

224

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

225

def add_ifm2_connection(self, conn: "Connection"):

226

"""Add input connection to another SchedulerOperation or Subgraph Input"""

227

if self.ifm2:

228

conn.consumers.append(self)

229

self.ifm2.connection = conn

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

230

else:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

231

assert False, f"Trying to set an IFM2 Connection to {self} which has no IFM2"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

232

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

233

def add_ofm_connection(self, conn: "Connection"):

234

"""Add output connection to another SchedulerOperation or Subgraph Output"""

235

conn.producers.append(self)

236

self.ofm.connection = conn

237

238

def get_dependants(self):

239

"""Returns a list of the Ops that depend on this Operation's OFM"""

240

return self.ofm.connection.consumers

241

242

def ifm_size_in_bytes(self) -> int:

243

"""Returns size of the IFM in bytes"""

244

ifm_storage_shape = shape_for_format(self.ifm.shape, self.ifm.format)

245

return round_up(ifm_storage_shape.elements() * self.ifm.dtype.size_in_bytes(), Tensor.AllocationQuantum)

246

247

def ifm2_size_in_bytes(self) -> int:

248

"""Returns size of the IFM2 in bytes"""

249

if self.ifm2:

250

ifm2_storage_shape = shape_for_format(self.ifm2.shape, self.ifm2.format)

251

return round_up(ifm2_storage_shape.elements() * self.ifm2.dtype.size_in_bytes(), Tensor.AllocationQuantum)

return 0

def ofm_size_in_bytes(self) -> int:

256

"""Returns size of the OFM in bytes"""

257

ofm_storage_shape = shape_for_format(self.ofm.shape, self.ofm.format)

258

return round_up(ofm_storage_shape.elements() * self.ofm.dtype.size_in_bytes(), Tensor.AllocationQuantum)

259

260

def create_scheduler_info(self, nng: Graph, stripe: Shape4D) -> SchedulerOpInfo:

261

"""Returns schedule info about this SchedulerOperation based on how many ofm elements it should produce"""

262

ifm_shape = self.ifm.shape

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

263

ifm2_shape = self.ifm2.shape if self.ifm2 is not None else None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

264

ofm_shape = stripe

265

266

if ofm_shape != self.ofm.shape:

267

# Striped Op - Need to calculate stripe input volume

268

stripe_input_w, stripe_input_h = self._get_stripe_input_requirement(stripe)

269

# Ensure stripe input volume is within the full IFM volume

270

stripe_input_h = min(stripe_input_h, self.ifm.shape.height)

271

stripe_input_w = min(stripe_input_w, self.ifm.shape.width)

272

ifm_shape = ifm_shape.with_hw(stripe_input_h, stripe_input_w)

273

274

if self.ifm2:

275

stripe_input2_h = min(stripe_input_h, self.ifm2.shape.height)

276

stripe_input2_w = min(stripe_input_w, self.ifm2.shape.width)

277

ifm2_shape = ifm2_shape.with_hw(stripe_input2_h, stripe_input2_w)

278

279

block_config = self._get_block_config(ifm_shape, ifm2_shape, self.uses_scalar, ofm_shape)

280

281

scheduler_op_info = SchedulerOpInfo(block_config, 0, ifm_shape, ifm2_shape, ofm_shape)

282

if self.parent_op.weights:

283

# Default full-depth weight encoding with no buffering

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

284

(

285

scheduler_op_info.npu_weights_tensor,

286

scheduler_op_info.npu_scales_tensor,

287

) = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

288

self.arch,

289

self.parent_op,

290

self.parent_op.weights,

self.parent_op.bias,

self.kernel,

block_config,

[0, self.ofm.shape.depth],

295

)

296

297

self.parent_ps.block_config = block_config.old_style_representation()

298

return scheduler_op_info

299

300

def _get_stripe_input_requirement(self, stripe_shape: Shape4D) -> Tuple[int, int]:

301

"""Returns the amount of IFM required to produce the stripe with shape:'stripe_shape'"""

302

ofm_shape_to_produce = Block.from_shape(stripe_shape.as_list())

303

Fredrik Svedberg

3ff7a4a

2021-09-29 10:08:04 +0200

[diff] [blame]

304

return get_ifm_area_required(ofm_shape_to_produce, self.kernel, self.resampling_mode)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

305

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

306

def _calculate_min_stripe_input(self) -> Tuple[int, int]:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

307

# Calculate the input volume required height and width for the smallest possible stripe (h,w = 1,1)

308

min_stripe = self.ofm.shape.with_hw(1, 1)

309

return self._get_stripe_input_requirement(min_stripe)

310

311

def _get_block_config(

312

self, ifm_shape: Shape4D, ifm2_shape: Optional[Shape4D], uses_scalar: bool, ofm_shape: Shape4D

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

313

) -> Optional[ArchitectureBlockConfig]:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

314

# Returns a block config and SHRAM layout

315

lut_banks = 2 if self.parent_op.activation_lut else 0

316

return find_block_config(

317

self.arch,

318

self.op_type.npu_block_type,

ofm_shape,

ifm_shape,

ifm2_shape,

uses_scalar,

self.ifm.dtype.size_in_bits(),

324

self.kernel,

325

lut_banks,

326

self.parent_op.has_scaling(),

327

self.resampling_mode,

)

class Connection:

"""Scheduler internal representation of a Tensor that connects two SchedulerOperations

333

This class can be seen as an edge within the Scheduler Graph representation

334

"""

335

336

def __init__(self, tensor: Tensor):

337

self.parent_tens = tensor

338

339

# SchedulerOperation relationships

340

self.producers: List[SchedulerOperation] = []

341

self.consumers: List[SchedulerOperation] = []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

342

343

def __str__(self):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

344

return f"<Connection {self.parent_tens.name}>"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

__repr__ = __str__

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

349

class Schedule:

350

"""Class that contains a solution of how to schedule an NPU subgraph and its cost"""

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

351

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

352

def __init__(self, sg: Subgraph, label: str):

353

self.sg = sg

354

self.label = label

355

self.cost_map: Dict[SchedulerOperation, SchedulerOpInfo] = {}

356

self.cascades: Dict[int, CascadeInfo] = {}

357

self.fast_storage_peak_usage = 0

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

358

self.memory_snapshot: Optional[List[int]] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

@property

def name(self):

return f"{self.sg.name}_{self.label}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

363

364

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

365

class Scheduler:

366

"""Main class of the Vela Scheduling"""

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

367

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

368

def __init__(self, nng: Graph, sg: Subgraph, arch: ArchitectureFeatures, options: SchedulerOptions):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

369

self.nng = nng

370

self.sg = sg

371

self.arch = arch

Ayaan Masood

b801dda

2022-02-22 11:28:55 +0000

[diff] [blame]

372

self.sched_ops: List[SchedulerOperation] = []

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

373

self.max_schedule: Optional[Schedule] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

374

self.scheduler_options = options

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

375

Johan Alfvén

5e0ae55

2022-02-09 21:20:10 +0100

[diff] [blame]

376

def avoid_nhcwb16_for_ofm(self, tens, ps, arch):

377

# Only run this check for opt strategy Size

378

if self.scheduler_options.optimization_strategy == OptimizationStrategy.Performance:

return False

op = ps.primary_op

if not op.type.is_elementwise_op():

383

return False

384

385

depth = op.ofm_shapes[0][-1]

386

if (depth % 16) == 0:

387

return False

388

389

# Check if overwriting the inputs can be allowed

390

OpShapeTens = namedtuple("OpShapeTens", ["op_shape", "tens"])

391

outp = OpShapeTens(op.ofm_shapes[0], op.ofm)

392

inps = []

393

if op.ifm is not None:

394

inps.append(OpShapeTens(op.ifm_shapes[0], op.ifm))

395

if op.ifm2 is not None:

396

inps.append(OpShapeTens(op.ifm_shapes[1], op.ifm2))

397

398

# Find an input tensor that can be overwritten by the output

399

for inp in inps:

400

if (

401

# check op input and output shapes allow overlapping

402

inp.op_shape == outp.op_shape

403

# check input tensor is valid

404

and inp.tens is not None

405

and inp.tens.shape != []

406

# check input and output tensors are compatible

407

and inp.tens.format == outp.tens.format

408

and inp.tens.dtype == outp.tens.dtype

409

):

410

if inp.tens.format == TensorFormat.NHWC:

return True

return False

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

415

def create_scheduler_representation(self, arch: ArchitectureFeatures):

416

"""Creates a Scheduler Graph representation"""

417

# Temporary dict for creating connections between the Operations

418

connections: Dict[Tensor, Connection] = {}

419

# Memory required for the largest FeatureMap that has to be full

420

min_memory_req = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

421

for ps in self.sg.passes:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

422

if ps.primary_op:

423

# Set tensor format to NHCWB16 for output FeatureMaps, if possible

Louis Verhaard

0b9c9a3

2020-09-15 14:05:38 +0200

[diff] [blame]

424

for output in ps.outputs:

Jacob Bohlin

a5e8c1c

2021-06-14 13:33:39 +0200

[diff] [blame]

425

if output in self.sg.output_tensors or output.purpose != TensorPurpose.FeatureMap:

Patrik Gustavsson

feeb06d

2020-04-22 12:53:47 +0200

[diff] [blame]

426

continue

Johan Alfvén

5e0ae55

2022-02-09 21:20:10 +0100

[diff] [blame]

427

428

if output.needs_linear_format:

429

continue

430

431

if self.avoid_nhcwb16_for_ofm(output, ps, arch):

432

output.needs_linear_format = True

433

continue

434

435

output.set_format(TensorFormat.NHCWB16, arch)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

436

437

# Create SchedulerOperations

438

op = SchedulerOperation(ps, arch, self.nng)

439

op.index = len(self.sched_ops)

440

441

# Make connections

442

if ps.ifm_tensor not in connections:

443

connections[ps.ifm_tensor] = Connection(ps.ifm_tensor)

444

if ps.ifm2_tensor and ps.ifm2_tensor not in connections:

445

connections[ps.ifm2_tensor] = Connection(ps.ifm2_tensor)

446

if ps.ofm_tensor not in connections:

447

connections[ps.ofm_tensor] = Connection(ps.ofm_tensor)

448

449

op.add_ifm_connection(connections[ps.ifm_tensor])

450

if ps.ifm2_tensor:

451

op.add_ifm2_connection(connections[ps.ifm2_tensor])

452

op.add_ofm_connection(connections[ps.ofm_tensor])

453

454

# Set requirements on the ifm/ofm buffers

455

self.sched_ops.append(op)

456

if ps.ifm_tensor in self.sg.input_tensors:

457

# This Op consumes a subgraph input

458

op.requires_full_ifm = True

459

if ps.ifm2_tensor and ps.ifm2_tensor in self.sg.input_tensors:

460

# This Op consumes a subgraph input

461

op.requires_full_ifm2 = True

462

if ps.ofm_tensor in self.sg.output_tensors:

463

# This Op produces a subgraph output

464

op.requires_full_ofm = True

465

if ps.ifm_tensor.needs_linear_format:

466

op.requires_full_ifm = True

467

if ps.ifm2_tensor and ps.ifm2_tensor.needs_linear_format:

468

op.requires_full_ifm2 = True

469

if ps.ofm_tensor.needs_linear_format or ps.primary_op.memory_function == Op.ConcatSliceWrite:

470

op.requires_full_ofm = True

471

if len(ps.primary_op.outputs) > 1 or len(ps.primary_op.outputs[0].consumer_list) > 1:

472

# Op has multiple outputs or consumers - requires full OFM

473

op.requires_full_ofm = True

474

475

# Check memory requirements if this Op requires any full FeatureMaps

476

op_memory_req = 0

477

if op.requires_full_ifm:

478

op_memory_req += op.ifm_size_in_bytes()

479

if op.requires_full_ifm2:

480

op_memory_req += op.ifm2_size_in_bytes()

481

if op.requires_full_ofm:

482

op_memory_req += op.ofm_size_in_bytes()

483

484

min_memory_req = max(op_memory_req, min_memory_req)

485

486

# Theoretical minimum required memory - used to guide the cascade building

487

self.min_memory_req = min_memory_req

488

489

def create_initial_schedule(self) -> Schedule:

490

"""Creates an initial schedule with no cascading or buffering of any kind"""

491

schedule = Schedule(self.sg, "MAX")

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

492

for op in self.sched_ops:

493

cost = op.create_scheduler_info(self.nng, op.ofm.shape)

494

cost.cycles = self.estimate_op_performance(op, cost.block_config, op.ofm.shape.depth)

495

schedule.cost_map[op] = cost

return schedule

def update_op_memory_snapshot(self, schedule: Schedule):

500

memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

501

502

# Collect live ranges from tensors

503

lr_graph = live_range.LiveRangeGraph()

504

for mem_area, mem_type_set in memories_list:

505

live_range.extract_live_ranges_from_cascaded_passes(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame^]

506

self.nng.get_root_subgraph(),

mem_area,

mem_type_set,

lr_graph,

Tensor.AllocationQuantum,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

511

)

512

513

# Populate time-array with memory used by live ranges

514

temporal_usage = lr_graph.get_temporal_memory_usage(self.arch.fast_storage_mem_area)

515

schedule.memory_snapshot = temporal_usage

516

517

# Set the peak memory usage

518

schedule.fast_storage_peak_usage = max(temporal_usage, default=0)

519

520

def estimate_op_performance(self, op: SchedulerOperation, block_config, ofm_depth):

521

query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)

522

query.ifm_shape = op.ifm.shape

523

query.ifm_memory_area = op.ifm.mem_area

524

query.ifm_bits = op.ifm.dtype.size_in_bits()

525

query.ifm_format = op.ifm.format

526

query.ifm2_shape = op.ifm2 and op.ifm2.shape

527

query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area

528

query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()

529

query.ifm2_format = op.ifm2 and op.ifm2.format

530

query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)

531

query.ofm_memory_area = op.ofm.mem_area

532

query.ofm_bits = op.ofm.dtype.size_in_bits()

533

query.ofm_format = op.ofm.format

534

if op.parent_op.bias:

535

query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)

536

query.const_memory_area = self.arch.fast_storage_mem_area

537

538

query.kernel = op.kernel

539

query.config = block_config

540

541

return npu_performance.measure_cycle_cost(self.arch, op.op_type, op.activation and op.activation.op_type, query)

542

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

543

def propose_schedule_buffering(self, ref_schedule: Schedule, staging_limit_bytes):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

544

"""Create a buffered schedule"""

545

buffered_schedule = Schedule(self.sg, f"{ref_schedule.label}_BUFFERED")

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

546

547

prev_op = None

548

for sched_op in self.sched_ops:

549

if sched_op not in ref_schedule.cost_map:

550

# sched_op is not part of this sub-schedule - skip

551

continue

552

553

self.propose_operator_buffering(sched_op, prev_op, buffered_schedule, ref_schedule, staging_limit_bytes)

554

prev_op = sched_op

555

556

return buffered_schedule

557

558

def propose_operator_buffering(

559

self,

560

sched_op: SchedulerOperation,

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

561

prev_op: Optional[SchedulerOperation],

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

562

buffered_schedule: Schedule,

563

ref_schedule: Schedule,

564

staging_limit_bytes,

565

):

566

# Mild recursion might mean this Op has already been seen

567

if sched_op in buffered_schedule.cost_map:

568

return

569

570

# Take the reference schedule as default costings for this schedule

571

ref_cost = ref_schedule.cost_map[sched_op]

572

cost = copy.copy(ref_cost)

573

cost.slack_buffering_cycles = ref_cost.cycles.op_cycles

574

memory_snapshot = ref_schedule.memory_snapshot

575

ref_memory_usage = memory_snapshot[ref_cost.time_index] if ref_cost.time_index < len(memory_snapshot) else 0

576

cost.slack_buffering_memory = staging_limit_bytes - ref_memory_usage

577

buffered_schedule.cost_map[sched_op] = cost

578

579

# Attempt weight buffering on anything with a weights tensor

580

if sched_op.parent_op.weights:

581

self.propose_weight_buffering(

582

sched_op.parent_op.weights,

583

sched_op.parent_op.bias,

sched_op,

prev_op,

buffered_schedule,

ref_schedule,

cost.slack_buffering_memory,

)

return cost

def weights_needs_dma(self, weight_tensor):

594

if weight_tensor and weight_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast):

595

# Weights are in permanent storage

596

# Only when permanent storage differs from feature map storage, there is a point moving the data

597

if (

598

weight_tensor.mem_area in (MemArea.Dram, MemArea.OffChipFlash)

599

and self.arch.permanent_storage_mem_area != self.arch.fast_storage_mem_area

):

return True

return False

def propose_weight_buffering(

self,

weight_tensor,

scale_tensor,

sched_op: SchedulerOperation,

609

prev_op: SchedulerOperation,

610

buffered_schedule: Schedule,

611

ref_schedule: Schedule,

612

buffer_limit_bytes,

613

):

614

cost = buffered_schedule.cost_map[sched_op]

615

prev_cost = buffered_schedule.cost_map.get(prev_op)

616

ref_cost = ref_schedule.cost_map[sched_op]

617

assert cost and ref_cost

618

619

needs_dma = self.weights_needs_dma(weight_tensor)

620

621

ofm_full_depth_slices = [0, ref_cost.stripe.depth]

622

623

# Encode weights for the full depth

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

624

full_weights, full_scales = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

self.arch,

sched_op.parent_op,

weight_tensor,

scale_tensor,

sched_op.kernel,

cost.block_config,

ofm_full_depth_slices,

632

)

633

full_weights_bytes = len(full_weights.buffer)

634

cost.ofm_depth_slices = ofm_full_depth_slices

635

636

# No buffering required - take all the weights from permanent storage

637

if sched_op.op_type == Op.FullyConnected or not needs_dma:

638

cost.npu_weights_tensor = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

639

cost.npu_scales_tensor = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

640

return

641

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

642

encoded_weights: Optional[NpuWeightTensor] = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

643

encoded_scales = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

644

645

# How many NPU cycles are available under the previously executing

646

# operator and SRAM unused for performing buffered DMA transfers

647

slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0

648

slack_memory = prev_cost.slack_buffering_memory if prev_cost else 0

649

650

# Force full depth for cascaded Ops

651

if ref_cost.cascade != 0:

652

weight_tensor_purpose = TensorSubPurpose.Standard

653

weight_buffer_size = full_weights_bytes

654

# Update the memory snapshot to reflect the added size of the weights

655

ref_schedule.memory_snapshot[ref_cost.time_index] += weight_buffer_size

656

else:

657

# Estimate the buffering cycle time for the full set of weights

658

full_transfer_cycles = npu_performance.measure_mem2mem_cycles(

659

self.arch, weight_tensor.mem_area, self.arch.fast_storage_mem_area, full_weights_bytes

660

)

661

cost.full_weight_transfer_cycles = full_transfer_cycles

662

663

# Calculate the amount of prebuffering necessary (or what is possible with limited

664

# double buffer buffer size)

665

half_buffer_limit = buffer_limit_bytes // 2

666

if full_transfer_cycles > slack_cycles:

667

prebuffer_ratio = slack_cycles / full_transfer_cycles

668

prebuffer_bytes = min(prebuffer_ratio * full_weights_bytes, half_buffer_limit)

669

else:

670

prebuffer_bytes = min(full_weights_bytes, half_buffer_limit)

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

671

672

prebuffer_ratio = prebuffer_bytes / full_weights_bytes

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

673

674

# Have to split the weights if the initial buffering can't store

675

# all of the compressed weights

676

if prebuffer_bytes < full_weights_bytes:

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

677

block_depth = cost.block_config.ofm_block.depth

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

678

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

679

# Choose initial prebuffering depth (already buffer clamped)

680

prebuffer_depth = ref_cost.stripe.depth * prebuffer_ratio

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

681

prebuffer_depth = int(max(16, round_down(prebuffer_depth, ArchitectureFeatures.OFMSplitDepth)))

682

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

683

# Calculate cycles executed during the prebuffer

684

pre_op_cycles = self.estimate_op_performance(sched_op, cost.block_config, prebuffer_depth)

685

buffering_depth = ref_cost.stripe.depth * (pre_op_cycles.op_cycles / full_transfer_cycles)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

686

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

687

# Choose initial buffering depth and clamp to the double buffering limit

688

buffering_depth = round_up(buffering_depth, block_depth)

689

buffering_bytes = (buffering_depth / ref_cost.stripe.depth) * full_weights_bytes

690

if buffering_bytes > half_buffer_limit:

691

buffering_depth = (half_buffer_limit / full_weights_bytes) * ref_cost.stripe.depth

692

693

while True:

694

# Attempt to buffer whole blocks

695

if buffering_bytes > block_depth:

696

buffering_depth = round_down(buffering_depth, block_depth)

697

else:

698

buffering_depth = round_down(buffering_depth, ArchitectureFeatures.OFMSplitDepth)

699

buffering_depth = int(max(buffering_depth, ArchitectureFeatures.OFMSplitDepth))

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

700

701

# Create list of depth slices

702

depth_slices = [0]

703

if prebuffer_depth < ref_cost.stripe.depth:

704

depth_slices += list(range(prebuffer_depth, ref_cost.stripe.depth, buffering_depth))

705

depth_slices.append(ref_cost.stripe.depth)

706

707

# Encode weights based depth slices

708

cost.ofm_depth_slices = depth_slices

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

709

encoded_weights, encoded_scales = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

self.arch,

sched_op.parent_op,

weight_tensor,

scale_tensor,

sched_op.kernel,

cost.block_config,

cost.ofm_depth_slices,

717

)

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

718

assert encoded_weights is not None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

719

# Chosen buffering might not fit at all, iterate until it does

720

# or until the minimum usable slice size is reached

721

if (

Louis Verhaard

2022-03-01 11:26:58 +0100

[diff] [blame]

722

encoded_weights.double_buffer_size() <= buffer_limit_bytes

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

723

or prebuffer_depth == ArchitectureFeatures.OFMSplitDepth

):

break

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

727

if buffering_depth > prebuffer_depth:

728

buffering_depth = round_up(buffering_depth // 2, ArchitectureFeatures.OFMSplitDepth)

729

else:

730

prebuffer_depth = round_up(prebuffer_depth // 2, ArchitectureFeatures.OFMSplitDepth)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

731

732

# Calculate cycles required to run the last op for use as future slack

733

tail_cycles = self.estimate_op_performance(

734

sched_op, cost.block_config, depth_slices[-1] - depth_slices[-2]

735

)

736

cost.slack_buffering_cycles = tail_cycles.op_cycles

737

738

# Determine whether the weights need to be double buffered

Louis Verhaard

2022-03-01 11:26:58 +0100

[diff] [blame]

739

weight_buffer_size = min(len(encoded_weights.buffer), encoded_weights.max_range_bytes())

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

740

741

# Only buffer weights if there's still space left for the buffer

742

if weight_buffer_size <= buffer_limit_bytes:

743

assert weight_buffer_size % 16 == 0

744

# Determine whether to double buffer or single buffer

Louis Verhaard

2022-03-01 11:26:58 +0100

[diff] [blame]

745

double_buffer_size = encoded_weights.double_buffer_size()

746

if (double_buffer_size <= buffer_limit_bytes) and (weight_buffer_size < len(encoded_weights.buffer)):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

747

weight_tensor_purpose = TensorSubPurpose.DoubleBuffer

748

else:

749

weight_tensor_purpose = TensorSubPurpose.Standard

750

Louis Verhaard

2022-03-01 11:26:58 +0100

[diff] [blame]

751

cost.buffered_weight_tensors = [

752

self.buffer_tensor(

753

encoded_weights,

754

weight_tensor_purpose,

755

encoded_weights.double_buffer_sizes[0],

756

weight_tensor.name + "_buffer",

757

)

758

]

759

if weight_tensor_purpose == TensorSubPurpose.DoubleBuffer:

760

buf2 = self.buffer_tensor(

761

encoded_weights,

762

weight_tensor_purpose,

763

encoded_weights.double_buffer_sizes[1],

764

weight_tensor.name + "_buffer2",

765

)

766

cost.buffered_weight_tensors.append(buf2)

767

last_used_buffer_idx = len(cost.ofm_depth_slices) % 2

768

weight_buffer_size = encoded_weights.double_buffer_sizes[last_used_buffer_idx]

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

769

if ref_cost.cascade == 0:

Louis Verhaard

2022-03-01 11:26:58 +0100

[diff] [blame]

770

# Determine if the lifetime can be extended and pre-buffer the first weight buffer

771

# under the previous operation

772

cost.buffered_weight_tensors[0].pre_buffer = encoded_weights.double_buffer_sizes[0] < slack_memory

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

773

774

cost.slack_buffering_memory -= weight_buffer_size

775

else:

776

# Don't slice or buffer - use the whole depth from persistent storage

777

cost.ofm_depth_slices = ofm_full_depth_slices

778

encoded_weights = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

779

encoded_scales = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

780

781

cost.npu_weights_tensor = encoded_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

782

cost.npu_scales_tensor = encoded_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

783

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

784

def buffer_tensor(self, src_tensor: Tensor, sub_purpose: TensorSubPurpose, buffer_size: int, name: str) -> Tensor:

Louis Verhaard

2022-03-01 11:26:58 +0100

[diff] [blame]

785

buffered_weight_tensor = Tensor([1, 1, 1, buffer_size], DataType.uint8, name)

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

786

buffered_weight_tensor.src_tensor = src_tensor

787

buffered_weight_tensor.mem_area = self.arch.fast_storage_mem_area

788

buffered_weight_tensor.mem_type = MemType.Scratch_fast

789

buffered_weight_tensor.purpose = TensorPurpose.Weights

790

buffered_weight_tensor.sub_purpose = sub_purpose

791

return buffered_weight_tensor

792

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

793

def propose_minimal_schedule(self) -> Schedule:

794

"""Proposes scheduling parameters where every operator is subdivided into the smallest stripe that satisfies the

795

next operators stride"""

796

min_schedule = Schedule(self.sg, "MIN")

797

cost_map = min_schedule.cost_map

798

799

# Keep track of the previous Op - which consumes the current Op's OFM

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

800

prev_op: Optional[SchedulerOperation] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

801

for sched_op in reversed(self.sched_ops):

802

min_stripe_height = prev_op.kernel.stride.y if prev_op else 1

803

min_stripe = sched_op.ofm.shape.with_height(min_stripe_height)

804

805

cost = sched_op.create_scheduler_info(self.nng, min_stripe)

806

cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)

807

cost_map[sched_op] = cost

prev_op = sched_op

return min_schedule

def propose_schedule_striping(self, final_stripe: Shape4D, label: str, ref_schedule: Schedule) -> Schedule:

814

"""Proposes new striping for a schedule. The stripe is derived from the ifm requirements of the next Op down"""

815

ref_cost = ref_schedule.cost_map

816

817

striped_schedule = Schedule(self.sg, label)

818

stripe = final_stripe

819

for sched_op in reversed(self.sched_ops):

820

if sched_op not in ref_cost:

821

# sched_op is not part of the sub-schedule - skip

822

continue

823

824

# Create a cost entry with the new stripe

825

cost = sched_op.create_scheduler_info(self.nng, stripe)

826

Louis Verhaard

2022-03-01 11:26:58 +0100

[diff] [blame]

827

for buffered_tens in ref_cost[sched_op].buffered_weight_tensors:

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

828

# If the weights are buffered in the reference schedule they should be in the new proposal

829

weight_tensor = cost.npu_weights_tensor

Louis Verhaard

2022-03-01 11:26:58 +0100

[diff] [blame]

830

cost.buffered_weight_tensors.append(

831

self.buffer_tensor(

832

weight_tensor, TensorSubPurpose.Standard, buffered_tens.storage_size(), buffered_tens.name

833

)

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

834

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

835

836

# Estimate performance

837

cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)

838

striped_schedule.cost_map[sched_op] = cost

839

840

# Calculate the preceeding Op's stripe

841

stripe = sched_op.ifm.shape.with_height(stripe.height * sched_op.kernel.stride.y)

842

843

return striped_schedule

844

845

def estimate_schedule_memory_usage(self, schedule: Schedule, non_local_mem_usage: dict):

846

"""Estimates the memory usage of a schedule"""

847

cost = schedule.cost_map

848

cascades = schedule.cascades

849

peak_mem_usage = 0

850

for sched_op in self.sched_ops:

851

if sched_op not in cost:

852

# sched_op is not part of the sub-schedule - skip

853

continue

854

855

if cost[sched_op].cascade:

856

# This Op is part of a cascade - use the cascade's memory usage

857

cascade_info = cascades[cost[sched_op].cascade]

858

# Non-local memory usage is already included in the cascade_info

859

peak_mem_usage = max(cascade_info.mem_usage, peak_mem_usage)

860

else:

861

# This Op is not part of a cascade - calculate the memory usage

Louis Verhaard

2022-03-01 11:26:58 +0100

[diff] [blame]

862

op_weight_buffer = sum(tens.storage_size() for tens in cost[sched_op].buffered_weight_tensors)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

863

864

op_mem_usage = (

865

sched_op.ifm_size_in_bytes()

866

+ sched_op.ofm_size_in_bytes()

867

+ op_weight_buffer

868

+ non_local_mem_usage.get(sched_op, 0)

869

)

870

peak_mem_usage = max(op_mem_usage, peak_mem_usage)

871

872

return peak_mem_usage

873

874

def optimize_sub_schedule(

875

self, cascade_info: CascadeInfo, ref_schedule: Schedule, max_template: Schedule, memory_limit: int

876

) -> Schedule:

877

"""Extracts the Ops covered by the given cascade and creates a sub-schedule. The sub-schedule is optimized by

878

proposing weight buffering and then continously proposing new stripe sizes"""

879

ref_cost = ref_schedule.cost_map

880

# Extract the ops that are part of this sub-schedule

881

start = cascade_info.start

882

end = cascade_info.end

883

sub_schedule_ops = self.sched_ops[start : end + 1]

884

# Create a sub-schedule that contains only the costs for the Ops that are part of the sub-schedule

885

sub_schedule = Schedule(self.sg, f"SUB_{start}_{end}")

886

for sched_op in sub_schedule_ops:

887

sub_schedule.cost_map[sched_op] = ref_cost[sched_op]

888

889

sub_schedule.cascades[end] = cascade_info

890

# Use the memory snapshot from the reference schedule

891

sub_schedule.memory_snapshot = ref_schedule.memory_snapshot

892

893

# Calculate memory usage that is live during the sub-schedule but not part of it

894

time_for_cascade = ref_cost[sub_schedule_ops[0]].time_index

895

mem_usage_parallel_to_sub_schedule = ref_schedule.memory_snapshot[time_for_cascade] - cascade_info.mem_usage

896

# If the first Op's IFM has other consumers it has to live throughout the whole sub-schedule whether it's

897

# included in a cascade or not

898

persistent_initial_ifm = (

899

sub_schedule_ops[0].ifm_size_in_bytes() if len(sub_schedule_ops[0].ifm.connection.consumers) > 1 else 0

900

)

901

# Calculate non-local-mem-usage per Operator

902

non_local_mem_usage = {}

903

for idx, sched_op in enumerate(sub_schedule_ops):

904

non_local_mem_usage[sched_op] = mem_usage_parallel_to_sub_schedule

905

if idx != 0:

906

non_local_mem_usage[sched_op] += persistent_initial_ifm

907

908

cascade_builder = CascadeBuilder(sub_schedule_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)

909

910

# Start by adding buffering

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

911

buffered_sub_schedule = self.propose_schedule_buffering(

912

sub_schedule, self.scheduler_options.optimization_sram_limit

913

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

914

# Copy the cascades over from the unbuffered-schedule

915

buffered_sub_schedule.cascades = sub_schedule.cascades

916

917

# Generate the possible stripings for the final Op in the sub-schedule

918

final_ofm_shape = sub_schedule_ops[-1].ofm.shape

919

possible_stripes = [

920

final_ofm_shape.with_height(stripe_h) for stripe_h in range(1, final_ofm_shape.height // 2 + 1)

921

]

922

923

# Propose different striping - the possible stripes are proposed similarly to a binary search

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

924

best_schedule = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

925

iteration = 0

926

while len(possible_stripes) > 1:

927

proposed_stripe = possible_stripes[len(possible_stripes) // 2]

928

proposed_schedule = self.propose_schedule_striping(

929

proposed_stripe, f"OPTIMIZED_{iteration}", buffered_sub_schedule

930

)

931

932

cascade_builder.build_cascades(proposed_schedule, max_template, memory_limit)

933

934

# Check if proposal fits

935

proposed_schedule_mem_usage = self.estimate_schedule_memory_usage(proposed_schedule, non_local_mem_usage)

936

if (proposed_schedule_mem_usage) <= memory_limit:

937

# Remove all possible stripes smaller than this

938

possible_stripes = possible_stripes[len(possible_stripes) // 2 :]

939

best_schedule = proposed_schedule

940

if not proposed_schedule.cascades:

941

# No cascading required - early exit

942

break

943

else:

944

# Proposal doesn't fit within the limit - remove all possible stripes larger than this

945

possible_stripes = possible_stripes[: len(possible_stripes) // 2]

iteration += 1

return best_schedule

def optimize_schedule(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame^]

self,

schedule: Schedule,

max_sched: Schedule,

max_template: Schedule,

956

options: SchedulerOptions,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

957

) -> Schedule:

958

"""Extracts sub-schedules based on the cascades and optimizes them and applies them to the final schedule"""

959

sram_limit = options.optimization_sram_limit

960

if max_sched.fast_storage_peak_usage < sram_limit and not self.arch.is_spilling_enabled():

961

# Maximum performance schedule fits within the SRAM target

962

return max_sched

963

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

964

# Iterate over a copy of the cascades since they may change during the loop

965

for cascade_info in list(schedule.cascades.values()):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

966

# Optimize the sub-schedule in this cascade

967

opt_sub_schedule = self.optimize_sub_schedule(cascade_info, schedule, max_template, sram_limit)

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

968

if opt_sub_schedule:

969

# Remove the existing cascade

970

del schedule.cascades[cascade_info.end]

971

# Update the sub-schedule Op and cascade costs to the full schedule

972

schedule.cost_map.update(opt_sub_schedule.cost_map)

973

schedule.cascades.update(opt_sub_schedule.cascades)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

974

975

# Update memory snapshot

976

self.sg.schedule = schedule

977

self.update_op_memory_snapshot(schedule)

978

# Propose schedule buffering to the optimized schedule

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

979

optimized_sched = self.propose_schedule_buffering(schedule, self.scheduler_options.optimization_sram_limit)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

980

# Copy the cascade's metadata from the unbuffered schedule

981

optimized_sched.cascades = schedule.cascades

982

return optimized_sched

983

984

def apply_schedule(self, sched: Schedule):

985

"""Applies the given schedule as a final solution"""

986

for sched_op in self.sched_ops:

987

op_info = sched.cost_map[sched_op]

988

cascade_info = sched.cascades.get(op_info.cascade, None)

989

if cascade_info and sched_op in cascade_info.buffers:

990

buffer_tens = sched_op.ifm.connection.parent_tens

991

# Apply memory area and type

992

buffer_tens.mem_area = self.arch.fast_storage_mem_area

993

buffer_tens.mem_type = MemType.Scratch_fast

994

# Apply Rolling buffer

995

buffer_tens.set_format(TensorFormat.NHCWB16, self.arch)

996

buffer_tens.set_new_sub_purpose(TensorSubPurpose.RollingBufferY, cascade_info.buffers[sched_op].height)

997

998

sched_op.parent_ps.block_config = op_info.block_config.old_style_representation()

999

1000

# Ensure that the src_tensor reference is set correctly

Louis Verhaard

2022-03-01 11:26:58 +0100

[diff] [blame]

1001

for tens in op_info.buffered_weight_tensors:

1002

tens.src_tensor = op_info.npu_weights_tensor

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1003

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1004

def use_fast_storage_for_feature_maps(self, schedule, staging_limit):

scratched_fms = {}

max_mem_usage = []

base_mem_usage = []

fast_storage_type = MemType.Scratch_fast

1009

fast_storage_mem_area = self.arch.fast_storage_mem_area

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1010

1011

# Force all OFMs to fast-storage

1012

for sched_op in self.sched_ops:

1013

cost = schedule.cost_map[sched_op]

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1014

if cost.cascade == 0 and sched_op.get_dependants():

1015

ofm_tens = sched_op.ofm.connection.parent_tens

1016

if not any(cons is None for cons in ofm_tens.consumer_list):

1017

if ofm_tens not in scratched_fms:

1018

scratched_fms[ofm_tens] = (ofm_tens.mem_area, ofm_tens.mem_type)

1019

ofm_tens.mem_area = fast_storage_mem_area

1020

ofm_tens.mem_type = fast_storage_type

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1021

1022

# Collect live ranges from tensors

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1023

memories_list = [(fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1024

lr_graph = live_range.LiveRangeGraph()

1025

for mem_area, mem_type_set in memories_list:

1026

live_range.extract_live_ranges_from_cascaded_passes(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame^]

1027

self.nng.get_root_subgraph(),

mem_area,

mem_type_set,

lr_graph,

Tensor.AllocationQuantum,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1032

)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1033

max_mem_usage = lr_graph.get_temporal_memory_usage(fast_storage_mem_area)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1034

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1035

# If true, everything fits and we can proceed

1036

if max(max_mem_usage) <= staging_limit:

1037

return

1038

1039

# Build up the base memory usage by removing the

1040

# mem_usage of the lrs we previously moved to fast-storage

1041

base_mem_usage = np.array(max_mem_usage)

1042

curr_lrs = []

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1043

for lr in lr_graph.lrs:

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1044

for tens in lr.tensors:

1045

if scratched_fms.get(tens):

1046

curr_lrs.append(lr)

1047

base_mem_usage[lr.start_time : lr.end_time + 1] -= lr.size

break

competing_lrs = []

for lr in curr_lrs:

base_usage = max(base_mem_usage[lr.start_time : lr.end_time + 1])

1053

# If true, the lr will never fit and may thus be evicted

1054

if base_usage + lr.size > staging_limit:

1055

FastStorageComponentAllocator.evict(lr, max_mem_usage, scratched_fms)

1056

continue

1057

# Since max_mem_usage is the memory usage with all FMs still in fast-storage,

1058

# the memory limit cannot be exceeded if max_mem_usage does not.

1059

# Thus, the affected lrs can remain in fast-storage if the following is true

1060

if max(max_mem_usage[lr.start_time : lr.end_time + 1]) <= staging_limit:

1061

FastStorageComponentAllocator.keep(lr, base_mem_usage, staging_limit)

1062

else:

1063

competing_lrs.append(lr)

1064

sz = len(competing_lrs)

1065

# All lrs and their tensors have been handled if sz is zero, we may thus return

if sz == 0:

return

competing_lrs = sorted(competing_lrs, key=lambda lr: (lr.start_time, lr.end_time + 1, lr.size))

1070

start = 0

1071

start_time = competing_lrs[0].start_time

1072

end_time = competing_lrs[0].end_time

1073

component_allocator = FastStorageComponentAllocator(base_mem_usage, max_mem_usage, staging_limit)

1074

# Build up components and then allocate each separately

1075

for i, lr in enumerate(competing_lrs):

1076

if lr.start_time <= end_time and i - start < component_allocator.max_exhaustive_size:

1077

start_time = min(start_time, lr.start_time)

1078

end_time = max(end_time, lr.end_time)

1079

else:

1080

component_allocator.allocate_component(

1081

component_allocator,

1082

competing_lrs[start:i],

max_mem_usage,

base_mem_usage,

staging_limit,

scratched_fms,

)

start = i

start_time = lr.start_time

1090

end_time = lr.end_time

1091

component_allocator.allocate_component(

1092

component_allocator, competing_lrs[start:sz], max_mem_usage, base_mem_usage, staging_limit, scratched_fms

1093

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1094

1095

def move_constant_data(self):

1096

"""Determine if data, can be moved from permanent storage to another memory area. A move

1097

will generate a DMA command in the high-level command stream"""

1098

for sched_op in self.sched_ops:

1099

parent_op = sched_op.parent_op

1100

is_lut_used = any(inp.purpose == TensorPurpose.LUT for inp in parent_op.inputs)

1101

max_ifm_shram_avail = (

1102

(self.arch.available_shram_banks(is_lut_used) - self.arch.shram_reserved_output_banks)

1103

* self.arch.shram_bank_size

// 2

)

for idx, tens in enumerate(parent_op.inputs):

1108

if tens.mem_type not in (MemType.Scratch, MemType.Scratch_fast):

1109

# Tensor is in permanent storage

1110

# Only when permanent storage differs from feature map storage, there is a point moving the data

1111

if (

1112

tens.mem_area in self.arch.permanent_storage_mem_area

1113

and self.arch.permanent_storage_mem_area != self.arch.feature_map_storage_mem_area

1114

) or tens.purpose == TensorPurpose.LUT:

1115

if tens.purpose == TensorPurpose.LUT or (

Patrik Gustavsson

94292fe

2021-09-02 08:22:58 +0200

[diff] [blame]

1116

# For elementwise broadcast

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1117

tens.purpose == TensorPurpose.FeatureMap

1118

and sched_op.op_type.is_binary_elementwise_op()

1119

and tens.shape != []

1120

and sched_op.ifm.shape != sched_op.ofm.shape

Patrik Gustavsson

94292fe

2021-09-02 08:22:58 +0200

[diff] [blame]

1121

and parent_op.write_shape is None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1122

and tens.storage_size() > max_ifm_shram_avail

1123

):

1124

only_vector_product_consumers = all(

1125

oper and oper.type.npu_block_type == NpuBlockType.VectorProduct

1126

for oper in tens.consumers()

1127

)

1128

1129

if (not only_vector_product_consumers) or tens.purpose == TensorPurpose.LUT:

1130

new_tens = tens.clone_into_fast_storage(self.arch)

1131

if tens.purpose == TensorPurpose.LUT:

1132

new_tens.mem_area = MemArea.Shram

1133

1134

new_tens.consumer_list.append(parent_op)

1135

parent_op.inputs[idx] = new_tens

Dwight Lidman

352607c

2021-09-29 17:00:09 +0200

[diff] [blame]

1136

# If the index is out of range, IFM and IFM2 are the same tensor

1137

# and pass inputs don't have duplicates

1138

if idx < len(sched_op.parent_ps.inputs):

1139

sched_op.parent_ps.inputs[idx] = new_tens

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1140

1141

def print_schedule(self, schedule: Schedule):

1142

print(f"Schedule: '{schedule.name}'")

1143

for sched_op in self.sched_ops:

1144

if sched_op not in schedule.cost_map:

1145

# Sub-schedule printing

1146

continue

1147

1148

op_info = schedule.cost_map[sched_op]

1149

print(f"\t{sched_op.index}: Operation {sched_op.name} - OFM {sched_op.ofm.shape}")

1150

print(f"\t\tType: {sched_op.op_type}")

1151

print(f"\t\tKernel: {sched_op.kernel}")

1152

print(f"{op_info}")

1153

mem_usage = (

1154

schedule.memory_snapshot[op_info.time_index]

1155

if op_info.time_index < len(schedule.memory_snapshot)

1156

else 0

1157

)

1158

print(f"\t\tSRAM Used: {mem_usage} bytes")

1159

Jonas Ohlsson

25e700c

2022-03-04 14:58:56 +0100

[diff] [blame]

1160

print("\tCascades:")

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1161

for i, cascade in enumerate(schedule.cascades.values()):

1162

print(f"\t\t{i}: {cascade.start} -> {cascade.end}, size: {cascade.mem_usage}")

Patrik Gustavsson

feeb06d

2020-04-22 12:53:47 +0200

[diff] [blame]

1163

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1164

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1165

def _update_tensor_allocation(nng: Graph, arch: ArchitectureFeatures, options):

1166

"""

1167

Creates live ranges and runs tensor allocator for the current schedule

1168

(i.e. sg.schedule for all subgraphs), returns the maximum memory usage

1169

and updates SchedulerOpInfo.mem_usage for all operations in the schedule.

1170

"""

1171

root_sg = nng.get_root_subgraph()

1172

1173

alloc_list = []

1174

if arch.is_spilling_enabled():

1175

mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,)))

1176

mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))

1177

# Order is important

1178

alloc_list.append(mem_alloc_scratch_fast)

1179

alloc_list.append(mem_alloc_scratch)

1180

else:

1181

mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))

1182

alloc_list.append(mem_alloc_scratch)

1183

1184

for mem_area, mem_type_set in alloc_list:

1185

tensor_allocation.allocate_tensors(

nng,

root_sg,

arch,

mem_area,

mem_type_set,

tensor_allocator=options.tensor_allocator,

1192

verbose_allocation=options.verbose_allocation,

1193

cpu_tensor_alignment=options.cpu_tensor_alignment,

1194

)

1195

1196

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1197

class FastStorageComponentAllocator:

1198

def __init__(self, base_mem_usage, max_mem_usage, staging_limit):

1199

self.base_mem_usage = base_mem_usage

1200

self.max_mem_usage = list(max_mem_usage)

1201

self.staging_limit = staging_limit

1202

self.lrs = []

1203

self.evicted = []

1204

self.curr_evicted = []

1205

self.remaining_total_size = []

1206

self.best_allocated_size = 0

1207

self.max_exhaustive_size = 20

1208

1209

def allocate_exhaustive(self, ix, alloc_size):

1210

if ix >= len(self.lrs):

1211

if alloc_size > self.best_allocated_size:

1212

self.best_allocated_size = alloc_size

Louis Verhaard

5c8f1e5

2022-02-23 14:13:07 +0100

[diff] [blame]

1213

self.evicted = self.curr_evicted.copy()

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

return

lr = self.lrs[ix]

for t in range(lr.start_time, lr.end_time):

1218

assert self.base_mem_usage[t] <= self.max_mem_usage[t]

1219

base_usage = max(self.base_mem_usage[lr.start_time : lr.end_time + 1])

1220

can_fit = base_usage + lr.size <= self.staging_limit

1221

always_fits = can_fit

1222

1223

if can_fit:

1224

max_usage = max(self.max_mem_usage[lr.start_time : lr.end_time + 1])

1225

always_fits = max_usage <= self.staging_limit

1226

1227

if can_fit or always_fits:

1228

self.curr_evicted[ix] = False

1229

self.base_mem_usage = self.update_mem_usage(self.base_mem_usage, lr, True)

1230

self.allocate_exhaustive(ix + 1, alloc_size + lr.size)

1231

self.base_mem_usage = self.update_mem_usage(self.base_mem_usage, lr, False)

1232

1233

if not always_fits:

1234

self.curr_evicted[ix] = True

1235

self.max_mem_usage = self.update_mem_usage(self.max_mem_usage, lr, False)

1236

self.allocate_exhaustive(ix + 1, alloc_size)

1237

self.max_mem_usage = self.update_mem_usage(self.max_mem_usage, lr, True)

1238

1239

@staticmethod

1240

def update_mem_usage(mem_usage, lr, increase):

1241

for t in range(lr.start_time, lr.end_time + 1):

1242

mem_usage[t] += lr.size if increase else -lr.size

1243

assert mem_usage[t] >= 0

return mem_usage

@staticmethod

def evict(lr, max_mem_usage, scratched_fms):

1248

for t in range(lr.start_time, lr.end_time + 1):

1249

max_mem_usage[t] -= lr.size

1250

for tens in lr.tensors:

1251

if tens in scratched_fms:

1252

tens.mem_area = scratched_fms[tens][0]

1253

tens.mem_type = scratched_fms[tens][1]

1254

1255

@staticmethod

1256

def keep(lr, base_mem_usage, staging_limit):

1257

for t in range(lr.start_time, lr.end_time + 1):

1258

base_mem_usage[t] += lr.size

1259

assert base_mem_usage[t] <= staging_limit

1260

1261

def allocate_component(self, allocator, lrs, max_mem, min_mem, staging_limit, scratched_fms):

1262

sz = len(lrs)

1263

allocator.lrs = lrs

1264

allocator.evicted = [0] * len(lrs)

1265

allocator.curr_evicted = [0] * sz

1266

allocator.best_allocated_size = -1

1267

# Recursively evaluate all permutations of allocations of the lrs found in the component

1268

allocator.allocate_exhaustive(0, 0)

1269

1270

# Optimal allocation has been found, move lrs accordingly

1271

for i, e in enumerate(allocator.evicted):

1272

if e:

1273

self.evict(lrs[i], max_mem, scratched_fms)

1274

else:

1275

self.keep(lrs[i], min_mem, staging_limit)

1276

1277

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1278

def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_options: SchedulerOptions):

1279

"""Entry point for the Scheduler"""

1280

# Initialize CPU subgraphs

1281

schedulers = dict()

1282

# Initialize schedulers with max schedule. Only schedule NPU subgraphs

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1283

for sg in nng.subgraphs:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1284

if sg.placement != PassPlacement.Npu:

1285

# Create cascaded passes for CPU Ops

1286

cascaded_passes = []

1287

for idx, ps in enumerate(sg.passes):

1288

cps = CascadedPass(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame^]

1289

ps.name,

1290

SchedulingStrategy.WeightStream,

ps.inputs,

[],

ps.outputs,

[ps],

ps.placement,

False,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1297

)

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1298

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1299

cps.time = idx

1300

ps.cascade = cps

1301

cascaded_passes.append(cps)

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1302

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1303

sg.cascaded_passes = cascaded_passes

1304

else:

1305

# Npu subgraph - create schedule

1306

scheduler = Scheduler(nng, sg, arch, scheduler_options)

1307

schedulers[sg] = scheduler

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1308

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1309

scheduler.create_scheduler_representation(arch)

1310

sg.sched_ops = scheduler.sched_ops

1311

scheduler.move_constant_data()

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1312

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1313

# Create the Max schedule template

1314

max_schedule_template = scheduler.create_initial_schedule()

1315

scheduler.max_schedule = max_schedule_template

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1316

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1317

# Create the optimimised Max schedule

1318

sg.schedule = max_schedule_template

1319

scheduler.update_op_memory_snapshot(max_schedule_template)

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

1320

opt_max_schedule = scheduler.propose_schedule_buffering(max_schedule_template, 1 << 32)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1321

sg.schedule = opt_max_schedule

1322

scheduler.update_op_memory_snapshot(opt_max_schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1323

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1324

# Create Min schedule

1325

min_schedule = scheduler.propose_minimal_schedule()

1326

initial_sram_limit = scheduler_options.optimization_sram_limit

1327

if scheduler_options.optimization_strategy == OptimizationStrategy.Size:

1328

initial_sram_limit = scheduler.min_memory_req

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1329

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1330

cascade_builder = CascadeBuilder(scheduler.sched_ops, arch.is_spilling_enabled())

1331

cascade_builder.build_cascades(min_schedule, max_schedule_template, initial_sram_limit)

1332

sg.schedule = min_schedule

1333

scheduler.update_op_memory_snapshot(min_schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1334

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1335

if scheduler_options.optimization_strategy == OptimizationStrategy.Performance:

1336

# Create an optimized schedule

1337

sg.schedule = scheduler.optimize_schedule(

1338

min_schedule, opt_max_schedule, max_schedule_template, scheduler_options

1339

)

1340

scheduler.update_op_memory_snapshot(sg.schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1341

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1342

scheduler.apply_schedule(sg.schedule)

1343

scheduler.use_fast_storage_for_feature_maps(sg.schedule, scheduler_options.optimization_sram_limit)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1344

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1345

if scheduler_options.verbose_schedule:

1346

scheduler.print_schedule(sg.schedule)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1347

Tim Hall