Blame - ethosu/vela/scheduler.py - ml/ethos-u/ethos-u-vela

2021-05-27 18:49:40 +0100

[diff] [blame^]

1

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

16

#

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

17

# Description:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

18

# The scheduler creates and searches for an optimal plan for the network, selecting block configurations and

19

# subdivisions for the Operators

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

20

import copy

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

21

from enum import auto

22

from enum import IntEnum

23

from typing import Dict

24

from typing import List

25

from typing import Optional

26

from typing import Tuple

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

27

28

from . import live_range

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

29

from . import npu_performance

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

30

from . import tensor_allocation

31

from . import weight_compressor

32

from .architecture_allocator import ArchitectureBlockConfig

33

from .architecture_allocator import find_block_config

34

from .architecture_allocator import get_ifm_area_required

35

from .architecture_allocator import to_upscale

36

from .architecture_features import ArchitectureFeatures

37

from .architecture_features import Block

38

from .cascade_builder import CascadeBuilder

39

from .cascade_builder import CascadeInfo

Fredrik Svedberg

880e735

2020-08-25 11:31:47 +0200

[diff] [blame]

40

from .data_type import DataType

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

41

from .nn_graph import CascadedPass

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

42

from .nn_graph import Graph

43

from .nn_graph import Pass

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

44

from .nn_graph import PassPlacement

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

45

from .nn_graph import SchedulingStrategy

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

46

from .nn_graph import Subgraph

47

from .numeric_util import round_down

48

from .numeric_util import round_up

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

49

from .operation import NpuBlockType

Louis Verhaard

aee5d75

2020-09-30 09:01:52 +0200

[diff] [blame]

50

from .operation import Op

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

51

from .shape4d import Shape4D

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

52

from .tensor import MemArea

Patrik Gustavsson

eca2e95

2020-05-27 09:15:11 +0200

[diff] [blame]

53

from .tensor import MemType

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

54

from .tensor import Tensor

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

55

from .tensor import TensorFormat

56

from .tensor import TensorPurpose

57

from .tensor import TensorSubPurpose

Jacob Bohlin

1a66697

2020-09-11 10:04:15 +0200

[diff] [blame]

58

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

59

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

60

def shape_for_format(shape: Shape4D, tensor_format: TensorFormat) -> Shape4D:

61

if tensor_format == TensorFormat.NHCWB16:

62

return shape.with_depth(round_up(shape.depth, 16))

return shape

class OptimizationStrategy(IntEnum):

68

"""Enum defining the different optimization strategies for the Scheduler"""

69

70

Size = auto()

71

Performance = auto()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

def __str__(self):

return self.name

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

77

class SchedulerOpInfo:

78

"""Contains metadata about a SchedulerOperation that is unique to one Schedule"""

79

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

80

def __init__(

81

self,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

82

block_config: ArchitectureBlockConfig,

83

weights_size: int,

84

stripe_input: Shape4D,

85

stripe_input2: Optional[Shape4D],

86

stripe: Shape4D,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

87

):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

88

self.block_config = block_config

89

self.weights_size = weights_size

90

self.stripe_input = stripe_input

91

self.stripe_input2 = stripe_input2

92

self.stripe = stripe

93

self.cascade = 0 # Assigned by CascadeBuilder. 0 means not part of a cascade

94

self.time_index = None # Set by update_op_memory_snapshot

95

self.ofm_depth_slices: List[int] = [0, stripe.depth]

96

self.npu_weights_tensor = None

97

self.buffered_weight_tensor = None

98

self.cycles = None

99

self.slack_buffering_cycles = 0

100

self.slack_buffering_memory = 0

101

self.full_weight_transfer_cycles = 0

102

103

def copy(self):

104

res = SchedulerOpInfo(self.block_config, self.weights_size, self.stripe_input, self.stripe_input2, self.stripe,)

105

res.cascade = self.cascade

return res

def __str__(self):

res = f"\t\tBlock Config = {self.block_config}\n"

110

res += f"\t\tOFM Block = {self.block_config.ofm_block}\n"

111

res += f"\t\tIFM Stripe = {self.stripe_input}\n"

112

res += f"\t\tIFM2 Stripe = {self.stripe_input2}\n"

113

res += f"\t\tOFM Stripe = {self.stripe}\n"

114

res += f"\t\tEncoded Weights = {self.npu_weights_tensor and len(self.npu_weights_tensor.buffer)} bytes\n"

115

res += (

116

f"\t\tWeight buffer = {self.buffered_weight_tensor and self.buffered_weight_tensor.storage_size()} bytes\n"

117

)

118

res += f"\t\tDepth slices = {self.ofm_depth_slices}\n"

119

res += f"\t\tAssigned Cascade = {self.cascade}"

return res

class SchedulerOptions:

124

"""Contains options for the Scheduler"""

125

126

def __init__(

127

self, optimization_strategy, sram_target, verbose_schedule,

128

):

129

self.optimization_strategy = optimization_strategy

130

self.optimization_sram_limit = sram_target

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

131

self.verbose_schedule = verbose_schedule

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

132

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

133

def __str__(self) -> str:

134

return f"{type(self).__name__}: {str(self.__dict__)}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

__repr__ = __str__

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

139

class SchedulerTensor:

140

def __init__(self, shape, dt, mem_area, _format):

141

self.dtype = dt

142

self.mem_area = mem_area

143

self.shape = shape

144

self.format = _format

145

self.connection = None

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

146

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

147

148

class SchedulerOperation:

149

"""Scheduler internal representation of 'Operation'

150

This class can be seen as a node within the Scheduler Graph representation

151

"""

152

153

def __init__(self, ps: Pass, arch: ArchitectureFeatures, nng: Graph):

154

self.arch = arch

155

self.parent_ps = ps

156

self.parent_op = ps.primary_op

157

self.name = ps.primary_op.name

158

self.op_type = ps.primary_op.type

159

self.activation = ps.primary_op.activation

160

self.kernel = ps.primary_op.kernel

161

self.resampling_mode = ps.primary_op.ifm.resampling_mode

162

self.uses_scalar = ps.primary_op.ifm2 is not None and (

163

ps.primary_op.ifm.shape == [] or ps.primary_op.ifm2.shape == []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

164

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

165

self.ifm_ublock = arch.ifm_ublock

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

166

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

167

self.ifm = SchedulerTensor(ps.ifm_shapes[0], ps.ifm_tensor.dtype, ps.ifm_tensor.mem_area, ps.ifm_tensor.format,)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

168

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

169

self.ifm2 = None

170

if ps.ifm2_tensor:

171

self.ifm2 = SchedulerTensor(

172

ps.ifm_shapes[1], ps.ifm2_tensor.dtype, ps.ifm2_tensor.mem_area, ps.ifm2_tensor.format,

173

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

174

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

175

self.ofm = SchedulerTensor(ps.ofm_shapes[0], ps.ofm_tensor.dtype, ps.ofm_tensor.mem_area, ps.ofm_tensor.format,)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

176

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

177

# Input volume width and height required to produce the smallest possible stripe

178

self.min_stripe_input_w, self.min_stripe_input_h = self._calculate_min_stripe_input()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

179

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

180

# Flags that marks whether this SchedulerOperation requires full IFM/OFM

181

self.requires_full_ifm = False

182

self.requires_full_ifm2 = False

183

self.requires_full_ofm = False

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

184

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

185

self.index = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

186

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

187

def add_ifm_connection(self, conn: "Connection"):

188

"""Add input connection to another SchedulerOperation or Subgraph Input"""

189

conn.consumers.append(self)

190

self.ifm.connection = conn

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

191

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

192

def add_ifm2_connection(self, conn: "Connection"):

193

"""Add input connection to another SchedulerOperation or Subgraph Input"""

194

if self.ifm2:

195

conn.consumers.append(self)

196

self.ifm2.connection = conn

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

197

else:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

198

assert False, f"Trying to set an IFM2 Connection to {self} which has no IFM2"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

199

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

200

def add_ofm_connection(self, conn: "Connection"):

201

"""Add output connection to another SchedulerOperation or Subgraph Output"""

202

conn.producers.append(self)

203

self.ofm.connection = conn

204

205

def get_dependants(self):

206

"""Returns a list of the Ops that depend on this Operation's OFM"""

207

return self.ofm.connection.consumers

208

209

def ifm_size_in_bytes(self) -> int:

210

"""Returns size of the IFM in bytes"""

211

ifm_storage_shape = shape_for_format(self.ifm.shape, self.ifm.format)

212

return round_up(ifm_storage_shape.elements() * self.ifm.dtype.size_in_bytes(), Tensor.AllocationQuantum)

213

214

def ifm2_size_in_bytes(self) -> int:

215

"""Returns size of the IFM2 in bytes"""

216

if self.ifm2:

217

ifm2_storage_shape = shape_for_format(self.ifm2.shape, self.ifm2.format)

218

return round_up(ifm2_storage_shape.elements() * self.ifm2.dtype.size_in_bytes(), Tensor.AllocationQuantum)

return 0

def ofm_size_in_bytes(self) -> int:

223

"""Returns size of the OFM in bytes"""

224

ofm_storage_shape = shape_for_format(self.ofm.shape, self.ofm.format)

225

return round_up(ofm_storage_shape.elements() * self.ofm.dtype.size_in_bytes(), Tensor.AllocationQuantum)

226

227

def create_scheduler_info(self, nng: Graph, stripe: Shape4D) -> SchedulerOpInfo:

228

"""Returns schedule info about this SchedulerOperation based on how many ofm elements it should produce"""

229

ifm_shape = self.ifm.shape

230

ifm2_shape = self.ifm2 and self.ifm2.shape

231

ofm_shape = stripe

232

233

if ofm_shape != self.ofm.shape:

234

# Striped Op - Need to calculate stripe input volume

235

stripe_input_w, stripe_input_h = self._get_stripe_input_requirement(stripe)

236

# Ensure stripe input volume is within the full IFM volume

237

stripe_input_h = min(stripe_input_h, self.ifm.shape.height)

238

stripe_input_w = min(stripe_input_w, self.ifm.shape.width)

239

ifm_shape = ifm_shape.with_hw(stripe_input_h, stripe_input_w)

240

241

if self.ifm2:

242

stripe_input2_h = min(stripe_input_h, self.ifm2.shape.height)

243

stripe_input2_w = min(stripe_input_w, self.ifm2.shape.width)

244

ifm2_shape = ifm2_shape.with_hw(stripe_input2_h, stripe_input2_w)

245

246

block_config = self._get_block_config(ifm_shape, ifm2_shape, self.uses_scalar, ofm_shape)

247

248

scheduler_op_info = SchedulerOpInfo(block_config, 0, ifm_shape, ifm2_shape, ofm_shape)

249

if self.parent_op.weights:

250

# Default full-depth weight encoding with no buffering

251

scheduler_op_info.npu_weights_tensor = weight_compressor.encode_weight_and_scale_tensor(

252

self.arch,

253

self.parent_op,

254

self.parent_op.weights,

self.parent_op.bias,

self.kernel,

block_config,

[0, self.ofm.shape.depth],

259

)

260

261

self.parent_ps.block_config = block_config.old_style_representation()

262

return scheduler_op_info

263

264

def _get_stripe_input_requirement(self, stripe_shape: Shape4D) -> Tuple[int, int]:

265

"""Returns the amount of IFM required to produce the stripe with shape:'stripe_shape'"""

266

ofm_shape_to_produce = Block.from_shape(stripe_shape.as_list())

267

268

return get_ifm_area_required(ofm_shape_to_produce, self.kernel, to_upscale(self.resampling_mode))

269

270

def _calculate_min_stripe_input(self) -> Shape4D:

271

# Calculate the input volume required height and width for the smallest possible stripe (h,w = 1,1)

272

min_stripe = self.ofm.shape.with_hw(1, 1)

273

return self._get_stripe_input_requirement(min_stripe)

274

275

def _get_block_config(

276

self, ifm_shape: Shape4D, ifm2_shape: Optional[Shape4D], uses_scalar: bool, ofm_shape: Shape4D

277

) -> ArchitectureBlockConfig:

278

# Returns a block config and SHRAM layout

279

lut_banks = 2 if self.parent_op.activation_lut else 0

280

return find_block_config(

281

self.arch,

282

self.op_type.npu_block_type,

ofm_shape,

ifm_shape,

ifm2_shape,

uses_scalar,

self.ifm.dtype.size_in_bits(),

288

self.kernel,

289

lut_banks,

290

self.parent_op.has_scaling(),

291

self.resampling_mode,

)

class Connection:

"""Scheduler internal representation of a Tensor that connects two SchedulerOperations

297

This class can be seen as an edge within the Scheduler Graph representation

298

"""

299

300

def __init__(self, tensor: Tensor):

301

self.parent_tens = tensor

302

303

# SchedulerOperation relationships

304

self.producers: List[SchedulerOperation] = []

305

self.consumers: List[SchedulerOperation] = []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

306

307

def __str__(self):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

308

return f"<Connection {self.parent_tens.name}>"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

__repr__ = __str__

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

313

class Schedule:

314

"""Class that contains a solution of how to schedule an NPU subgraph and its cost"""

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

315

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

316

def __init__(self, sg: Subgraph, label: str):

317

self.sg = sg

318

self.label = label

319

self.cost_map: Dict[SchedulerOperation, SchedulerOpInfo] = {}

320

self.cascades: Dict[int, CascadeInfo] = {}

321

self.fast_storage_peak_usage = 0

322

self.memory_snapshot = None

@property

def name(self):

return f"{self.sg.name}_{self.label}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

327

328

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

329

class Scheduler:

330

"""Main class of the Vela Scheduling"""

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

331

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

332

def __init__(self, nng: Graph, sg: Subgraph, arch: ArchitectureFeatures, options: SchedulerOptions):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

333

self.nng = nng

334

self.sg = sg

335

self.arch = arch

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

336

self.sched_ops: List(SchedulerOperation) = []

337

self.max_schedule = None

338

self.scheduler_options = options

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

339

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

340

def create_scheduler_representation(self, arch: ArchitectureFeatures):

341

"""Creates a Scheduler Graph representation"""

342

# Temporary dict for creating connections between the Operations

343

connections: Dict[Tensor, Connection] = {}

344

# Memory required for the largest FeatureMap that has to be full

345

min_memory_req = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

346

for ps in self.sg.passes:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

347

if ps.primary_op:

348

# Set tensor format to NHCWB16 for output FeatureMaps, if possible

Louis Verhaard

0b9c9a3

2020-09-15 14:05:38 +0200

[diff] [blame]

349

for output in ps.outputs:

Patrik Gustavsson

d1836c7

2021-02-04 08:22:18 +0100

[diff] [blame]

350

if output.purpose != TensorPurpose.FeatureMap:

Patrik Gustavsson

feeb06d

2020-04-22 12:53:47 +0200

[diff] [blame]

351

continue

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

352

if not output.needs_linear_format:

Louis Verhaard

0b9c9a3

2020-09-15 14:05:38 +0200

[diff] [blame]

353

output.set_format(TensorFormat.NHCWB16, arch)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

354

355

# Create SchedulerOperations

356

op = SchedulerOperation(ps, arch, self.nng)

357

op.index = len(self.sched_ops)

358

359

# Make connections

360

if ps.ifm_tensor not in connections:

361

connections[ps.ifm_tensor] = Connection(ps.ifm_tensor)

362

if ps.ifm2_tensor and ps.ifm2_tensor not in connections:

363

connections[ps.ifm2_tensor] = Connection(ps.ifm2_tensor)

364

if ps.ofm_tensor not in connections:

365

connections[ps.ofm_tensor] = Connection(ps.ofm_tensor)

366

367

op.add_ifm_connection(connections[ps.ifm_tensor])

368

if ps.ifm2_tensor:

369

op.add_ifm2_connection(connections[ps.ifm2_tensor])

370

op.add_ofm_connection(connections[ps.ofm_tensor])

371

372

# Set requirements on the ifm/ofm buffers

373

self.sched_ops.append(op)

374

if ps.ifm_tensor in self.sg.input_tensors:

375

# This Op consumes a subgraph input

376

op.requires_full_ifm = True

377

if ps.ifm2_tensor and ps.ifm2_tensor in self.sg.input_tensors:

378

# This Op consumes a subgraph input

379

op.requires_full_ifm2 = True

380

if ps.ofm_tensor in self.sg.output_tensors:

381

# This Op produces a subgraph output

382

op.requires_full_ofm = True

383

if ps.ifm_tensor.needs_linear_format:

384

op.requires_full_ifm = True

385

if ps.ifm2_tensor and ps.ifm2_tensor.needs_linear_format:

386

op.requires_full_ifm2 = True

387

if ps.ofm_tensor.needs_linear_format or ps.primary_op.memory_function == Op.ConcatSliceWrite:

388

op.requires_full_ofm = True

389

if len(ps.primary_op.outputs) > 1 or len(ps.primary_op.outputs[0].consumer_list) > 1:

390

# Op has multiple outputs or consumers - requires full OFM

391

op.requires_full_ofm = True

392

393

# Check memory requirements if this Op requires any full FeatureMaps

394

op_memory_req = 0

395

if op.requires_full_ifm:

396

op_memory_req += op.ifm_size_in_bytes()

397

if op.requires_full_ifm2:

398

op_memory_req += op.ifm2_size_in_bytes()

399

if op.requires_full_ofm:

400

op_memory_req += op.ofm_size_in_bytes()

401

402

min_memory_req = max(op_memory_req, min_memory_req)

403

404

# Theoretical minimum required memory - used to guide the cascade building

405

self.min_memory_req = min_memory_req

406

407

def create_initial_schedule(self) -> Schedule:

408

"""Creates an initial schedule with no cascading or buffering of any kind"""

409

schedule = Schedule(self.sg, "MAX")

410

411

for op in self.sched_ops:

412

cost = op.create_scheduler_info(self.nng, op.ofm.shape)

413

cost.cycles = self.estimate_op_performance(op, cost.block_config, op.ofm.shape.depth)

414

schedule.cost_map[op] = cost

return schedule

def update_op_memory_snapshot(self, schedule: Schedule):

419

memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

420

421

# Collect live ranges from tensors

422

lr_graph = live_range.LiveRangeGraph()

423

for mem_area, mem_type_set in memories_list:

424

live_range.extract_live_ranges_from_cascaded_passes(

425

self.nng.get_root_subgraph(), mem_area, mem_type_set, False, lr_graph, Tensor.AllocationQuantum,

426

)

427

428

# Populate time-array with memory used by live ranges

429

temporal_usage = lr_graph.get_temporal_memory_usage(self.arch.fast_storage_mem_area)

430

schedule.memory_snapshot = temporal_usage

431

432

# Set the peak memory usage

433

schedule.fast_storage_peak_usage = max(temporal_usage, default=0)

434

435

def estimate_op_performance(self, op: SchedulerOperation, block_config, ofm_depth):

436

query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)

437

query.ifm_shape = op.ifm.shape

438

query.ifm_memory_area = op.ifm.mem_area

439

query.ifm_bits = op.ifm.dtype.size_in_bits()

440

query.ifm_format = op.ifm.format

441

query.ifm2_shape = op.ifm2 and op.ifm2.shape

442

query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area

443

query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()

444

query.ifm2_format = op.ifm2 and op.ifm2.format

445

query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)

446

query.ofm_memory_area = op.ofm.mem_area

447

query.ofm_bits = op.ofm.dtype.size_in_bits()

448

query.ofm_format = op.ofm.format

449

if op.parent_op.bias:

450

query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)

451

query.const_memory_area = self.arch.fast_storage_mem_area

452

453

query.kernel = op.kernel

454

query.config = block_config

455

456

return npu_performance.measure_cycle_cost(self.arch, op.op_type, op.activation and op.activation.op_type, query)

457

458

def propose_schedule_buffering(self, ref_schedule: Schedule):

459

"""Create a buffered schedule"""

460

buffered_schedule = Schedule(self.sg, f"{ref_schedule.label}_BUFFERED")

461

staging_limit_bytes = self.scheduler_options.optimization_sram_limit

462

463

prev_op = None

464

for sched_op in self.sched_ops:

465

if sched_op not in ref_schedule.cost_map:

466

# sched_op is not part of this sub-schedule - skip

467

continue

468

469

self.propose_operator_buffering(sched_op, prev_op, buffered_schedule, ref_schedule, staging_limit_bytes)

470

prev_op = sched_op

471

472

return buffered_schedule

473

474

def propose_operator_buffering(

475

self,

476

sched_op: SchedulerOperation,

477

prev_op: SchedulerOperation,

478

buffered_schedule: Schedule,

479

ref_schedule: Schedule,

480

staging_limit_bytes,

481

):

482

# Mild recursion might mean this Op has already been seen

483

if sched_op in buffered_schedule.cost_map:

484

return

485

486

# Take the reference schedule as default costings for this schedule

487

ref_cost = ref_schedule.cost_map[sched_op]

488

cost = copy.copy(ref_cost)

489

cost.slack_buffering_cycles = ref_cost.cycles.op_cycles

490

memory_snapshot = ref_schedule.memory_snapshot

491

ref_memory_usage = memory_snapshot[ref_cost.time_index] if ref_cost.time_index < len(memory_snapshot) else 0

492

cost.slack_buffering_memory = staging_limit_bytes - ref_memory_usage

493

buffered_schedule.cost_map[sched_op] = cost

494

495

# Attempt weight buffering on anything with a weights tensor

496

if sched_op.parent_op.weights:

497

self.propose_weight_buffering(

498

sched_op.parent_op.weights,

499

sched_op.parent_op.bias,

sched_op,

prev_op,

buffered_schedule,

ref_schedule,

cost.slack_buffering_memory,

)

return cost

def weights_needs_dma(self, weight_tensor):

510

if weight_tensor and weight_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast):

511

# Weights are in permanent storage

512

# Only when permanent storage differs from feature map storage, there is a point moving the data

513

if (

514

weight_tensor.mem_area in (MemArea.Dram, MemArea.OffChipFlash)

515

and self.arch.permanent_storage_mem_area != self.arch.fast_storage_mem_area

):

return True

return False

def propose_weight_buffering(

self,

weight_tensor,

scale_tensor,

sched_op: SchedulerOperation,

525

prev_op: SchedulerOperation,

526

buffered_schedule: Schedule,

527

ref_schedule: Schedule,

528

buffer_limit_bytes,

529

):

530

cost = buffered_schedule.cost_map[sched_op]

531

prev_cost = buffered_schedule.cost_map.get(prev_op)

532

ref_cost = ref_schedule.cost_map[sched_op]

533

assert cost and ref_cost

534

535

needs_dma = self.weights_needs_dma(weight_tensor)

536

537

ofm_full_depth_slices = [0, ref_cost.stripe.depth]

538

539

# Encode weights for the full depth

540

full_weights = weight_compressor.encode_weight_and_scale_tensor(

self.arch,

sched_op.parent_op,

weight_tensor,

scale_tensor,

sched_op.kernel,

cost.block_config,

ofm_full_depth_slices,

548

)

549

full_weights_bytes = len(full_weights.buffer)

550

cost.ofm_depth_slices = ofm_full_depth_slices

551

552

# No buffering required - take all the weights from permanent storage

553

if sched_op.op_type == Op.FullyConnected or not needs_dma:

554

cost.npu_weights_tensor = full_weights

555

return

556

557

encoded_weights = full_weights

558

559

# How many NPU cycles are available under the previously executing

560

# operator and SRAM unused for performing buffered DMA transfers

561

slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0

562

slack_memory = prev_cost.slack_buffering_memory if prev_cost else 0

563

564

# Force full depth for cascaded Ops

565

if ref_cost.cascade != 0:

566

weight_tensor_purpose = TensorSubPurpose.Standard

567

weight_buffer_size = full_weights_bytes

568

# Update the memory snapshot to reflect the added size of the weights

569

ref_schedule.memory_snapshot[ref_cost.time_index] += weight_buffer_size

570

else:

571

# Estimate the buffering cycle time for the full set of weights

572

full_transfer_cycles = npu_performance.measure_mem2mem_cycles(

573

self.arch, weight_tensor.mem_area, self.arch.fast_storage_mem_area, full_weights_bytes

574

)

575

cost.full_weight_transfer_cycles = full_transfer_cycles

576

577

# Calculate the amount of prebuffering necessary (or what is possible with limited

578

# double buffer buffer size)

579

half_buffer_limit = buffer_limit_bytes // 2

580

if full_transfer_cycles > slack_cycles:

581

prebuffer_ratio = slack_cycles / full_transfer_cycles

582

prebuffer_bytes = min(prebuffer_ratio * full_weights_bytes, half_buffer_limit)

583

else:

584

prebuffer_bytes = min(full_weights_bytes, half_buffer_limit)

585

prebuffer_ratio = prebuffer_bytes / full_weights_bytes

586

587

# Have to split the weights if the initial buffering can't store

588

# all of the compressed weights

589

if prebuffer_bytes < full_weights_bytes:

590

prebuffer_depth = int(ref_cost.stripe.depth * prebuffer_ratio)

591

592

# Round prebuffering down to nearest valid split depth

593

prebuffer_depth = int(max(16, round_down(prebuffer_depth, ArchitectureFeatures.OFMSplitDepth)))

594

595

while True:

596

buffering_depth = max(cost.block_config.ofm_block.depth, prebuffer_depth)

597

598

# Clamp buffering to the double buffering limit

599

buffering_bytes = (buffering_depth / ref_cost.stripe.depth) * full_weights_bytes

600

if buffering_bytes > half_buffer_limit:

601

buffering_depth = (half_buffer_limit / full_weights_bytes) * ref_cost.stripe.depth

602

buffering_depth = int(max(16, round_down(prebuffer_depth, ArchitectureFeatures.OFMSplitDepth)))

603

604

# Create list of depth slices

605

depth_slices = [0]

606

if prebuffer_depth < ref_cost.stripe.depth:

607

depth_slices += list(range(prebuffer_depth, ref_cost.stripe.depth, buffering_depth))

608

depth_slices.append(ref_cost.stripe.depth)

609

610

# Encode weights based depth slices

611

cost.ofm_depth_slices = depth_slices

612

encoded_weights = weight_compressor.encode_weight_and_scale_tensor(

self.arch,

sched_op.parent_op,

weight_tensor,

scale_tensor,

sched_op.kernel,

cost.block_config,

cost.ofm_depth_slices,

620

)

621

622

# Chosen buffering might not fit at all, iterate until it does

623

# or until the minimum usable slice size is reached

624

if (

625

encoded_weights.max_range_bytes <= half_buffer_limit

626

or prebuffer_depth == ArchitectureFeatures.OFMSplitDepth

):

break

prebuffer_depth = round_up(prebuffer_depth // 2, ArchitectureFeatures.OFMSplitDepth)

631

632

# Calculate cycles required to run the last op for use as future slack

633

tail_cycles = self.estimate_op_performance(

634

sched_op, cost.block_config, depth_slices[-1] - depth_slices[-2]

635

)

636

cost.slack_buffering_cycles = tail_cycles.op_cycles

637

638

# Determine whether the weights need to be double buffered

639

weight_buffer_size = min(len(encoded_weights.buffer), encoded_weights.max_range_bytes)

640

641

# Only buffer weights if there's still space left for the buffer

642

if weight_buffer_size <= buffer_limit_bytes:

643

assert weight_buffer_size % 16 == 0

644

# Determine whether to double buffer or single buffer

645

if (weight_buffer_size * 2 <= buffer_limit_bytes) and (weight_buffer_size < len(encoded_weights.buffer)):

646

weight_buffer_size = weight_buffer_size * 2

647

weight_tensor_purpose = TensorSubPurpose.DoubleBuffer

648

else:

649

weight_tensor_purpose = TensorSubPurpose.Standard

650

651

cost.buffered_weight_tensor = Tensor(

652

[1, 1, 1, weight_buffer_size], DataType.uint8, weight_tensor.name + "_buffer"

653

)

654

cost.buffered_weight_tensor.src_tensor = encoded_weights

655

cost.buffered_weight_tensor.mem_area = self.arch.fast_storage_mem_area

656

cost.buffered_weight_tensor.mem_type = MemType.Scratch_fast

657

cost.buffered_weight_tensor.purpose = TensorPurpose.Weights

658

cost.buffered_weight_tensor.sub_purpose = weight_tensor_purpose

659

if ref_cost.cascade == 0:

660

# Determine if the lifetime can be extended and pre-buffer weights under the previous operation

661

cost.buffered_weight_tensor.pre_buffer = weight_buffer_size < slack_memory

662

663

cost.slack_buffering_memory -= weight_buffer_size

664

else:

665

# Don't slice or buffer - use the whole depth from persistent storage

666

cost.ofm_depth_slices = ofm_full_depth_slices

667

encoded_weights = full_weights

668

669

cost.npu_weights_tensor = encoded_weights

670

671

def propose_minimal_schedule(self) -> Schedule:

672

"""Proposes scheduling parameters where every operator is subdivided into the smallest stripe that satisfies the

673

next operators stride"""

674

min_schedule = Schedule(self.sg, "MIN")

675

cost_map = min_schedule.cost_map

676

677

# Keep track of the previous Op - which consumes the current Op's OFM

678

prev_op = None

679

for sched_op in reversed(self.sched_ops):

680

min_stripe_height = prev_op.kernel.stride.y if prev_op else 1

681

min_stripe = sched_op.ofm.shape.with_height(min_stripe_height)

682

683

cost = sched_op.create_scheduler_info(self.nng, min_stripe)

684

cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)

685

cost_map[sched_op] = cost

prev_op = sched_op

return min_schedule

def propose_schedule_striping(self, final_stripe: Shape4D, label: str, ref_schedule: Schedule) -> Schedule:

692

"""Proposes new striping for a schedule. The stripe is derived from the ifm requirements of the next Op down"""

693

ref_cost = ref_schedule.cost_map

694

695

striped_schedule = Schedule(self.sg, label)

696

stripe = final_stripe

697

for sched_op in reversed(self.sched_ops):

698

if sched_op not in ref_cost:

699

# sched_op is not part of the sub-schedule - skip

700

continue

701

702

# Create a cost entry with the new stripe

703

cost = sched_op.create_scheduler_info(self.nng, stripe)

704

705

# Copy the weight buffering from the reference schedule

706

cost.buffered_weight_tensor = ref_cost[sched_op].buffered_weight_tensor

707

708

# Estimate performance

709

cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)

710

striped_schedule.cost_map[sched_op] = cost

711

712

# Calculate the preceeding Op's stripe

713

stripe = sched_op.ifm.shape.with_height(stripe.height * sched_op.kernel.stride.y)

714

715

return striped_schedule

716

717

def estimate_schedule_memory_usage(self, schedule: Schedule, non_local_mem_usage: dict):

718

"""Estimates the memory usage of a schedule"""

719

cost = schedule.cost_map

720

cascades = schedule.cascades

721

peak_mem_usage = 0

722

for sched_op in self.sched_ops:

723

if sched_op not in cost:

724

# sched_op is not part of the sub-schedule - skip

725

continue

726

727

if cost[sched_op].cascade:

728

# This Op is part of a cascade - use the cascade's memory usage

729

cascade_info = cascades[cost[sched_op].cascade]

730

# Non-local memory usage is already included in the cascade_info

731

peak_mem_usage = max(cascade_info.mem_usage, peak_mem_usage)

732

else:

733

# This Op is not part of a cascade - calculate the memory usage

734

op_weight_buffer = 0

735

if cost[sched_op].buffered_weight_tensor:

736

op_weight_buffer = cost[sched_op].buffered_weight_tensor.storage_size()

737

738

op_mem_usage = (

739

sched_op.ifm_size_in_bytes()

740

+ sched_op.ofm_size_in_bytes()

741

+ op_weight_buffer

742

+ non_local_mem_usage.get(sched_op, 0)

743

)

744

peak_mem_usage = max(op_mem_usage, peak_mem_usage)

745

746

return peak_mem_usage

747

748

def optimize_sub_schedule(

749

self, cascade_info: CascadeInfo, ref_schedule: Schedule, max_template: Schedule, memory_limit: int

750

) -> Schedule:

751

"""Extracts the Ops covered by the given cascade and creates a sub-schedule. The sub-schedule is optimized by

752

proposing weight buffering and then continously proposing new stripe sizes"""

753

ref_cost = ref_schedule.cost_map

754

# Extract the ops that are part of this sub-schedule

755

start = cascade_info.start

756

end = cascade_info.end

757

sub_schedule_ops = self.sched_ops[start : end + 1]

758

# Create a sub-schedule that contains only the costs for the Ops that are part of the sub-schedule

759

sub_schedule = Schedule(self.sg, f"SUB_{start}_{end}")

760

for sched_op in sub_schedule_ops:

761

sub_schedule.cost_map[sched_op] = ref_cost[sched_op]

762

763

sub_schedule.cascades[end] = cascade_info

764

# Use the memory snapshot from the reference schedule

765

sub_schedule.memory_snapshot = ref_schedule.memory_snapshot

766

767

# Calculate memory usage that is live during the sub-schedule but not part of it

768

time_for_cascade = ref_cost[sub_schedule_ops[0]].time_index

769

mem_usage_parallel_to_sub_schedule = ref_schedule.memory_snapshot[time_for_cascade] - cascade_info.mem_usage

770

# If the first Op's IFM has other consumers it has to live throughout the whole sub-schedule whether it's

771

# included in a cascade or not

772

persistent_initial_ifm = (

773

sub_schedule_ops[0].ifm_size_in_bytes() if len(sub_schedule_ops[0].ifm.connection.consumers) > 1 else 0

774

)

775

# Calculate non-local-mem-usage per Operator

776

non_local_mem_usage = {}

777

for idx, sched_op in enumerate(sub_schedule_ops):

778

non_local_mem_usage[sched_op] = mem_usage_parallel_to_sub_schedule

779

if idx != 0:

780

non_local_mem_usage[sched_op] += persistent_initial_ifm

781

782

cascade_builder = CascadeBuilder(sub_schedule_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)

783

784

# Start by adding buffering

785

buffered_sub_schedule = self.propose_schedule_buffering(sub_schedule)

786

# Copy the cascades over from the unbuffered-schedule

787

buffered_sub_schedule.cascades = sub_schedule.cascades

788

789

# Generate the possible stripings for the final Op in the sub-schedule

790

final_ofm_shape = sub_schedule_ops[-1].ofm.shape

791

possible_stripes = [

792

final_ofm_shape.with_height(stripe_h) for stripe_h in range(1, final_ofm_shape.height // 2 + 1)

793

]

794

795

# Propose different striping - the possible stripes are proposed similarly to a binary search

796

best_schedule = buffered_sub_schedule

797

iteration = 0

798

while len(possible_stripes) > 1:

799

proposed_stripe = possible_stripes[len(possible_stripes) // 2]

800

proposed_schedule = self.propose_schedule_striping(

801

proposed_stripe, f"OPTIMIZED_{iteration}", buffered_sub_schedule

802

)

803

804

cascade_builder.build_cascades(proposed_schedule, max_template, memory_limit)

805

806

# Check if proposal fits

807

proposed_schedule_mem_usage = self.estimate_schedule_memory_usage(proposed_schedule, non_local_mem_usage)

808

if (proposed_schedule_mem_usage) <= memory_limit:

809

# Remove all possible stripes smaller than this

810

possible_stripes = possible_stripes[len(possible_stripes) // 2 :]

811

best_schedule = proposed_schedule

812

if not proposed_schedule.cascades:

813

# No cascading required - early exit

814

break

815

else:

816

# Proposal doesn't fit within the limit - remove all possible stripes larger than this

817

possible_stripes = possible_stripes[: len(possible_stripes) // 2]

iteration += 1

return best_schedule

def optimize_schedule(

824

self, schedule: Schedule, max_sched: Schedule, max_template: Schedule, options: SchedulerOptions,

825

) -> Schedule:

826

"""Extracts sub-schedules based on the cascades and optimizes them and applies them to the final schedule"""

827

sram_limit = options.optimization_sram_limit

828

if max_sched.fast_storage_peak_usage < sram_limit and not self.arch.is_spilling_enabled():

829

# Maximum performance schedule fits within the SRAM target

830

return max_sched

831

832

# Extract the cascades

833

cascades = [cascade for cascade in schedule.cascades.values()]

834

for cascade_info in cascades:

835

# Remove existing cascade from schedule

836

del schedule.cascades[cascade_info.end]

837

# Optimize the sub-schedule in this cascade

838

opt_sub_schedule = self.optimize_sub_schedule(cascade_info, schedule, max_template, sram_limit)

839

# Update the sub-schedule Op and cascade costs to the full schedule

840

schedule.cost_map.update(opt_sub_schedule.cost_map)

841

schedule.cascades.update(opt_sub_schedule.cascades)

842

843

# Update memory snapshot

844

self.sg.schedule = schedule

845

self.update_op_memory_snapshot(schedule)

846

# Propose schedule buffering to the optimized schedule

847

optimized_sched = self.propose_schedule_buffering(schedule)

848

# Copy the cascade's metadata from the unbuffered schedule

849

optimized_sched.cascades = schedule.cascades

850

return optimized_sched

851

852

def apply_schedule(self, sched: Schedule):

853

"""Applies the given schedule as a final solution"""

854

for sched_op in self.sched_ops:

855

op_info = sched.cost_map[sched_op]

856

cascade_info = sched.cascades.get(op_info.cascade, None)

857

if cascade_info and sched_op in cascade_info.buffers:

858

buffer_tens = sched_op.ifm.connection.parent_tens

859

# Apply memory area and type

860

buffer_tens.mem_area = self.arch.fast_storage_mem_area

861

buffer_tens.mem_type = MemType.Scratch_fast

862

# Apply Rolling buffer

863

buffer_tens.set_format(TensorFormat.NHCWB16, self.arch)

864

buffer_tens.set_new_sub_purpose(TensorSubPurpose.RollingBufferY, cascade_info.buffers[sched_op].height)

865

866

sched_op.parent_ps.block_config = op_info.block_config.old_style_representation()

867

868

# Ensure that the src_tensor reference is set correctly

869

if op_info.buffered_weight_tensor:

870

op_info.buffered_weight_tensor.src_tensor = op_info.npu_weights_tensor

871

872

def use_fast_storage_for_feature_maps(self, schedule: Schedule, memory_limit: int):

873

if self.arch.fast_storage_mem_area == self.arch.feature_map_storage_mem_area:

874

return

875

876

# Force all OFMs to fast-storage

877

for sched_op in self.sched_ops:

878

cost = schedule.cost_map[sched_op]

879

if cost.cascade == 0:

880

if sched_op.get_dependants():

881

ofm_tens = sched_op.ofm.connection.parent_tens

882

if not any(cons is None for cons in ofm_tens.consumer_list):

883

ofm_tens.mem_area = self.arch.fast_storage_mem_area

884

ofm_tens.mem_type = MemType.Scratch_fast

885

886

# Collect live ranges from tensors

887

memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

888

lr_graph = live_range.LiveRangeGraph()

889

for mem_area, mem_type_set in memories_list:

890

live_range.extract_live_ranges_from_cascaded_passes(

891

self.nng.get_root_subgraph(), mem_area, mem_type_set, False, lr_graph, Tensor.AllocationQuantum,

892

)

893

894

# Iterate over live ranges and evict tensors that doesn't fit

895

fast_storage_snapshot = lr_graph.get_temporal_memory_usage(self.arch.fast_storage_mem_area)

896

for lr in lr_graph.lrs:

897

if (

898

lr.mem_area == self.arch.fast_storage_mem_area

899

and max(fast_storage_snapshot[lr.start_time : lr.end_time + 1]) > memory_limit

900

):

901

# Evict tensor to DRAM

902

for tens in lr.tensors:

903

if tens.purpose == TensorPurpose.FeatureMap and tens.sub_purpose == TensorSubPurpose.Standard:

904

# Can only evict unbuffered FeatureMaps

905

tens.mem_area = self.arch.feature_map_storage_mem_area

906

tens.mem_type = MemType.Scratch

907

# Adjust the snapshot

908

fast_storage_snapshot[lr.start_time : lr.end_time + 1] -= lr.size

909

910

def move_constant_data(self):

911

"""Determine if data, can be moved from permanent storage to another memory area. A move

912

will generate a DMA command in the high-level command stream"""

913

for sched_op in self.sched_ops:

914

parent_op = sched_op.parent_op

915

is_lut_used = any(inp.purpose == TensorPurpose.LUT for inp in parent_op.inputs)

916

max_ifm_shram_avail = (

917

(self.arch.available_shram_banks(is_lut_used) - self.arch.shram_reserved_output_banks)

918

* self.arch.shram_bank_size

// 2

)

for idx, tens in enumerate(parent_op.inputs):

923

if tens.mem_type not in (MemType.Scratch, MemType.Scratch_fast):

924

# Tensor is in permanent storage

925

# Only when permanent storage differs from feature map storage, there is a point moving the data

926

if (

927

tens.mem_area in self.arch.permanent_storage_mem_area

928

and self.arch.permanent_storage_mem_area != self.arch.feature_map_storage_mem_area

929

) or tens.purpose == TensorPurpose.LUT:

930

if tens.purpose == TensorPurpose.LUT or (

931

tens.purpose == TensorPurpose.FeatureMap

932

and sched_op.op_type.is_binary_elementwise_op()

933

and tens.shape != []

934

and sched_op.ifm.shape != sched_op.ofm.shape

935

and tens.storage_size() > max_ifm_shram_avail

936

):

937

only_vector_product_consumers = all(

938

oper and oper.type.npu_block_type == NpuBlockType.VectorProduct

939

for oper in tens.consumers()

940

)

941

942

if (not only_vector_product_consumers) or tens.purpose == TensorPurpose.LUT:

943

new_tens = tens.clone_into_fast_storage(self.arch)

944

if tens.purpose == TensorPurpose.LUT:

945

new_tens.mem_area = MemArea.Shram

946

947

new_tens.consumer_list.append(parent_op)

948

parent_op.inputs[idx] = new_tens

949

sched_op.parent_ps.inputs[idx] = new_tens

950

951

def print_schedule(self, schedule: Schedule):

952

print(f"Schedule: '{schedule.name}'")

953

for sched_op in self.sched_ops:

954

if sched_op not in schedule.cost_map:

955

# Sub-schedule printing

956

continue

957

958

op_info = schedule.cost_map[sched_op]

959

print(f"\t{sched_op.index}: Operation {sched_op.name} - OFM {sched_op.ofm.shape}")

960

print(f"\t\tType: {sched_op.op_type}")

961

print(f"\t\tKernel: {sched_op.kernel}")

962

print(f"{op_info}")

963

mem_usage = (

964

schedule.memory_snapshot[op_info.time_index]

965

if op_info.time_index < len(schedule.memory_snapshot)

966

else 0

967

)

968

print(f"\t\tSRAM Used: {mem_usage} bytes")

969

970

print(f"\tCascades:")

971

for i, cascade in enumerate(schedule.cascades.values()):

972

print(f"\t\t{i}: {cascade.start} -> {cascade.end}, size: {cascade.mem_usage}")

Patrik Gustavsson

feeb06d

2020-04-22 12:53:47 +0200

[diff] [blame]

973

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

974

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

975

def _update_tensor_allocation(nng: Graph, arch: ArchitectureFeatures, options):

976

"""

977

Creates live ranges and runs tensor allocator for the current schedule

978

(i.e. sg.schedule for all subgraphs), returns the maximum memory usage

979

and updates SchedulerOpInfo.mem_usage for all operations in the schedule.

980

"""

981

root_sg = nng.get_root_subgraph()

982

983

alloc_list = []

984

if arch.is_spilling_enabled():

985

mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,)))

986

mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))

987

# Order is important

988

alloc_list.append(mem_alloc_scratch_fast)

989

alloc_list.append(mem_alloc_scratch)

990

else:

991

mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))

992

alloc_list.append(mem_alloc_scratch)

993

994

for mem_area, mem_type_set in alloc_list:

995

tensor_allocation.allocate_tensors(

nng,

root_sg,

arch,

mem_area,

mem_type_set,

tensor_allocator=options.tensor_allocator,

1002

verbose_allocation=options.verbose_allocation,

1003

cpu_tensor_alignment=options.cpu_tensor_alignment,

)

def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_options: SchedulerOptions):

1008

"""Entry point for the Scheduler"""

1009

# Initialize CPU subgraphs

1010

schedulers = dict()

1011

# Initialize schedulers with max schedule. Only schedule NPU subgraphs

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1012

for sg in nng.subgraphs:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

1013

if sg.placement != PassPlacement.Npu:

1014

# Create cascaded passes for CPU Ops

1015

cascaded_passes = []

1016

for idx, ps in enumerate(sg.passes):

1017

cps = CascadedPass(

1018

ps.name, SchedulingStrategy.WeightStream, ps.inputs, [], ps.outputs, [ps], ps.placement, False,

1019

)

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1020

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

1021

cps.time = idx

1022

ps.cascade = cps

1023

cascaded_passes.append(cps)

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1024

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

1025

sg.cascaded_passes = cascaded_passes

1026

else:

1027

# Npu subgraph - create schedule

1028

scheduler = Scheduler(nng, sg, arch, scheduler_options)

1029

schedulers[sg] = scheduler

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1030

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

1031

scheduler.create_scheduler_representation(arch)

1032

sg.sched_ops = scheduler.sched_ops

1033

scheduler.move_constant_data()

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1034

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

1035

# Create the Max schedule template

1036

max_schedule_template = scheduler.create_initial_schedule()

1037

scheduler.max_schedule = max_schedule_template

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1038

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

1039

# Create the optimimised Max schedule

1040

sg.schedule = max_schedule_template

1041

scheduler.update_op_memory_snapshot(max_schedule_template)

1042

opt_max_schedule = scheduler.propose_schedule_buffering(max_schedule_template)

1043

sg.schedule = opt_max_schedule

1044

scheduler.update_op_memory_snapshot(opt_max_schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1045

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

1046

# Create Min schedule

1047

min_schedule = scheduler.propose_minimal_schedule()

1048

initial_sram_limit = scheduler_options.optimization_sram_limit

1049

if scheduler_options.optimization_strategy == OptimizationStrategy.Size:

1050

initial_sram_limit = scheduler.min_memory_req

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1051

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

1052

cascade_builder = CascadeBuilder(scheduler.sched_ops, arch.is_spilling_enabled())

1053

cascade_builder.build_cascades(min_schedule, max_schedule_template, initial_sram_limit)

1054

sg.schedule = min_schedule

1055

scheduler.update_op_memory_snapshot(min_schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1056

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

1057

if scheduler_options.optimization_strategy == OptimizationStrategy.Performance:

1058

# Create an optimized schedule

1059

sg.schedule = scheduler.optimize_schedule(

1060

min_schedule, opt_max_schedule, max_schedule_template, scheduler_options

1061

)

1062

scheduler.update_op_memory_snapshot(sg.schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1063

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

1064

scheduler.apply_schedule(sg.schedule)

1065

scheduler.use_fast_storage_for_feature_maps(sg.schedule, scheduler_options.optimization_sram_limit)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1066

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame^]

1067

if scheduler_options.verbose_schedule:

1068

scheduler.print_schedule(sg.schedule)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1069

Tim Hall