Blame - ethosu/vela/scheduler.py - ml/ethos-u/ethos-u-vela

2021-05-27 18:49:40 +0100

[diff] [blame]

1

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

16

#

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

17

# Description:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

18

# The scheduler creates and searches for an optimal plan for the network, selecting block configurations and

19

# subdivisions for the Operators

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

20

import copy

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

21

from enum import auto

22

from enum import IntEnum

23

from typing import Dict

24

from typing import List

25

from typing import Optional

26

from typing import Tuple

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

27

28

from . import live_range

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

29

from . import npu_performance

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

30

from . import tensor_allocation

31

from . import weight_compressor

32

from .architecture_allocator import ArchitectureBlockConfig

33

from .architecture_allocator import find_block_config

34

from .architecture_allocator import get_ifm_area_required

35

from .architecture_allocator import to_upscale

36

from .architecture_features import ArchitectureFeatures

37

from .architecture_features import Block

38

from .cascade_builder import CascadeBuilder

39

from .cascade_builder import CascadeInfo

Fredrik Svedberg

880e735

2020-08-25 11:31:47 +0200

[diff] [blame]

40

from .data_type import DataType

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

41

from .nn_graph import CascadedPass

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

42

from .nn_graph import Graph

43

from .nn_graph import Pass

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

44

from .nn_graph import PassPlacement

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

45

from .nn_graph import SchedulingStrategy

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

46

from .nn_graph import Subgraph

47

from .numeric_util import round_down

48

from .numeric_util import round_up

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

49

from .operation import NpuBlockType

Louis Verhaard

aee5d75

2020-09-30 09:01:52 +0200

[diff] [blame]

50

from .operation import Op

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

51

from .shape4d import Shape4D

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

52

from .tensor import MemArea

Patrik Gustavsson

eca2e95

2020-05-27 09:15:11 +0200

[diff] [blame]

53

from .tensor import MemType

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

54

from .tensor import Tensor

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

55

from .tensor import TensorFormat

56

from .tensor import TensorPurpose

57

from .tensor import TensorSubPurpose

Jacob Bohlin

1a66697

2020-09-11 10:04:15 +0200

[diff] [blame]

58

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

59

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

60

def shape_for_format(shape: Shape4D, tensor_format: TensorFormat) -> Shape4D:

61

if tensor_format == TensorFormat.NHCWB16:

62

return shape.with_depth(round_up(shape.depth, 16))

return shape

class OptimizationStrategy(IntEnum):

68

"""Enum defining the different optimization strategies for the Scheduler"""

69

70

Size = auto()

71

Performance = auto()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

def __str__(self):

return self.name

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

77

class SchedulerOpInfo:

78

"""Contains metadata about a SchedulerOperation that is unique to one Schedule"""

79

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

80

def __init__(

81

self,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

82

block_config: ArchitectureBlockConfig,

83

weights_size: int,

84

stripe_input: Shape4D,

85

stripe_input2: Optional[Shape4D],

86

stripe: Shape4D,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

87

):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

88

self.block_config = block_config

89

self.weights_size = weights_size

90

self.stripe_input = stripe_input

91

self.stripe_input2 = stripe_input2

92

self.stripe = stripe

93

self.cascade = 0 # Assigned by CascadeBuilder. 0 means not part of a cascade

94

self.time_index = None # Set by update_op_memory_snapshot

95

self.ofm_depth_slices: List[int] = [0, stripe.depth]

96

self.npu_weights_tensor = None

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

97

self.npu_scales_tensor = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

98

self.buffered_weight_tensor = None

99

self.cycles = None

100

self.slack_buffering_cycles = 0

101

self.slack_buffering_memory = 0

102

self.full_weight_transfer_cycles = 0

103

104

def copy(self):

105

res = SchedulerOpInfo(self.block_config, self.weights_size, self.stripe_input, self.stripe_input2, self.stripe,)

106

res.cascade = self.cascade

return res

def __str__(self):

res = f"\t\tBlock Config = {self.block_config}\n"

111

res += f"\t\tOFM Block = {self.block_config.ofm_block}\n"

112

res += f"\t\tIFM Stripe = {self.stripe_input}\n"

113

res += f"\t\tIFM2 Stripe = {self.stripe_input2}\n"

114

res += f"\t\tOFM Stripe = {self.stripe}\n"

115

res += f"\t\tEncoded Weights = {self.npu_weights_tensor and len(self.npu_weights_tensor.buffer)} bytes\n"

116

res += (

117

f"\t\tWeight buffer = {self.buffered_weight_tensor and self.buffered_weight_tensor.storage_size()} bytes\n"

118

)

119

res += f"\t\tDepth slices = {self.ofm_depth_slices}\n"

120

res += f"\t\tAssigned Cascade = {self.cascade}"

return res

class SchedulerOptions:

125

"""Contains options for the Scheduler"""

126

127

def __init__(

128

self, optimization_strategy, sram_target, verbose_schedule,

129

):

130

self.optimization_strategy = optimization_strategy

131

self.optimization_sram_limit = sram_target

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

132

self.verbose_schedule = verbose_schedule

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

133

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

134

def __str__(self) -> str:

135

return f"{type(self).__name__}: {str(self.__dict__)}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

__repr__ = __str__

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

140

class SchedulerTensor:

141

def __init__(self, shape, dt, mem_area, _format):

142

self.dtype = dt

143

self.mem_area = mem_area

144

self.shape = shape

145

self.format = _format

146

self.connection = None

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

147

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

148

149

class SchedulerOperation:

150

"""Scheduler internal representation of 'Operation'

151

This class can be seen as a node within the Scheduler Graph representation

152

"""

153

154

def __init__(self, ps: Pass, arch: ArchitectureFeatures, nng: Graph):

155

self.arch = arch

156

self.parent_ps = ps

157

self.parent_op = ps.primary_op

158

self.name = ps.primary_op.name

159

self.op_type = ps.primary_op.type

160

self.activation = ps.primary_op.activation

161

self.kernel = ps.primary_op.kernel

162

self.resampling_mode = ps.primary_op.ifm.resampling_mode

163

self.uses_scalar = ps.primary_op.ifm2 is not None and (

164

ps.primary_op.ifm.shape == [] or ps.primary_op.ifm2.shape == []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

165

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

166

self.ifm_ublock = arch.ifm_ublock

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

167

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

168

self.ifm = SchedulerTensor(ps.ifm_shapes[0], ps.ifm_tensor.dtype, ps.ifm_tensor.mem_area, ps.ifm_tensor.format,)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

169

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

170

self.ifm2 = None

171

if ps.ifm2_tensor:

172

self.ifm2 = SchedulerTensor(

173

ps.ifm_shapes[1], ps.ifm2_tensor.dtype, ps.ifm2_tensor.mem_area, ps.ifm2_tensor.format,

174

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

175

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

176

self.ofm = SchedulerTensor(ps.ofm_shapes[0], ps.ofm_tensor.dtype, ps.ofm_tensor.mem_area, ps.ofm_tensor.format,)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

177

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

178

# Input volume width and height required to produce the smallest possible stripe

179

self.min_stripe_input_w, self.min_stripe_input_h = self._calculate_min_stripe_input()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

180

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

181

# Flags that marks whether this SchedulerOperation requires full IFM/OFM

182

self.requires_full_ifm = False

183

self.requires_full_ifm2 = False

184

self.requires_full_ofm = False

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

185

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

186

self.index = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

187

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

188

def add_ifm_connection(self, conn: "Connection"):

189

"""Add input connection to another SchedulerOperation or Subgraph Input"""

190

conn.consumers.append(self)

191

self.ifm.connection = conn

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

192

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

193

def add_ifm2_connection(self, conn: "Connection"):

194

"""Add input connection to another SchedulerOperation or Subgraph Input"""

195

if self.ifm2:

196

conn.consumers.append(self)

197

self.ifm2.connection = conn

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

198

else:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

199

assert False, f"Trying to set an IFM2 Connection to {self} which has no IFM2"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

200

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

201

def add_ofm_connection(self, conn: "Connection"):

202

"""Add output connection to another SchedulerOperation or Subgraph Output"""

203

conn.producers.append(self)

204

self.ofm.connection = conn

205

206

def get_dependants(self):

207

"""Returns a list of the Ops that depend on this Operation's OFM"""

208

return self.ofm.connection.consumers

209

210

def ifm_size_in_bytes(self) -> int:

211

"""Returns size of the IFM in bytes"""

212

ifm_storage_shape = shape_for_format(self.ifm.shape, self.ifm.format)

213

return round_up(ifm_storage_shape.elements() * self.ifm.dtype.size_in_bytes(), Tensor.AllocationQuantum)

214

215

def ifm2_size_in_bytes(self) -> int:

216

"""Returns size of the IFM2 in bytes"""

217

if self.ifm2:

218

ifm2_storage_shape = shape_for_format(self.ifm2.shape, self.ifm2.format)

219

return round_up(ifm2_storage_shape.elements() * self.ifm2.dtype.size_in_bytes(), Tensor.AllocationQuantum)

return 0

def ofm_size_in_bytes(self) -> int:

224

"""Returns size of the OFM in bytes"""

225

ofm_storage_shape = shape_for_format(self.ofm.shape, self.ofm.format)

226

return round_up(ofm_storage_shape.elements() * self.ofm.dtype.size_in_bytes(), Tensor.AllocationQuantum)

227

228

def create_scheduler_info(self, nng: Graph, stripe: Shape4D) -> SchedulerOpInfo:

229

"""Returns schedule info about this SchedulerOperation based on how many ofm elements it should produce"""

230

ifm_shape = self.ifm.shape

231

ifm2_shape = self.ifm2 and self.ifm2.shape

232

ofm_shape = stripe

233

234

if ofm_shape != self.ofm.shape:

235

# Striped Op - Need to calculate stripe input volume

236

stripe_input_w, stripe_input_h = self._get_stripe_input_requirement(stripe)

237

# Ensure stripe input volume is within the full IFM volume

238

stripe_input_h = min(stripe_input_h, self.ifm.shape.height)

239

stripe_input_w = min(stripe_input_w, self.ifm.shape.width)

240

ifm_shape = ifm_shape.with_hw(stripe_input_h, stripe_input_w)

241

242

if self.ifm2:

243

stripe_input2_h = min(stripe_input_h, self.ifm2.shape.height)

244

stripe_input2_w = min(stripe_input_w, self.ifm2.shape.width)

245

ifm2_shape = ifm2_shape.with_hw(stripe_input2_h, stripe_input2_w)

246

247

block_config = self._get_block_config(ifm_shape, ifm2_shape, self.uses_scalar, ofm_shape)

248

249

scheduler_op_info = SchedulerOpInfo(block_config, 0, ifm_shape, ifm2_shape, ofm_shape)

250

if self.parent_op.weights:

251

# Default full-depth weight encoding with no buffering

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

252

(

253

scheduler_op_info.npu_weights_tensor,

254

scheduler_op_info.npu_scales_tensor,

255

) = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

256

self.arch,

257

self.parent_op,

258

self.parent_op.weights,

self.parent_op.bias,

self.kernel,

block_config,

[0, self.ofm.shape.depth],

263

)

264

265

self.parent_ps.block_config = block_config.old_style_representation()

266

return scheduler_op_info

267

268

def _get_stripe_input_requirement(self, stripe_shape: Shape4D) -> Tuple[int, int]:

269

"""Returns the amount of IFM required to produce the stripe with shape:'stripe_shape'"""

270

ofm_shape_to_produce = Block.from_shape(stripe_shape.as_list())

271

272

return get_ifm_area_required(ofm_shape_to_produce, self.kernel, to_upscale(self.resampling_mode))

273

274

def _calculate_min_stripe_input(self) -> Shape4D:

275

# Calculate the input volume required height and width for the smallest possible stripe (h,w = 1,1)

276

min_stripe = self.ofm.shape.with_hw(1, 1)

277

return self._get_stripe_input_requirement(min_stripe)

278

279

def _get_block_config(

280

self, ifm_shape: Shape4D, ifm2_shape: Optional[Shape4D], uses_scalar: bool, ofm_shape: Shape4D

281

) -> ArchitectureBlockConfig:

282

# Returns a block config and SHRAM layout

283

lut_banks = 2 if self.parent_op.activation_lut else 0

284

return find_block_config(

285

self.arch,

286

self.op_type.npu_block_type,

ofm_shape,

ifm_shape,

ifm2_shape,

uses_scalar,

self.ifm.dtype.size_in_bits(),

292

self.kernel,

293

lut_banks,

294

self.parent_op.has_scaling(),

295

self.resampling_mode,

)

class Connection:

"""Scheduler internal representation of a Tensor that connects two SchedulerOperations

301

This class can be seen as an edge within the Scheduler Graph representation

302

"""

303

304

def __init__(self, tensor: Tensor):

305

self.parent_tens = tensor

306

307

# SchedulerOperation relationships

308

self.producers: List[SchedulerOperation] = []

309

self.consumers: List[SchedulerOperation] = []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

310

311

def __str__(self):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

312

return f"<Connection {self.parent_tens.name}>"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

__repr__ = __str__

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

317

class Schedule:

318

"""Class that contains a solution of how to schedule an NPU subgraph and its cost"""

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

319

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

320

def __init__(self, sg: Subgraph, label: str):

321

self.sg = sg

322

self.label = label

323

self.cost_map: Dict[SchedulerOperation, SchedulerOpInfo] = {}

324

self.cascades: Dict[int, CascadeInfo] = {}

325

self.fast_storage_peak_usage = 0

326

self.memory_snapshot = None

@property

def name(self):

return f"{self.sg.name}_{self.label}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

331

332

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

333

class Scheduler:

334

"""Main class of the Vela Scheduling"""

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

335

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

336

def __init__(self, nng: Graph, sg: Subgraph, arch: ArchitectureFeatures, options: SchedulerOptions):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

337

self.nng = nng

338

self.sg = sg

339

self.arch = arch

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

340

self.sched_ops: List(SchedulerOperation) = []

341

self.max_schedule = None

342

self.scheduler_options = options

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

343

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

344

def create_scheduler_representation(self, arch: ArchitectureFeatures):

345

"""Creates a Scheduler Graph representation"""

346

# Temporary dict for creating connections between the Operations

347

connections: Dict[Tensor, Connection] = {}

348

# Memory required for the largest FeatureMap that has to be full

349

min_memory_req = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

350

for ps in self.sg.passes:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

351

if ps.primary_op:

352

# Set tensor format to NHCWB16 for output FeatureMaps, if possible

Louis Verhaard

0b9c9a3

2020-09-15 14:05:38 +0200

[diff] [blame]

353

for output in ps.outputs:

Jacob Bohlin

a5e8c1c

2021-06-14 13:33:39 +0200

[diff] [blame]

354

if output in self.sg.output_tensors or output.purpose != TensorPurpose.FeatureMap:

Patrik Gustavsson

feeb06d

2020-04-22 12:53:47 +0200

[diff] [blame]

355

continue

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

356

if not output.needs_linear_format:

Louis Verhaard

0b9c9a3

2020-09-15 14:05:38 +0200

[diff] [blame]

357

output.set_format(TensorFormat.NHCWB16, arch)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

358

359

# Create SchedulerOperations

360

op = SchedulerOperation(ps, arch, self.nng)

361

op.index = len(self.sched_ops)

362

363

# Make connections

364

if ps.ifm_tensor not in connections:

365

connections[ps.ifm_tensor] = Connection(ps.ifm_tensor)

366

if ps.ifm2_tensor and ps.ifm2_tensor not in connections:

367

connections[ps.ifm2_tensor] = Connection(ps.ifm2_tensor)

368

if ps.ofm_tensor not in connections:

369

connections[ps.ofm_tensor] = Connection(ps.ofm_tensor)

370

371

op.add_ifm_connection(connections[ps.ifm_tensor])

372

if ps.ifm2_tensor:

373

op.add_ifm2_connection(connections[ps.ifm2_tensor])

374

op.add_ofm_connection(connections[ps.ofm_tensor])

375

376

# Set requirements on the ifm/ofm buffers

377

self.sched_ops.append(op)

378

if ps.ifm_tensor in self.sg.input_tensors:

379

# This Op consumes a subgraph input

380

op.requires_full_ifm = True

381

if ps.ifm2_tensor and ps.ifm2_tensor in self.sg.input_tensors:

382

# This Op consumes a subgraph input

383

op.requires_full_ifm2 = True

384

if ps.ofm_tensor in self.sg.output_tensors:

385

# This Op produces a subgraph output

386

op.requires_full_ofm = True

387

if ps.ifm_tensor.needs_linear_format:

388

op.requires_full_ifm = True

389

if ps.ifm2_tensor and ps.ifm2_tensor.needs_linear_format:

390

op.requires_full_ifm2 = True

391

if ps.ofm_tensor.needs_linear_format or ps.primary_op.memory_function == Op.ConcatSliceWrite:

392

op.requires_full_ofm = True

393

if len(ps.primary_op.outputs) > 1 or len(ps.primary_op.outputs[0].consumer_list) > 1:

394

# Op has multiple outputs or consumers - requires full OFM

395

op.requires_full_ofm = True

396

397

# Check memory requirements if this Op requires any full FeatureMaps

398

op_memory_req = 0

399

if op.requires_full_ifm:

400

op_memory_req += op.ifm_size_in_bytes()

401

if op.requires_full_ifm2:

402

op_memory_req += op.ifm2_size_in_bytes()

403

if op.requires_full_ofm:

404

op_memory_req += op.ofm_size_in_bytes()

405

406

min_memory_req = max(op_memory_req, min_memory_req)

407

408

# Theoretical minimum required memory - used to guide the cascade building

409

self.min_memory_req = min_memory_req

410

411

def create_initial_schedule(self) -> Schedule:

412

"""Creates an initial schedule with no cascading or buffering of any kind"""

413

schedule = Schedule(self.sg, "MAX")

414

415

for op in self.sched_ops:

416

cost = op.create_scheduler_info(self.nng, op.ofm.shape)

417

cost.cycles = self.estimate_op_performance(op, cost.block_config, op.ofm.shape.depth)

418

schedule.cost_map[op] = cost

return schedule

def update_op_memory_snapshot(self, schedule: Schedule):

423

memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

424

425

# Collect live ranges from tensors

426

lr_graph = live_range.LiveRangeGraph()

427

for mem_area, mem_type_set in memories_list:

428

live_range.extract_live_ranges_from_cascaded_passes(

429

self.nng.get_root_subgraph(), mem_area, mem_type_set, False, lr_graph, Tensor.AllocationQuantum,

430

)

431

432

# Populate time-array with memory used by live ranges

433

temporal_usage = lr_graph.get_temporal_memory_usage(self.arch.fast_storage_mem_area)

434

schedule.memory_snapshot = temporal_usage

435

436

# Set the peak memory usage

437

schedule.fast_storage_peak_usage = max(temporal_usage, default=0)

438

439

def estimate_op_performance(self, op: SchedulerOperation, block_config, ofm_depth):

440

query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)

441

query.ifm_shape = op.ifm.shape

442

query.ifm_memory_area = op.ifm.mem_area

443

query.ifm_bits = op.ifm.dtype.size_in_bits()

444

query.ifm_format = op.ifm.format

445

query.ifm2_shape = op.ifm2 and op.ifm2.shape

446

query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area

447

query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()

448

query.ifm2_format = op.ifm2 and op.ifm2.format

449

query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)

450

query.ofm_memory_area = op.ofm.mem_area

451

query.ofm_bits = op.ofm.dtype.size_in_bits()

452

query.ofm_format = op.ofm.format

453

if op.parent_op.bias:

454

query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)

455

query.const_memory_area = self.arch.fast_storage_mem_area

456

457

query.kernel = op.kernel

458

query.config = block_config

459

460

return npu_performance.measure_cycle_cost(self.arch, op.op_type, op.activation and op.activation.op_type, query)

461

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame^]

462

def propose_schedule_buffering(self, ref_schedule: Schedule, staging_limit_bytes):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

463

"""Create a buffered schedule"""

464

buffered_schedule = Schedule(self.sg, f"{ref_schedule.label}_BUFFERED")

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

465

466

prev_op = None

467

for sched_op in self.sched_ops:

468

if sched_op not in ref_schedule.cost_map:

469

# sched_op is not part of this sub-schedule - skip

470

continue

471

472

self.propose_operator_buffering(sched_op, prev_op, buffered_schedule, ref_schedule, staging_limit_bytes)

473

prev_op = sched_op

474

475

return buffered_schedule

476

477

def propose_operator_buffering(

478

self,

479

sched_op: SchedulerOperation,

480

prev_op: SchedulerOperation,

481

buffered_schedule: Schedule,

482

ref_schedule: Schedule,

483

staging_limit_bytes,

484

):

485

# Mild recursion might mean this Op has already been seen

486

if sched_op in buffered_schedule.cost_map:

487

return

488

489

# Take the reference schedule as default costings for this schedule

490

ref_cost = ref_schedule.cost_map[sched_op]

491

cost = copy.copy(ref_cost)

492

cost.slack_buffering_cycles = ref_cost.cycles.op_cycles

493

memory_snapshot = ref_schedule.memory_snapshot

494

ref_memory_usage = memory_snapshot[ref_cost.time_index] if ref_cost.time_index < len(memory_snapshot) else 0

495

cost.slack_buffering_memory = staging_limit_bytes - ref_memory_usage

496

buffered_schedule.cost_map[sched_op] = cost

497

498

# Attempt weight buffering on anything with a weights tensor

499

if sched_op.parent_op.weights:

500

self.propose_weight_buffering(

501

sched_op.parent_op.weights,

502

sched_op.parent_op.bias,

sched_op,

prev_op,

buffered_schedule,

ref_schedule,

cost.slack_buffering_memory,

)

return cost

def weights_needs_dma(self, weight_tensor):

513

if weight_tensor and weight_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast):

514

# Weights are in permanent storage

515

# Only when permanent storage differs from feature map storage, there is a point moving the data

516

if (

517

weight_tensor.mem_area in (MemArea.Dram, MemArea.OffChipFlash)

518

and self.arch.permanent_storage_mem_area != self.arch.fast_storage_mem_area

):

return True

return False

def propose_weight_buffering(

self,

weight_tensor,

scale_tensor,

sched_op: SchedulerOperation,

528

prev_op: SchedulerOperation,

529

buffered_schedule: Schedule,

530

ref_schedule: Schedule,

531

buffer_limit_bytes,

532

):

533

cost = buffered_schedule.cost_map[sched_op]

534

prev_cost = buffered_schedule.cost_map.get(prev_op)

535

ref_cost = ref_schedule.cost_map[sched_op]

536

assert cost and ref_cost

537

538

needs_dma = self.weights_needs_dma(weight_tensor)

539

540

ofm_full_depth_slices = [0, ref_cost.stripe.depth]

541

542

# Encode weights for the full depth

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

543

full_weights, full_scales = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

self.arch,

sched_op.parent_op,

weight_tensor,

scale_tensor,

sched_op.kernel,

cost.block_config,

ofm_full_depth_slices,

551

)

552

full_weights_bytes = len(full_weights.buffer)

553

cost.ofm_depth_slices = ofm_full_depth_slices

554

555

# No buffering required - take all the weights from permanent storage

556

if sched_op.op_type == Op.FullyConnected or not needs_dma:

557

cost.npu_weights_tensor = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

558

cost.npu_scales_tensor = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

559

return

560

561

encoded_weights = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

562

encoded_scales = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

563

564

# How many NPU cycles are available under the previously executing

565

# operator and SRAM unused for performing buffered DMA transfers

566

slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0

567

slack_memory = prev_cost.slack_buffering_memory if prev_cost else 0

568

569

# Force full depth for cascaded Ops

570

if ref_cost.cascade != 0:

571

weight_tensor_purpose = TensorSubPurpose.Standard

572

weight_buffer_size = full_weights_bytes

573

# Update the memory snapshot to reflect the added size of the weights

574

ref_schedule.memory_snapshot[ref_cost.time_index] += weight_buffer_size

575

else:

576

# Estimate the buffering cycle time for the full set of weights

577

full_transfer_cycles = npu_performance.measure_mem2mem_cycles(

578

self.arch, weight_tensor.mem_area, self.arch.fast_storage_mem_area, full_weights_bytes

579

)

580

cost.full_weight_transfer_cycles = full_transfer_cycles

581

582

# Calculate the amount of prebuffering necessary (or what is possible with limited

583

# double buffer buffer size)

584

half_buffer_limit = buffer_limit_bytes // 2

585

if full_transfer_cycles > slack_cycles:

586

prebuffer_ratio = slack_cycles / full_transfer_cycles

587

prebuffer_bytes = min(prebuffer_ratio * full_weights_bytes, half_buffer_limit)

588

else:

589

prebuffer_bytes = min(full_weights_bytes, half_buffer_limit)

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame^]

590

591

prebuffer_ratio = prebuffer_bytes / full_weights_bytes

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

592

593

# Have to split the weights if the initial buffering can't store

594

# all of the compressed weights

595

if prebuffer_bytes < full_weights_bytes:

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame^]

596

block_depth = cost.block_config.ofm_block.depth

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

597

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame^]

598

# Choose initial prebuffering depth (already buffer clamped)

599

prebuffer_depth = ref_cost.stripe.depth * prebuffer_ratio

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

600

prebuffer_depth = int(max(16, round_down(prebuffer_depth, ArchitectureFeatures.OFMSplitDepth)))

601

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame^]

602

# Calculate cycles executed during the prebuffer

603

pre_op_cycles = self.estimate_op_performance(sched_op, cost.block_config, prebuffer_depth)

604

buffering_depth = ref_cost.stripe.depth * (pre_op_cycles.op_cycles / full_transfer_cycles)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

605

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame^]

606

# Choose initial buffering depth and clamp to the double buffering limit

607

buffering_depth = round_up(buffering_depth, block_depth)

608

buffering_bytes = (buffering_depth / ref_cost.stripe.depth) * full_weights_bytes

609

if buffering_bytes > half_buffer_limit:

610

buffering_depth = (half_buffer_limit / full_weights_bytes) * ref_cost.stripe.depth

611

612

while True:

613

# Attempt to buffer whole blocks

614

if buffering_bytes > block_depth:

615

buffering_depth = round_down(buffering_depth, block_depth)

616

else:

617

buffering_depth = round_down(buffering_depth, ArchitectureFeatures.OFMSplitDepth)

618

buffering_depth = int(max(buffering_depth, ArchitectureFeatures.OFMSplitDepth))

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

619

620

# Create list of depth slices

621

depth_slices = [0]

622

if prebuffer_depth < ref_cost.stripe.depth:

623

depth_slices += list(range(prebuffer_depth, ref_cost.stripe.depth, buffering_depth))

624

depth_slices.append(ref_cost.stripe.depth)

625

626

# Encode weights based depth slices

627

cost.ofm_depth_slices = depth_slices

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

628

encoded_weights, encoded_scales = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

self.arch,

sched_op.parent_op,

weight_tensor,

scale_tensor,

sched_op.kernel,

cost.block_config,

cost.ofm_depth_slices,

636

)

637

638

# Chosen buffering might not fit at all, iterate until it does

639

# or until the minimum usable slice size is reached

640

if (

641

encoded_weights.max_range_bytes <= half_buffer_limit

642

or prebuffer_depth == ArchitectureFeatures.OFMSplitDepth

):

break

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame^]

646

if buffering_depth > prebuffer_depth:

647

buffering_depth = round_up(buffering_depth // 2, ArchitectureFeatures.OFMSplitDepth)

648

else:

649

prebuffer_depth = round_up(prebuffer_depth // 2, ArchitectureFeatures.OFMSplitDepth)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

650

651

# Calculate cycles required to run the last op for use as future slack

652

tail_cycles = self.estimate_op_performance(

653

sched_op, cost.block_config, depth_slices[-1] - depth_slices[-2]

654

)

655

cost.slack_buffering_cycles = tail_cycles.op_cycles

656

657

# Determine whether the weights need to be double buffered

658

weight_buffer_size = min(len(encoded_weights.buffer), encoded_weights.max_range_bytes)

659

660

# Only buffer weights if there's still space left for the buffer

661

if weight_buffer_size <= buffer_limit_bytes:

662

assert weight_buffer_size % 16 == 0

663

# Determine whether to double buffer or single buffer

664

if (weight_buffer_size * 2 <= buffer_limit_bytes) and (weight_buffer_size < len(encoded_weights.buffer)):

665

weight_buffer_size = weight_buffer_size * 2

666

weight_tensor_purpose = TensorSubPurpose.DoubleBuffer

667

else:

668

weight_tensor_purpose = TensorSubPurpose.Standard

669

670

cost.buffered_weight_tensor = Tensor(

671

[1, 1, 1, weight_buffer_size], DataType.uint8, weight_tensor.name + "_buffer"

672

)

673

cost.buffered_weight_tensor.src_tensor = encoded_weights

674

cost.buffered_weight_tensor.mem_area = self.arch.fast_storage_mem_area

675

cost.buffered_weight_tensor.mem_type = MemType.Scratch_fast

676

cost.buffered_weight_tensor.purpose = TensorPurpose.Weights

677

cost.buffered_weight_tensor.sub_purpose = weight_tensor_purpose

678

if ref_cost.cascade == 0:

679

# Determine if the lifetime can be extended and pre-buffer weights under the previous operation

680

cost.buffered_weight_tensor.pre_buffer = weight_buffer_size < slack_memory

681

682

cost.slack_buffering_memory -= weight_buffer_size

683

else:

684

# Don't slice or buffer - use the whole depth from persistent storage

685

cost.ofm_depth_slices = ofm_full_depth_slices

686

encoded_weights = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

687

encoded_scales = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

688

689

cost.npu_weights_tensor = encoded_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

690

cost.npu_scales_tensor = encoded_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

691

692

def propose_minimal_schedule(self) -> Schedule:

693

"""Proposes scheduling parameters where every operator is subdivided into the smallest stripe that satisfies the

694

next operators stride"""

695

min_schedule = Schedule(self.sg, "MIN")

696

cost_map = min_schedule.cost_map

697

698

# Keep track of the previous Op - which consumes the current Op's OFM

699

prev_op = None

700

for sched_op in reversed(self.sched_ops):

701

min_stripe_height = prev_op.kernel.stride.y if prev_op else 1

702

min_stripe = sched_op.ofm.shape.with_height(min_stripe_height)

703

704

cost = sched_op.create_scheduler_info(self.nng, min_stripe)

705

cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)

706

cost_map[sched_op] = cost

prev_op = sched_op

return min_schedule

def propose_schedule_striping(self, final_stripe: Shape4D, label: str, ref_schedule: Schedule) -> Schedule:

713

"""Proposes new striping for a schedule. The stripe is derived from the ifm requirements of the next Op down"""

714

ref_cost = ref_schedule.cost_map

715

716

striped_schedule = Schedule(self.sg, label)

717

stripe = final_stripe

718

for sched_op in reversed(self.sched_ops):

719

if sched_op not in ref_cost:

720

# sched_op is not part of the sub-schedule - skip

721

continue

722

723

# Create a cost entry with the new stripe

724

cost = sched_op.create_scheduler_info(self.nng, stripe)

725

726

# Copy the weight buffering from the reference schedule

727

cost.buffered_weight_tensor = ref_cost[sched_op].buffered_weight_tensor

728

729

# Estimate performance

730

cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)

731

striped_schedule.cost_map[sched_op] = cost

732

733

# Calculate the preceeding Op's stripe

734

stripe = sched_op.ifm.shape.with_height(stripe.height * sched_op.kernel.stride.y)

735

736

return striped_schedule

737

738

def estimate_schedule_memory_usage(self, schedule: Schedule, non_local_mem_usage: dict):

739

"""Estimates the memory usage of a schedule"""

740

cost = schedule.cost_map

741

cascades = schedule.cascades

742

peak_mem_usage = 0

743

for sched_op in self.sched_ops:

744

if sched_op not in cost:

745

# sched_op is not part of the sub-schedule - skip

746

continue

747

748

if cost[sched_op].cascade:

749

# This Op is part of a cascade - use the cascade's memory usage

750

cascade_info = cascades[cost[sched_op].cascade]

751

# Non-local memory usage is already included in the cascade_info

752

peak_mem_usage = max(cascade_info.mem_usage, peak_mem_usage)

753

else:

754

# This Op is not part of a cascade - calculate the memory usage

755

op_weight_buffer = 0

756

if cost[sched_op].buffered_weight_tensor:

757

op_weight_buffer = cost[sched_op].buffered_weight_tensor.storage_size()

758

759

op_mem_usage = (

760

sched_op.ifm_size_in_bytes()

761

+ sched_op.ofm_size_in_bytes()

762

+ op_weight_buffer

763

+ non_local_mem_usage.get(sched_op, 0)

764

)

765

peak_mem_usage = max(op_mem_usage, peak_mem_usage)

766

767

return peak_mem_usage

768

769

def optimize_sub_schedule(

770

self, cascade_info: CascadeInfo, ref_schedule: Schedule, max_template: Schedule, memory_limit: int

771

) -> Schedule:

772

"""Extracts the Ops covered by the given cascade and creates a sub-schedule. The sub-schedule is optimized by

773

proposing weight buffering and then continously proposing new stripe sizes"""

774

ref_cost = ref_schedule.cost_map

775

# Extract the ops that are part of this sub-schedule

776

start = cascade_info.start

777

end = cascade_info.end

778

sub_schedule_ops = self.sched_ops[start : end + 1]

779

# Create a sub-schedule that contains only the costs for the Ops that are part of the sub-schedule

780

sub_schedule = Schedule(self.sg, f"SUB_{start}_{end}")

781

for sched_op in sub_schedule_ops:

782

sub_schedule.cost_map[sched_op] = ref_cost[sched_op]

783

784

sub_schedule.cascades[end] = cascade_info

785

# Use the memory snapshot from the reference schedule

786

sub_schedule.memory_snapshot = ref_schedule.memory_snapshot

787

788

# Calculate memory usage that is live during the sub-schedule but not part of it

789

time_for_cascade = ref_cost[sub_schedule_ops[0]].time_index

790

mem_usage_parallel_to_sub_schedule = ref_schedule.memory_snapshot[time_for_cascade] - cascade_info.mem_usage

791

# If the first Op's IFM has other consumers it has to live throughout the whole sub-schedule whether it's

792

# included in a cascade or not

793

persistent_initial_ifm = (

794

sub_schedule_ops[0].ifm_size_in_bytes() if len(sub_schedule_ops[0].ifm.connection.consumers) > 1 else 0

795

)

796

# Calculate non-local-mem-usage per Operator

797

non_local_mem_usage = {}

798

for idx, sched_op in enumerate(sub_schedule_ops):

799

non_local_mem_usage[sched_op] = mem_usage_parallel_to_sub_schedule

800

if idx != 0:

801

non_local_mem_usage[sched_op] += persistent_initial_ifm

802

803

cascade_builder = CascadeBuilder(sub_schedule_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)

804

805

# Start by adding buffering

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame^]

806

buffered_sub_schedule = self.propose_schedule_buffering(

807

sub_schedule, self.scheduler_options.optimization_sram_limit

808

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

809

# Copy the cascades over from the unbuffered-schedule

810

buffered_sub_schedule.cascades = sub_schedule.cascades

811

812

# Generate the possible stripings for the final Op in the sub-schedule

813

final_ofm_shape = sub_schedule_ops[-1].ofm.shape

814

possible_stripes = [

815

final_ofm_shape.with_height(stripe_h) for stripe_h in range(1, final_ofm_shape.height // 2 + 1)

816

]

817

818

# Propose different striping - the possible stripes are proposed similarly to a binary search

819

best_schedule = buffered_sub_schedule

820

iteration = 0

821

while len(possible_stripes) > 1:

822

proposed_stripe = possible_stripes[len(possible_stripes) // 2]

823

proposed_schedule = self.propose_schedule_striping(

824

proposed_stripe, f"OPTIMIZED_{iteration}", buffered_sub_schedule

825

)

826

827

cascade_builder.build_cascades(proposed_schedule, max_template, memory_limit)

828

829

# Check if proposal fits

830

proposed_schedule_mem_usage = self.estimate_schedule_memory_usage(proposed_schedule, non_local_mem_usage)

831

if (proposed_schedule_mem_usage) <= memory_limit:

832

# Remove all possible stripes smaller than this

833

possible_stripes = possible_stripes[len(possible_stripes) // 2 :]

834

best_schedule = proposed_schedule

835

if not proposed_schedule.cascades:

836

# No cascading required - early exit

837

break

838

else:

839

# Proposal doesn't fit within the limit - remove all possible stripes larger than this

840

possible_stripes = possible_stripes[: len(possible_stripes) // 2]

iteration += 1

return best_schedule

def optimize_schedule(

847

self, schedule: Schedule, max_sched: Schedule, max_template: Schedule, options: SchedulerOptions,

848

) -> Schedule:

849

"""Extracts sub-schedules based on the cascades and optimizes them and applies them to the final schedule"""

850

sram_limit = options.optimization_sram_limit

851

if max_sched.fast_storage_peak_usage < sram_limit and not self.arch.is_spilling_enabled():

852

# Maximum performance schedule fits within the SRAM target

853

return max_sched

854

855

# Extract the cascades

856

cascades = [cascade for cascade in schedule.cascades.values()]

857

for cascade_info in cascades:

858

# Remove existing cascade from schedule

859

del schedule.cascades[cascade_info.end]

860

# Optimize the sub-schedule in this cascade

861

opt_sub_schedule = self.optimize_sub_schedule(cascade_info, schedule, max_template, sram_limit)

862

# Update the sub-schedule Op and cascade costs to the full schedule

863

schedule.cost_map.update(opt_sub_schedule.cost_map)

864

schedule.cascades.update(opt_sub_schedule.cascades)

865

866

# Update memory snapshot

867

self.sg.schedule = schedule

868

self.update_op_memory_snapshot(schedule)

869

# Propose schedule buffering to the optimized schedule

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame^]

870

optimized_sched = self.propose_schedule_buffering(schedule, self.scheduler_options.optimization_sram_limit)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

871

# Copy the cascade's metadata from the unbuffered schedule

872

optimized_sched.cascades = schedule.cascades

873

return optimized_sched

874

875

def apply_schedule(self, sched: Schedule):

876

"""Applies the given schedule as a final solution"""

877

for sched_op in self.sched_ops:

878

op_info = sched.cost_map[sched_op]

879

cascade_info = sched.cascades.get(op_info.cascade, None)

880

if cascade_info and sched_op in cascade_info.buffers:

881

buffer_tens = sched_op.ifm.connection.parent_tens

882

# Apply memory area and type

883

buffer_tens.mem_area = self.arch.fast_storage_mem_area

884

buffer_tens.mem_type = MemType.Scratch_fast

885

# Apply Rolling buffer

886

buffer_tens.set_format(TensorFormat.NHCWB16, self.arch)

887

buffer_tens.set_new_sub_purpose(TensorSubPurpose.RollingBufferY, cascade_info.buffers[sched_op].height)

888

889

sched_op.parent_ps.block_config = op_info.block_config.old_style_representation()

890

891

# Ensure that the src_tensor reference is set correctly

892

if op_info.buffered_weight_tensor:

893

op_info.buffered_weight_tensor.src_tensor = op_info.npu_weights_tensor

894

895

def use_fast_storage_for_feature_maps(self, schedule: Schedule, memory_limit: int):

896

if self.arch.fast_storage_mem_area == self.arch.feature_map_storage_mem_area:

897

return

898

899

# Force all OFMs to fast-storage

900

for sched_op in self.sched_ops:

901

cost = schedule.cost_map[sched_op]

902

if cost.cascade == 0:

903

if sched_op.get_dependants():

904

ofm_tens = sched_op.ofm.connection.parent_tens

905

if not any(cons is None for cons in ofm_tens.consumer_list):

906

ofm_tens.mem_area = self.arch.fast_storage_mem_area

907

ofm_tens.mem_type = MemType.Scratch_fast

908

909

# Collect live ranges from tensors

910

memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

911

lr_graph = live_range.LiveRangeGraph()

912

for mem_area, mem_type_set in memories_list:

913

live_range.extract_live_ranges_from_cascaded_passes(

914

self.nng.get_root_subgraph(), mem_area, mem_type_set, False, lr_graph, Tensor.AllocationQuantum,

915

)

916

917

# Iterate over live ranges and evict tensors that doesn't fit

918

fast_storage_snapshot = lr_graph.get_temporal_memory_usage(self.arch.fast_storage_mem_area)

919

for lr in lr_graph.lrs:

920

if (

921

lr.mem_area == self.arch.fast_storage_mem_area

922

and max(fast_storage_snapshot[lr.start_time : lr.end_time + 1]) > memory_limit

923

):

924

# Evict tensor to DRAM

925

for tens in lr.tensors:

926

if tens.purpose == TensorPurpose.FeatureMap and tens.sub_purpose == TensorSubPurpose.Standard:

927

# Can only evict unbuffered FeatureMaps

928

tens.mem_area = self.arch.feature_map_storage_mem_area

929

tens.mem_type = MemType.Scratch

930

# Adjust the snapshot

931

fast_storage_snapshot[lr.start_time : lr.end_time + 1] -= lr.size

932

933

def move_constant_data(self):

934

"""Determine if data, can be moved from permanent storage to another memory area. A move

935

will generate a DMA command in the high-level command stream"""

936

for sched_op in self.sched_ops:

937

parent_op = sched_op.parent_op

938

is_lut_used = any(inp.purpose == TensorPurpose.LUT for inp in parent_op.inputs)

939

max_ifm_shram_avail = (

940

(self.arch.available_shram_banks(is_lut_used) - self.arch.shram_reserved_output_banks)

941

* self.arch.shram_bank_size

// 2

)

for idx, tens in enumerate(parent_op.inputs):

946

if tens.mem_type not in (MemType.Scratch, MemType.Scratch_fast):

947

# Tensor is in permanent storage

948

# Only when permanent storage differs from feature map storage, there is a point moving the data

949

if (

950

tens.mem_area in self.arch.permanent_storage_mem_area

951

and self.arch.permanent_storage_mem_area != self.arch.feature_map_storage_mem_area

952

) or tens.purpose == TensorPurpose.LUT:

953

if tens.purpose == TensorPurpose.LUT or (

954

tens.purpose == TensorPurpose.FeatureMap

955

and sched_op.op_type.is_binary_elementwise_op()

956

and tens.shape != []

957

and sched_op.ifm.shape != sched_op.ofm.shape

958

and tens.storage_size() > max_ifm_shram_avail

959

):

960

only_vector_product_consumers = all(

961

oper and oper.type.npu_block_type == NpuBlockType.VectorProduct

962

for oper in tens.consumers()

963

)

964

965

if (not only_vector_product_consumers) or tens.purpose == TensorPurpose.LUT:

966

new_tens = tens.clone_into_fast_storage(self.arch)

967

if tens.purpose == TensorPurpose.LUT:

968

new_tens.mem_area = MemArea.Shram

969

970

new_tens.consumer_list.append(parent_op)

971

parent_op.inputs[idx] = new_tens

972

sched_op.parent_ps.inputs[idx] = new_tens

973

974

def print_schedule(self, schedule: Schedule):

975

print(f"Schedule: '{schedule.name}'")

976

for sched_op in self.sched_ops:

977

if sched_op not in schedule.cost_map:

978

# Sub-schedule printing

979

continue

980

981

op_info = schedule.cost_map[sched_op]

982

print(f"\t{sched_op.index}: Operation {sched_op.name} - OFM {sched_op.ofm.shape}")

983

print(f"\t\tType: {sched_op.op_type}")

984

print(f"\t\tKernel: {sched_op.kernel}")

985

print(f"{op_info}")

986

mem_usage = (

987

schedule.memory_snapshot[op_info.time_index]

988

if op_info.time_index < len(schedule.memory_snapshot)

989

else 0

990

)

991

print(f"\t\tSRAM Used: {mem_usage} bytes")

992

993

print(f"\tCascades:")

994

for i, cascade in enumerate(schedule.cascades.values()):

995

print(f"\t\t{i}: {cascade.start} -> {cascade.end}, size: {cascade.mem_usage}")

Patrik Gustavsson

feeb06d

2020-04-22 12:53:47 +0200

[diff] [blame]

996

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

997

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

998

def _update_tensor_allocation(nng: Graph, arch: ArchitectureFeatures, options):

999

"""

1000

Creates live ranges and runs tensor allocator for the current schedule

1001

(i.e. sg.schedule for all subgraphs), returns the maximum memory usage

1002

and updates SchedulerOpInfo.mem_usage for all operations in the schedule.

1003

"""

1004

root_sg = nng.get_root_subgraph()

1005

1006

alloc_list = []

1007

if arch.is_spilling_enabled():

1008

mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,)))

1009

mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))

1010

# Order is important

1011

alloc_list.append(mem_alloc_scratch_fast)

1012

alloc_list.append(mem_alloc_scratch)

1013

else:

1014

mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))

1015

alloc_list.append(mem_alloc_scratch)

1016

1017

for mem_area, mem_type_set in alloc_list:

1018

tensor_allocation.allocate_tensors(

nng,

root_sg,

arch,

mem_area,

mem_type_set,

tensor_allocator=options.tensor_allocator,

1025

verbose_allocation=options.verbose_allocation,

1026

cpu_tensor_alignment=options.cpu_tensor_alignment,

)

def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_options: SchedulerOptions):

1031

"""Entry point for the Scheduler"""

1032

# Initialize CPU subgraphs

1033

schedulers = dict()

1034

# Initialize schedulers with max schedule. Only schedule NPU subgraphs

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1035

for sg in nng.subgraphs:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1036

if sg.placement != PassPlacement.Npu:

1037

# Create cascaded passes for CPU Ops

1038

cascaded_passes = []

1039

for idx, ps in enumerate(sg.passes):

1040

cps = CascadedPass(

1041

ps.name, SchedulingStrategy.WeightStream, ps.inputs, [], ps.outputs, [ps], ps.placement, False,

1042

)

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1043

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1044

cps.time = idx

1045

ps.cascade = cps

1046

cascaded_passes.append(cps)

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1047

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1048

sg.cascaded_passes = cascaded_passes

1049

else:

1050

# Npu subgraph - create schedule

1051

scheduler = Scheduler(nng, sg, arch, scheduler_options)

1052

schedulers[sg] = scheduler

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1053

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1054

scheduler.create_scheduler_representation(arch)

1055

sg.sched_ops = scheduler.sched_ops

1056

scheduler.move_constant_data()

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1057

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1058

# Create the Max schedule template

1059

max_schedule_template = scheduler.create_initial_schedule()

1060

scheduler.max_schedule = max_schedule_template

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1061

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1062

# Create the optimimised Max schedule

1063

sg.schedule = max_schedule_template

1064

scheduler.update_op_memory_snapshot(max_schedule_template)

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame^]

1065

opt_max_schedule = scheduler.propose_schedule_buffering(max_schedule_template, 1 << 32)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1066

sg.schedule = opt_max_schedule

1067

scheduler.update_op_memory_snapshot(opt_max_schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1068

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1069

# Create Min schedule

1070

min_schedule = scheduler.propose_minimal_schedule()

1071

initial_sram_limit = scheduler_options.optimization_sram_limit

1072

if scheduler_options.optimization_strategy == OptimizationStrategy.Size:

1073

initial_sram_limit = scheduler.min_memory_req

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1074

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1075

cascade_builder = CascadeBuilder(scheduler.sched_ops, arch.is_spilling_enabled())

1076

cascade_builder.build_cascades(min_schedule, max_schedule_template, initial_sram_limit)

1077

sg.schedule = min_schedule

1078

scheduler.update_op_memory_snapshot(min_schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1079

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1080

if scheduler_options.optimization_strategy == OptimizationStrategy.Performance:

1081

# Create an optimized schedule

1082

sg.schedule = scheduler.optimize_schedule(

1083

min_schedule, opt_max_schedule, max_schedule_template, scheduler_options

1084

)

1085

scheduler.update_op_memory_snapshot(sg.schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1086

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1087

scheduler.apply_schedule(sg.schedule)

1088

scheduler.use_fast_storage_for_feature_maps(sg.schedule, scheduler_options.optimization_sram_limit)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1089

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1090

if scheduler_options.verbose_schedule:

1091

scheduler.print_schedule(sg.schedule)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1092

Tim Hall