Blame - ethosu/vela/scheduler.py - ml/ethos-u/ethos-u-vela

2021-05-27 18:49:40 +0100

[diff] [blame]

1

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

16

#

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

17

# Description:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

18

# The scheduler creates and searches for an optimal plan for the network, selecting block configurations and

19

# subdivisions for the Operators

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

20

import copy

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

21

from enum import auto

22

from enum import IntEnum

23

from typing import Dict

24

from typing import List

25

from typing import Optional

26

from typing import Tuple

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

27

28

from . import live_range

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

29

from . import npu_performance

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

30

from . import tensor_allocation

31

from . import weight_compressor

32

from .architecture_allocator import ArchitectureBlockConfig

33

from .architecture_allocator import find_block_config

34

from .architecture_allocator import get_ifm_area_required

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

35

from .architecture_features import ArchitectureFeatures

36

from .architecture_features import Block

37

from .cascade_builder import CascadeBuilder

38

from .cascade_builder import CascadeInfo

Fredrik Svedberg

880e735

2020-08-25 11:31:47 +0200

[diff] [blame]

39

from .data_type import DataType

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

40

from .nn_graph import CascadedPass

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

41

from .nn_graph import Graph

42

from .nn_graph import Pass

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

43

from .nn_graph import PassPlacement

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

44

from .nn_graph import SchedulingStrategy

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

45

from .nn_graph import Subgraph

46

from .numeric_util import round_down

47

from .numeric_util import round_up

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

48

from .operation import NpuBlockType

Louis Verhaard

aee5d75

2020-09-30 09:01:52 +0200

[diff] [blame]

49

from .operation import Op

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

50

from .shape4d import Shape4D

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

51

from .tensor import MemArea

Patrik Gustavsson

eca2e95

2020-05-27 09:15:11 +0200

[diff] [blame]

52

from .tensor import MemType

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

53

from .tensor import Tensor

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

54

from .tensor import TensorFormat

55

from .tensor import TensorPurpose

56

from .tensor import TensorSubPurpose

Jacob Bohlin

1a66697

2020-09-11 10:04:15 +0200

[diff] [blame]

57

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

58

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

59

def shape_for_format(shape: Shape4D, tensor_format: TensorFormat) -> Shape4D:

60

if tensor_format == TensorFormat.NHCWB16:

61

return shape.with_depth(round_up(shape.depth, 16))

return shape

class OptimizationStrategy(IntEnum):

67

"""Enum defining the different optimization strategies for the Scheduler"""

68

69

Size = auto()

70

Performance = auto()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

def __str__(self):

return self.name

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

76

class SchedulerOpInfo:

77

"""Contains metadata about a SchedulerOperation that is unique to one Schedule"""

78

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

79

def __init__(

80

self,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

81

block_config: ArchitectureBlockConfig,

82

weights_size: int,

83

stripe_input: Shape4D,

84

stripe_input2: Optional[Shape4D],

85

stripe: Shape4D,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

86

):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

87

self.block_config = block_config

88

self.weights_size = weights_size

89

self.stripe_input = stripe_input

90

self.stripe_input2 = stripe_input2

91

self.stripe = stripe

92

self.cascade = 0 # Assigned by CascadeBuilder. 0 means not part of a cascade

93

self.time_index = None # Set by update_op_memory_snapshot

94

self.ofm_depth_slices: List[int] = [0, stripe.depth]

95

self.npu_weights_tensor = None

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

96

self.npu_scales_tensor = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

97

self.buffered_weight_tensor = None

98

self.cycles = None

99

self.slack_buffering_cycles = 0

100

self.slack_buffering_memory = 0

101

self.full_weight_transfer_cycles = 0

102

103

def copy(self):

104

res = SchedulerOpInfo(self.block_config, self.weights_size, self.stripe_input, self.stripe_input2, self.stripe,)

105

res.cascade = self.cascade

return res

def __str__(self):

res = f"\t\tBlock Config = {self.block_config}\n"

110

res += f"\t\tOFM Block = {self.block_config.ofm_block}\n"

111

res += f"\t\tIFM Stripe = {self.stripe_input}\n"

112

res += f"\t\tIFM2 Stripe = {self.stripe_input2}\n"

113

res += f"\t\tOFM Stripe = {self.stripe}\n"

114

res += f"\t\tEncoded Weights = {self.npu_weights_tensor and len(self.npu_weights_tensor.buffer)} bytes\n"

115

res += (

116

f"\t\tWeight buffer = {self.buffered_weight_tensor and self.buffered_weight_tensor.storage_size()} bytes\n"

117

)

118

res += f"\t\tDepth slices = {self.ofm_depth_slices}\n"

119

res += f"\t\tAssigned Cascade = {self.cascade}"

return res

class SchedulerOptions:

124

"""Contains options for the Scheduler"""

125

126

def __init__(

127

self, optimization_strategy, sram_target, verbose_schedule,

128

):

129

self.optimization_strategy = optimization_strategy

130

self.optimization_sram_limit = sram_target

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

131

self.verbose_schedule = verbose_schedule

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

132

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

133

def __str__(self) -> str:

134

return f"{type(self).__name__}: {str(self.__dict__)}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

__repr__ = __str__

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

139

class SchedulerTensor:

140

def __init__(self, shape, dt, mem_area, _format):

141

self.dtype = dt

142

self.mem_area = mem_area

143

self.shape = shape

144

self.format = _format

145

self.connection = None

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

146

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

147

148

class SchedulerOperation:

149

"""Scheduler internal representation of 'Operation'

150

This class can be seen as a node within the Scheduler Graph representation

151

"""

152

153

def __init__(self, ps: Pass, arch: ArchitectureFeatures, nng: Graph):

154

self.arch = arch

155

self.parent_ps = ps

156

self.parent_op = ps.primary_op

157

self.name = ps.primary_op.name

158

self.op_type = ps.primary_op.type

159

self.activation = ps.primary_op.activation

160

self.kernel = ps.primary_op.kernel

161

self.resampling_mode = ps.primary_op.ifm.resampling_mode

162

self.uses_scalar = ps.primary_op.ifm2 is not None and (

163

ps.primary_op.ifm.shape == [] or ps.primary_op.ifm2.shape == []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

164

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

165

self.ifm_ublock = arch.ifm_ublock

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

166

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

167

self.ifm = SchedulerTensor(ps.ifm_shapes[0], ps.ifm_tensor.dtype, ps.ifm_tensor.mem_area, ps.ifm_tensor.format,)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

168

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

169

self.ifm2 = None

170

if ps.ifm2_tensor:

171

self.ifm2 = SchedulerTensor(

172

ps.ifm_shapes[1], ps.ifm2_tensor.dtype, ps.ifm2_tensor.mem_area, ps.ifm2_tensor.format,

173

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

174

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

175

self.ofm = SchedulerTensor(ps.ofm_shapes[0], ps.ofm_tensor.dtype, ps.ofm_tensor.mem_area, ps.ofm_tensor.format,)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

176

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

177

# Input volume width and height required to produce the smallest possible stripe

178

self.min_stripe_input_w, self.min_stripe_input_h = self._calculate_min_stripe_input()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

179

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

180

# Flags that marks whether this SchedulerOperation requires full IFM/OFM

181

self.requires_full_ifm = False

182

self.requires_full_ifm2 = False

183

self.requires_full_ofm = False

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

184

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

185

self.index = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

186

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

187

def add_ifm_connection(self, conn: "Connection"):

188

"""Add input connection to another SchedulerOperation or Subgraph Input"""

189

conn.consumers.append(self)

190

self.ifm.connection = conn

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

191

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

192

def add_ifm2_connection(self, conn: "Connection"):

193

"""Add input connection to another SchedulerOperation or Subgraph Input"""

194

if self.ifm2:

195

conn.consumers.append(self)

196

self.ifm2.connection = conn

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

197

else:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

198

assert False, f"Trying to set an IFM2 Connection to {self} which has no IFM2"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

199

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

200

def add_ofm_connection(self, conn: "Connection"):

201

"""Add output connection to another SchedulerOperation or Subgraph Output"""

202

conn.producers.append(self)

203

self.ofm.connection = conn

204

205

def get_dependants(self):

206

"""Returns a list of the Ops that depend on this Operation's OFM"""

207

return self.ofm.connection.consumers

208

209

def ifm_size_in_bytes(self) -> int:

210

"""Returns size of the IFM in bytes"""

211

ifm_storage_shape = shape_for_format(self.ifm.shape, self.ifm.format)

212

return round_up(ifm_storage_shape.elements() * self.ifm.dtype.size_in_bytes(), Tensor.AllocationQuantum)

213

214

def ifm2_size_in_bytes(self) -> int:

215

"""Returns size of the IFM2 in bytes"""

216

if self.ifm2:

217

ifm2_storage_shape = shape_for_format(self.ifm2.shape, self.ifm2.format)

218

return round_up(ifm2_storage_shape.elements() * self.ifm2.dtype.size_in_bytes(), Tensor.AllocationQuantum)

return 0

def ofm_size_in_bytes(self) -> int:

223

"""Returns size of the OFM in bytes"""

224

ofm_storage_shape = shape_for_format(self.ofm.shape, self.ofm.format)

225

return round_up(ofm_storage_shape.elements() * self.ofm.dtype.size_in_bytes(), Tensor.AllocationQuantum)

226

227

def create_scheduler_info(self, nng: Graph, stripe: Shape4D) -> SchedulerOpInfo:

228

"""Returns schedule info about this SchedulerOperation based on how many ofm elements it should produce"""

229

ifm_shape = self.ifm.shape

230

ifm2_shape = self.ifm2 and self.ifm2.shape

231

ofm_shape = stripe

232

233

if ofm_shape != self.ofm.shape:

234

# Striped Op - Need to calculate stripe input volume

235

stripe_input_w, stripe_input_h = self._get_stripe_input_requirement(stripe)

236

# Ensure stripe input volume is within the full IFM volume

237

stripe_input_h = min(stripe_input_h, self.ifm.shape.height)

238

stripe_input_w = min(stripe_input_w, self.ifm.shape.width)

239

ifm_shape = ifm_shape.with_hw(stripe_input_h, stripe_input_w)

240

241

if self.ifm2:

242

stripe_input2_h = min(stripe_input_h, self.ifm2.shape.height)

243

stripe_input2_w = min(stripe_input_w, self.ifm2.shape.width)

244

ifm2_shape = ifm2_shape.with_hw(stripe_input2_h, stripe_input2_w)

245

246

block_config = self._get_block_config(ifm_shape, ifm2_shape, self.uses_scalar, ofm_shape)

247

248

scheduler_op_info = SchedulerOpInfo(block_config, 0, ifm_shape, ifm2_shape, ofm_shape)

249

if self.parent_op.weights:

250

# Default full-depth weight encoding with no buffering

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

251

(

252

scheduler_op_info.npu_weights_tensor,

253

scheduler_op_info.npu_scales_tensor,

254

) = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

255

self.arch,

256

self.parent_op,

257

self.parent_op.weights,

self.parent_op.bias,

self.kernel,

block_config,

[0, self.ofm.shape.depth],

262

)

263

264

self.parent_ps.block_config = block_config.old_style_representation()

265

return scheduler_op_info

266

267

def _get_stripe_input_requirement(self, stripe_shape: Shape4D) -> Tuple[int, int]:

268

"""Returns the amount of IFM required to produce the stripe with shape:'stripe_shape'"""

269

ofm_shape_to_produce = Block.from_shape(stripe_shape.as_list())

270

Fredrik Svedberg

3ff7a4a

2021-09-29 10:08:04 +0200

[diff] [blame]

271

return get_ifm_area_required(ofm_shape_to_produce, self.kernel, self.resampling_mode)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

272

273

def _calculate_min_stripe_input(self) -> Shape4D:

274

# Calculate the input volume required height and width for the smallest possible stripe (h,w = 1,1)

275

min_stripe = self.ofm.shape.with_hw(1, 1)

276

return self._get_stripe_input_requirement(min_stripe)

277

278

def _get_block_config(

279

self, ifm_shape: Shape4D, ifm2_shape: Optional[Shape4D], uses_scalar: bool, ofm_shape: Shape4D

280

) -> ArchitectureBlockConfig:

281

# Returns a block config and SHRAM layout

282

lut_banks = 2 if self.parent_op.activation_lut else 0

283

return find_block_config(

284

self.arch,

285

self.op_type.npu_block_type,

ofm_shape,

ifm_shape,

ifm2_shape,

uses_scalar,

self.ifm.dtype.size_in_bits(),

291

self.kernel,

292

lut_banks,

293

self.parent_op.has_scaling(),

294

self.resampling_mode,

)

class Connection:

"""Scheduler internal representation of a Tensor that connects two SchedulerOperations

300

This class can be seen as an edge within the Scheduler Graph representation

301

"""

302

303

def __init__(self, tensor: Tensor):

304

self.parent_tens = tensor

305

306

# SchedulerOperation relationships

307

self.producers: List[SchedulerOperation] = []

308

self.consumers: List[SchedulerOperation] = []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

309

310

def __str__(self):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

311

return f"<Connection {self.parent_tens.name}>"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

__repr__ = __str__

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

316

class Schedule:

317

"""Class that contains a solution of how to schedule an NPU subgraph and its cost"""

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

318

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

319

def __init__(self, sg: Subgraph, label: str):

320

self.sg = sg

321

self.label = label

322

self.cost_map: Dict[SchedulerOperation, SchedulerOpInfo] = {}

323

self.cascades: Dict[int, CascadeInfo] = {}

324

self.fast_storage_peak_usage = 0

325

self.memory_snapshot = None

@property

def name(self):

return f"{self.sg.name}_{self.label}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

330

331

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

332

class Scheduler:

333

"""Main class of the Vela Scheduling"""

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

334

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

335

def __init__(self, nng: Graph, sg: Subgraph, arch: ArchitectureFeatures, options: SchedulerOptions):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

336

self.nng = nng

337

self.sg = sg

338

self.arch = arch

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

339

self.sched_ops: List(SchedulerOperation) = []

340

self.max_schedule = None

341

self.scheduler_options = options

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

342

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

343

def create_scheduler_representation(self, arch: ArchitectureFeatures):

344

"""Creates a Scheduler Graph representation"""

345

# Temporary dict for creating connections between the Operations

346

connections: Dict[Tensor, Connection] = {}

347

# Memory required for the largest FeatureMap that has to be full

348

min_memory_req = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

349

for ps in self.sg.passes:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

350

if ps.primary_op:

351

# Set tensor format to NHCWB16 for output FeatureMaps, if possible

Louis Verhaard

0b9c9a3

2020-09-15 14:05:38 +0200

[diff] [blame]

352

for output in ps.outputs:

Jacob Bohlin

a5e8c1c

2021-06-14 13:33:39 +0200

[diff] [blame]

353

if output in self.sg.output_tensors or output.purpose != TensorPurpose.FeatureMap:

Patrik Gustavsson

feeb06d

2020-04-22 12:53:47 +0200

[diff] [blame]

354

continue

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

355

if not output.needs_linear_format:

Louis Verhaard

0b9c9a3

2020-09-15 14:05:38 +0200

[diff] [blame]

356

output.set_format(TensorFormat.NHCWB16, arch)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

357

358

# Create SchedulerOperations

359

op = SchedulerOperation(ps, arch, self.nng)

360

op.index = len(self.sched_ops)

361

362

# Make connections

363

if ps.ifm_tensor not in connections:

364

connections[ps.ifm_tensor] = Connection(ps.ifm_tensor)

365

if ps.ifm2_tensor and ps.ifm2_tensor not in connections:

366

connections[ps.ifm2_tensor] = Connection(ps.ifm2_tensor)

367

if ps.ofm_tensor not in connections:

368

connections[ps.ofm_tensor] = Connection(ps.ofm_tensor)

369

370

op.add_ifm_connection(connections[ps.ifm_tensor])

371

if ps.ifm2_tensor:

372

op.add_ifm2_connection(connections[ps.ifm2_tensor])

373

op.add_ofm_connection(connections[ps.ofm_tensor])

374

375

# Set requirements on the ifm/ofm buffers

376

self.sched_ops.append(op)

377

if ps.ifm_tensor in self.sg.input_tensors:

378

# This Op consumes a subgraph input

379

op.requires_full_ifm = True

380

if ps.ifm2_tensor and ps.ifm2_tensor in self.sg.input_tensors:

381

# This Op consumes a subgraph input

382

op.requires_full_ifm2 = True

383

if ps.ofm_tensor in self.sg.output_tensors:

384

# This Op produces a subgraph output

385

op.requires_full_ofm = True

386

if ps.ifm_tensor.needs_linear_format:

387

op.requires_full_ifm = True

388

if ps.ifm2_tensor and ps.ifm2_tensor.needs_linear_format:

389

op.requires_full_ifm2 = True

390

if ps.ofm_tensor.needs_linear_format or ps.primary_op.memory_function == Op.ConcatSliceWrite:

391

op.requires_full_ofm = True

392

if len(ps.primary_op.outputs) > 1 or len(ps.primary_op.outputs[0].consumer_list) > 1:

393

# Op has multiple outputs or consumers - requires full OFM

394

op.requires_full_ofm = True

395

396

# Check memory requirements if this Op requires any full FeatureMaps

397

op_memory_req = 0

398

if op.requires_full_ifm:

399

op_memory_req += op.ifm_size_in_bytes()

400

if op.requires_full_ifm2:

401

op_memory_req += op.ifm2_size_in_bytes()

402

if op.requires_full_ofm:

403

op_memory_req += op.ofm_size_in_bytes()

404

405

min_memory_req = max(op_memory_req, min_memory_req)

406

407

# Theoretical minimum required memory - used to guide the cascade building

408

self.min_memory_req = min_memory_req

409

410

def create_initial_schedule(self) -> Schedule:

411

"""Creates an initial schedule with no cascading or buffering of any kind"""

412

schedule = Schedule(self.sg, "MAX")

413

414

for op in self.sched_ops:

415

cost = op.create_scheduler_info(self.nng, op.ofm.shape)

416

cost.cycles = self.estimate_op_performance(op, cost.block_config, op.ofm.shape.depth)

417

schedule.cost_map[op] = cost

return schedule

def update_op_memory_snapshot(self, schedule: Schedule):

422

memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

423

424

# Collect live ranges from tensors

425

lr_graph = live_range.LiveRangeGraph()

426

for mem_area, mem_type_set in memories_list:

427

live_range.extract_live_ranges_from_cascaded_passes(

428

self.nng.get_root_subgraph(), mem_area, mem_type_set, False, lr_graph, Tensor.AllocationQuantum,

429

)

430

431

# Populate time-array with memory used by live ranges

432

temporal_usage = lr_graph.get_temporal_memory_usage(self.arch.fast_storage_mem_area)

433

schedule.memory_snapshot = temporal_usage

434

435

# Set the peak memory usage

436

schedule.fast_storage_peak_usage = max(temporal_usage, default=0)

437

438

def estimate_op_performance(self, op: SchedulerOperation, block_config, ofm_depth):

439

query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)

440

query.ifm_shape = op.ifm.shape

441

query.ifm_memory_area = op.ifm.mem_area

442

query.ifm_bits = op.ifm.dtype.size_in_bits()

443

query.ifm_format = op.ifm.format

444

query.ifm2_shape = op.ifm2 and op.ifm2.shape

445

query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area

446

query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()

447

query.ifm2_format = op.ifm2 and op.ifm2.format

448

query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)

449

query.ofm_memory_area = op.ofm.mem_area

450

query.ofm_bits = op.ofm.dtype.size_in_bits()

451

query.ofm_format = op.ofm.format

452

if op.parent_op.bias:

453

query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)

454

query.const_memory_area = self.arch.fast_storage_mem_area

455

456

query.kernel = op.kernel

457

query.config = block_config

458

459

return npu_performance.measure_cycle_cost(self.arch, op.op_type, op.activation and op.activation.op_type, query)

460

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

461

def propose_schedule_buffering(self, ref_schedule: Schedule, staging_limit_bytes):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

462

"""Create a buffered schedule"""

463

buffered_schedule = Schedule(self.sg, f"{ref_schedule.label}_BUFFERED")

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

464

465

prev_op = None

466

for sched_op in self.sched_ops:

467

if sched_op not in ref_schedule.cost_map:

468

# sched_op is not part of this sub-schedule - skip

469

continue

470

471

self.propose_operator_buffering(sched_op, prev_op, buffered_schedule, ref_schedule, staging_limit_bytes)

472

prev_op = sched_op

473

474

return buffered_schedule

475

476

def propose_operator_buffering(

477

self,

478

sched_op: SchedulerOperation,

479

prev_op: SchedulerOperation,

480

buffered_schedule: Schedule,

481

ref_schedule: Schedule,

482

staging_limit_bytes,

483

):

484

# Mild recursion might mean this Op has already been seen

485

if sched_op in buffered_schedule.cost_map:

486

return

487

488

# Take the reference schedule as default costings for this schedule

489

ref_cost = ref_schedule.cost_map[sched_op]

490

cost = copy.copy(ref_cost)

491

cost.slack_buffering_cycles = ref_cost.cycles.op_cycles

492

memory_snapshot = ref_schedule.memory_snapshot

493

ref_memory_usage = memory_snapshot[ref_cost.time_index] if ref_cost.time_index < len(memory_snapshot) else 0

494

cost.slack_buffering_memory = staging_limit_bytes - ref_memory_usage

495

buffered_schedule.cost_map[sched_op] = cost

496

497

# Attempt weight buffering on anything with a weights tensor

498

if sched_op.parent_op.weights:

499

self.propose_weight_buffering(

500

sched_op.parent_op.weights,

501

sched_op.parent_op.bias,

sched_op,

prev_op,

buffered_schedule,

ref_schedule,

cost.slack_buffering_memory,

)

return cost

def weights_needs_dma(self, weight_tensor):

512

if weight_tensor and weight_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast):

513

# Weights are in permanent storage

514

# Only when permanent storage differs from feature map storage, there is a point moving the data

515

if (

516

weight_tensor.mem_area in (MemArea.Dram, MemArea.OffChipFlash)

517

and self.arch.permanent_storage_mem_area != self.arch.fast_storage_mem_area

):

return True

return False

def propose_weight_buffering(

self,

weight_tensor,

scale_tensor,

sched_op: SchedulerOperation,

527

prev_op: SchedulerOperation,

528

buffered_schedule: Schedule,

529

ref_schedule: Schedule,

530

buffer_limit_bytes,

531

):

532

cost = buffered_schedule.cost_map[sched_op]

533

prev_cost = buffered_schedule.cost_map.get(prev_op)

534

ref_cost = ref_schedule.cost_map[sched_op]

535

assert cost and ref_cost

536

537

needs_dma = self.weights_needs_dma(weight_tensor)

538

539

ofm_full_depth_slices = [0, ref_cost.stripe.depth]

540

541

# Encode weights for the full depth

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

542

full_weights, full_scales = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

self.arch,

sched_op.parent_op,

weight_tensor,

scale_tensor,

sched_op.kernel,

cost.block_config,

ofm_full_depth_slices,

550

)

551

full_weights_bytes = len(full_weights.buffer)

552

cost.ofm_depth_slices = ofm_full_depth_slices

553

554

# No buffering required - take all the weights from permanent storage

555

if sched_op.op_type == Op.FullyConnected or not needs_dma:

556

cost.npu_weights_tensor = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

557

cost.npu_scales_tensor = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

558

return

559

560

encoded_weights = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

561

encoded_scales = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

562

563

# How many NPU cycles are available under the previously executing

564

# operator and SRAM unused for performing buffered DMA transfers

565

slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0

566

slack_memory = prev_cost.slack_buffering_memory if prev_cost else 0

567

568

# Force full depth for cascaded Ops

569

if ref_cost.cascade != 0:

570

weight_tensor_purpose = TensorSubPurpose.Standard

571

weight_buffer_size = full_weights_bytes

572

# Update the memory snapshot to reflect the added size of the weights

573

ref_schedule.memory_snapshot[ref_cost.time_index] += weight_buffer_size

574

else:

575

# Estimate the buffering cycle time for the full set of weights

576

full_transfer_cycles = npu_performance.measure_mem2mem_cycles(

577

self.arch, weight_tensor.mem_area, self.arch.fast_storage_mem_area, full_weights_bytes

578

)

579

cost.full_weight_transfer_cycles = full_transfer_cycles

580

581

# Calculate the amount of prebuffering necessary (or what is possible with limited

582

# double buffer buffer size)

583

half_buffer_limit = buffer_limit_bytes // 2

584

if full_transfer_cycles > slack_cycles:

585

prebuffer_ratio = slack_cycles / full_transfer_cycles

586

prebuffer_bytes = min(prebuffer_ratio * full_weights_bytes, half_buffer_limit)

587

else:

588

prebuffer_bytes = min(full_weights_bytes, half_buffer_limit)

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

589

590

prebuffer_ratio = prebuffer_bytes / full_weights_bytes

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

591

592

# Have to split the weights if the initial buffering can't store

593

# all of the compressed weights

594

if prebuffer_bytes < full_weights_bytes:

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

595

block_depth = cost.block_config.ofm_block.depth

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

596

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

597

# Choose initial prebuffering depth (already buffer clamped)

598

prebuffer_depth = ref_cost.stripe.depth * prebuffer_ratio

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

599

prebuffer_depth = int(max(16, round_down(prebuffer_depth, ArchitectureFeatures.OFMSplitDepth)))

600

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

601

# Calculate cycles executed during the prebuffer

602

pre_op_cycles = self.estimate_op_performance(sched_op, cost.block_config, prebuffer_depth)

603

buffering_depth = ref_cost.stripe.depth * (pre_op_cycles.op_cycles / full_transfer_cycles)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

604

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

605

# Choose initial buffering depth and clamp to the double buffering limit

606

buffering_depth = round_up(buffering_depth, block_depth)

607

buffering_bytes = (buffering_depth / ref_cost.stripe.depth) * full_weights_bytes

608

if buffering_bytes > half_buffer_limit:

609

buffering_depth = (half_buffer_limit / full_weights_bytes) * ref_cost.stripe.depth

610

611

while True:

612

# Attempt to buffer whole blocks

613

if buffering_bytes > block_depth:

614

buffering_depth = round_down(buffering_depth, block_depth)

615

else:

616

buffering_depth = round_down(buffering_depth, ArchitectureFeatures.OFMSplitDepth)

617

buffering_depth = int(max(buffering_depth, ArchitectureFeatures.OFMSplitDepth))

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

618

619

# Create list of depth slices

620

depth_slices = [0]

621

if prebuffer_depth < ref_cost.stripe.depth:

622

depth_slices += list(range(prebuffer_depth, ref_cost.stripe.depth, buffering_depth))

623

depth_slices.append(ref_cost.stripe.depth)

624

625

# Encode weights based depth slices

626

cost.ofm_depth_slices = depth_slices

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

627

encoded_weights, encoded_scales = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

self.arch,

sched_op.parent_op,

weight_tensor,

scale_tensor,

sched_op.kernel,

cost.block_config,

cost.ofm_depth_slices,

635

)

636

637

# Chosen buffering might not fit at all, iterate until it does

638

# or until the minimum usable slice size is reached

639

if (

640

encoded_weights.max_range_bytes <= half_buffer_limit

641

or prebuffer_depth == ArchitectureFeatures.OFMSplitDepth

):

break

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

645

if buffering_depth > prebuffer_depth:

646

buffering_depth = round_up(buffering_depth // 2, ArchitectureFeatures.OFMSplitDepth)

647

else:

648

prebuffer_depth = round_up(prebuffer_depth // 2, ArchitectureFeatures.OFMSplitDepth)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

649

650

# Calculate cycles required to run the last op for use as future slack

651

tail_cycles = self.estimate_op_performance(

652

sched_op, cost.block_config, depth_slices[-1] - depth_slices[-2]

653

)

654

cost.slack_buffering_cycles = tail_cycles.op_cycles

655

656

# Determine whether the weights need to be double buffered

657

weight_buffer_size = min(len(encoded_weights.buffer), encoded_weights.max_range_bytes)

658

659

# Only buffer weights if there's still space left for the buffer

660

if weight_buffer_size <= buffer_limit_bytes:

661

assert weight_buffer_size % 16 == 0

662

# Determine whether to double buffer or single buffer

663

if (weight_buffer_size * 2 <= buffer_limit_bytes) and (weight_buffer_size < len(encoded_weights.buffer)):

664

weight_buffer_size = weight_buffer_size * 2

665

weight_tensor_purpose = TensorSubPurpose.DoubleBuffer

666

else:

667

weight_tensor_purpose = TensorSubPurpose.Standard

668

Jacob Bohlin

eee9e5d

2021-08-17 17:44:45 +0200

[diff] [blame]

669

cost.buffered_weight_tensor = self.buffer_tensor(

670

encoded_weights, weight_tensor_purpose, weight_buffer_size, weight_tensor.name

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

671

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

672

if ref_cost.cascade == 0:

673

# Determine if the lifetime can be extended and pre-buffer weights under the previous operation

674

cost.buffered_weight_tensor.pre_buffer = weight_buffer_size < slack_memory

675

676

cost.slack_buffering_memory -= weight_buffer_size

677

else:

678

# Don't slice or buffer - use the whole depth from persistent storage

679

cost.ofm_depth_slices = ofm_full_depth_slices

680

encoded_weights = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

681

encoded_scales = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

682

683

cost.npu_weights_tensor = encoded_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

684

cost.npu_scales_tensor = encoded_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

685

Jacob Bohlin

eee9e5d

2021-08-17 17:44:45 +0200

[diff] [blame]

686

def buffer_tensor(self, src_tensor: Tensor, sub_purpose: TensorSubPurpose, buffer_size: int, name: str) -> Tensor:

687

buffered_weight_tensor = Tensor([1, 1, 1, buffer_size], DataType.uint8, name + "_buffer")

688

buffered_weight_tensor.src_tensor = src_tensor

689

buffered_weight_tensor.mem_area = self.arch.fast_storage_mem_area

690

buffered_weight_tensor.mem_type = MemType.Scratch_fast

691

buffered_weight_tensor.purpose = TensorPurpose.Weights

692

buffered_weight_tensor.sub_purpose = sub_purpose

693

return buffered_weight_tensor

694

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

695

def propose_minimal_schedule(self) -> Schedule:

696

"""Proposes scheduling parameters where every operator is subdivided into the smallest stripe that satisfies the

697

next operators stride"""

698

min_schedule = Schedule(self.sg, "MIN")

699

cost_map = min_schedule.cost_map

700

701

# Keep track of the previous Op - which consumes the current Op's OFM

702

prev_op = None

703

for sched_op in reversed(self.sched_ops):

704

min_stripe_height = prev_op.kernel.stride.y if prev_op else 1

705

min_stripe = sched_op.ofm.shape.with_height(min_stripe_height)

706

707

cost = sched_op.create_scheduler_info(self.nng, min_stripe)

708

cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)

709

cost_map[sched_op] = cost

prev_op = sched_op

return min_schedule

def propose_schedule_striping(self, final_stripe: Shape4D, label: str, ref_schedule: Schedule) -> Schedule:

716

"""Proposes new striping for a schedule. The stripe is derived from the ifm requirements of the next Op down"""

717

ref_cost = ref_schedule.cost_map

718

719

striped_schedule = Schedule(self.sg, label)

720

stripe = final_stripe

721

for sched_op in reversed(self.sched_ops):

722

if sched_op not in ref_cost:

723

# sched_op is not part of the sub-schedule - skip

724

continue

725

726

# Create a cost entry with the new stripe

727

cost = sched_op.create_scheduler_info(self.nng, stripe)

728

Jacob Bohlin

eee9e5d

2021-08-17 17:44:45 +0200

[diff] [blame]

729

if ref_cost[sched_op].buffered_weight_tensor:

730

# If the weights are buffered in the reference schedule they should be in the new proposal

731

weight_tensor = cost.npu_weights_tensor

732

cost.buffered_weight_tensor = self.buffer_tensor(

733

weight_tensor, TensorSubPurpose.Standard, len(weight_tensor.buffer), weight_tensor.name

734

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

735

736

# Estimate performance

737

cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)

738

striped_schedule.cost_map[sched_op] = cost

739

740

# Calculate the preceeding Op's stripe

741

stripe = sched_op.ifm.shape.with_height(stripe.height * sched_op.kernel.stride.y)

742

743

return striped_schedule

744

745

def estimate_schedule_memory_usage(self, schedule: Schedule, non_local_mem_usage: dict):

746

"""Estimates the memory usage of a schedule"""

747

cost = schedule.cost_map

748

cascades = schedule.cascades

749

peak_mem_usage = 0

750

for sched_op in self.sched_ops:

751

if sched_op not in cost:

752

# sched_op is not part of the sub-schedule - skip

753

continue

754

755

if cost[sched_op].cascade:

756

# This Op is part of a cascade - use the cascade's memory usage

757

cascade_info = cascades[cost[sched_op].cascade]

758

# Non-local memory usage is already included in the cascade_info

759

peak_mem_usage = max(cascade_info.mem_usage, peak_mem_usage)

760

else:

761

# This Op is not part of a cascade - calculate the memory usage

762

op_weight_buffer = 0

763

if cost[sched_op].buffered_weight_tensor:

764

op_weight_buffer = cost[sched_op].buffered_weight_tensor.storage_size()

765

766

op_mem_usage = (

767

sched_op.ifm_size_in_bytes()

768

+ sched_op.ofm_size_in_bytes()

769

+ op_weight_buffer

770

+ non_local_mem_usage.get(sched_op, 0)

771

)

772

peak_mem_usage = max(op_mem_usage, peak_mem_usage)

773

774

return peak_mem_usage

775

776

def optimize_sub_schedule(

777

self, cascade_info: CascadeInfo, ref_schedule: Schedule, max_template: Schedule, memory_limit: int

778

) -> Schedule:

779

"""Extracts the Ops covered by the given cascade and creates a sub-schedule. The sub-schedule is optimized by

780

proposing weight buffering and then continously proposing new stripe sizes"""

781

ref_cost = ref_schedule.cost_map

782

# Extract the ops that are part of this sub-schedule

783

start = cascade_info.start

784

end = cascade_info.end

785

sub_schedule_ops = self.sched_ops[start : end + 1]

786

# Create a sub-schedule that contains only the costs for the Ops that are part of the sub-schedule

787

sub_schedule = Schedule(self.sg, f"SUB_{start}_{end}")

788

for sched_op in sub_schedule_ops:

789

sub_schedule.cost_map[sched_op] = ref_cost[sched_op]

790

791

sub_schedule.cascades[end] = cascade_info

792

# Use the memory snapshot from the reference schedule

793

sub_schedule.memory_snapshot = ref_schedule.memory_snapshot

794

795

# Calculate memory usage that is live during the sub-schedule but not part of it

796

time_for_cascade = ref_cost[sub_schedule_ops[0]].time_index

797

mem_usage_parallel_to_sub_schedule = ref_schedule.memory_snapshot[time_for_cascade] - cascade_info.mem_usage

798

# If the first Op's IFM has other consumers it has to live throughout the whole sub-schedule whether it's

799

# included in a cascade or not

800

persistent_initial_ifm = (

801

sub_schedule_ops[0].ifm_size_in_bytes() if len(sub_schedule_ops[0].ifm.connection.consumers) > 1 else 0

802

)

803

# Calculate non-local-mem-usage per Operator

804

non_local_mem_usage = {}

805

for idx, sched_op in enumerate(sub_schedule_ops):

806

non_local_mem_usage[sched_op] = mem_usage_parallel_to_sub_schedule

807

if idx != 0:

808

non_local_mem_usage[sched_op] += persistent_initial_ifm

809

810

cascade_builder = CascadeBuilder(sub_schedule_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)

811

812

# Start by adding buffering

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

813

buffered_sub_schedule = self.propose_schedule_buffering(

814

sub_schedule, self.scheduler_options.optimization_sram_limit

815

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

816

# Copy the cascades over from the unbuffered-schedule

817

buffered_sub_schedule.cascades = sub_schedule.cascades

818

819

# Generate the possible stripings for the final Op in the sub-schedule

820

final_ofm_shape = sub_schedule_ops[-1].ofm.shape

821

possible_stripes = [

822

final_ofm_shape.with_height(stripe_h) for stripe_h in range(1, final_ofm_shape.height // 2 + 1)

823

]

824

825

# Propose different striping - the possible stripes are proposed similarly to a binary search

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

826

best_schedule = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

827

iteration = 0

828

while len(possible_stripes) > 1:

829

proposed_stripe = possible_stripes[len(possible_stripes) // 2]

830

proposed_schedule = self.propose_schedule_striping(

831

proposed_stripe, f"OPTIMIZED_{iteration}", buffered_sub_schedule

832

)

833

834

cascade_builder.build_cascades(proposed_schedule, max_template, memory_limit)

835

836

# Check if proposal fits

837

proposed_schedule_mem_usage = self.estimate_schedule_memory_usage(proposed_schedule, non_local_mem_usage)

838

if (proposed_schedule_mem_usage) <= memory_limit:

839

# Remove all possible stripes smaller than this

840

possible_stripes = possible_stripes[len(possible_stripes) // 2 :]

841

best_schedule = proposed_schedule

842

if not proposed_schedule.cascades:

843

# No cascading required - early exit

844

break

845

else:

846

# Proposal doesn't fit within the limit - remove all possible stripes larger than this

847

possible_stripes = possible_stripes[: len(possible_stripes) // 2]

iteration += 1

return best_schedule

def optimize_schedule(

854

self, schedule: Schedule, max_sched: Schedule, max_template: Schedule, options: SchedulerOptions,

855

) -> Schedule:

856

"""Extracts sub-schedules based on the cascades and optimizes them and applies them to the final schedule"""

857

sram_limit = options.optimization_sram_limit

858

if max_sched.fast_storage_peak_usage < sram_limit and not self.arch.is_spilling_enabled():

859

# Maximum performance schedule fits within the SRAM target

860

return max_sched

861

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

862

# Iterate over a copy of the cascades since they may change during the loop

863

for cascade_info in list(schedule.cascades.values()):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

864

# Optimize the sub-schedule in this cascade

865

opt_sub_schedule = self.optimize_sub_schedule(cascade_info, schedule, max_template, sram_limit)

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

866

if opt_sub_schedule:

867

# Remove the existing cascade

868

del schedule.cascades[cascade_info.end]

869

# Update the sub-schedule Op and cascade costs to the full schedule

870

schedule.cost_map.update(opt_sub_schedule.cost_map)

871

schedule.cascades.update(opt_sub_schedule.cascades)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

872

873

# Update memory snapshot

874

self.sg.schedule = schedule

875

self.update_op_memory_snapshot(schedule)

876

# Propose schedule buffering to the optimized schedule

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

877

optimized_sched = self.propose_schedule_buffering(schedule, self.scheduler_options.optimization_sram_limit)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

878

# Copy the cascade's metadata from the unbuffered schedule

879

optimized_sched.cascades = schedule.cascades

880

return optimized_sched

881

882

def apply_schedule(self, sched: Schedule):

883

"""Applies the given schedule as a final solution"""

884

for sched_op in self.sched_ops:

885

op_info = sched.cost_map[sched_op]

886

cascade_info = sched.cascades.get(op_info.cascade, None)

887

if cascade_info and sched_op in cascade_info.buffers:

888

buffer_tens = sched_op.ifm.connection.parent_tens

889

# Apply memory area and type

890

buffer_tens.mem_area = self.arch.fast_storage_mem_area

891

buffer_tens.mem_type = MemType.Scratch_fast

892

# Apply Rolling buffer

893

buffer_tens.set_format(TensorFormat.NHCWB16, self.arch)

894

buffer_tens.set_new_sub_purpose(TensorSubPurpose.RollingBufferY, cascade_info.buffers[sched_op].height)

895

896

sched_op.parent_ps.block_config = op_info.block_config.old_style_representation()

897

898

# Ensure that the src_tensor reference is set correctly

899

if op_info.buffered_weight_tensor:

900

op_info.buffered_weight_tensor.src_tensor = op_info.npu_weights_tensor

901

902

def use_fast_storage_for_feature_maps(self, schedule: Schedule, memory_limit: int):

903

if self.arch.fast_storage_mem_area == self.arch.feature_map_storage_mem_area:

904

return

905

906

# Force all OFMs to fast-storage

907

for sched_op in self.sched_ops:

908

cost = schedule.cost_map[sched_op]

909

if cost.cascade == 0:

910

if sched_op.get_dependants():

911

ofm_tens = sched_op.ofm.connection.parent_tens

912

if not any(cons is None for cons in ofm_tens.consumer_list):

913

ofm_tens.mem_area = self.arch.fast_storage_mem_area

914

ofm_tens.mem_type = MemType.Scratch_fast

915

916

# Collect live ranges from tensors

917

memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

918

lr_graph = live_range.LiveRangeGraph()

919

for mem_area, mem_type_set in memories_list:

920

live_range.extract_live_ranges_from_cascaded_passes(

921

self.nng.get_root_subgraph(), mem_area, mem_type_set, False, lr_graph, Tensor.AllocationQuantum,

922

)

923

924

# Iterate over live ranges and evict tensors that doesn't fit

925

fast_storage_snapshot = lr_graph.get_temporal_memory_usage(self.arch.fast_storage_mem_area)

926

for lr in lr_graph.lrs:

927

if (

928

lr.mem_area == self.arch.fast_storage_mem_area

929

and max(fast_storage_snapshot[lr.start_time : lr.end_time + 1]) > memory_limit

930

):

931

# Evict tensor to DRAM

932

for tens in lr.tensors:

933

if tens.purpose == TensorPurpose.FeatureMap and tens.sub_purpose == TensorSubPurpose.Standard:

934

# Can only evict unbuffered FeatureMaps

935

tens.mem_area = self.arch.feature_map_storage_mem_area

936

tens.mem_type = MemType.Scratch

937

# Adjust the snapshot

938

fast_storage_snapshot[lr.start_time : lr.end_time + 1] -= lr.size

939

940

def move_constant_data(self):

941

"""Determine if data, can be moved from permanent storage to another memory area. A move

942

will generate a DMA command in the high-level command stream"""

943

for sched_op in self.sched_ops:

944

parent_op = sched_op.parent_op

945

is_lut_used = any(inp.purpose == TensorPurpose.LUT for inp in parent_op.inputs)

946

max_ifm_shram_avail = (

947

(self.arch.available_shram_banks(is_lut_used) - self.arch.shram_reserved_output_banks)

948

* self.arch.shram_bank_size

// 2

)

for idx, tens in enumerate(parent_op.inputs):

953

if tens.mem_type not in (MemType.Scratch, MemType.Scratch_fast):

954

# Tensor is in permanent storage

955

# Only when permanent storage differs from feature map storage, there is a point moving the data

956

if (

957

tens.mem_area in self.arch.permanent_storage_mem_area

958

and self.arch.permanent_storage_mem_area != self.arch.feature_map_storage_mem_area

959

) or tens.purpose == TensorPurpose.LUT:

960

if tens.purpose == TensorPurpose.LUT or (

Patrik Gustavsson

94292fe

2021-09-02 08:22:58 +0200

[diff] [blame]

961

# For elementwise broadcast

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

962

tens.purpose == TensorPurpose.FeatureMap

963

and sched_op.op_type.is_binary_elementwise_op()

964

and tens.shape != []

965

and sched_op.ifm.shape != sched_op.ofm.shape

Patrik Gustavsson

94292fe

2021-09-02 08:22:58 +0200

[diff] [blame]

966

and parent_op.write_shape is None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

967

and tens.storage_size() > max_ifm_shram_avail

968

):

969

only_vector_product_consumers = all(

970

oper and oper.type.npu_block_type == NpuBlockType.VectorProduct

971

for oper in tens.consumers()

972

)

973

974

if (not only_vector_product_consumers) or tens.purpose == TensorPurpose.LUT:

975

new_tens = tens.clone_into_fast_storage(self.arch)

976

if tens.purpose == TensorPurpose.LUT:

977

new_tens.mem_area = MemArea.Shram

978

979

new_tens.consumer_list.append(parent_op)

980

parent_op.inputs[idx] = new_tens

Dwight Lidman

352607c

2021-09-29 17:00:09 +0200

[diff] [blame^]

981

# If the index is out of range, IFM and IFM2 are the same tensor

982

# and pass inputs don't have duplicates

983

if idx < len(sched_op.parent_ps.inputs):

984

sched_op.parent_ps.inputs[idx] = new_tens

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

985

986

def print_schedule(self, schedule: Schedule):

987

print(f"Schedule: '{schedule.name}'")

988

for sched_op in self.sched_ops:

989

if sched_op not in schedule.cost_map:

990

# Sub-schedule printing

991

continue

992

993

op_info = schedule.cost_map[sched_op]

994

print(f"\t{sched_op.index}: Operation {sched_op.name} - OFM {sched_op.ofm.shape}")

995

print(f"\t\tType: {sched_op.op_type}")

996

print(f"\t\tKernel: {sched_op.kernel}")

997

print(f"{op_info}")

998

mem_usage = (

999

schedule.memory_snapshot[op_info.time_index]

1000

if op_info.time_index < len(schedule.memory_snapshot)

1001

else 0

1002

)

1003

print(f"\t\tSRAM Used: {mem_usage} bytes")

1004

1005

print(f"\tCascades:")

1006

for i, cascade in enumerate(schedule.cascades.values()):

1007

print(f"\t\t{i}: {cascade.start} -> {cascade.end}, size: {cascade.mem_usage}")

Patrik Gustavsson

feeb06d

2020-04-22 12:53:47 +0200

[diff] [blame]

1008

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1009

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1010

def _update_tensor_allocation(nng: Graph, arch: ArchitectureFeatures, options):

1011

"""

1012

Creates live ranges and runs tensor allocator for the current schedule

1013

(i.e. sg.schedule for all subgraphs), returns the maximum memory usage

1014

and updates SchedulerOpInfo.mem_usage for all operations in the schedule.

1015

"""

1016

root_sg = nng.get_root_subgraph()

1017

1018

alloc_list = []

1019

if arch.is_spilling_enabled():

1020

mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,)))

1021

mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))

1022

# Order is important

1023

alloc_list.append(mem_alloc_scratch_fast)

1024

alloc_list.append(mem_alloc_scratch)

1025

else:

1026

mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))

1027

alloc_list.append(mem_alloc_scratch)

1028

1029

for mem_area, mem_type_set in alloc_list:

1030

tensor_allocation.allocate_tensors(

nng,

root_sg,

arch,

mem_area,

mem_type_set,

tensor_allocator=options.tensor_allocator,

1037

verbose_allocation=options.verbose_allocation,

1038

cpu_tensor_alignment=options.cpu_tensor_alignment,

)

def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_options: SchedulerOptions):

1043

"""Entry point for the Scheduler"""

1044

# Initialize CPU subgraphs

1045

schedulers = dict()

1046

# Initialize schedulers with max schedule. Only schedule NPU subgraphs

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1047

for sg in nng.subgraphs:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1048

if sg.placement != PassPlacement.Npu:

1049

# Create cascaded passes for CPU Ops

1050

cascaded_passes = []

1051

for idx, ps in enumerate(sg.passes):

1052

cps = CascadedPass(

1053

ps.name, SchedulingStrategy.WeightStream, ps.inputs, [], ps.outputs, [ps], ps.placement, False,

1054

)

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1055

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1056

cps.time = idx

1057

ps.cascade = cps

1058

cascaded_passes.append(cps)

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1059

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1060

sg.cascaded_passes = cascaded_passes

1061

else:

1062

# Npu subgraph - create schedule

1063

scheduler = Scheduler(nng, sg, arch, scheduler_options)

1064

schedulers[sg] = scheduler

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1065

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1066

scheduler.create_scheduler_representation(arch)

1067

sg.sched_ops = scheduler.sched_ops

1068

scheduler.move_constant_data()

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1069

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1070

# Create the Max schedule template

1071

max_schedule_template = scheduler.create_initial_schedule()

1072

scheduler.max_schedule = max_schedule_template

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1073

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1074

# Create the optimimised Max schedule

1075

sg.schedule = max_schedule_template

1076

scheduler.update_op_memory_snapshot(max_schedule_template)

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

1077

opt_max_schedule = scheduler.propose_schedule_buffering(max_schedule_template, 1 << 32)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1078

sg.schedule = opt_max_schedule

1079

scheduler.update_op_memory_snapshot(opt_max_schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1080

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1081

# Create Min schedule

1082

min_schedule = scheduler.propose_minimal_schedule()

1083

initial_sram_limit = scheduler_options.optimization_sram_limit

1084

if scheduler_options.optimization_strategy == OptimizationStrategy.Size:

1085

initial_sram_limit = scheduler.min_memory_req

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1086

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1087

cascade_builder = CascadeBuilder(scheduler.sched_ops, arch.is_spilling_enabled())

1088

cascade_builder.build_cascades(min_schedule, max_schedule_template, initial_sram_limit)

1089

sg.schedule = min_schedule

1090

scheduler.update_op_memory_snapshot(min_schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1091

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1092

if scheduler_options.optimization_strategy == OptimizationStrategy.Performance:

1093

# Create an optimized schedule

1094

sg.schedule = scheduler.optimize_schedule(

1095

min_schedule, opt_max_schedule, max_schedule_template, scheduler_options

1096

)

1097

scheduler.update_op_memory_snapshot(sg.schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1098

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1099

scheduler.apply_schedule(sg.schedule)

1100

scheduler.use_fast_storage_for_feature_maps(sg.schedule, scheduler_options.optimization_sram_limit)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1101

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1102

if scheduler_options.verbose_schedule:

1103

scheduler.print_schedule(sg.schedule)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1104

Tim Hall