Blame - ethosu/vela/scheduler.py - ml/ethos-u/ethos-u-vela

2021-05-27 18:49:40 +0100

[diff] [blame]

1

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

16

#

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

17

# Description:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

18

# The scheduler creates and searches for an optimal plan for the network, selecting block configurations and

19

# subdivisions for the Operators

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

20

import copy

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

21

from enum import auto

22

from enum import IntEnum

23

from typing import Dict

24

from typing import List

25

from typing import Optional

26

from typing import Tuple

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

27

28

from . import live_range

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

29

from . import npu_performance

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

30

from . import tensor_allocation

31

from . import weight_compressor

32

from .architecture_allocator import ArchitectureBlockConfig

33

from .architecture_allocator import find_block_config

34

from .architecture_allocator import get_ifm_area_required

35

from .architecture_allocator import to_upscale

36

from .architecture_features import ArchitectureFeatures

37

from .architecture_features import Block

38

from .cascade_builder import CascadeBuilder

39

from .cascade_builder import CascadeInfo

Fredrik Svedberg

880e735

2020-08-25 11:31:47 +0200

[diff] [blame]

40

from .data_type import DataType

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

41

from .nn_graph import CascadedPass

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

42

from .nn_graph import Graph

43

from .nn_graph import Pass

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

44

from .nn_graph import PassPlacement

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

45

from .nn_graph import SchedulingStrategy

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

46

from .nn_graph import Subgraph

47

from .numeric_util import round_down

48

from .numeric_util import round_up

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

49

from .operation import NpuBlockType

Louis Verhaard

aee5d75

2020-09-30 09:01:52 +0200

[diff] [blame]

50

from .operation import Op

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

51

from .shape4d import Shape4D

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

52

from .tensor import MemArea

Patrik Gustavsson

eca2e95

2020-05-27 09:15:11 +0200

[diff] [blame]

53

from .tensor import MemType

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

54

from .tensor import Tensor

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

55

from .tensor import TensorFormat

56

from .tensor import TensorPurpose

57

from .tensor import TensorSubPurpose

Jacob Bohlin

1a66697

2020-09-11 10:04:15 +0200

[diff] [blame]

58

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

59

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

60

def shape_for_format(shape: Shape4D, tensor_format: TensorFormat) -> Shape4D:

61

if tensor_format == TensorFormat.NHCWB16:

62

return shape.with_depth(round_up(shape.depth, 16))

return shape

class OptimizationStrategy(IntEnum):

68

"""Enum defining the different optimization strategies for the Scheduler"""

69

70

Size = auto()

71

Performance = auto()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

def __str__(self):

return self.name

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

77

class SchedulerOpInfo:

78

"""Contains metadata about a SchedulerOperation that is unique to one Schedule"""

79

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

80

def __init__(

81

self,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

82

block_config: ArchitectureBlockConfig,

83

weights_size: int,

84

stripe_input: Shape4D,

85

stripe_input2: Optional[Shape4D],

86

stripe: Shape4D,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

87

):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

88

self.block_config = block_config

89

self.weights_size = weights_size

90

self.stripe_input = stripe_input

91

self.stripe_input2 = stripe_input2

92

self.stripe = stripe

93

self.cascade = 0 # Assigned by CascadeBuilder. 0 means not part of a cascade

94

self.time_index = None # Set by update_op_memory_snapshot

95

self.ofm_depth_slices: List[int] = [0, stripe.depth]

96

self.npu_weights_tensor = None

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame^]

97

self.npu_scales_tensor = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

98

self.buffered_weight_tensor = None

99

self.cycles = None

100

self.slack_buffering_cycles = 0

101

self.slack_buffering_memory = 0

102

self.full_weight_transfer_cycles = 0

103

104

def copy(self):

105

res = SchedulerOpInfo(self.block_config, self.weights_size, self.stripe_input, self.stripe_input2, self.stripe,)

106

res.cascade = self.cascade

return res

def __str__(self):

res = f"\t\tBlock Config = {self.block_config}\n"

111

res += f"\t\tOFM Block = {self.block_config.ofm_block}\n"

112

res += f"\t\tIFM Stripe = {self.stripe_input}\n"

113

res += f"\t\tIFM2 Stripe = {self.stripe_input2}\n"

114

res += f"\t\tOFM Stripe = {self.stripe}\n"

115

res += f"\t\tEncoded Weights = {self.npu_weights_tensor and len(self.npu_weights_tensor.buffer)} bytes\n"

116

res += (

117

f"\t\tWeight buffer = {self.buffered_weight_tensor and self.buffered_weight_tensor.storage_size()} bytes\n"

118

)

119

res += f"\t\tDepth slices = {self.ofm_depth_slices}\n"

120

res += f"\t\tAssigned Cascade = {self.cascade}"

return res

class SchedulerOptions:

125

"""Contains options for the Scheduler"""

126

127

def __init__(

128

self, optimization_strategy, sram_target, verbose_schedule,

129

):

130

self.optimization_strategy = optimization_strategy

131

self.optimization_sram_limit = sram_target

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

132

self.verbose_schedule = verbose_schedule

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

133

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

134

def __str__(self) -> str:

135

return f"{type(self).__name__}: {str(self.__dict__)}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

__repr__ = __str__

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

140

class SchedulerTensor:

141

def __init__(self, shape, dt, mem_area, _format):

142

self.dtype = dt

143

self.mem_area = mem_area

144

self.shape = shape

145

self.format = _format

146

self.connection = None

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

147

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

148

149

class SchedulerOperation:

150

"""Scheduler internal representation of 'Operation'

151

This class can be seen as a node within the Scheduler Graph representation

152

"""

153

154

def __init__(self, ps: Pass, arch: ArchitectureFeatures, nng: Graph):

155

self.arch = arch

156

self.parent_ps = ps

157

self.parent_op = ps.primary_op

158

self.name = ps.primary_op.name

159

self.op_type = ps.primary_op.type

160

self.activation = ps.primary_op.activation

161

self.kernel = ps.primary_op.kernel

162

self.resampling_mode = ps.primary_op.ifm.resampling_mode

163

self.uses_scalar = ps.primary_op.ifm2 is not None and (

164

ps.primary_op.ifm.shape == [] or ps.primary_op.ifm2.shape == []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

165

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

166

self.ifm_ublock = arch.ifm_ublock

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

167

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

168

self.ifm = SchedulerTensor(ps.ifm_shapes[0], ps.ifm_tensor.dtype, ps.ifm_tensor.mem_area, ps.ifm_tensor.format,)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

169

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

170

self.ifm2 = None

171

if ps.ifm2_tensor:

172

self.ifm2 = SchedulerTensor(

173

ps.ifm_shapes[1], ps.ifm2_tensor.dtype, ps.ifm2_tensor.mem_area, ps.ifm2_tensor.format,

174

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

175

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

176

self.ofm = SchedulerTensor(ps.ofm_shapes[0], ps.ofm_tensor.dtype, ps.ofm_tensor.mem_area, ps.ofm_tensor.format,)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

177

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

178

# Input volume width and height required to produce the smallest possible stripe

179

self.min_stripe_input_w, self.min_stripe_input_h = self._calculate_min_stripe_input()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

180

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

181

# Flags that marks whether this SchedulerOperation requires full IFM/OFM

182

self.requires_full_ifm = False

183

self.requires_full_ifm2 = False

184

self.requires_full_ofm = False

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

185

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

186

self.index = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

187

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

188

def add_ifm_connection(self, conn: "Connection"):

189

"""Add input connection to another SchedulerOperation or Subgraph Input"""

190

conn.consumers.append(self)

191

self.ifm.connection = conn

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

192

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

193

def add_ifm2_connection(self, conn: "Connection"):

194

"""Add input connection to another SchedulerOperation or Subgraph Input"""

195

if self.ifm2:

196

conn.consumers.append(self)

197

self.ifm2.connection = conn

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

198

else:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

199

assert False, f"Trying to set an IFM2 Connection to {self} which has no IFM2"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

200

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

201

def add_ofm_connection(self, conn: "Connection"):

202

"""Add output connection to another SchedulerOperation or Subgraph Output"""

203

conn.producers.append(self)

204

self.ofm.connection = conn

205

206

def get_dependants(self):

207

"""Returns a list of the Ops that depend on this Operation's OFM"""

208

return self.ofm.connection.consumers

209

210

def ifm_size_in_bytes(self) -> int:

211

"""Returns size of the IFM in bytes"""

212

ifm_storage_shape = shape_for_format(self.ifm.shape, self.ifm.format)

213

return round_up(ifm_storage_shape.elements() * self.ifm.dtype.size_in_bytes(), Tensor.AllocationQuantum)

214

215

def ifm2_size_in_bytes(self) -> int:

216

"""Returns size of the IFM2 in bytes"""

217

if self.ifm2:

218

ifm2_storage_shape = shape_for_format(self.ifm2.shape, self.ifm2.format)

219

return round_up(ifm2_storage_shape.elements() * self.ifm2.dtype.size_in_bytes(), Tensor.AllocationQuantum)

return 0

def ofm_size_in_bytes(self) -> int:

224

"""Returns size of the OFM in bytes"""

225

ofm_storage_shape = shape_for_format(self.ofm.shape, self.ofm.format)

226

return round_up(ofm_storage_shape.elements() * self.ofm.dtype.size_in_bytes(), Tensor.AllocationQuantum)

227

228

def create_scheduler_info(self, nng: Graph, stripe: Shape4D) -> SchedulerOpInfo:

229

"""Returns schedule info about this SchedulerOperation based on how many ofm elements it should produce"""

230

ifm_shape = self.ifm.shape

231

ifm2_shape = self.ifm2 and self.ifm2.shape

232

ofm_shape = stripe

233

234

if ofm_shape != self.ofm.shape:

235

# Striped Op - Need to calculate stripe input volume

236

stripe_input_w, stripe_input_h = self._get_stripe_input_requirement(stripe)

237

# Ensure stripe input volume is within the full IFM volume

238

stripe_input_h = min(stripe_input_h, self.ifm.shape.height)

239

stripe_input_w = min(stripe_input_w, self.ifm.shape.width)

240

ifm_shape = ifm_shape.with_hw(stripe_input_h, stripe_input_w)

241

242

if self.ifm2:

243

stripe_input2_h = min(stripe_input_h, self.ifm2.shape.height)

244

stripe_input2_w = min(stripe_input_w, self.ifm2.shape.width)

245

ifm2_shape = ifm2_shape.with_hw(stripe_input2_h, stripe_input2_w)

246

247

block_config = self._get_block_config(ifm_shape, ifm2_shape, self.uses_scalar, ofm_shape)

248

249

scheduler_op_info = SchedulerOpInfo(block_config, 0, ifm_shape, ifm2_shape, ofm_shape)

250

if self.parent_op.weights:

251

# Default full-depth weight encoding with no buffering

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame^]

252

(

253

scheduler_op_info.npu_weights_tensor,

254

scheduler_op_info.npu_scales_tensor,

255

) = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

256

self.arch,

257

self.parent_op,

258

self.parent_op.weights,

self.parent_op.bias,

self.kernel,

block_config,

[0, self.ofm.shape.depth],

263

)

264

265

self.parent_ps.block_config = block_config.old_style_representation()

266

return scheduler_op_info

267

268

def _get_stripe_input_requirement(self, stripe_shape: Shape4D) -> Tuple[int, int]:

269

"""Returns the amount of IFM required to produce the stripe with shape:'stripe_shape'"""

270

ofm_shape_to_produce = Block.from_shape(stripe_shape.as_list())

271

272

return get_ifm_area_required(ofm_shape_to_produce, self.kernel, to_upscale(self.resampling_mode))

273

274

def _calculate_min_stripe_input(self) -> Shape4D:

275

# Calculate the input volume required height and width for the smallest possible stripe (h,w = 1,1)

276

min_stripe = self.ofm.shape.with_hw(1, 1)

277

return self._get_stripe_input_requirement(min_stripe)

278

279

def _get_block_config(

280

self, ifm_shape: Shape4D, ifm2_shape: Optional[Shape4D], uses_scalar: bool, ofm_shape: Shape4D

281

) -> ArchitectureBlockConfig:

282

# Returns a block config and SHRAM layout

283

lut_banks = 2 if self.parent_op.activation_lut else 0

284

return find_block_config(

285

self.arch,

286

self.op_type.npu_block_type,

ofm_shape,

ifm_shape,

ifm2_shape,

uses_scalar,

self.ifm.dtype.size_in_bits(),

292

self.kernel,

293

lut_banks,

294

self.parent_op.has_scaling(),

295

self.resampling_mode,

)

class Connection:

"""Scheduler internal representation of a Tensor that connects two SchedulerOperations

301

This class can be seen as an edge within the Scheduler Graph representation

302

"""

303

304

def __init__(self, tensor: Tensor):

305

self.parent_tens = tensor

306

307

# SchedulerOperation relationships

308

self.producers: List[SchedulerOperation] = []

309

self.consumers: List[SchedulerOperation] = []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

310

311

def __str__(self):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

312

return f"<Connection {self.parent_tens.name}>"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

__repr__ = __str__

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

317

class Schedule:

318

"""Class that contains a solution of how to schedule an NPU subgraph and its cost"""

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

319

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

320

def __init__(self, sg: Subgraph, label: str):

321

self.sg = sg

322

self.label = label

323

self.cost_map: Dict[SchedulerOperation, SchedulerOpInfo] = {}

324

self.cascades: Dict[int, CascadeInfo] = {}

325

self.fast_storage_peak_usage = 0

326

self.memory_snapshot = None

@property

def name(self):

return f"{self.sg.name}_{self.label}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

331

332

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

333

class Scheduler:

334

"""Main class of the Vela Scheduling"""

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

335

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

336

def __init__(self, nng: Graph, sg: Subgraph, arch: ArchitectureFeatures, options: SchedulerOptions):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

337

self.nng = nng

338

self.sg = sg

339

self.arch = arch

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

340

self.sched_ops: List(SchedulerOperation) = []

341

self.max_schedule = None

342

self.scheduler_options = options

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

343

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

344

def create_scheduler_representation(self, arch: ArchitectureFeatures):

345

"""Creates a Scheduler Graph representation"""

346

# Temporary dict for creating connections between the Operations

347

connections: Dict[Tensor, Connection] = {}

348

# Memory required for the largest FeatureMap that has to be full

349

min_memory_req = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

350

for ps in self.sg.passes:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

351

if ps.primary_op:

352

# Set tensor format to NHCWB16 for output FeatureMaps, if possible

Louis Verhaard

0b9c9a3

2020-09-15 14:05:38 +0200

[diff] [blame]

353

for output in ps.outputs:

Patrik Gustavsson

d1836c7

2021-02-04 08:22:18 +0100

[diff] [blame]

354

if output.purpose != TensorPurpose.FeatureMap:

Patrik Gustavsson

feeb06d

2020-04-22 12:53:47 +0200

[diff] [blame]

355

continue

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

356

if not output.needs_linear_format:

Louis Verhaard

0b9c9a3

2020-09-15 14:05:38 +0200

[diff] [blame]

357

output.set_format(TensorFormat.NHCWB16, arch)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

358

359

# Create SchedulerOperations

360

op = SchedulerOperation(ps, arch, self.nng)

361

op.index = len(self.sched_ops)

362

363

# Make connections

364

if ps.ifm_tensor not in connections:

365

connections[ps.ifm_tensor] = Connection(ps.ifm_tensor)

366

if ps.ifm2_tensor and ps.ifm2_tensor not in connections:

367

connections[ps.ifm2_tensor] = Connection(ps.ifm2_tensor)

368

if ps.ofm_tensor not in connections:

369

connections[ps.ofm_tensor] = Connection(ps.ofm_tensor)

370

371

op.add_ifm_connection(connections[ps.ifm_tensor])

372

if ps.ifm2_tensor:

373

op.add_ifm2_connection(connections[ps.ifm2_tensor])

374

op.add_ofm_connection(connections[ps.ofm_tensor])

375

376

# Set requirements on the ifm/ofm buffers

377

self.sched_ops.append(op)

378

if ps.ifm_tensor in self.sg.input_tensors:

379

# This Op consumes a subgraph input

380

op.requires_full_ifm = True

381

if ps.ifm2_tensor and ps.ifm2_tensor in self.sg.input_tensors:

382

# This Op consumes a subgraph input

383

op.requires_full_ifm2 = True

384

if ps.ofm_tensor in self.sg.output_tensors:

385

# This Op produces a subgraph output

386

op.requires_full_ofm = True

387

if ps.ifm_tensor.needs_linear_format:

388

op.requires_full_ifm = True

389

if ps.ifm2_tensor and ps.ifm2_tensor.needs_linear_format:

390

op.requires_full_ifm2 = True

391

if ps.ofm_tensor.needs_linear_format or ps.primary_op.memory_function == Op.ConcatSliceWrite:

392

op.requires_full_ofm = True

393

if len(ps.primary_op.outputs) > 1 or len(ps.primary_op.outputs[0].consumer_list) > 1:

394

# Op has multiple outputs or consumers - requires full OFM

395

op.requires_full_ofm = True

396

397

# Check memory requirements if this Op requires any full FeatureMaps

398

op_memory_req = 0

399

if op.requires_full_ifm:

400

op_memory_req += op.ifm_size_in_bytes()

401

if op.requires_full_ifm2:

402

op_memory_req += op.ifm2_size_in_bytes()

403

if op.requires_full_ofm:

404

op_memory_req += op.ofm_size_in_bytes()

405

406

min_memory_req = max(op_memory_req, min_memory_req)

407

408

# Theoretical minimum required memory - used to guide the cascade building

409

self.min_memory_req = min_memory_req

410

411

def create_initial_schedule(self) -> Schedule:

412

"""Creates an initial schedule with no cascading or buffering of any kind"""

413

schedule = Schedule(self.sg, "MAX")

414

415

for op in self.sched_ops:

416

cost = op.create_scheduler_info(self.nng, op.ofm.shape)

417

cost.cycles = self.estimate_op_performance(op, cost.block_config, op.ofm.shape.depth)

418

schedule.cost_map[op] = cost

return schedule

def update_op_memory_snapshot(self, schedule: Schedule):

423

memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

424

425

# Collect live ranges from tensors

426

lr_graph = live_range.LiveRangeGraph()

427

for mem_area, mem_type_set in memories_list:

428

live_range.extract_live_ranges_from_cascaded_passes(

429

self.nng.get_root_subgraph(), mem_area, mem_type_set, False, lr_graph, Tensor.AllocationQuantum,

430

)

431

432

# Populate time-array with memory used by live ranges

433

temporal_usage = lr_graph.get_temporal_memory_usage(self.arch.fast_storage_mem_area)

434

schedule.memory_snapshot = temporal_usage

435

436

# Set the peak memory usage

437

schedule.fast_storage_peak_usage = max(temporal_usage, default=0)

438

439

def estimate_op_performance(self, op: SchedulerOperation, block_config, ofm_depth):

440

query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)

441

query.ifm_shape = op.ifm.shape

442

query.ifm_memory_area = op.ifm.mem_area

443

query.ifm_bits = op.ifm.dtype.size_in_bits()

444

query.ifm_format = op.ifm.format

445

query.ifm2_shape = op.ifm2 and op.ifm2.shape

446

query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area

447

query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()

448

query.ifm2_format = op.ifm2 and op.ifm2.format

449

query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)

450

query.ofm_memory_area = op.ofm.mem_area

451

query.ofm_bits = op.ofm.dtype.size_in_bits()

452

query.ofm_format = op.ofm.format

453

if op.parent_op.bias:

454

query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)

455

query.const_memory_area = self.arch.fast_storage_mem_area

456

457

query.kernel = op.kernel

458

query.config = block_config

459

460

return npu_performance.measure_cycle_cost(self.arch, op.op_type, op.activation and op.activation.op_type, query)

461

462

def propose_schedule_buffering(self, ref_schedule: Schedule):

463

"""Create a buffered schedule"""

464

buffered_schedule = Schedule(self.sg, f"{ref_schedule.label}_BUFFERED")

465

staging_limit_bytes = self.scheduler_options.optimization_sram_limit

466

467

prev_op = None

468

for sched_op in self.sched_ops:

469

if sched_op not in ref_schedule.cost_map:

470

# sched_op is not part of this sub-schedule - skip

471

continue

472

473

self.propose_operator_buffering(sched_op, prev_op, buffered_schedule, ref_schedule, staging_limit_bytes)

474

prev_op = sched_op

475

476

return buffered_schedule

477

478

def propose_operator_buffering(

479

self,

480

sched_op: SchedulerOperation,

481

prev_op: SchedulerOperation,

482

buffered_schedule: Schedule,

483

ref_schedule: Schedule,

484

staging_limit_bytes,

485

):

486

# Mild recursion might mean this Op has already been seen

487

if sched_op in buffered_schedule.cost_map:

488

return

489

490

# Take the reference schedule as default costings for this schedule

491

ref_cost = ref_schedule.cost_map[sched_op]

492

cost = copy.copy(ref_cost)

493

cost.slack_buffering_cycles = ref_cost.cycles.op_cycles

494

memory_snapshot = ref_schedule.memory_snapshot

495

ref_memory_usage = memory_snapshot[ref_cost.time_index] if ref_cost.time_index < len(memory_snapshot) else 0

496

cost.slack_buffering_memory = staging_limit_bytes - ref_memory_usage

497

buffered_schedule.cost_map[sched_op] = cost

498

499

# Attempt weight buffering on anything with a weights tensor

500

if sched_op.parent_op.weights:

501

self.propose_weight_buffering(

502

sched_op.parent_op.weights,

503

sched_op.parent_op.bias,

sched_op,

prev_op,

buffered_schedule,

ref_schedule,

cost.slack_buffering_memory,

)

return cost

def weights_needs_dma(self, weight_tensor):

514

if weight_tensor and weight_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast):

515

# Weights are in permanent storage

516

# Only when permanent storage differs from feature map storage, there is a point moving the data

517

if (

518

weight_tensor.mem_area in (MemArea.Dram, MemArea.OffChipFlash)

519

and self.arch.permanent_storage_mem_area != self.arch.fast_storage_mem_area

):

return True

return False

def propose_weight_buffering(

self,

weight_tensor,

scale_tensor,

sched_op: SchedulerOperation,

529

prev_op: SchedulerOperation,

530

buffered_schedule: Schedule,

531

ref_schedule: Schedule,

532

buffer_limit_bytes,

533

):

534

cost = buffered_schedule.cost_map[sched_op]

535

prev_cost = buffered_schedule.cost_map.get(prev_op)

536

ref_cost = ref_schedule.cost_map[sched_op]

537

assert cost and ref_cost

538

539

needs_dma = self.weights_needs_dma(weight_tensor)

540

541

ofm_full_depth_slices = [0, ref_cost.stripe.depth]

542

543

# Encode weights for the full depth

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame^]

544

full_weights, full_scales = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

self.arch,

sched_op.parent_op,

weight_tensor,

scale_tensor,

sched_op.kernel,

cost.block_config,

ofm_full_depth_slices,

552

)

553

full_weights_bytes = len(full_weights.buffer)

554

cost.ofm_depth_slices = ofm_full_depth_slices

555

556

# No buffering required - take all the weights from permanent storage

557

if sched_op.op_type == Op.FullyConnected or not needs_dma:

558

cost.npu_weights_tensor = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame^]

559

cost.npu_scales_tensor = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

560

return

561

562

encoded_weights = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame^]

563

encoded_scales = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

564

565

# How many NPU cycles are available under the previously executing

566

# operator and SRAM unused for performing buffered DMA transfers

567

slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0

568

slack_memory = prev_cost.slack_buffering_memory if prev_cost else 0

569

570

# Force full depth for cascaded Ops

571

if ref_cost.cascade != 0:

572

weight_tensor_purpose = TensorSubPurpose.Standard

573

weight_buffer_size = full_weights_bytes

574

# Update the memory snapshot to reflect the added size of the weights

575

ref_schedule.memory_snapshot[ref_cost.time_index] += weight_buffer_size

576

else:

577

# Estimate the buffering cycle time for the full set of weights

578

full_transfer_cycles = npu_performance.measure_mem2mem_cycles(

579

self.arch, weight_tensor.mem_area, self.arch.fast_storage_mem_area, full_weights_bytes

580

)

581

cost.full_weight_transfer_cycles = full_transfer_cycles

582

583

# Calculate the amount of prebuffering necessary (or what is possible with limited

584

# double buffer buffer size)

585

half_buffer_limit = buffer_limit_bytes // 2

586

if full_transfer_cycles > slack_cycles:

587

prebuffer_ratio = slack_cycles / full_transfer_cycles

588

prebuffer_bytes = min(prebuffer_ratio * full_weights_bytes, half_buffer_limit)

589

else:

590

prebuffer_bytes = min(full_weights_bytes, half_buffer_limit)

591

prebuffer_ratio = prebuffer_bytes / full_weights_bytes

592

593

# Have to split the weights if the initial buffering can't store

594

# all of the compressed weights

595

if prebuffer_bytes < full_weights_bytes:

596

prebuffer_depth = int(ref_cost.stripe.depth * prebuffer_ratio)

597

598

# Round prebuffering down to nearest valid split depth

599

prebuffer_depth = int(max(16, round_down(prebuffer_depth, ArchitectureFeatures.OFMSplitDepth)))

600

601

while True:

602

buffering_depth = max(cost.block_config.ofm_block.depth, prebuffer_depth)

603

604

# Clamp buffering to the double buffering limit

605

buffering_bytes = (buffering_depth / ref_cost.stripe.depth) * full_weights_bytes

606

if buffering_bytes > half_buffer_limit:

607

buffering_depth = (half_buffer_limit / full_weights_bytes) * ref_cost.stripe.depth

608

buffering_depth = int(max(16, round_down(prebuffer_depth, ArchitectureFeatures.OFMSplitDepth)))

609

610

# Create list of depth slices

611

depth_slices = [0]

612

if prebuffer_depth < ref_cost.stripe.depth:

613

depth_slices += list(range(prebuffer_depth, ref_cost.stripe.depth, buffering_depth))

614

depth_slices.append(ref_cost.stripe.depth)

615

616

# Encode weights based depth slices

617

cost.ofm_depth_slices = depth_slices

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame^]

618

encoded_weights, encoded_scales = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

self.arch,

sched_op.parent_op,

weight_tensor,

scale_tensor,

sched_op.kernel,

cost.block_config,

cost.ofm_depth_slices,

626

)

627

628

# Chosen buffering might not fit at all, iterate until it does

629

# or until the minimum usable slice size is reached

630

if (

631

encoded_weights.max_range_bytes <= half_buffer_limit

632

or prebuffer_depth == ArchitectureFeatures.OFMSplitDepth

):

break

prebuffer_depth = round_up(prebuffer_depth // 2, ArchitectureFeatures.OFMSplitDepth)

637

638

# Calculate cycles required to run the last op for use as future slack

639

tail_cycles = self.estimate_op_performance(

640

sched_op, cost.block_config, depth_slices[-1] - depth_slices[-2]

641

)

642

cost.slack_buffering_cycles = tail_cycles.op_cycles

643

644

# Determine whether the weights need to be double buffered

645

weight_buffer_size = min(len(encoded_weights.buffer), encoded_weights.max_range_bytes)

646

647

# Only buffer weights if there's still space left for the buffer

648

if weight_buffer_size <= buffer_limit_bytes:

649

assert weight_buffer_size % 16 == 0

650

# Determine whether to double buffer or single buffer

651

if (weight_buffer_size * 2 <= buffer_limit_bytes) and (weight_buffer_size < len(encoded_weights.buffer)):

652

weight_buffer_size = weight_buffer_size * 2

653

weight_tensor_purpose = TensorSubPurpose.DoubleBuffer

654

else:

655

weight_tensor_purpose = TensorSubPurpose.Standard

656

657

cost.buffered_weight_tensor = Tensor(

658

[1, 1, 1, weight_buffer_size], DataType.uint8, weight_tensor.name + "_buffer"

659

)

660

cost.buffered_weight_tensor.src_tensor = encoded_weights

661

cost.buffered_weight_tensor.mem_area = self.arch.fast_storage_mem_area

662

cost.buffered_weight_tensor.mem_type = MemType.Scratch_fast

663

cost.buffered_weight_tensor.purpose = TensorPurpose.Weights

664

cost.buffered_weight_tensor.sub_purpose = weight_tensor_purpose

665

if ref_cost.cascade == 0:

666

# Determine if the lifetime can be extended and pre-buffer weights under the previous operation

667

cost.buffered_weight_tensor.pre_buffer = weight_buffer_size < slack_memory

668

669

cost.slack_buffering_memory -= weight_buffer_size

670

else:

671

# Don't slice or buffer - use the whole depth from persistent storage

672

cost.ofm_depth_slices = ofm_full_depth_slices

673

encoded_weights = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame^]

674

encoded_scales = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

675

676

cost.npu_weights_tensor = encoded_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame^]

677

cost.npu_scales_tensor = encoded_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

678

679

def propose_minimal_schedule(self) -> Schedule:

680

"""Proposes scheduling parameters where every operator is subdivided into the smallest stripe that satisfies the

681

next operators stride"""

682

min_schedule = Schedule(self.sg, "MIN")

683

cost_map = min_schedule.cost_map

684

685

# Keep track of the previous Op - which consumes the current Op's OFM

686

prev_op = None

687

for sched_op in reversed(self.sched_ops):

688

min_stripe_height = prev_op.kernel.stride.y if prev_op else 1

689

min_stripe = sched_op.ofm.shape.with_height(min_stripe_height)

690

691

cost = sched_op.create_scheduler_info(self.nng, min_stripe)

692

cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)

693

cost_map[sched_op] = cost

prev_op = sched_op

return min_schedule

def propose_schedule_striping(self, final_stripe: Shape4D, label: str, ref_schedule: Schedule) -> Schedule:

700

"""Proposes new striping for a schedule. The stripe is derived from the ifm requirements of the next Op down"""

701

ref_cost = ref_schedule.cost_map

702

703

striped_schedule = Schedule(self.sg, label)

704

stripe = final_stripe

705

for sched_op in reversed(self.sched_ops):

706

if sched_op not in ref_cost:

707

# sched_op is not part of the sub-schedule - skip

708

continue

709

710

# Create a cost entry with the new stripe

711

cost = sched_op.create_scheduler_info(self.nng, stripe)

712

713

# Copy the weight buffering from the reference schedule

714

cost.buffered_weight_tensor = ref_cost[sched_op].buffered_weight_tensor

715

716

# Estimate performance

717

cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)

718

striped_schedule.cost_map[sched_op] = cost

719

720

# Calculate the preceeding Op's stripe

721

stripe = sched_op.ifm.shape.with_height(stripe.height * sched_op.kernel.stride.y)

722

723

return striped_schedule

724

725

def estimate_schedule_memory_usage(self, schedule: Schedule, non_local_mem_usage: dict):

726

"""Estimates the memory usage of a schedule"""

727

cost = schedule.cost_map

728

cascades = schedule.cascades

729

peak_mem_usage = 0

730

for sched_op in self.sched_ops:

731

if sched_op not in cost:

732

# sched_op is not part of the sub-schedule - skip

733

continue

734

735

if cost[sched_op].cascade:

736

# This Op is part of a cascade - use the cascade's memory usage

737

cascade_info = cascades[cost[sched_op].cascade]

738

# Non-local memory usage is already included in the cascade_info

739

peak_mem_usage = max(cascade_info.mem_usage, peak_mem_usage)

740

else:

741

# This Op is not part of a cascade - calculate the memory usage

742

op_weight_buffer = 0

743

if cost[sched_op].buffered_weight_tensor:

744

op_weight_buffer = cost[sched_op].buffered_weight_tensor.storage_size()

745

746

op_mem_usage = (

747

sched_op.ifm_size_in_bytes()

748

+ sched_op.ofm_size_in_bytes()

749

+ op_weight_buffer

750

+ non_local_mem_usage.get(sched_op, 0)

751

)

752

peak_mem_usage = max(op_mem_usage, peak_mem_usage)

753

754

return peak_mem_usage

755

756

def optimize_sub_schedule(

757

self, cascade_info: CascadeInfo, ref_schedule: Schedule, max_template: Schedule, memory_limit: int

758

) -> Schedule:

759

"""Extracts the Ops covered by the given cascade and creates a sub-schedule. The sub-schedule is optimized by

760

proposing weight buffering and then continously proposing new stripe sizes"""

761

ref_cost = ref_schedule.cost_map

762

# Extract the ops that are part of this sub-schedule

763

start = cascade_info.start

764

end = cascade_info.end

765

sub_schedule_ops = self.sched_ops[start : end + 1]

766

# Create a sub-schedule that contains only the costs for the Ops that are part of the sub-schedule

767

sub_schedule = Schedule(self.sg, f"SUB_{start}_{end}")

768

for sched_op in sub_schedule_ops:

769

sub_schedule.cost_map[sched_op] = ref_cost[sched_op]

770

771

sub_schedule.cascades[end] = cascade_info

772

# Use the memory snapshot from the reference schedule

773

sub_schedule.memory_snapshot = ref_schedule.memory_snapshot

774

775

# Calculate memory usage that is live during the sub-schedule but not part of it

776

time_for_cascade = ref_cost[sub_schedule_ops[0]].time_index

777

mem_usage_parallel_to_sub_schedule = ref_schedule.memory_snapshot[time_for_cascade] - cascade_info.mem_usage

778

# If the first Op's IFM has other consumers it has to live throughout the whole sub-schedule whether it's

779

# included in a cascade or not

780

persistent_initial_ifm = (

781

sub_schedule_ops[0].ifm_size_in_bytes() if len(sub_schedule_ops[0].ifm.connection.consumers) > 1 else 0

782

)

783

# Calculate non-local-mem-usage per Operator

784

non_local_mem_usage = {}

785

for idx, sched_op in enumerate(sub_schedule_ops):

786

non_local_mem_usage[sched_op] = mem_usage_parallel_to_sub_schedule

787

if idx != 0:

788

non_local_mem_usage[sched_op] += persistent_initial_ifm

789

790

cascade_builder = CascadeBuilder(sub_schedule_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)

791

792

# Start by adding buffering

793

buffered_sub_schedule = self.propose_schedule_buffering(sub_schedule)

794

# Copy the cascades over from the unbuffered-schedule

795

buffered_sub_schedule.cascades = sub_schedule.cascades

796

797

# Generate the possible stripings for the final Op in the sub-schedule

798

final_ofm_shape = sub_schedule_ops[-1].ofm.shape

799

possible_stripes = [

800

final_ofm_shape.with_height(stripe_h) for stripe_h in range(1, final_ofm_shape.height // 2 + 1)

801

]

802

803

# Propose different striping - the possible stripes are proposed similarly to a binary search

804

best_schedule = buffered_sub_schedule

805

iteration = 0

806

while len(possible_stripes) > 1:

807

proposed_stripe = possible_stripes[len(possible_stripes) // 2]

808

proposed_schedule = self.propose_schedule_striping(

809

proposed_stripe, f"OPTIMIZED_{iteration}", buffered_sub_schedule

810

)

811

812

cascade_builder.build_cascades(proposed_schedule, max_template, memory_limit)

813

814

# Check if proposal fits

815

proposed_schedule_mem_usage = self.estimate_schedule_memory_usage(proposed_schedule, non_local_mem_usage)

816

if (proposed_schedule_mem_usage) <= memory_limit:

817

# Remove all possible stripes smaller than this

818

possible_stripes = possible_stripes[len(possible_stripes) // 2 :]

819

best_schedule = proposed_schedule

820

if not proposed_schedule.cascades:

821

# No cascading required - early exit

822

break

823

else:

824

# Proposal doesn't fit within the limit - remove all possible stripes larger than this

825

possible_stripes = possible_stripes[: len(possible_stripes) // 2]

iteration += 1

return best_schedule

def optimize_schedule(

832

self, schedule: Schedule, max_sched: Schedule, max_template: Schedule, options: SchedulerOptions,

833

) -> Schedule:

834

"""Extracts sub-schedules based on the cascades and optimizes them and applies them to the final schedule"""

835

sram_limit = options.optimization_sram_limit

836

if max_sched.fast_storage_peak_usage < sram_limit and not self.arch.is_spilling_enabled():

837

# Maximum performance schedule fits within the SRAM target

838

return max_sched

839

840

# Extract the cascades

841

cascades = [cascade for cascade in schedule.cascades.values()]

842

for cascade_info in cascades:

843

# Remove existing cascade from schedule

844

del schedule.cascades[cascade_info.end]

845

# Optimize the sub-schedule in this cascade

846

opt_sub_schedule = self.optimize_sub_schedule(cascade_info, schedule, max_template, sram_limit)

847

# Update the sub-schedule Op and cascade costs to the full schedule

848

schedule.cost_map.update(opt_sub_schedule.cost_map)

849

schedule.cascades.update(opt_sub_schedule.cascades)

850

851

# Update memory snapshot

852

self.sg.schedule = schedule

853

self.update_op_memory_snapshot(schedule)

854

# Propose schedule buffering to the optimized schedule

855

optimized_sched = self.propose_schedule_buffering(schedule)

856

# Copy the cascade's metadata from the unbuffered schedule

857

optimized_sched.cascades = schedule.cascades

858

return optimized_sched

859

860

def apply_schedule(self, sched: Schedule):

861

"""Applies the given schedule as a final solution"""

862

for sched_op in self.sched_ops:

863

op_info = sched.cost_map[sched_op]

864

cascade_info = sched.cascades.get(op_info.cascade, None)

865

if cascade_info and sched_op in cascade_info.buffers:

866

buffer_tens = sched_op.ifm.connection.parent_tens

867

# Apply memory area and type

868

buffer_tens.mem_area = self.arch.fast_storage_mem_area

869

buffer_tens.mem_type = MemType.Scratch_fast

870

# Apply Rolling buffer

871

buffer_tens.set_format(TensorFormat.NHCWB16, self.arch)

872

buffer_tens.set_new_sub_purpose(TensorSubPurpose.RollingBufferY, cascade_info.buffers[sched_op].height)

873

874

sched_op.parent_ps.block_config = op_info.block_config.old_style_representation()

875

876

# Ensure that the src_tensor reference is set correctly

877

if op_info.buffered_weight_tensor:

878

op_info.buffered_weight_tensor.src_tensor = op_info.npu_weights_tensor

879

880

def use_fast_storage_for_feature_maps(self, schedule: Schedule, memory_limit: int):

881

if self.arch.fast_storage_mem_area == self.arch.feature_map_storage_mem_area:

882

return

883

884

# Force all OFMs to fast-storage

885

for sched_op in self.sched_ops:

886

cost = schedule.cost_map[sched_op]

887

if cost.cascade == 0:

888

if sched_op.get_dependants():

889

ofm_tens = sched_op.ofm.connection.parent_tens

890

if not any(cons is None for cons in ofm_tens.consumer_list):

891

ofm_tens.mem_area = self.arch.fast_storage_mem_area

892

ofm_tens.mem_type = MemType.Scratch_fast

893

894

# Collect live ranges from tensors

895

memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

896

lr_graph = live_range.LiveRangeGraph()

897

for mem_area, mem_type_set in memories_list:

898

live_range.extract_live_ranges_from_cascaded_passes(

899

self.nng.get_root_subgraph(), mem_area, mem_type_set, False, lr_graph, Tensor.AllocationQuantum,

900

)

901

902

# Iterate over live ranges and evict tensors that doesn't fit

903

fast_storage_snapshot = lr_graph.get_temporal_memory_usage(self.arch.fast_storage_mem_area)

904

for lr in lr_graph.lrs:

905

if (

906

lr.mem_area == self.arch.fast_storage_mem_area

907

and max(fast_storage_snapshot[lr.start_time : lr.end_time + 1]) > memory_limit

908

):

909

# Evict tensor to DRAM

910

for tens in lr.tensors:

911

if tens.purpose == TensorPurpose.FeatureMap and tens.sub_purpose == TensorSubPurpose.Standard:

912

# Can only evict unbuffered FeatureMaps

913

tens.mem_area = self.arch.feature_map_storage_mem_area

914

tens.mem_type = MemType.Scratch

915

# Adjust the snapshot

916

fast_storage_snapshot[lr.start_time : lr.end_time + 1] -= lr.size

917

918

def move_constant_data(self):

919

"""Determine if data, can be moved from permanent storage to another memory area. A move

920

will generate a DMA command in the high-level command stream"""

921

for sched_op in self.sched_ops:

922

parent_op = sched_op.parent_op

923

is_lut_used = any(inp.purpose == TensorPurpose.LUT for inp in parent_op.inputs)

924

max_ifm_shram_avail = (

925

(self.arch.available_shram_banks(is_lut_used) - self.arch.shram_reserved_output_banks)

926

* self.arch.shram_bank_size

// 2

)

for idx, tens in enumerate(parent_op.inputs):

931

if tens.mem_type not in (MemType.Scratch, MemType.Scratch_fast):

932

# Tensor is in permanent storage

933

# Only when permanent storage differs from feature map storage, there is a point moving the data

934

if (

935

tens.mem_area in self.arch.permanent_storage_mem_area

936

and self.arch.permanent_storage_mem_area != self.arch.feature_map_storage_mem_area

937

) or tens.purpose == TensorPurpose.LUT:

938

if tens.purpose == TensorPurpose.LUT or (

939

tens.purpose == TensorPurpose.FeatureMap

940

and sched_op.op_type.is_binary_elementwise_op()

941

and tens.shape != []

942

and sched_op.ifm.shape != sched_op.ofm.shape

943

and tens.storage_size() > max_ifm_shram_avail

944

):

945

only_vector_product_consumers = all(

946

oper and oper.type.npu_block_type == NpuBlockType.VectorProduct

947

for oper in tens.consumers()

948

)

949

950

if (not only_vector_product_consumers) or tens.purpose == TensorPurpose.LUT:

951

new_tens = tens.clone_into_fast_storage(self.arch)

952

if tens.purpose == TensorPurpose.LUT:

953

new_tens.mem_area = MemArea.Shram

954

955

new_tens.consumer_list.append(parent_op)

956

parent_op.inputs[idx] = new_tens

957

sched_op.parent_ps.inputs[idx] = new_tens

958

959

def print_schedule(self, schedule: Schedule):

960

print(f"Schedule: '{schedule.name}'")

961

for sched_op in self.sched_ops:

962

if sched_op not in schedule.cost_map:

963

# Sub-schedule printing

964

continue

965

966

op_info = schedule.cost_map[sched_op]

967

print(f"\t{sched_op.index}: Operation {sched_op.name} - OFM {sched_op.ofm.shape}")

968

print(f"\t\tType: {sched_op.op_type}")

969

print(f"\t\tKernel: {sched_op.kernel}")

970

print(f"{op_info}")

971

mem_usage = (

972

schedule.memory_snapshot[op_info.time_index]

973

if op_info.time_index < len(schedule.memory_snapshot)

974

else 0

975

)

976

print(f"\t\tSRAM Used: {mem_usage} bytes")

977

978

print(f"\tCascades:")

979

for i, cascade in enumerate(schedule.cascades.values()):

980

print(f"\t\t{i}: {cascade.start} -> {cascade.end}, size: {cascade.mem_usage}")

Patrik Gustavsson

feeb06d

2020-04-22 12:53:47 +0200

[diff] [blame]

981

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

982

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

983

def _update_tensor_allocation(nng: Graph, arch: ArchitectureFeatures, options):

984

"""

985

Creates live ranges and runs tensor allocator for the current schedule

986

(i.e. sg.schedule for all subgraphs), returns the maximum memory usage

987

and updates SchedulerOpInfo.mem_usage for all operations in the schedule.

988

"""

989

root_sg = nng.get_root_subgraph()

990

991

alloc_list = []

992

if arch.is_spilling_enabled():

993

mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,)))

994

mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))

995

# Order is important

996

alloc_list.append(mem_alloc_scratch_fast)

997

alloc_list.append(mem_alloc_scratch)

998

else:

999

mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))

1000

alloc_list.append(mem_alloc_scratch)

1001

1002

for mem_area, mem_type_set in alloc_list:

1003

tensor_allocation.allocate_tensors(

nng,

root_sg,

arch,

mem_area,

mem_type_set,

tensor_allocator=options.tensor_allocator,

1010

verbose_allocation=options.verbose_allocation,

1011

cpu_tensor_alignment=options.cpu_tensor_alignment,

)

def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_options: SchedulerOptions):

1016

"""Entry point for the Scheduler"""

1017

# Initialize CPU subgraphs

1018

schedulers = dict()

1019

# Initialize schedulers with max schedule. Only schedule NPU subgraphs

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1020

for sg in nng.subgraphs:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1021

if sg.placement != PassPlacement.Npu:

1022

# Create cascaded passes for CPU Ops

1023

cascaded_passes = []

1024

for idx, ps in enumerate(sg.passes):

1025

cps = CascadedPass(

1026

ps.name, SchedulingStrategy.WeightStream, ps.inputs, [], ps.outputs, [ps], ps.placement, False,

1027

)

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1028

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1029

cps.time = idx

1030

ps.cascade = cps

1031

cascaded_passes.append(cps)

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1032

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1033

sg.cascaded_passes = cascaded_passes

1034

else:

1035

# Npu subgraph - create schedule

1036

scheduler = Scheduler(nng, sg, arch, scheduler_options)

1037

schedulers[sg] = scheduler

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1038

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1039

scheduler.create_scheduler_representation(arch)

1040

sg.sched_ops = scheduler.sched_ops

1041

scheduler.move_constant_data()

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1042

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1043

# Create the Max schedule template

1044

max_schedule_template = scheduler.create_initial_schedule()

1045

scheduler.max_schedule = max_schedule_template

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1046

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1047

# Create the optimimised Max schedule

1048

sg.schedule = max_schedule_template

1049

scheduler.update_op_memory_snapshot(max_schedule_template)

1050

opt_max_schedule = scheduler.propose_schedule_buffering(max_schedule_template)

1051

sg.schedule = opt_max_schedule

1052

scheduler.update_op_memory_snapshot(opt_max_schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1053

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1054

# Create Min schedule

1055

min_schedule = scheduler.propose_minimal_schedule()

1056

initial_sram_limit = scheduler_options.optimization_sram_limit

1057

if scheduler_options.optimization_strategy == OptimizationStrategy.Size:

1058

initial_sram_limit = scheduler.min_memory_req

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1059

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1060

cascade_builder = CascadeBuilder(scheduler.sched_ops, arch.is_spilling_enabled())

1061

cascade_builder.build_cascades(min_schedule, max_schedule_template, initial_sram_limit)

1062

sg.schedule = min_schedule

1063

scheduler.update_op_memory_snapshot(min_schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1064

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1065

if scheduler_options.optimization_strategy == OptimizationStrategy.Performance:

1066

# Create an optimized schedule

1067

sg.schedule = scheduler.optimize_schedule(

1068

min_schedule, opt_max_schedule, max_schedule_template, scheduler_options

1069

)

1070

scheduler.update_op_memory_snapshot(sg.schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1071

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1072

scheduler.apply_schedule(sg.schedule)

1073

scheduler.use_fast_storage_for_feature_maps(sg.schedule, scheduler_options.optimization_sram_limit)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1074

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1075

if scheduler_options.verbose_schedule:

1076

scheduler.print_schedule(sg.schedule)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1077

Tim Hall