Blame - ethosu/vela/scheduler.py - ml/ethos-u/ethos-u-vela

2021-05-27 18:49:40 +0100

[diff] [blame]

1

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

16

#

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

17

# Description:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

18

# The scheduler creates and searches for an optimal plan for the network, selecting block configurations and

19

# subdivisions for the Operators

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

20

# For Class name forward references for the type annotations. (see PEP 563).

21

from __future__ import annotations

22

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

23

import copy

Johan Alfvén

5e0ae55

2022-02-09 21:20:10 +0100

[diff] [blame]

24

from collections import namedtuple

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

25

from enum import auto

26

from enum import IntEnum

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame^]

27

from typing import Any

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

28

from typing import Dict

29

from typing import List

30

from typing import Optional

31

from typing import Tuple

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

32

from typing import TYPE_CHECKING

33

34

# Import needed for Type annotations. Only import for Type checking to avoid run-time errors due to cyclic import.

35

if TYPE_CHECKING:

36

from .npu_performance import CycleCost

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

37

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

38

import numpy as np

39

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

40

from . import live_range

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

41

from . import npu_performance

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

42

from . import tensor_allocation

43

from . import weight_compressor

44

from .architecture_allocator import ArchitectureBlockConfig

45

from .architecture_allocator import find_block_config

46

from .architecture_allocator import get_ifm_area_required

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

47

from .architecture_features import ArchitectureFeatures

48

from .architecture_features import Block

49

from .cascade_builder import CascadeBuilder

50

from .cascade_builder import CascadeInfo

Fredrik Svedberg

880e735

2020-08-25 11:31:47 +0200

[diff] [blame]

51

from .data_type import DataType

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

52

from .nn_graph import CascadedPass

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

53

from .nn_graph import Graph

54

from .nn_graph import Pass

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

55

from .nn_graph import PassPlacement

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

56

from .nn_graph import SchedulingStrategy

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

57

from .nn_graph import Subgraph

58

from .numeric_util import round_down

59

from .numeric_util import round_up

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

60

from .operation import NpuBlockType

Louis Verhaard

aee5d75

2020-09-30 09:01:52 +0200

[diff] [blame]

61

from .operation import Op

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

62

from .shape4d import Shape4D

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

63

from .tensor import MemArea

Patrik Gustavsson

eca2e95

2020-05-27 09:15:11 +0200

[diff] [blame]

64

from .tensor import MemType

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

65

from .tensor import Tensor

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

66

from .tensor import TensorFormat

67

from .tensor import TensorPurpose

68

from .tensor import TensorSubPurpose

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

69

from .weight_compressor import NpuWeightTensor

Jacob Bohlin

1a66697

2020-09-11 10:04:15 +0200

[diff] [blame]

70

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

71

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

72

def shape_for_format(shape: Shape4D, tensor_format: TensorFormat) -> Shape4D:

73

if tensor_format == TensorFormat.NHCWB16:

74

return shape.with_depth(round_up(shape.depth, 16))

return shape

class OptimizationStrategy(IntEnum):

80

"""Enum defining the different optimization strategies for the Scheduler"""

81

82

Size = auto()

83

Performance = auto()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

def __str__(self):

return self.name

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

89

class SchedulerOpInfo:

90

"""Contains metadata about a SchedulerOperation that is unique to one Schedule"""

91

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

92

def __init__(

93

self,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

94

block_config: ArchitectureBlockConfig,

95

weights_size: int,

96

stripe_input: Shape4D,

97

stripe_input2: Optional[Shape4D],

98

stripe: Shape4D,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

99

):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

100

self.block_config = block_config

101

self.weights_size = weights_size

102

self.stripe_input = stripe_input

103

self.stripe_input2 = stripe_input2

104

self.stripe = stripe

105

self.cascade = 0 # Assigned by CascadeBuilder. 0 means not part of a cascade

106

self.time_index = None # Set by update_op_memory_snapshot

107

self.ofm_depth_slices: List[int] = [0, stripe.depth]

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

108

self.npu_weights_tensor: Optional[NpuWeightTensor] = None

109

self.npu_scales_tensor: Optional[NpuWeightTensor] = None

Tim Hall

2022-05-04 16:20:43 +0100

[diff] [blame]

110

self.buffered_weight_tensor: Optional[Tensor] = None

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

111

self.cycles: Optional[CycleCost] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

112

self.slack_buffering_cycles = 0

113

self.slack_buffering_memory = 0

114

self.full_weight_transfer_cycles = 0

115

116

def copy(self):

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

117

res = SchedulerOpInfo(

self.block_config,

self.weights_size,

self.stripe_input,

self.stripe_input2,

self.stripe,

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

124

res.cascade = self.cascade

return res

def __str__(self):

res = f"\t\tBlock Config = {self.block_config}\n"

129

res += f"\t\tOFM Block = {self.block_config.ofm_block}\n"

130

res += f"\t\tIFM Stripe = {self.stripe_input}\n"

131

res += f"\t\tIFM2 Stripe = {self.stripe_input2}\n"

132

res += f"\t\tOFM Stripe = {self.stripe}\n"

133

res += f"\t\tEncoded Weights = {self.npu_weights_tensor and len(self.npu_weights_tensor.buffer)} bytes\n"

Tim Hall

2022-05-04 16:20:43 +0100

[diff] [blame]

134

res += (

135

f"\t\tWeight buffer = {self.buffered_weight_tensor and self.buffered_weight_tensor.storage_size()} bytes\n"

136

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

137

res += f"\t\tDepth slices = {self.ofm_depth_slices}\n"

138

res += f"\t\tAssigned Cascade = {self.cascade}"

return res

class SchedulerOptions:

143

"""Contains options for the Scheduler"""

144

145

def __init__(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

146

self,

147

optimization_strategy,

148

sram_target,

149

verbose_schedule,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

150

):

151

self.optimization_strategy = optimization_strategy

152

self.optimization_sram_limit = sram_target

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

153

self.verbose_schedule = verbose_schedule

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

154

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

155

def __str__(self) -> str:

156

return f"{type(self).__name__}: {str(self.__dict__)}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

__repr__ = __str__

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

161

class SchedulerTensor:

162

def __init__(self, shape, dt, mem_area, _format):

163

self.dtype = dt

164

self.mem_area = mem_area

165

self.shape = shape

166

self.format = _format

167

self.connection = None

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

168

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

169

170

class SchedulerOperation:

171

"""Scheduler internal representation of 'Operation'

172

This class can be seen as a node within the Scheduler Graph representation

173

"""

174

175

def __init__(self, ps: Pass, arch: ArchitectureFeatures, nng: Graph):

176

self.arch = arch

177

self.parent_ps = ps

178

self.parent_op = ps.primary_op

179

self.name = ps.primary_op.name

180

self.op_type = ps.primary_op.type

181

self.activation = ps.primary_op.activation

182

self.kernel = ps.primary_op.kernel

Tim Hall

3c5cfe9

2022-03-16 16:31:57 +0000

[diff] [blame]

183

self.resampling_mode = ps.primary_op.ifm_resampling_mode

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

184

self.uses_scalar = ps.primary_op.ifm2 is not None and (

185

ps.primary_op.ifm.shape == [] or ps.primary_op.ifm2.shape == []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

186

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

187

self.ifm_ublock = arch.ifm_ublock

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

188

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

189

self.ifm = SchedulerTensor(

190

ps.ifm_shapes[0],

191

ps.ifm_tensor.dtype,

192

ps.ifm_tensor.mem_area,

193

ps.ifm_tensor.format,

194

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

195

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

196

self.ifm2 = None

197

if ps.ifm2_tensor:

198

self.ifm2 = SchedulerTensor(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

199

ps.ifm_shapes[1],

200

ps.ifm2_tensor.dtype,

201

ps.ifm2_tensor.mem_area,

202

ps.ifm2_tensor.format,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

203

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

204

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

205

self.ofm = SchedulerTensor(

206

ps.ofm_shapes[0],

207

ps.ofm_tensor.dtype,

208

ps.ofm_tensor.mem_area,

209

ps.ofm_tensor.format,

210

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

211

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

212

# Input volume width and height required to produce the smallest possible stripe

213

self.min_stripe_input_w, self.min_stripe_input_h = self._calculate_min_stripe_input()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

214

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

215

# Flags that marks whether this SchedulerOperation requires full IFM/OFM

216

self.requires_full_ifm = False

217

self.requires_full_ifm2 = False

218

self.requires_full_ofm = False

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

219

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame^]

220

self.evicted_fms_size = 0

221

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

222

self.index = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

223

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

224

def add_ifm_connection(self, conn: "Connection"):

225

"""Add input connection to another SchedulerOperation or Subgraph Input"""

226

conn.consumers.append(self)

227

self.ifm.connection = conn

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

228

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

229

def add_ifm2_connection(self, conn: "Connection"):

230

"""Add input connection to another SchedulerOperation or Subgraph Input"""

231

if self.ifm2:

232

conn.consumers.append(self)

233

self.ifm2.connection = conn

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

234

else:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

235

assert False, f"Trying to set an IFM2 Connection to {self} which has no IFM2"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

236

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

237

def add_ofm_connection(self, conn: "Connection"):

238

"""Add output connection to another SchedulerOperation or Subgraph Output"""

239

conn.producers.append(self)

240

self.ofm.connection = conn

241

242

def get_dependants(self):

243

"""Returns a list of the Ops that depend on this Operation's OFM"""

244

return self.ofm.connection.consumers

245

246

def ifm_size_in_bytes(self) -> int:

247

"""Returns size of the IFM in bytes"""

248

ifm_storage_shape = shape_for_format(self.ifm.shape, self.ifm.format)

249

return round_up(ifm_storage_shape.elements() * self.ifm.dtype.size_in_bytes(), Tensor.AllocationQuantum)

250

251

def ifm2_size_in_bytes(self) -> int:

252

"""Returns size of the IFM2 in bytes"""

253

if self.ifm2:

254

ifm2_storage_shape = shape_for_format(self.ifm2.shape, self.ifm2.format)

255

return round_up(ifm2_storage_shape.elements() * self.ifm2.dtype.size_in_bytes(), Tensor.AllocationQuantum)

return 0

def ofm_size_in_bytes(self) -> int:

260

"""Returns size of the OFM in bytes"""

261

ofm_storage_shape = shape_for_format(self.ofm.shape, self.ofm.format)

262

return round_up(ofm_storage_shape.elements() * self.ofm.dtype.size_in_bytes(), Tensor.AllocationQuantum)

263

264

def create_scheduler_info(self, nng: Graph, stripe: Shape4D) -> SchedulerOpInfo:

265

"""Returns schedule info about this SchedulerOperation based on how many ofm elements it should produce"""

266

ifm_shape = self.ifm.shape

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

267

ifm2_shape = self.ifm2.shape if self.ifm2 is not None else None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

268

ofm_shape = stripe

269

270

if ofm_shape != self.ofm.shape:

271

# Striped Op - Need to calculate stripe input volume

272

stripe_input_w, stripe_input_h = self._get_stripe_input_requirement(stripe)

273

# Ensure stripe input volume is within the full IFM volume

274

stripe_input_h = min(stripe_input_h, self.ifm.shape.height)

275

stripe_input_w = min(stripe_input_w, self.ifm.shape.width)

276

ifm_shape = ifm_shape.with_hw(stripe_input_h, stripe_input_w)

277

278

if self.ifm2:

279

stripe_input2_h = min(stripe_input_h, self.ifm2.shape.height)

280

stripe_input2_w = min(stripe_input_w, self.ifm2.shape.width)

281

ifm2_shape = ifm2_shape.with_hw(stripe_input2_h, stripe_input2_w)

282

283

block_config = self._get_block_config(ifm_shape, ifm2_shape, self.uses_scalar, ofm_shape)

284

285

scheduler_op_info = SchedulerOpInfo(block_config, 0, ifm_shape, ifm2_shape, ofm_shape)

286

if self.parent_op.weights:

287

# Default full-depth weight encoding with no buffering

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

288

(

289

scheduler_op_info.npu_weights_tensor,

290

scheduler_op_info.npu_scales_tensor,

291

) = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

292

self.arch,

293

self.parent_op,

294

self.parent_op.weights,

self.parent_op.bias,

self.kernel,

block_config,

[0, self.ofm.shape.depth],

299

)

300

301

self.parent_ps.block_config = block_config.old_style_representation()

302

return scheduler_op_info

303

304

def _get_stripe_input_requirement(self, stripe_shape: Shape4D) -> Tuple[int, int]:

305

"""Returns the amount of IFM required to produce the stripe with shape:'stripe_shape'"""

306

ofm_shape_to_produce = Block.from_shape(stripe_shape.as_list())

307

Fredrik Svedberg

3ff7a4a

2021-09-29 10:08:04 +0200

[diff] [blame]

308

return get_ifm_area_required(ofm_shape_to_produce, self.kernel, self.resampling_mode)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

309

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

310

def _calculate_min_stripe_input(self) -> Tuple[int, int]:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

311

# Calculate the input volume required height and width for the smallest possible stripe (h,w = 1,1)

312

min_stripe = self.ofm.shape.with_hw(1, 1)

313

return self._get_stripe_input_requirement(min_stripe)

314

315

def _get_block_config(

316

self, ifm_shape: Shape4D, ifm2_shape: Optional[Shape4D], uses_scalar: bool, ofm_shape: Shape4D

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

317

) -> Optional[ArchitectureBlockConfig]:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

318

# Returns a block config and SHRAM layout

319

lut_banks = 2 if self.parent_op.activation_lut else 0

320

return find_block_config(

321

self.arch,

322

self.op_type.npu_block_type,

ofm_shape,

ifm_shape,

ifm2_shape,

uses_scalar,

self.ifm.dtype.size_in_bits(),

328

self.kernel,

329

lut_banks,

330

self.parent_op.has_scaling(),

331

self.resampling_mode,

)

class Connection:

"""Scheduler internal representation of a Tensor that connects two SchedulerOperations

337

This class can be seen as an edge within the Scheduler Graph representation

338

"""

339

340

def __init__(self, tensor: Tensor):

341

self.parent_tens = tensor

342

343

# SchedulerOperation relationships

344

self.producers: List[SchedulerOperation] = []

345

self.consumers: List[SchedulerOperation] = []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

346

347

def __str__(self):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

348

return f"<Connection {self.parent_tens.name}>"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

__repr__ = __str__

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

353

class Schedule:

354

"""Class that contains a solution of how to schedule an NPU subgraph and its cost"""

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

355

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

356

def __init__(self, sg: Subgraph, label: str):

357

self.sg = sg

358

self.label = label

359

self.cost_map: Dict[SchedulerOperation, SchedulerOpInfo] = {}

360

self.cascades: Dict[int, CascadeInfo] = {}

361

self.fast_storage_peak_usage = 0

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

362

self.memory_snapshot: Optional[List[int]] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

@property

def name(self):

return f"{self.sg.name}_{self.label}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

367

368

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

369

class Scheduler:

370

"""Main class of the Vela Scheduling"""

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

371

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

372

def __init__(self, nng: Graph, sg: Subgraph, arch: ArchitectureFeatures, options: SchedulerOptions):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

373

self.nng = nng

374

self.sg = sg

375

self.arch = arch

Ayaan Masood

b801dda

2022-02-22 11:28:55 +0000

[diff] [blame]

376

self.sched_ops: List[SchedulerOperation] = []

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

377

self.max_schedule: Optional[Schedule] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

378

self.scheduler_options = options

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

379

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame^]

380

self.scratched_fms: Dict[Tensor, Any] = {}

381

self.evicted_fms: List[live_range.LiveRange] = []

382

Johan Alfvén

5e0ae55

2022-02-09 21:20:10 +0100

[diff] [blame]

383

def avoid_nhcwb16_for_ofm(self, tens, ps, arch):

384

# Only run this check for opt strategy Size

385

if self.scheduler_options.optimization_strategy == OptimizationStrategy.Performance:

return False

op = ps.primary_op

if not op.type.is_elementwise_op():

390

return False

391

392

depth = op.ofm_shapes[0][-1]

393

if (depth % 16) == 0:

394

return False

395

396

# Check if overwriting the inputs can be allowed

397

OpShapeTens = namedtuple("OpShapeTens", ["op_shape", "tens"])

398

outp = OpShapeTens(op.ofm_shapes[0], op.ofm)

399

inps = []

400

if op.ifm is not None:

401

inps.append(OpShapeTens(op.ifm_shapes[0], op.ifm))

402

if op.ifm2 is not None:

403

inps.append(OpShapeTens(op.ifm_shapes[1], op.ifm2))

404

405

# Find an input tensor that can be overwritten by the output

406

for inp in inps:

407

if (

408

# check op input and output shapes allow overlapping

409

inp.op_shape == outp.op_shape

410

# check input tensor is valid

411

and inp.tens is not None

412

and inp.tens.shape != []

413

# check input and output tensors are compatible

414

and inp.tens.format == outp.tens.format

415

and inp.tens.dtype == outp.tens.dtype

416

):

417

if inp.tens.format == TensorFormat.NHWC:

return True

return False

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

422

def create_scheduler_representation(self, arch: ArchitectureFeatures):

423

"""Creates a Scheduler Graph representation"""

424

# Temporary dict for creating connections between the Operations

425

connections: Dict[Tensor, Connection] = {}

426

# Memory required for the largest FeatureMap that has to be full

427

min_memory_req = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

428

for ps in self.sg.passes:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

429

if ps.primary_op:

430

# Set tensor format to NHCWB16 for output FeatureMaps, if possible

Louis Verhaard

0b9c9a3

2020-09-15 14:05:38 +0200

[diff] [blame]

431

for output in ps.outputs:

Jacob Bohlin

a5e8c1c

2021-06-14 13:33:39 +0200

[diff] [blame]

432

if output in self.sg.output_tensors or output.purpose != TensorPurpose.FeatureMap:

Patrik Gustavsson

feeb06d

2020-04-22 12:53:47 +0200

[diff] [blame]

433

continue

Johan Alfvén

5e0ae55

2022-02-09 21:20:10 +0100

[diff] [blame]

434

435

if output.needs_linear_format:

436

continue

437

438

if self.avoid_nhcwb16_for_ofm(output, ps, arch):

439

output.needs_linear_format = True

440

continue

441

442

output.set_format(TensorFormat.NHCWB16, arch)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

443

444

# Create SchedulerOperations

445

op = SchedulerOperation(ps, arch, self.nng)

446

op.index = len(self.sched_ops)

447

448

# Make connections

449

if ps.ifm_tensor not in connections:

450

connections[ps.ifm_tensor] = Connection(ps.ifm_tensor)

451

if ps.ifm2_tensor and ps.ifm2_tensor not in connections:

452

connections[ps.ifm2_tensor] = Connection(ps.ifm2_tensor)

453

if ps.ofm_tensor not in connections:

454

connections[ps.ofm_tensor] = Connection(ps.ofm_tensor)

455

456

op.add_ifm_connection(connections[ps.ifm_tensor])

457

if ps.ifm2_tensor:

458

op.add_ifm2_connection(connections[ps.ifm2_tensor])

459

op.add_ofm_connection(connections[ps.ofm_tensor])

460

461

# Set requirements on the ifm/ofm buffers

462

self.sched_ops.append(op)

463

if ps.ifm_tensor in self.sg.input_tensors:

464

# This Op consumes a subgraph input

465

op.requires_full_ifm = True

466

if ps.ifm2_tensor and ps.ifm2_tensor in self.sg.input_tensors:

467

# This Op consumes a subgraph input

468

op.requires_full_ifm2 = True

469

if ps.ofm_tensor in self.sg.output_tensors:

470

# This Op produces a subgraph output

471

op.requires_full_ofm = True

472

if ps.ifm_tensor.needs_linear_format:

473

op.requires_full_ifm = True

474

if ps.ifm2_tensor and ps.ifm2_tensor.needs_linear_format:

475

op.requires_full_ifm2 = True

476

if ps.ofm_tensor.needs_linear_format or ps.primary_op.memory_function == Op.ConcatSliceWrite:

477

op.requires_full_ofm = True

478

if len(ps.primary_op.outputs) > 1 or len(ps.primary_op.outputs[0].consumer_list) > 1:

479

# Op has multiple outputs or consumers - requires full OFM

480

op.requires_full_ofm = True

481

482

# Check memory requirements if this Op requires any full FeatureMaps

483

op_memory_req = 0

484

if op.requires_full_ifm:

485

op_memory_req += op.ifm_size_in_bytes()

486

if op.requires_full_ifm2:

487

op_memory_req += op.ifm2_size_in_bytes()

488

if op.requires_full_ofm:

489

op_memory_req += op.ofm_size_in_bytes()

490

491

min_memory_req = max(op_memory_req, min_memory_req)

492

493

# Theoretical minimum required memory - used to guide the cascade building

494

self.min_memory_req = min_memory_req

495

496

def create_initial_schedule(self) -> Schedule:

497

"""Creates an initial schedule with no cascading or buffering of any kind"""

498

schedule = Schedule(self.sg, "MAX")

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

499

for op in self.sched_ops:

500

cost = op.create_scheduler_info(self.nng, op.ofm.shape)

501

cost.cycles = self.estimate_op_performance(op, cost.block_config, op.ofm.shape.depth)

502

schedule.cost_map[op] = cost

return schedule

def update_op_memory_snapshot(self, schedule: Schedule):

507

memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

508

509

# Collect live ranges from tensors

510

lr_graph = live_range.LiveRangeGraph()

511

for mem_area, mem_type_set in memories_list:

512

live_range.extract_live_ranges_from_cascaded_passes(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

513

self.nng.get_root_subgraph(),

mem_area,

mem_type_set,

lr_graph,

Tensor.AllocationQuantum,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

518

)

519

520

# Populate time-array with memory used by live ranges

521

temporal_usage = lr_graph.get_temporal_memory_usage(self.arch.fast_storage_mem_area)

522

schedule.memory_snapshot = temporal_usage

523

524

# Set the peak memory usage

525

schedule.fast_storage_peak_usage = max(temporal_usage, default=0)

526

527

def estimate_op_performance(self, op: SchedulerOperation, block_config, ofm_depth):

528

query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)

529

query.ifm_shape = op.ifm.shape

530

query.ifm_memory_area = op.ifm.mem_area

531

query.ifm_bits = op.ifm.dtype.size_in_bits()

532

query.ifm_format = op.ifm.format

533

query.ifm2_shape = op.ifm2 and op.ifm2.shape

534

query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area

535

query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()

536

query.ifm2_format = op.ifm2 and op.ifm2.format

537

query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)

538

query.ofm_memory_area = op.ofm.mem_area

539

query.ofm_bits = op.ofm.dtype.size_in_bits()

540

query.ofm_format = op.ofm.format

541

if op.parent_op.bias:

542

query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)

543

query.const_memory_area = self.arch.fast_storage_mem_area

544

545

query.kernel = op.kernel

546

query.config = block_config

547

548

return npu_performance.measure_cycle_cost(self.arch, op.op_type, op.activation and op.activation.op_type, query)

549

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

550

def propose_schedule_buffering(self, ref_schedule: Schedule, staging_limit_bytes):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

551

"""Create a buffered schedule"""

552

buffered_schedule = Schedule(self.sg, f"{ref_schedule.label}_BUFFERED")

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

553

554

prev_op = None

555

for sched_op in self.sched_ops:

556

if sched_op not in ref_schedule.cost_map:

557

# sched_op is not part of this sub-schedule - skip

558

continue

559

560

self.propose_operator_buffering(sched_op, prev_op, buffered_schedule, ref_schedule, staging_limit_bytes)

561

prev_op = sched_op

562

563

return buffered_schedule

564

565

def propose_operator_buffering(

566

self,

567

sched_op: SchedulerOperation,

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

568

prev_op: Optional[SchedulerOperation],

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

569

buffered_schedule: Schedule,

570

ref_schedule: Schedule,

571

staging_limit_bytes,

572

):

573

# Mild recursion might mean this Op has already been seen

574

if sched_op in buffered_schedule.cost_map:

575

return

576

577

# Take the reference schedule as default costings for this schedule

578

ref_cost = ref_schedule.cost_map[sched_op]

579

cost = copy.copy(ref_cost)

580

cost.slack_buffering_cycles = ref_cost.cycles.op_cycles

581

memory_snapshot = ref_schedule.memory_snapshot

582

ref_memory_usage = memory_snapshot[ref_cost.time_index] if ref_cost.time_index < len(memory_snapshot) else 0

583

cost.slack_buffering_memory = staging_limit_bytes - ref_memory_usage

584

buffered_schedule.cost_map[sched_op] = cost

585

586

# Attempt weight buffering on anything with a weights tensor

587

if sched_op.parent_op.weights:

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame^]

588

buffer_limit_bytes = cost.slack_buffering_memory

589

590

# If applicable apply size limitation, but keep it within reason (ratio 1.5).

591

# Size limitation is used when use_fast_storage_for_feature_maps have

592

# detected that there are fms that do not fit in fast storage.

593

if sched_op.evicted_fms_size and ((buffer_limit_bytes / sched_op.evicted_fms_size) >= 1.5):

594

buffer_limit_bytes -= sched_op.evicted_fms_size

595

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

596

self.propose_weight_buffering(

597

sched_op.parent_op.weights,

598

sched_op.parent_op.bias,

sched_op,

prev_op,

buffered_schedule,

ref_schedule,

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame^]

603

buffer_limit_bytes,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

)

return cost

def weights_needs_dma(self, weight_tensor):

609

if weight_tensor and weight_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast):

610

# Weights are in permanent storage

611

# Only when permanent storage differs from feature map storage, there is a point moving the data

612

if (

613

weight_tensor.mem_area in (MemArea.Dram, MemArea.OffChipFlash)

614

and self.arch.permanent_storage_mem_area != self.arch.fast_storage_mem_area

):

return True

return False

def propose_weight_buffering(

self,

weight_tensor,

scale_tensor,

sched_op: SchedulerOperation,

624

prev_op: SchedulerOperation,

625

buffered_schedule: Schedule,

626

ref_schedule: Schedule,

627

buffer_limit_bytes,

628

):

629

cost = buffered_schedule.cost_map[sched_op]

630

prev_cost = buffered_schedule.cost_map.get(prev_op)

631

ref_cost = ref_schedule.cost_map[sched_op]

632

assert cost and ref_cost

633

634

needs_dma = self.weights_needs_dma(weight_tensor)

635

636

ofm_full_depth_slices = [0, ref_cost.stripe.depth]

637

638

# Encode weights for the full depth

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

639

full_weights, full_scales = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

self.arch,

sched_op.parent_op,

weight_tensor,

scale_tensor,

sched_op.kernel,

cost.block_config,

ofm_full_depth_slices,

647

)

648

full_weights_bytes = len(full_weights.buffer)

649

cost.ofm_depth_slices = ofm_full_depth_slices

650

651

# No buffering required - take all the weights from permanent storage

652

if sched_op.op_type == Op.FullyConnected or not needs_dma:

653

cost.npu_weights_tensor = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

654

cost.npu_scales_tensor = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

655

return

656

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

657

encoded_weights: Optional[NpuWeightTensor] = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

658

encoded_scales = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

659

660

# How many NPU cycles are available under the previously executing

661

# operator and SRAM unused for performing buffered DMA transfers

662

slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0

663

slack_memory = prev_cost.slack_buffering_memory if prev_cost else 0

664

665

# Force full depth for cascaded Ops

666

if ref_cost.cascade != 0:

667

weight_tensor_purpose = TensorSubPurpose.Standard

668

weight_buffer_size = full_weights_bytes

669

# Update the memory snapshot to reflect the added size of the weights

670

ref_schedule.memory_snapshot[ref_cost.time_index] += weight_buffer_size

671

else:

672

# Estimate the buffering cycle time for the full set of weights

673

full_transfer_cycles = npu_performance.measure_mem2mem_cycles(

674

self.arch, weight_tensor.mem_area, self.arch.fast_storage_mem_area, full_weights_bytes

675

)

676

cost.full_weight_transfer_cycles = full_transfer_cycles

677

678

# Calculate the amount of prebuffering necessary (or what is possible with limited

679

# double buffer buffer size)

680

half_buffer_limit = buffer_limit_bytes // 2

681

if full_transfer_cycles > slack_cycles:

682

prebuffer_ratio = slack_cycles / full_transfer_cycles

683

prebuffer_bytes = min(prebuffer_ratio * full_weights_bytes, half_buffer_limit)

684

else:

685

prebuffer_bytes = min(full_weights_bytes, half_buffer_limit)

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

686

687

prebuffer_ratio = prebuffer_bytes / full_weights_bytes

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

688

689

# Have to split the weights if the initial buffering can't store

690

# all of the compressed weights

691

if prebuffer_bytes < full_weights_bytes:

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

692

block_depth = cost.block_config.ofm_block.depth

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

693

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

694

# Choose initial prebuffering depth (already buffer clamped)

695

prebuffer_depth = ref_cost.stripe.depth * prebuffer_ratio

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

696

prebuffer_depth = int(max(16, round_down(prebuffer_depth, ArchitectureFeatures.OFMSplitDepth)))

697

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

698

# Calculate cycles executed during the prebuffer

699

pre_op_cycles = self.estimate_op_performance(sched_op, cost.block_config, prebuffer_depth)

700

buffering_depth = ref_cost.stripe.depth * (pre_op_cycles.op_cycles / full_transfer_cycles)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

701

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

702

# Choose initial buffering depth and clamp to the double buffering limit

703

buffering_depth = round_up(buffering_depth, block_depth)

704

buffering_bytes = (buffering_depth / ref_cost.stripe.depth) * full_weights_bytes

705

if buffering_bytes > half_buffer_limit:

706

buffering_depth = (half_buffer_limit / full_weights_bytes) * ref_cost.stripe.depth

707

708

while True:

709

# Attempt to buffer whole blocks

Johan Alfvén

cce7f2d

2022-04-08 10:47:09 +0200

[diff] [blame]

710

if buffering_depth > block_depth:

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

711

buffering_depth = round_down(buffering_depth, block_depth)

712

else:

713

buffering_depth = round_down(buffering_depth, ArchitectureFeatures.OFMSplitDepth)

714

buffering_depth = int(max(buffering_depth, ArchitectureFeatures.OFMSplitDepth))

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

715

716

# Create list of depth slices

717

depth_slices = [0]

718

if prebuffer_depth < ref_cost.stripe.depth:

719

depth_slices += list(range(prebuffer_depth, ref_cost.stripe.depth, buffering_depth))

720

depth_slices.append(ref_cost.stripe.depth)

721

722

# Encode weights based depth slices

723

cost.ofm_depth_slices = depth_slices

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

724

encoded_weights, encoded_scales = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

self.arch,

sched_op.parent_op,

weight_tensor,

scale_tensor,

sched_op.kernel,

cost.block_config,

cost.ofm_depth_slices,

732

)

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

733

assert encoded_weights is not None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

734

# Chosen buffering might not fit at all, iterate until it does

735

# or until the minimum usable slice size is reached

736

if (

Tim Hall

2022-05-04 16:20:43 +0100

[diff] [blame]

737

encoded_weights.max_range_bytes <= half_buffer_limit

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

738

or prebuffer_depth == ArchitectureFeatures.OFMSplitDepth

):

break

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

742

if buffering_depth > prebuffer_depth:

743

buffering_depth = round_up(buffering_depth // 2, ArchitectureFeatures.OFMSplitDepth)

744

else:

745

prebuffer_depth = round_up(prebuffer_depth // 2, ArchitectureFeatures.OFMSplitDepth)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

746

747

# Calculate cycles required to run the last op for use as future slack

748

tail_cycles = self.estimate_op_performance(

749

sched_op, cost.block_config, depth_slices[-1] - depth_slices[-2]

750

)

751

cost.slack_buffering_cycles = tail_cycles.op_cycles

752

753

# Determine whether the weights need to be double buffered

Tim Hall

2022-05-04 16:20:43 +0100

[diff] [blame]

754

weight_buffer_size = min(len(encoded_weights.buffer), encoded_weights.max_range_bytes)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

755

756

# Only buffer weights if there's still space left for the buffer

757

if weight_buffer_size <= buffer_limit_bytes:

758

assert weight_buffer_size % 16 == 0

759

# Determine whether to double buffer or single buffer

Tim Hall

2022-05-04 16:20:43 +0100

[diff] [blame]

760

if (weight_buffer_size * 2 <= buffer_limit_bytes) and (weight_buffer_size < len(encoded_weights.buffer)):

761

weight_buffer_size = weight_buffer_size * 2

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

762

weight_tensor_purpose = TensorSubPurpose.DoubleBuffer

763

else:

764

weight_tensor_purpose = TensorSubPurpose.Standard

765

Tim Hall

2022-05-04 16:20:43 +0100

[diff] [blame]

766

cost.buffered_weight_tensor = self.buffer_tensor(

767

encoded_weights, weight_tensor_purpose, weight_buffer_size, weight_tensor.name

768

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

769

if ref_cost.cascade == 0:

Tim Hall

2022-05-04 16:20:43 +0100

[diff] [blame]

770

# Determine if the lifetime can be extended and pre-buffer weights under the previous operation

771

cost.buffered_weight_tensor.pre_buffer = weight_buffer_size < slack_memory

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

772

773

cost.slack_buffering_memory -= weight_buffer_size

774

else:

775

# Don't slice or buffer - use the whole depth from persistent storage

776

cost.ofm_depth_slices = ofm_full_depth_slices

777

encoded_weights = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

778

encoded_scales = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

779

780

cost.npu_weights_tensor = encoded_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

781

cost.npu_scales_tensor = encoded_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

782

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

783

def buffer_tensor(self, src_tensor: Tensor, sub_purpose: TensorSubPurpose, buffer_size: int, name: str) -> Tensor:

Tim Hall

2022-05-04 16:20:43 +0100

[diff] [blame]

784

buffered_weight_tensor = Tensor([1, 1, 1, buffer_size], DataType.uint8, name + "_buffer")

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

785

buffered_weight_tensor.src_tensor = src_tensor

786

buffered_weight_tensor.mem_area = self.arch.fast_storage_mem_area

787

buffered_weight_tensor.mem_type = MemType.Scratch_fast

788

buffered_weight_tensor.purpose = TensorPurpose.Weights

789

buffered_weight_tensor.sub_purpose = sub_purpose

790

return buffered_weight_tensor

791

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

792

def propose_minimal_schedule(self) -> Schedule:

793

"""Proposes scheduling parameters where every operator is subdivided into the smallest stripe that satisfies the

794

next operators stride"""

795

min_schedule = Schedule(self.sg, "MIN")

796

cost_map = min_schedule.cost_map

797

798

# Keep track of the previous Op - which consumes the current Op's OFM

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

799

prev_op: Optional[SchedulerOperation] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

800

for sched_op in reversed(self.sched_ops):

801

min_stripe_height = prev_op.kernel.stride.y if prev_op else 1

802

min_stripe = sched_op.ofm.shape.with_height(min_stripe_height)

803

804

cost = sched_op.create_scheduler_info(self.nng, min_stripe)

805

cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)

806

cost_map[sched_op] = cost

prev_op = sched_op

return min_schedule

def propose_schedule_striping(self, final_stripe: Shape4D, label: str, ref_schedule: Schedule) -> Schedule:

813

"""Proposes new striping for a schedule. The stripe is derived from the ifm requirements of the next Op down"""

814

ref_cost = ref_schedule.cost_map

815

816

striped_schedule = Schedule(self.sg, label)

817

stripe = final_stripe

818

for sched_op in reversed(self.sched_ops):

819

if sched_op not in ref_cost:

820

# sched_op is not part of the sub-schedule - skip

821

continue

822

823

# Create a cost entry with the new stripe

824

cost = sched_op.create_scheduler_info(self.nng, stripe)

825

Tim Hall

2022-05-04 16:20:43 +0100

[diff] [blame]

826

if ref_cost[sched_op].buffered_weight_tensor:

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

827

# If the weights are buffered in the reference schedule they should be in the new proposal

828

weight_tensor = cost.npu_weights_tensor

Tim Hall

2022-05-04 16:20:43 +0100

[diff] [blame]

829

cost.buffered_weight_tensor = self.buffer_tensor(

830

weight_tensor, TensorSubPurpose.Standard, len(weight_tensor.buffer), weight_tensor.name

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

831

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

832

833

# Estimate performance

834

cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)

835

striped_schedule.cost_map[sched_op] = cost

836

837

# Calculate the preceeding Op's stripe

838

stripe = sched_op.ifm.shape.with_height(stripe.height * sched_op.kernel.stride.y)

839

840

return striped_schedule

841

842

def estimate_schedule_memory_usage(self, schedule: Schedule, non_local_mem_usage: dict):

843

"""Estimates the memory usage of a schedule"""

844

cost = schedule.cost_map

845

cascades = schedule.cascades

846

peak_mem_usage = 0

847

for sched_op in self.sched_ops:

848

if sched_op not in cost:

849

# sched_op is not part of the sub-schedule - skip

850

continue

851

852

if cost[sched_op].cascade:

853

# This Op is part of a cascade - use the cascade's memory usage

854

cascade_info = cascades[cost[sched_op].cascade]

855

# Non-local memory usage is already included in the cascade_info

856

peak_mem_usage = max(cascade_info.mem_usage, peak_mem_usage)

857

else:

858

# This Op is not part of a cascade - calculate the memory usage

Tim Hall

2022-05-04 16:20:43 +0100

[diff] [blame]

859

op_weight_buffer = 0

860

if cost[sched_op].buffered_weight_tensor:

861

op_weight_buffer = cost[sched_op].buffered_weight_tensor.storage_size()

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

862

863

op_mem_usage = (

864

sched_op.ifm_size_in_bytes()

865

+ sched_op.ofm_size_in_bytes()

866

+ op_weight_buffer

867

+ non_local_mem_usage.get(sched_op, 0)

868

)

869

peak_mem_usage = max(op_mem_usage, peak_mem_usage)

870

871

return peak_mem_usage

872

873

def optimize_sub_schedule(

874

self, cascade_info: CascadeInfo, ref_schedule: Schedule, max_template: Schedule, memory_limit: int

875

) -> Schedule:

876

"""Extracts the Ops covered by the given cascade and creates a sub-schedule. The sub-schedule is optimized by

877

proposing weight buffering and then continously proposing new stripe sizes"""

878

ref_cost = ref_schedule.cost_map

879

# Extract the ops that are part of this sub-schedule

880

start = cascade_info.start

881

end = cascade_info.end

882

sub_schedule_ops = self.sched_ops[start : end + 1]

883

# Create a sub-schedule that contains only the costs for the Ops that are part of the sub-schedule

884

sub_schedule = Schedule(self.sg, f"SUB_{start}_{end}")

885

for sched_op in sub_schedule_ops:

886

sub_schedule.cost_map[sched_op] = ref_cost[sched_op]

887

888

sub_schedule.cascades[end] = cascade_info

889

# Use the memory snapshot from the reference schedule

890

sub_schedule.memory_snapshot = ref_schedule.memory_snapshot

891

892

# Calculate memory usage that is live during the sub-schedule but not part of it

893

time_for_cascade = ref_cost[sub_schedule_ops[0]].time_index

894

mem_usage_parallel_to_sub_schedule = ref_schedule.memory_snapshot[time_for_cascade] - cascade_info.mem_usage

895

# If the first Op's IFM has other consumers it has to live throughout the whole sub-schedule whether it's

896

# included in a cascade or not

897

persistent_initial_ifm = (

898

sub_schedule_ops[0].ifm_size_in_bytes() if len(sub_schedule_ops[0].ifm.connection.consumers) > 1 else 0

899

)

900

# Calculate non-local-mem-usage per Operator

901

non_local_mem_usage = {}

902

for idx, sched_op in enumerate(sub_schedule_ops):

903

non_local_mem_usage[sched_op] = mem_usage_parallel_to_sub_schedule

904

if idx != 0:

905

non_local_mem_usage[sched_op] += persistent_initial_ifm

906

907

cascade_builder = CascadeBuilder(sub_schedule_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)

908

909

# Start by adding buffering

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

910

buffered_sub_schedule = self.propose_schedule_buffering(

911

sub_schedule, self.scheduler_options.optimization_sram_limit

912

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

913

# Copy the cascades over from the unbuffered-schedule

914

buffered_sub_schedule.cascades = sub_schedule.cascades

915

916

# Generate the possible stripings for the final Op in the sub-schedule

917

final_ofm_shape = sub_schedule_ops[-1].ofm.shape

918

possible_stripes = [

919

final_ofm_shape.with_height(stripe_h) for stripe_h in range(1, final_ofm_shape.height // 2 + 1)

920

]

921

922

# Propose different striping - the possible stripes are proposed similarly to a binary search

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

923

best_schedule = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

924

iteration = 0

925

while len(possible_stripes) > 1:

926

proposed_stripe = possible_stripes[len(possible_stripes) // 2]

927

proposed_schedule = self.propose_schedule_striping(

928

proposed_stripe, f"OPTIMIZED_{iteration}", buffered_sub_schedule

929

)

930

931

cascade_builder.build_cascades(proposed_schedule, max_template, memory_limit)

932

933

# Check if proposal fits

934

proposed_schedule_mem_usage = self.estimate_schedule_memory_usage(proposed_schedule, non_local_mem_usage)

935

if (proposed_schedule_mem_usage) <= memory_limit:

936

# Remove all possible stripes smaller than this

937

possible_stripes = possible_stripes[len(possible_stripes) // 2 :]

938

best_schedule = proposed_schedule

939

if not proposed_schedule.cascades:

940

# No cascading required - early exit

941

break

942

else:

943

# Proposal doesn't fit within the limit - remove all possible stripes larger than this

944

possible_stripes = possible_stripes[: len(possible_stripes) // 2]

iteration += 1

return best_schedule

def optimize_schedule(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

self,

schedule: Schedule,

max_sched: Schedule,

max_template: Schedule,

955

options: SchedulerOptions,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

956

) -> Schedule:

957

"""Extracts sub-schedules based on the cascades and optimizes them and applies them to the final schedule"""

958

sram_limit = options.optimization_sram_limit

959

if max_sched.fast_storage_peak_usage < sram_limit and not self.arch.is_spilling_enabled():

960

# Maximum performance schedule fits within the SRAM target

961

return max_sched

962

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

963

# Iterate over a copy of the cascades since they may change during the loop

964

for cascade_info in list(schedule.cascades.values()):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

965

# Optimize the sub-schedule in this cascade

966

opt_sub_schedule = self.optimize_sub_schedule(cascade_info, schedule, max_template, sram_limit)

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

967

if opt_sub_schedule:

968

# Remove the existing cascade

969

del schedule.cascades[cascade_info.end]

970

# Update the sub-schedule Op and cascade costs to the full schedule

971

schedule.cost_map.update(opt_sub_schedule.cost_map)

972

schedule.cascades.update(opt_sub_schedule.cascades)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

973

974

# Update memory snapshot

975

self.sg.schedule = schedule

976

self.update_op_memory_snapshot(schedule)

977

# Propose schedule buffering to the optimized schedule

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

978

optimized_sched = self.propose_schedule_buffering(schedule, self.scheduler_options.optimization_sram_limit)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

979

# Copy the cascade's metadata from the unbuffered schedule

980

optimized_sched.cascades = schedule.cascades

981

return optimized_sched

982

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame^]

983

def optimize_weight_buffering_size(

984

self,

985

min_schedule: Schedule,

986

options: SchedulerOptions,

987

):

988

default_schedule = self.sg.schedule

989

npu_performance.calc_new_performance_for_network(self.nng, self.arch)

990

default_tot_cycles = self.nng.cycles[npu_performance.PassCycles.Total]

991

default_dram_cycles = self.nng.cycles[npu_performance.PassCycles.DramAccess]

992

993

# Restore mem/type for scratched_fms

994

for tens in self.scratched_fms:

995

tens.mem_area = self.scratched_fms[tens][0]

996

tens.mem_type = self.scratched_fms[tens][1]

997

998

self.update_op_memory_snapshot(self.sg.schedule)

999

1000

# Collect live ranges from tensors

1001

memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

1002

lr_graph = live_range.LiveRangeGraph()

1003

for mem_area, mem_type_set in memories_list:

1004

live_range.extract_live_ranges_from_cascaded_passes(

1005

self.nng.get_root_subgraph(),

mem_area,

mem_type_set,

lr_graph,

Tensor.AllocationQuantum,

1010

)

1011

1012

# Find the relation between the sched_op and the buffering tensor

1013

weight_ops = {}

1014

for sched_op in self.sched_ops:

1015

cost = self.sg.schedule.cost_map[sched_op]

1016

if cost.buffered_weight_tensor:

1017

weight_ops[cost.buffered_weight_tensor] = sched_op

1018

1019

# Filter out weight buffer live ranges

1020

weight_lrs = []

1021

for lr in lr_graph.lrs:

1022

for tens in lr.tensors:

1023

if weight_ops.get(tens):

1024

weight_lrs.append(lr)

1025

break

1026

1027

# See if any evicted fm overlaps with a weight buffering op.

1028

# If this is the case add a size limitation to the buffering op

1029

for lr in self.evicted_fms:

1030

for weight_lr in weight_lrs:

1031

if lr.overlaps_ranges(weight_lr):

1032

for tens in weight_lr.tensors:

1033

sched_op = weight_ops.get(tens)

1034

if sched_op:

1035

# Add size reduction to the op

1036

sched_op.evicted_fms_size += lr.size

1037

break

1038

1039

self.sg.schedule = min_schedule

1040

self.update_op_memory_snapshot(self.sg.schedule)

1041

1042

# Run schedule buffering - with weight buffer size reduction

1043

schedule = self.propose_schedule_buffering(self.sg.schedule, options.optimization_sram_limit)

1044

schedule.cascades = self.sg.schedule.cascades

1045

self.sg.schedule = schedule

1046

1047

# Apply new buffer schdule and calc new performance

1048

self.update_op_memory_snapshot(self.sg.schedule)

1049

self.apply_schedule(self.sg.schedule)

1050

self.use_fast_storage_for_feature_maps(self.sg.schedule, options.optimization_sram_limit)

1051

1052

npu_performance.calc_new_performance_for_network(self.nng, self.arch)

1053

new_tot_cycles = self.nng.cycles[npu_performance.PassCycles.Total]

1054

new_dram_cycles = self.nng.cycles[npu_performance.PassCycles.DramAccess]

1055

1056

improvement_tot = round((default_tot_cycles - new_tot_cycles) / default_tot_cycles, 2)

1057

improvement_dram = round((default_dram_cycles - new_dram_cycles) / default_dram_cycles, 2)

1058

1059

# Compare both total and dram improvement

1060

if not (improvement_tot > 0 and improvement_dram > 0):

1061

# No improvement, restore the default schedule

1062

for sched_op in self.sched_ops:

1063

sched_op.evicted_fms_size = 0

1064

1065

for tens in self.scratched_fms:

1066

tens.mem_area = self.scratched_fms[tens][0]

1067

tens.mem_type = self.scratched_fms[tens][1]

1068

1069

self.sg.schedule = default_schedule

1070

self.update_op_memory_snapshot(self.sg.schedule)

1071

self.apply_schedule(self.sg.schedule)

1072

self.use_fast_storage_for_feature_maps(self.sg.schedule, options.optimization_sram_limit)

1073

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1074

def apply_schedule(self, sched: Schedule):

1075

"""Applies the given schedule as a final solution"""

1076

for sched_op in self.sched_ops:

1077

op_info = sched.cost_map[sched_op]

1078

cascade_info = sched.cascades.get(op_info.cascade, None)

1079

if cascade_info and sched_op in cascade_info.buffers:

1080

buffer_tens = sched_op.ifm.connection.parent_tens

1081

# Apply memory area and type

1082

buffer_tens.mem_area = self.arch.fast_storage_mem_area

1083

buffer_tens.mem_type = MemType.Scratch_fast

1084

# Apply Rolling buffer

1085

buffer_tens.set_format(TensorFormat.NHCWB16, self.arch)

1086

buffer_tens.set_new_sub_purpose(TensorSubPurpose.RollingBufferY, cascade_info.buffers[sched_op].height)

1087

1088

sched_op.parent_ps.block_config = op_info.block_config.old_style_representation()

1089

1090

# Ensure that the src_tensor reference is set correctly

Tim Hall

2022-05-04 16:20:43 +0100

[diff] [blame]

1091

if op_info.buffered_weight_tensor:

1092

op_info.buffered_weight_tensor.src_tensor = op_info.npu_weights_tensor

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1093

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1094

def use_fast_storage_for_feature_maps(self, schedule, staging_limit):

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1095

max_mem_usage = []

1096

base_mem_usage = []

1097

fast_storage_type = MemType.Scratch_fast

1098

fast_storage_mem_area = self.arch.fast_storage_mem_area

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame^]

1099

self.evicted_fms = []

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1100

1101

# Force all OFMs to fast-storage

1102

for sched_op in self.sched_ops:

1103

cost = schedule.cost_map[sched_op]

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1104

if cost.cascade == 0 and sched_op.get_dependants():

1105

ofm_tens = sched_op.ofm.connection.parent_tens

1106

if not any(cons is None for cons in ofm_tens.consumer_list):

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame^]

1107

if ofm_tens not in self.scratched_fms:

1108

# Remember default mem area and mem type, only done once

1109

self.scratched_fms[ofm_tens] = (ofm_tens.mem_area, ofm_tens.mem_type)

1110

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1111

ofm_tens.mem_area = fast_storage_mem_area

1112

ofm_tens.mem_type = fast_storage_type

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1113

1114

# Collect live ranges from tensors

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1115

memories_list = [(fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1116

lr_graph = live_range.LiveRangeGraph()

1117

for mem_area, mem_type_set in memories_list:

1118

live_range.extract_live_ranges_from_cascaded_passes(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

1119

self.nng.get_root_subgraph(),

mem_area,

mem_type_set,

lr_graph,

Tensor.AllocationQuantum,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1124

)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1125

max_mem_usage = lr_graph.get_temporal_memory_usage(fast_storage_mem_area)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1126

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1127

# If true, everything fits and we can proceed

1128

if max(max_mem_usage) <= staging_limit:

1129

return

1130

1131

# Build up the base memory usage by removing the

1132

# mem_usage of the lrs we previously moved to fast-storage

1133

base_mem_usage = np.array(max_mem_usage)

1134

curr_lrs = []

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1135

for lr in lr_graph.lrs:

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1136

for tens in lr.tensors:

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame^]

1137

if self.scratched_fms.get(tens):

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1138

curr_lrs.append(lr)

1139

base_mem_usage[lr.start_time : lr.end_time + 1] -= lr.size

1140

break

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1141

competing_lrs = []

1142

for lr in curr_lrs:

1143

base_usage = max(base_mem_usage[lr.start_time : lr.end_time + 1])

1144

# If true, the lr will never fit and may thus be evicted

1145

if base_usage + lr.size > staging_limit:

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame^]

1146

self.evicted_fms.append(lr)

1147

FastStorageComponentAllocator.evict(lr, max_mem_usage, self.scratched_fms)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1148

continue

1149

# Since max_mem_usage is the memory usage with all FMs still in fast-storage,

1150

# the memory limit cannot be exceeded if max_mem_usage does not.

1151

# Thus, the affected lrs can remain in fast-storage if the following is true

1152

if max(max_mem_usage[lr.start_time : lr.end_time + 1]) <= staging_limit:

1153

FastStorageComponentAllocator.keep(lr, base_mem_usage, staging_limit)

1154

else:

1155

competing_lrs.append(lr)

1156

sz = len(competing_lrs)

1157

# All lrs and their tensors have been handled if sz is zero, we may thus return

if sz == 0:

return

competing_lrs = sorted(competing_lrs, key=lambda lr: (lr.start_time, lr.end_time + 1, lr.size))

1162

start = 0

1163

start_time = competing_lrs[0].start_time

1164

end_time = competing_lrs[0].end_time

1165

component_allocator = FastStorageComponentAllocator(base_mem_usage, max_mem_usage, staging_limit)

1166

# Build up components and then allocate each separately

1167

for i, lr in enumerate(competing_lrs):

1168

if lr.start_time <= end_time and i - start < component_allocator.max_exhaustive_size:

1169

start_time = min(start_time, lr.start_time)

1170

end_time = max(end_time, lr.end_time)

1171

else:

1172

component_allocator.allocate_component(

1173

component_allocator,

1174

competing_lrs[start:i],

1175

max_mem_usage,

1176

base_mem_usage,

1177

staging_limit,

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame^]

1178

self.scratched_fms,

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1179

)

1180

start = i

1181

start_time = lr.start_time

1182

end_time = lr.end_time

1183

component_allocator.allocate_component(

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame^]

1184

component_allocator,

1185

competing_lrs[start:sz],

max_mem_usage,

base_mem_usage,

staging_limit,

self.scratched_fms,

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1190

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1191

1192

def move_constant_data(self):

1193

"""Determine if data, can be moved from permanent storage to another memory area. A move

1194

will generate a DMA command in the high-level command stream"""

1195

for sched_op in self.sched_ops:

1196

parent_op = sched_op.parent_op

1197

is_lut_used = any(inp.purpose == TensorPurpose.LUT for inp in parent_op.inputs)

1198

max_ifm_shram_avail = (

1199

(self.arch.available_shram_banks(is_lut_used) - self.arch.shram_reserved_output_banks)

1200

* self.arch.shram_bank_size

// 2

)

for idx, tens in enumerate(parent_op.inputs):

1205

if tens.mem_type not in (MemType.Scratch, MemType.Scratch_fast):

1206

# Tensor is in permanent storage

1207

# Only when permanent storage differs from feature map storage, there is a point moving the data

1208

if (

1209

tens.mem_area in self.arch.permanent_storage_mem_area

1210

and self.arch.permanent_storage_mem_area != self.arch.feature_map_storage_mem_area

1211

) or tens.purpose == TensorPurpose.LUT:

1212

if tens.purpose == TensorPurpose.LUT or (

Patrik Gustavsson

94292fe

2021-09-02 08:22:58 +0200

[diff] [blame]

1213

# For elementwise broadcast

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1214

tens.purpose == TensorPurpose.FeatureMap

1215

and sched_op.op_type.is_binary_elementwise_op()

1216

and tens.shape != []

1217

and sched_op.ifm.shape != sched_op.ofm.shape

Patrik Gustavsson

94292fe

2021-09-02 08:22:58 +0200

[diff] [blame]

1218

and parent_op.write_shape is None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1219

and tens.storage_size() > max_ifm_shram_avail

1220

):

1221

only_vector_product_consumers = all(

1222

oper and oper.type.npu_block_type == NpuBlockType.VectorProduct

1223

for oper in tens.consumers()

1224

)

1225

1226

if (not only_vector_product_consumers) or tens.purpose == TensorPurpose.LUT:

1227

new_tens = tens.clone_into_fast_storage(self.arch)

1228

if tens.purpose == TensorPurpose.LUT:

1229

new_tens.mem_area = MemArea.Shram

1230

1231

new_tens.consumer_list.append(parent_op)

1232

parent_op.inputs[idx] = new_tens

Dwight Lidman

352607c

2021-09-29 17:00:09 +0200

[diff] [blame]

1233

# If the index is out of range, IFM and IFM2 are the same tensor

1234

# and pass inputs don't have duplicates

1235

if idx < len(sched_op.parent_ps.inputs):

1236

sched_op.parent_ps.inputs[idx] = new_tens

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1237

1238

def print_schedule(self, schedule: Schedule):

1239

print(f"Schedule: '{schedule.name}'")

1240

for sched_op in self.sched_ops:

1241

if sched_op not in schedule.cost_map:

1242

# Sub-schedule printing

1243

continue

1244

1245

op_info = schedule.cost_map[sched_op]

1246

print(f"\t{sched_op.index}: Operation {sched_op.name} - OFM {sched_op.ofm.shape}")

1247

print(f"\t\tType: {sched_op.op_type}")

1248

print(f"\t\tKernel: {sched_op.kernel}")

1249

print(f"{op_info}")

1250

mem_usage = (

1251

schedule.memory_snapshot[op_info.time_index]

1252

if op_info.time_index < len(schedule.memory_snapshot)

1253

else 0

1254

)

1255

print(f"\t\tSRAM Used: {mem_usage} bytes")

1256

Jonas Ohlsson

25e700c

2022-03-04 14:58:56 +0100

[diff] [blame]

1257

print("\tCascades:")

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1258

for i, cascade in enumerate(schedule.cascades.values()):

1259

print(f"\t\t{i}: {cascade.start} -> {cascade.end}, size: {cascade.mem_usage}")

Patrik Gustavsson

feeb06d

2020-04-22 12:53:47 +0200

[diff] [blame]

1260

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1261

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1262

def _update_tensor_allocation(nng: Graph, arch: ArchitectureFeatures, options):

1263

"""

1264

Creates live ranges and runs tensor allocator for the current schedule

1265

(i.e. sg.schedule for all subgraphs), returns the maximum memory usage

1266

and updates SchedulerOpInfo.mem_usage for all operations in the schedule.

1267

"""

1268

root_sg = nng.get_root_subgraph()

1269

1270

alloc_list = []

1271

if arch.is_spilling_enabled():

1272

mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,)))

1273

mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))

1274

# Order is important

1275

alloc_list.append(mem_alloc_scratch_fast)

1276

alloc_list.append(mem_alloc_scratch)

1277

else:

1278

mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))

1279

alloc_list.append(mem_alloc_scratch)

1280

1281

for mem_area, mem_type_set in alloc_list:

1282

tensor_allocation.allocate_tensors(

nng,

root_sg,

arch,

mem_area,

mem_type_set,

tensor_allocator=options.tensor_allocator,

1289

verbose_allocation=options.verbose_allocation,

1290

cpu_tensor_alignment=options.cpu_tensor_alignment,

1291

)

1292

1293

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1294

class FastStorageComponentAllocator:

1295

def __init__(self, base_mem_usage, max_mem_usage, staging_limit):

1296

self.base_mem_usage = base_mem_usage

1297

self.max_mem_usage = list(max_mem_usage)

1298

self.staging_limit = staging_limit

1299

self.lrs = []

1300

self.evicted = []

1301

self.curr_evicted = []

1302

self.remaining_total_size = []

1303

self.best_allocated_size = 0

1304

self.max_exhaustive_size = 20

1305

1306

def allocate_exhaustive(self, ix, alloc_size):

1307

if ix >= len(self.lrs):

1308

if alloc_size > self.best_allocated_size:

1309

self.best_allocated_size = alloc_size

Louis Verhaard

5c8f1e5

2022-02-23 14:13:07 +0100

[diff] [blame]

1310

self.evicted = self.curr_evicted.copy()

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

return

lr = self.lrs[ix]

for t in range(lr.start_time, lr.end_time):

1315

assert self.base_mem_usage[t] <= self.max_mem_usage[t]

1316

base_usage = max(self.base_mem_usage[lr.start_time : lr.end_time + 1])

1317

can_fit = base_usage + lr.size <= self.staging_limit

1318

always_fits = can_fit

1319

1320

if can_fit:

1321

max_usage = max(self.max_mem_usage[lr.start_time : lr.end_time + 1])

1322

always_fits = max_usage <= self.staging_limit

1323

1324

if can_fit or always_fits:

1325

self.curr_evicted[ix] = False

1326

self.base_mem_usage = self.update_mem_usage(self.base_mem_usage, lr, True)

1327

self.allocate_exhaustive(ix + 1, alloc_size + lr.size)

1328

self.base_mem_usage = self.update_mem_usage(self.base_mem_usage, lr, False)

1329

1330

if not always_fits:

1331

self.curr_evicted[ix] = True

1332

self.max_mem_usage = self.update_mem_usage(self.max_mem_usage, lr, False)

1333

self.allocate_exhaustive(ix + 1, alloc_size)

1334

self.max_mem_usage = self.update_mem_usage(self.max_mem_usage, lr, True)

1335

1336

@staticmethod

1337

def update_mem_usage(mem_usage, lr, increase):

1338

for t in range(lr.start_time, lr.end_time + 1):

1339

mem_usage[t] += lr.size if increase else -lr.size

1340

assert mem_usage[t] >= 0

return mem_usage

@staticmethod

def evict(lr, max_mem_usage, scratched_fms):

1345

for t in range(lr.start_time, lr.end_time + 1):

1346

max_mem_usage[t] -= lr.size

1347

for tens in lr.tensors:

1348

if tens in scratched_fms:

1349

tens.mem_area = scratched_fms[tens][0]

1350

tens.mem_type = scratched_fms[tens][1]

1351

1352

@staticmethod

1353

def keep(lr, base_mem_usage, staging_limit):

1354

for t in range(lr.start_time, lr.end_time + 1):

1355

base_mem_usage[t] += lr.size

1356

assert base_mem_usage[t] <= staging_limit

1357

1358

def allocate_component(self, allocator, lrs, max_mem, min_mem, staging_limit, scratched_fms):

1359

sz = len(lrs)

1360

allocator.lrs = lrs

1361

allocator.evicted = [0] * len(lrs)

1362

allocator.curr_evicted = [0] * sz

1363

allocator.best_allocated_size = -1

1364

# Recursively evaluate all permutations of allocations of the lrs found in the component

1365

allocator.allocate_exhaustive(0, 0)

1366

1367

# Optimal allocation has been found, move lrs accordingly

1368

for i, e in enumerate(allocator.evicted):

1369

if e:

1370

self.evict(lrs[i], max_mem, scratched_fms)

1371

else:

1372

self.keep(lrs[i], min_mem, staging_limit)

1373

1374

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1375

def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_options: SchedulerOptions):

1376

"""Entry point for the Scheduler"""

1377

# Initialize CPU subgraphs

1378

schedulers = dict()

1379

# Initialize schedulers with max schedule. Only schedule NPU subgraphs

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1380

for sg in nng.subgraphs:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1381

if sg.placement != PassPlacement.Npu:

1382

# Create cascaded passes for CPU Ops

1383

cascaded_passes = []

1384

for idx, ps in enumerate(sg.passes):

1385

cps = CascadedPass(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

1386

ps.name,

1387

SchedulingStrategy.WeightStream,

ps.inputs,

[],

ps.outputs,

[ps],

ps.placement,

False,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1394

)

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1395

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1396

cps.time = idx

1397

ps.cascade = cps

1398

cascaded_passes.append(cps)

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1399

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1400

sg.cascaded_passes = cascaded_passes

1401

else:

1402

# Npu subgraph - create schedule

1403

scheduler = Scheduler(nng, sg, arch, scheduler_options)

1404

schedulers[sg] = scheduler

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1405

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1406

scheduler.create_scheduler_representation(arch)

1407

sg.sched_ops = scheduler.sched_ops

1408

scheduler.move_constant_data()

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1409

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1410

# Create the Max schedule template

1411

max_schedule_template = scheduler.create_initial_schedule()

1412

scheduler.max_schedule = max_schedule_template

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1413

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1414

# Create the optimimised Max schedule

1415

sg.schedule = max_schedule_template

1416

scheduler.update_op_memory_snapshot(max_schedule_template)

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

1417

opt_max_schedule = scheduler.propose_schedule_buffering(max_schedule_template, 1 << 32)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1418

sg.schedule = opt_max_schedule

1419

scheduler.update_op_memory_snapshot(opt_max_schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1420

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1421

# Create Min schedule

1422

min_schedule = scheduler.propose_minimal_schedule()

1423

initial_sram_limit = scheduler_options.optimization_sram_limit

1424

if scheduler_options.optimization_strategy == OptimizationStrategy.Size:

1425

initial_sram_limit = scheduler.min_memory_req

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1426

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1427

cascade_builder = CascadeBuilder(scheduler.sched_ops, arch.is_spilling_enabled())

1428

cascade_builder.build_cascades(min_schedule, max_schedule_template, initial_sram_limit)

1429

sg.schedule = min_schedule

1430

scheduler.update_op_memory_snapshot(min_schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1431

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1432

if scheduler_options.optimization_strategy == OptimizationStrategy.Performance:

1433

# Create an optimized schedule

1434

sg.schedule = scheduler.optimize_schedule(

1435

min_schedule, opt_max_schedule, max_schedule_template, scheduler_options

1436

)

1437

scheduler.update_op_memory_snapshot(sg.schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1438

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1439

scheduler.apply_schedule(sg.schedule)

1440

scheduler.use_fast_storage_for_feature_maps(sg.schedule, scheduler_options.optimization_sram_limit)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1441

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame^]

1442

if scheduler_options.optimization_strategy == OptimizationStrategy.Performance and scheduler.evicted_fms:

1443

# It might be possible to gain performance by reducing

1444

# weight buffer size and instead fit fms in fast storage

1445

scheduler.optimize_weight_buffering_size(min_schedule, scheduler_options)

1446

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1447

if scheduler_options.verbose_schedule:

1448

scheduler.print_schedule(sg.schedule)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1449

Tim Hall