Blame - ethosu/vela/scheduler.py - ml/ethos-u/ethos-u-vela

2020-04-27 18:20:16 +0100

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

16

#

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

17

# Description:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

18

# The scheduler creates and searches for an optimal plan for the network, selecting block configurations and

19

# subdivisions for the Operators

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

20

# For Class name forward references for the type annotations. (see PEP 563).

21

from __future__ import annotations

22

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

23

import copy

Johan Alfvén

2022-02-09 21:20:10 +0100

[diff] [blame]

24

from collections import namedtuple

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

25

from enum import auto

26

from enum import IntEnum

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

27

from typing import Any

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

28

from typing import Dict

29

from typing import List

30

from typing import Optional

31

from typing import Tuple

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

32

from typing import TYPE_CHECKING

33

34

# Import needed for Type annotations. Only import for Type checking to avoid run-time errors due to cyclic import.

35

if TYPE_CHECKING:

36

from .npu_performance import CycleCost

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

37

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

38

import numpy as np

39

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

40

from . import live_range

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

41

from . import npu_performance

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

42

from . import tensor_allocation

43

from . import weight_compressor

44

from .architecture_allocator import ArchitectureBlockConfig

45

from .architecture_allocator import find_block_config

46

from .architecture_allocator import get_ifm_area_required

Fredrik Svedberg

d03dc50

2022-06-30 10:44:12 +0200

[diff] [blame]

47

from .architecture_allocator import to_upscale

erik.andersson@arm.com

8912f3a

2022-08-16 11:08:57 +0200

[diff] [blame]

48

from .architecture_allocator import is_nearest

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

49

from .architecture_features import ArchitectureFeatures

50

from .architecture_features import Block

51

from .cascade_builder import CascadeBuilder

52

from .cascade_builder import CascadeInfo

Fredrik Svedberg

880e735

2020-08-25 11:31:47 +0200

[diff] [blame]

53

from .data_type import DataType

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

54

from .nn_graph import CascadedPass

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

55

from .nn_graph import Graph

56

from .nn_graph import Pass

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

57

from .nn_graph import PassPlacement

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

58

from .nn_graph import SchedulingStrategy

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

59

from .nn_graph import Subgraph

Johan Alfvén

92689d5

2022-12-06 11:16:19 +0100

[diff] [blame]

60

from .live_range import ofm_can_reuse_ifm

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

61

from .numeric_util import round_down

62

from .numeric_util import round_up

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

63

from .operation import NpuBlockType

Louis Verhaard

aee5d75

2020-09-30 09:01:52 +0200

[diff] [blame]

64

from .operation import Op

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

65

from .shape4d import Shape4D

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

66

from .tensor import MemArea

Patrik Gustavsson

eca2e95

2020-05-27 09:15:11 +0200

[diff] [blame]

67

from .tensor import MemType

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

68

from .tensor import Tensor

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

69

from .tensor import TensorFormat

70

from .tensor import TensorPurpose

71

from .tensor import TensorSubPurpose

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

72

from .weight_compressor import NpuWeightTensor

Jacob Bohlin

1a66697

2020-09-11 10:04:15 +0200

[diff] [blame]

73

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

74

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

75

def shape_for_format(shape: Shape4D, tensor_format: TensorFormat) -> Shape4D:

76

if tensor_format == TensorFormat.NHCWB16:

77

return shape.with_depth(round_up(shape.depth, 16))

return shape

class OptimizationStrategy(IntEnum):

83

"""Enum defining the different optimization strategies for the Scheduler"""

84

85

Size = auto()

86

Performance = auto()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

def __str__(self):

return self.name

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

92

class SchedulerOpInfo:

93

"""Contains metadata about a SchedulerOperation that is unique to one Schedule"""

94

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

95

def __init__(

96

self,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

97

block_config: ArchitectureBlockConfig,

98

weights_size: int,

99

stripe_input: Shape4D,

100

stripe_input2: Optional[Shape4D],

101

stripe: Shape4D,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

102

):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

103

self.block_config = block_config

104

self.weights_size = weights_size

105

self.stripe_input = stripe_input

106

self.stripe_input2 = stripe_input2

107

self.stripe = stripe

108

self.cascade = 0 # Assigned by CascadeBuilder. 0 means not part of a cascade

109

self.time_index = None # Set by update_op_memory_snapshot

110

self.ofm_depth_slices: List[int] = [0, stripe.depth]

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

111

self.npu_weights_tensor: Optional[NpuWeightTensor] = None

112

self.npu_scales_tensor: Optional[NpuWeightTensor] = None

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

113

self.buffered_weight_tensors: List[Tensor] = []

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

114

self.cycles: Optional[CycleCost] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

115

self.slack_buffering_cycles = 0

116

self.slack_buffering_memory = 0

117

self.full_weight_transfer_cycles = 0

118

119

def copy(self):

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

120

res = SchedulerOpInfo(

self.block_config,

self.weights_size,

self.stripe_input,

self.stripe_input2,

self.stripe,

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

127

res.cascade = self.cascade

return res

def __str__(self):

res = f"\t\tBlock Config = {self.block_config}\n"

132

res += f"\t\tOFM Block = {self.block_config.ofm_block}\n"

133

res += f"\t\tIFM Stripe = {self.stripe_input}\n"

134

res += f"\t\tIFM2 Stripe = {self.stripe_input2}\n"

135

res += f"\t\tOFM Stripe = {self.stripe}\n"

136

res += f"\t\tEncoded Weights = {self.npu_weights_tensor and len(self.npu_weights_tensor.buffer)} bytes\n"

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

137

for idx, tens in enumerate(self.buffered_weight_tensors):

138

res += f"\t\tWeight buffer{idx + 1} = {tens.storage_size()} bytes\n"

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

139

res += f"\t\tDepth slices = {self.ofm_depth_slices}\n"

140

res += f"\t\tAssigned Cascade = {self.cascade}"

return res

class SchedulerOptions:

145

"""Contains options for the Scheduler"""

146

147

def __init__(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

148

self,

149

optimization_strategy,

150

sram_target,

151

verbose_schedule,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

152

):

153

self.optimization_strategy = optimization_strategy

154

self.optimization_sram_limit = sram_target

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

155

self.verbose_schedule = verbose_schedule

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

156

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

157

def __str__(self) -> str:

158

return f"{type(self).__name__}: {str(self.__dict__)}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

__repr__ = __str__

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

163

class SchedulerTensor:

164

def __init__(self, shape, dt, mem_area, _format):

165

self.dtype = dt

166

self.mem_area = mem_area

167

self.shape = shape

168

self.format = _format

169

self.connection = None

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

170

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

171

172

class SchedulerOperation:

173

"""Scheduler internal representation of 'Operation'

174

This class can be seen as a node within the Scheduler Graph representation

175

"""

176

177

def __init__(self, ps: Pass, arch: ArchitectureFeatures, nng: Graph):

178

self.arch = arch

179

self.parent_ps = ps

180

self.parent_op = ps.primary_op

181

self.name = ps.primary_op.name

182

self.op_type = ps.primary_op.type

183

self.activation = ps.primary_op.activation

184

self.kernel = ps.primary_op.kernel

Tim Hall

3c5cfe9

2022-03-16 16:31:57 +0000

[diff] [blame]

185

self.resampling_mode = ps.primary_op.ifm_resampling_mode

Fredrik Svedberg

b81e1bb

2022-10-11 21:50:51 +0200

[diff] [blame]

186

self.reversed_operands = False

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

187

self.uses_scalar = ps.primary_op.ifm2 is not None and (

188

ps.primary_op.ifm.shape == [] or ps.primary_op.ifm2.shape == []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

189

)

erik.andersson@arm.com

2022-03-22 15:35:30 +0100

[diff] [blame]

190

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

191

self.ifm_ublock = arch.ifm_ublock

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

192

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

193

self.ifm = SchedulerTensor(

194

ps.ifm_shapes[0],

195

ps.ifm_tensor.dtype,

196

ps.ifm_tensor.mem_area,

197

ps.ifm_tensor.format,

198

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

199

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

200

self.ifm2 = None

201

if ps.ifm2_tensor:

202

self.ifm2 = SchedulerTensor(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

203

ps.ifm_shapes[1],

204

ps.ifm2_tensor.dtype,

205

ps.ifm2_tensor.mem_area,

206

ps.ifm2_tensor.format,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

207

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

208

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

209

self.ofm = SchedulerTensor(

210

ps.ofm_shapes[0],

211

ps.ofm_tensor.dtype,

212

ps.ofm_tensor.mem_area,

213

ps.ofm_tensor.format,

214

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

215

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

216

# Input volume width and height required to produce the smallest possible stripe

217

self.min_stripe_input_w, self.min_stripe_input_h = self._calculate_min_stripe_input()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

218

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

219

# Flags that marks whether this SchedulerOperation requires full IFM/OFM

220

self.requires_full_ifm = False

221

self.requires_full_ifm2 = False

222

self.requires_full_ofm = False

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

223

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

224

self.evicted_fms_size = 0

225

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

226

self.index = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

227

erik.andersson@arm.com

2022-03-22 15:35:30 +0100

[diff] [blame]

228

# Perform an IFM swap for certain binary elementwise operators

229

# in order to enable cascading, if the SchedOp conforms to

230

# Elementwise cascading rules.

Johan Alfvén

0f2e59f

2022-10-21 11:21:38 +0200

[diff] [blame]

231

# The non-constant/non-scalar/non-broadcast IFM should be the primary input

232

if self.op_type.is_binary_elementwise_op():

233

ifm = self.parent_op.ifm

234

ifm2 = self.parent_op.ifm2

235

ofm = self.parent_op.ofm

erik.andersson@arm.com

2022-03-22 15:35:30 +0100

[diff] [blame]

236

Johan Alfvén

993ea53

2022-10-26 10:20:01 +0200

[diff] [blame]

237

ifm_can_swap = ifm.is_const or ifm.is_scalar

Johan Alfvén

0f2e59f

2022-10-21 11:21:38 +0200

[diff] [blame]

238

ifm2_can_be_primary = not (ifm2.is_const or ifm2.is_scalar or ifm2.is_broadcast(ofm))

239

Johan Alfvén

993ea53

2022-10-26 10:20:01 +0200

[diff] [blame]

240

if ifm_can_swap and ifm2_can_be_primary:

Johan Alfvén

0f2e59f

2022-10-21 11:21:38 +0200

[diff] [blame]

241

# IFM2 is the primary input

Fredrik Svedberg

b81e1bb

2022-10-11 21:50:51 +0200

[diff] [blame]

242

self.reversed_operands = True

erik.andersson@arm.com

2022-03-22 15:35:30 +0100

[diff] [blame]

243

self.ifm, self.ifm2 = self.ifm2, self.ifm

244

245

self.parent_ps.ifm_shapes = self.parent_ps.ifm_shapes[::-1]

246

self.parent_ps.inputs = self.parent_ps.inputs[::-1]

247

self.parent_ps.ifm_tensor, self.parent_ps.ifm2_tensor = (

248

self.parent_ps.ifm2_tensor,

249

self.parent_ps.ifm_tensor,

250

)

251

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

252

def add_ifm_connection(self, conn: "Connection"):

253

"""Add input connection to another SchedulerOperation or Subgraph Input"""

254

conn.consumers.append(self)

255

self.ifm.connection = conn

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

256

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

257

def add_ifm2_connection(self, conn: "Connection"):

258

"""Add input connection to another SchedulerOperation or Subgraph Input"""

259

if self.ifm2:

260

conn.consumers.append(self)

261

self.ifm2.connection = conn

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

262

else:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

263

assert False, f"Trying to set an IFM2 Connection to {self} which has no IFM2"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

264

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

265

def add_ofm_connection(self, conn: "Connection"):

266

"""Add output connection to another SchedulerOperation or Subgraph Output"""

267

conn.producers.append(self)

268

self.ofm.connection = conn

269

270

def get_dependants(self):

271

"""Returns a list of the Ops that depend on this Operation's OFM"""

272

return self.ofm.connection.consumers

273

274

def ifm_size_in_bytes(self) -> int:

275

"""Returns size of the IFM in bytes"""

276

ifm_storage_shape = shape_for_format(self.ifm.shape, self.ifm.format)

277

return round_up(ifm_storage_shape.elements() * self.ifm.dtype.size_in_bytes(), Tensor.AllocationQuantum)

278

279

def ifm2_size_in_bytes(self) -> int:

280

"""Returns size of the IFM2 in bytes"""

281

if self.ifm2:

282

ifm2_storage_shape = shape_for_format(self.ifm2.shape, self.ifm2.format)

283

return round_up(ifm2_storage_shape.elements() * self.ifm2.dtype.size_in_bytes(), Tensor.AllocationQuantum)

return 0

def ofm_size_in_bytes(self) -> int:

288

"""Returns size of the OFM in bytes"""

289

ofm_storage_shape = shape_for_format(self.ofm.shape, self.ofm.format)

290

return round_up(ofm_storage_shape.elements() * self.ofm.dtype.size_in_bytes(), Tensor.AllocationQuantum)

291

292

def create_scheduler_info(self, nng: Graph, stripe: Shape4D) -> SchedulerOpInfo:

293

"""Returns schedule info about this SchedulerOperation based on how many ofm elements it should produce"""

294

ifm_shape = self.ifm.shape

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

295

ifm2_shape = self.ifm2.shape if self.ifm2 is not None else None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

296

ofm_shape = stripe

297

298

if ofm_shape != self.ofm.shape:

299

# Striped Op - Need to calculate stripe input volume

300

stripe_input_w, stripe_input_h = self._get_stripe_input_requirement(stripe)

301

# Ensure stripe input volume is within the full IFM volume

302

stripe_input_h = min(stripe_input_h, self.ifm.shape.height)

303

stripe_input_w = min(stripe_input_w, self.ifm.shape.width)

304

ifm_shape = ifm_shape.with_hw(stripe_input_h, stripe_input_w)

305

306

if self.ifm2:

307

stripe_input2_h = min(stripe_input_h, self.ifm2.shape.height)

308

stripe_input2_w = min(stripe_input_w, self.ifm2.shape.width)

309

ifm2_shape = ifm2_shape.with_hw(stripe_input2_h, stripe_input2_w)

310

311

block_config = self._get_block_config(ifm_shape, ifm2_shape, self.uses_scalar, ofm_shape)

312

313

scheduler_op_info = SchedulerOpInfo(block_config, 0, ifm_shape, ifm2_shape, ofm_shape)

314

if self.parent_op.weights:

315

# Default full-depth weight encoding with no buffering

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

316

(

317

scheduler_op_info.npu_weights_tensor,

318

scheduler_op_info.npu_scales_tensor,

319

) = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

320

self.arch,

321

self.parent_op,

322

self.parent_op.weights,

self.parent_op.bias,

self.kernel,

block_config,

[0, self.ofm.shape.depth],

327

)

328

329

self.parent_ps.block_config = block_config.old_style_representation()

330

return scheduler_op_info

331

332

def _get_stripe_input_requirement(self, stripe_shape: Shape4D) -> Tuple[int, int]:

333

"""Returns the amount of IFM required to produce the stripe with shape:'stripe_shape'"""

334

ofm_shape_to_produce = Block.from_shape(stripe_shape.as_list())

335

Fredrik Svedberg

3ff7a4a

2021-09-29 10:08:04 +0200

[diff] [blame]

336

return get_ifm_area_required(ofm_shape_to_produce, self.kernel, self.resampling_mode)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

337

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

338

def _calculate_min_stripe_input(self) -> Tuple[int, int]:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

339

# Calculate the input volume required height and width for the smallest possible stripe (h,w = 1,1)

340

min_stripe = self.ofm.shape.with_hw(1, 1)

341

return self._get_stripe_input_requirement(min_stripe)

342

343

def _get_block_config(

344

self, ifm_shape: Shape4D, ifm2_shape: Optional[Shape4D], uses_scalar: bool, ofm_shape: Shape4D

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

345

) -> Optional[ArchitectureBlockConfig]:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

346

# Returns a block config and SHRAM layout

347

lut_banks = 2 if self.parent_op.activation_lut else 0

348

return find_block_config(

349

self.arch,

350

self.op_type.npu_block_type,

ofm_shape,

ifm_shape,

ifm2_shape,

uses_scalar,

self.ifm.dtype.size_in_bits(),

356

self.kernel,

357

lut_banks,

358

self.parent_op.has_scaling(),

359

self.resampling_mode,

)

class Connection:

"""Scheduler internal representation of a Tensor that connects two SchedulerOperations

365

This class can be seen as an edge within the Scheduler Graph representation

366

"""

367

368

def __init__(self, tensor: Tensor):

369

self.parent_tens = tensor

370

371

# SchedulerOperation relationships

372

self.producers: List[SchedulerOperation] = []

373

self.consumers: List[SchedulerOperation] = []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

374

375

def __str__(self):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

376

return f"<Connection {self.parent_tens.name}>"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

__repr__ = __str__

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

381

class Schedule:

382

"""Class that contains a solution of how to schedule an NPU subgraph and its cost"""

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

383

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

384

def __init__(self, sg: Subgraph, label: str):

385

self.sg = sg

386

self.label = label

387

self.cost_map: Dict[SchedulerOperation, SchedulerOpInfo] = {}

388

self.cascades: Dict[int, CascadeInfo] = {}

389

self.fast_storage_peak_usage = 0

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

390

self.memory_snapshot: Optional[List[int]] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

@property

def name(self):

return f"{self.sg.name}_{self.label}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

395

396

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

397

class Scheduler:

398

"""Main class of the Vela Scheduling"""

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

399

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

400

def __init__(self, nng: Graph, sg: Subgraph, arch: ArchitectureFeatures, options: SchedulerOptions):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

401

self.nng = nng

402

self.sg = sg

403

self.arch = arch

Ayaan Masood

b801dda

2022-02-22 11:28:55 +0000

[diff] [blame]

404

self.sched_ops: List[SchedulerOperation] = []

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

405

self.max_schedule: Optional[Schedule] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

406

self.scheduler_options = options

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

407

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

408

self.scratched_fms: Dict[Tensor, Any] = {}

409

self.evicted_fms: List[live_range.LiveRange] = []

410

Johan Alfvén

2022-02-09 21:20:10 +0100

[diff] [blame]

411

def avoid_nhcwb16_for_ofm(self, tens, ps, arch):

Johan Alfven

f349047

2023-01-13 08:46:27 +0100

[diff] [blame]

412

"""For elementwise ops when ifm is in nhwc format and not brick format aligned (16),

413

then if the ofm can overwrite the ifm it is better to enforce ofm format to nhwc in

414

order to reduce memory transactions"""

Johan Alfvén

2022-02-09 21:20:10 +0100

[diff] [blame]

415

416

op = ps.primary_op

417

if not op.type.is_elementwise_op():

418

return False

419

420

depth = op.ofm_shapes[0][-1]

421

if (depth % 16) == 0:

422

return False

423

424

# Check if overwriting the inputs can be allowed

425

OpShapeTens = namedtuple("OpShapeTens", ["op_shape", "tens"])

426

outp = OpShapeTens(op.ofm_shapes[0], op.ofm)

427

inps = []

428

if op.ifm is not None:

429

inps.append(OpShapeTens(op.ifm_shapes[0], op.ifm))

430

if op.ifm2 is not None:

431

inps.append(OpShapeTens(op.ifm_shapes[1], op.ifm2))

432

433

# Find an input tensor that can be overwritten by the output

434

for inp in inps:

435

if (

436

# check op input and output shapes allow overlapping

437

inp.op_shape == outp.op_shape

438

# check input tensor is valid

439

and inp.tens is not None

440

and inp.tens.shape != []

441

# check input and output tensors are compatible

442

and inp.tens.format == outp.tens.format

443

and inp.tens.dtype == outp.tens.dtype

444

):

445

if inp.tens.format == TensorFormat.NHWC:

return True

return False

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

450

def create_scheduler_representation(self, arch: ArchitectureFeatures):

451

"""Creates a Scheduler Graph representation"""

452

# Temporary dict for creating connections between the Operations

453

connections: Dict[Tensor, Connection] = {}

454

# Memory required for the largest FeatureMap that has to be full

455

min_memory_req = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

456

for ps in self.sg.passes:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

457

if ps.primary_op:

458

# Set tensor format to NHCWB16 for output FeatureMaps, if possible

Louis Verhaard

0b9c9a3

2020-09-15 14:05:38 +0200

[diff] [blame]

459

for output in ps.outputs:

Jacob Bohlin

a5e8c1c

2021-06-14 13:33:39 +0200

[diff] [blame]

460

if output in self.sg.output_tensors or output.purpose != TensorPurpose.FeatureMap:

Patrik Gustavsson

feeb06d

2020-04-22 12:53:47 +0200

[diff] [blame]

461

continue

Johan Alfvén

2022-02-09 21:20:10 +0100

[diff] [blame]

462

463

if output.needs_linear_format:

464

continue

465

466

if self.avoid_nhcwb16_for_ofm(output, ps, arch):

467

output.needs_linear_format = True

468

continue

469

470

output.set_format(TensorFormat.NHCWB16, arch)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

471

472

# Create SchedulerOperations

473

op = SchedulerOperation(ps, arch, self.nng)

474

op.index = len(self.sched_ops)

475

476

# Make connections

477

if ps.ifm_tensor not in connections:

478

connections[ps.ifm_tensor] = Connection(ps.ifm_tensor)

479

if ps.ifm2_tensor and ps.ifm2_tensor not in connections:

480

connections[ps.ifm2_tensor] = Connection(ps.ifm2_tensor)

481

if ps.ofm_tensor not in connections:

482

connections[ps.ofm_tensor] = Connection(ps.ofm_tensor)

483

484

op.add_ifm_connection(connections[ps.ifm_tensor])

485

if ps.ifm2_tensor:

486

op.add_ifm2_connection(connections[ps.ifm2_tensor])

487

op.add_ofm_connection(connections[ps.ofm_tensor])

488

489

# Set requirements on the ifm/ofm buffers

490

self.sched_ops.append(op)

491

if ps.ifm_tensor in self.sg.input_tensors:

492

# This Op consumes a subgraph input

493

op.requires_full_ifm = True

494

if ps.ifm2_tensor and ps.ifm2_tensor in self.sg.input_tensors:

495

# This Op consumes a subgraph input

496

op.requires_full_ifm2 = True

497

if ps.ofm_tensor in self.sg.output_tensors:

498

# This Op produces a subgraph output

499

op.requires_full_ofm = True

500

if ps.ifm_tensor.needs_linear_format:

501

op.requires_full_ifm = True

502

if ps.ifm2_tensor and ps.ifm2_tensor.needs_linear_format:

503

op.requires_full_ifm2 = True

504

if ps.ofm_tensor.needs_linear_format or ps.primary_op.memory_function == Op.ConcatSliceWrite:

505

op.requires_full_ofm = True

506

if len(ps.primary_op.outputs) > 1 or len(ps.primary_op.outputs[0].consumer_list) > 1:

507

# Op has multiple outputs or consumers - requires full OFM

508

op.requires_full_ofm = True

509

510

# Check memory requirements if this Op requires any full FeatureMaps

511

op_memory_req = 0

512

if op.requires_full_ifm:

513

op_memory_req += op.ifm_size_in_bytes()

514

if op.requires_full_ifm2:

515

op_memory_req += op.ifm2_size_in_bytes()

516

if op.requires_full_ofm:

517

op_memory_req += op.ofm_size_in_bytes()

518

519

min_memory_req = max(op_memory_req, min_memory_req)

520

521

# Theoretical minimum required memory - used to guide the cascade building

522

self.min_memory_req = min_memory_req

523

524

def create_initial_schedule(self) -> Schedule:

525

"""Creates an initial schedule with no cascading or buffering of any kind"""

526

schedule = Schedule(self.sg, "MAX")

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

527

for op in self.sched_ops:

528

cost = op.create_scheduler_info(self.nng, op.ofm.shape)

529

cost.cycles = self.estimate_op_performance(op, cost.block_config, op.ofm.shape.depth)

530

schedule.cost_map[op] = cost

return schedule

def update_op_memory_snapshot(self, schedule: Schedule):

535

memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

536

537

# Collect live ranges from tensors

538

lr_graph = live_range.LiveRangeGraph()

539

for mem_area, mem_type_set in memories_list:

Johan Alfven

2023-02-28 09:03:03 +0100

[diff] [blame]

540

live_range.extract_live_ranges_from_schedule(

541

self.sg,

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

542

mem_area,

543

mem_type_set,

544

lr_graph,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

545

)

546

547

# Populate time-array with memory used by live ranges

548

temporal_usage = lr_graph.get_temporal_memory_usage(self.arch.fast_storage_mem_area)

549

schedule.memory_snapshot = temporal_usage

550

551

# Set the peak memory usage

552

schedule.fast_storage_peak_usage = max(temporal_usage, default=0)

553

554

def estimate_op_performance(self, op: SchedulerOperation, block_config, ofm_depth):

555

query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)

556

query.ifm_shape = op.ifm.shape

557

query.ifm_memory_area = op.ifm.mem_area

558

query.ifm_bits = op.ifm.dtype.size_in_bits()

559

query.ifm_format = op.ifm.format

560

query.ifm2_shape = op.ifm2 and op.ifm2.shape

561

query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area

562

query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()

563

query.ifm2_format = op.ifm2 and op.ifm2.format

564

query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)

565

query.ofm_memory_area = op.ofm.mem_area

566

query.ofm_bits = op.ofm.dtype.size_in_bits()

567

query.ofm_format = op.ofm.format

568

if op.parent_op.bias:

569

query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)

570

query.const_memory_area = self.arch.fast_storage_mem_area

571

572

query.kernel = op.kernel

573

query.config = block_config

574

575

return npu_performance.measure_cycle_cost(self.arch, op.op_type, op.activation and op.activation.op_type, query)

576

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

577

def estimate_element_access(self, op: SchedulerOperation, block_config, ofm_depth):

578

query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)

579

query.ifm_shape = op.ifm.shape

580

query.ifm_memory_area = op.ifm.mem_area

581

query.ifm_bits = op.ifm.dtype.size_in_bits()

582

query.ifm_format = op.ifm.format

583

query.ifm2_shape = op.ifm2 and op.ifm2.shape

584

query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area

585

query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()

586

query.ifm2_format = op.ifm2 and op.ifm2.format

587

query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)

588

query.ofm_memory_area = op.ofm.mem_area

589

query.ofm_bits = op.ofm.dtype.size_in_bits()

590

query.ofm_format = op.ofm.format

591

if op.parent_op.bias:

592

query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)

593

query.const_memory_area = self.arch.fast_storage_mem_area

594

595

query.kernel = op.kernel

596

query.config = block_config

597

598

return npu_performance.measure_element_access(self.arch, query)

599

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

600

def propose_schedule_buffering(self, ref_schedule: Schedule, staging_limit_bytes):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

601

"""Create a buffered schedule"""

602

buffered_schedule = Schedule(self.sg, f"{ref_schedule.label}_BUFFERED")

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

603

604

prev_op = None

605

for sched_op in self.sched_ops:

606

if sched_op not in ref_schedule.cost_map:

607

# sched_op is not part of this sub-schedule - skip

608

continue

609

610

self.propose_operator_buffering(sched_op, prev_op, buffered_schedule, ref_schedule, staging_limit_bytes)

611

prev_op = sched_op

612

613

return buffered_schedule

614

615

def propose_operator_buffering(

616

self,

617

sched_op: SchedulerOperation,

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

618

prev_op: Optional[SchedulerOperation],

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

619

buffered_schedule: Schedule,

620

ref_schedule: Schedule,

621

staging_limit_bytes,

622

):

623

# Mild recursion might mean this Op has already been seen

624

if sched_op in buffered_schedule.cost_map:

625

return

626

627

# Take the reference schedule as default costings for this schedule

628

ref_cost = ref_schedule.cost_map[sched_op]

629

cost = copy.copy(ref_cost)

630

cost.slack_buffering_cycles = ref_cost.cycles.op_cycles

631

memory_snapshot = ref_schedule.memory_snapshot

632

ref_memory_usage = memory_snapshot[ref_cost.time_index] if ref_cost.time_index < len(memory_snapshot) else 0

633

cost.slack_buffering_memory = staging_limit_bytes - ref_memory_usage

634

buffered_schedule.cost_map[sched_op] = cost

635

636

# Attempt weight buffering on anything with a weights tensor

637

if sched_op.parent_op.weights:

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

638

buffer_limit_bytes = cost.slack_buffering_memory

639

640

# If applicable apply size limitation, but keep it within reason (ratio 1.5).

641

# Size limitation is used when use_fast_storage_for_feature_maps have

642

# detected that there are fms that do not fit in fast storage.

643

if sched_op.evicted_fms_size and ((buffer_limit_bytes / sched_op.evicted_fms_size) >= 1.5):

644

buffer_limit_bytes -= sched_op.evicted_fms_size

645

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

646

self.propose_weight_buffering(

647

sched_op.parent_op.weights,

648

sched_op.parent_op.bias,

sched_op,

prev_op,

buffered_schedule,

ref_schedule,

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

653

buffer_limit_bytes,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

)

return cost

def weights_needs_dma(self, weight_tensor):

659

if weight_tensor and weight_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast):

660

# Weights are in permanent storage

661

# Only when permanent storage differs from feature map storage, there is a point moving the data

662

if (

663

weight_tensor.mem_area in (MemArea.Dram, MemArea.OffChipFlash)

664

and self.arch.permanent_storage_mem_area != self.arch.fast_storage_mem_area

):

return True

return False

def propose_weight_buffering(

self,

weight_tensor,

scale_tensor,

sched_op: SchedulerOperation,

674

prev_op: SchedulerOperation,

675

buffered_schedule: Schedule,

676

ref_schedule: Schedule,

677

buffer_limit_bytes,

678

):

679

cost = buffered_schedule.cost_map[sched_op]

680

prev_cost = buffered_schedule.cost_map.get(prev_op)

681

ref_cost = ref_schedule.cost_map[sched_op]

682

assert cost and ref_cost

683

684

needs_dma = self.weights_needs_dma(weight_tensor)

685

686

ofm_full_depth_slices = [0, ref_cost.stripe.depth]

687

688

# Encode weights for the full depth

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

689

full_weights, full_scales = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

self.arch,

sched_op.parent_op,

weight_tensor,

scale_tensor,

sched_op.kernel,

cost.block_config,

ofm_full_depth_slices,

697

)

698

full_weights_bytes = len(full_weights.buffer)

699

cost.ofm_depth_slices = ofm_full_depth_slices

700

701

# No buffering required - take all the weights from permanent storage

702

if sched_op.op_type == Op.FullyConnected or not needs_dma:

703

cost.npu_weights_tensor = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

704

cost.npu_scales_tensor = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

705

return

706

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

707

encoded_weights: Optional[NpuWeightTensor] = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

708

encoded_scales = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

709

710

# How many NPU cycles are available under the previously executing

711

# operator and SRAM unused for performing buffered DMA transfers

712

slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0

713

slack_memory = prev_cost.slack_buffering_memory if prev_cost else 0

714

715

# Force full depth for cascaded Ops

716

if ref_cost.cascade != 0:

717

weight_tensor_purpose = TensorSubPurpose.Standard

718

weight_buffer_size = full_weights_bytes

719

# Update the memory snapshot to reflect the added size of the weights

720

ref_schedule.memory_snapshot[ref_cost.time_index] += weight_buffer_size

721

else:

722

# Estimate the buffering cycle time for the full set of weights

723

full_transfer_cycles = npu_performance.measure_mem2mem_cycles(

724

self.arch, weight_tensor.mem_area, self.arch.fast_storage_mem_area, full_weights_bytes

725

)

726

cost.full_weight_transfer_cycles = full_transfer_cycles

727

728

# Calculate the amount of prebuffering necessary (or what is possible with limited

729

# double buffer buffer size)

730

half_buffer_limit = buffer_limit_bytes // 2

731

if full_transfer_cycles > slack_cycles:

732

prebuffer_ratio = slack_cycles / full_transfer_cycles

733

prebuffer_bytes = min(prebuffer_ratio * full_weights_bytes, half_buffer_limit)

734

else:

735

prebuffer_bytes = min(full_weights_bytes, half_buffer_limit)

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

736

737

prebuffer_ratio = prebuffer_bytes / full_weights_bytes

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

738

739

# Have to split the weights if the initial buffering can't store

740

# all of the compressed weights

741

if prebuffer_bytes < full_weights_bytes:

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

742

block_depth = cost.block_config.ofm_block.depth

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

743

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

744

# Choose initial prebuffering depth (already buffer clamped)

745

prebuffer_depth = ref_cost.stripe.depth * prebuffer_ratio

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

746

prebuffer_depth = int(max(16, round_down(prebuffer_depth, ArchitectureFeatures.OFMSplitDepth)))

747

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

748

# Calculate cycles executed during the prebuffer

749

pre_op_cycles = self.estimate_op_performance(sched_op, cost.block_config, prebuffer_depth)

750

buffering_depth = ref_cost.stripe.depth * (pre_op_cycles.op_cycles / full_transfer_cycles)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

751

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

752

# Choose initial buffering depth and clamp to the double buffering limit

753

buffering_depth = round_up(buffering_depth, block_depth)

754

buffering_bytes = (buffering_depth / ref_cost.stripe.depth) * full_weights_bytes

755

if buffering_bytes > half_buffer_limit:

756

buffering_depth = (half_buffer_limit / full_weights_bytes) * ref_cost.stripe.depth

757

758

while True:

759

# Attempt to buffer whole blocks

Johan Alfvén

cce7f2d

2022-04-08 10:47:09 +0200

[diff] [blame]

760

if buffering_depth > block_depth:

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

761

buffering_depth = round_down(buffering_depth, block_depth)

762

else:

763

buffering_depth = round_down(buffering_depth, ArchitectureFeatures.OFMSplitDepth)

764

buffering_depth = int(max(buffering_depth, ArchitectureFeatures.OFMSplitDepth))

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

765

766

# Create list of depth slices

767

depth_slices = [0]

768

if prebuffer_depth < ref_cost.stripe.depth:

769

depth_slices += list(range(prebuffer_depth, ref_cost.stripe.depth, buffering_depth))

770

depth_slices.append(ref_cost.stripe.depth)

771

772

# Encode weights based depth slices

773

cost.ofm_depth_slices = depth_slices

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

774

encoded_weights, encoded_scales = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

self.arch,

sched_op.parent_op,

weight_tensor,

scale_tensor,

sched_op.kernel,

cost.block_config,

cost.ofm_depth_slices,

782

)

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

783

assert encoded_weights is not None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

784

# Chosen buffering might not fit at all, iterate until it does

785

# or until the minimum usable slice size is reached

786

if (

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

787

encoded_weights.double_buffer_size() <= buffer_limit_bytes

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

788

or prebuffer_depth == ArchitectureFeatures.OFMSplitDepth

):

break

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

792

if buffering_depth > prebuffer_depth:

793

buffering_depth = round_up(buffering_depth // 2, ArchitectureFeatures.OFMSplitDepth)

794

else:

795

prebuffer_depth = round_up(prebuffer_depth // 2, ArchitectureFeatures.OFMSplitDepth)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

796

797

# Calculate cycles required to run the last op for use as future slack

798

tail_cycles = self.estimate_op_performance(

799

sched_op, cost.block_config, depth_slices[-1] - depth_slices[-2]

800

)

801

cost.slack_buffering_cycles = tail_cycles.op_cycles

802

803

# Determine whether the weights need to be double buffered

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

804

weight_buffer_size = min(len(encoded_weights.buffer), encoded_weights.max_range_bytes())

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

805

806

# Only buffer weights if there's still space left for the buffer

807

if weight_buffer_size <= buffer_limit_bytes:

808

assert weight_buffer_size % 16 == 0

809

# Determine whether to double buffer or single buffer

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

810

double_buffer_size = encoded_weights.double_buffer_size()

811

if (double_buffer_size <= buffer_limit_bytes) and (weight_buffer_size < len(encoded_weights.buffer)):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

812

weight_tensor_purpose = TensorSubPurpose.DoubleBuffer

813

else:

814

weight_tensor_purpose = TensorSubPurpose.Standard

815

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

816

cost.buffered_weight_tensors = [

817

self.buffer_tensor(

818

encoded_weights,

819

weight_tensor_purpose,

820

encoded_weights.double_buffer_sizes[0],

821

weight_tensor.name + "_buffer",

822

)

823

]

824

if weight_tensor_purpose == TensorSubPurpose.DoubleBuffer:

825

buf2 = self.buffer_tensor(

826

encoded_weights,

827

weight_tensor_purpose,

828

encoded_weights.double_buffer_sizes[1],

829

weight_tensor.name + "_buffer2",

830

)

831

cost.buffered_weight_tensors.append(buf2)

832

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

833

# Note! OFM depth slices define slices as [0, s1, ... sn]. For example, [0, 70, 140] describes two slices

834

# (0-70 and 70-140) but has a length of 3, which would result in idx = 3 % 2 = 1 if two buffers were used.

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

835

last_used_buffer_idx = len(cost.ofm_depth_slices) % len(cost.buffered_weight_tensors)

836

weight_buffer_size = encoded_weights.double_buffer_sizes[last_used_buffer_idx]

837

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

838

if ref_cost.cascade == 0:

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

839

# Determine if the lifetime can be extended and pre-buffer the first weight buffer

840

# under the previous operation

841

cost.buffered_weight_tensors[0].pre_buffer = encoded_weights.double_buffer_size() < slack_memory

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

842

843

cost.slack_buffering_memory -= weight_buffer_size

844

else:

845

# Don't slice or buffer - use the whole depth from persistent storage

846

cost.ofm_depth_slices = ofm_full_depth_slices

847

encoded_weights = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

848

encoded_scales = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

849

850

cost.npu_weights_tensor = encoded_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

851

cost.npu_scales_tensor = encoded_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

852

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

853

def buffer_tensor(self, src_tensor: Tensor, sub_purpose: TensorSubPurpose, buffer_size: int, name: str) -> Tensor:

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

854

buffered_weight_tensor = Tensor([1, 1, 1, buffer_size], DataType.uint8, name)

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

855

buffered_weight_tensor.src_tensor = src_tensor

856

buffered_weight_tensor.mem_area = self.arch.fast_storage_mem_area

857

buffered_weight_tensor.mem_type = MemType.Scratch_fast

858

buffered_weight_tensor.purpose = TensorPurpose.Weights

859

buffered_weight_tensor.sub_purpose = sub_purpose

860

return buffered_weight_tensor

861

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

862

def propose_minimal_schedule(self) -> Schedule:

863

"""Proposes scheduling parameters where every operator is subdivided into the smallest stripe that satisfies the

864

next operators stride"""

865

min_schedule = Schedule(self.sg, "MIN")

866

cost_map = min_schedule.cost_map

867

868

# Keep track of the previous Op - which consumes the current Op's OFM

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

869

prev_op: Optional[SchedulerOperation] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

870

for sched_op in reversed(self.sched_ops):

871

min_stripe_height = prev_op.kernel.stride.y if prev_op else 1

872

min_stripe = sched_op.ofm.shape.with_height(min_stripe_height)

873

874

cost = sched_op.create_scheduler_info(self.nng, min_stripe)

875

cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)

876

cost_map[sched_op] = cost

prev_op = sched_op

return min_schedule

def propose_schedule_striping(self, final_stripe: Shape4D, label: str, ref_schedule: Schedule) -> Schedule:

883

"""Proposes new striping for a schedule. The stripe is derived from the ifm requirements of the next Op down"""

884

ref_cost = ref_schedule.cost_map

885

886

striped_schedule = Schedule(self.sg, label)

887

stripe = final_stripe

888

for sched_op in reversed(self.sched_ops):

889

if sched_op not in ref_cost:

890

# sched_op is not part of the sub-schedule - skip

891

continue

892

893

# Create a cost entry with the new stripe

894

cost = sched_op.create_scheduler_info(self.nng, stripe)

895

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

896

weight_tensor = cost.npu_weights_tensor

897

for idx, buffered_tens in enumerate(ref_cost[sched_op].buffered_weight_tensors):

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

898

# If the weights are buffered in the reference schedule they should be in the new proposal

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

899

cost.buffered_weight_tensors.append(

900

self.buffer_tensor(

901

weight_tensor,

902

buffered_tens.sub_purpose,

903

weight_tensor.double_buffer_sizes[idx],

904

buffered_tens.name,

905

)

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

906

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

907

908

# Estimate performance

909

cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)

910

striped_schedule.cost_map[sched_op] = cost

911

erik.andersson@arm.com

8912f3a

2022-08-16 11:08:57 +0200

[diff] [blame]

912

# Calculate the preceeding Op's stripe.

913

914

# In certain cases where an upscaling Op is cascaded,

915

# it may get instructed to produce an odd stripe height.

916

# Thus we need to force it back to even heights.

917

force_even_stripe_heights = False

918

for op in self.sched_ops:

919

# Check if the cascade has a Nearest Neighbor-op.

920

# If that is the case, force the stripes to be even.

921

if (

922

ref_cost.get(op, None)

923

and ref_cost.get(sched_op, None)

924

and ref_cost[op].cascade == ref_cost[sched_op].cascade

925

and is_nearest(op.resampling_mode)

926

):

927

force_even_stripe_heights = True

928

break

929

upscaling_remainder = stripe.height % to_upscale(sched_op.resampling_mode)

930

height = stripe.height + (stripe.height % 2 if force_even_stripe_heights else upscaling_remainder)

Fredrik Svedberg

d03dc50

2022-06-30 10:44:12 +0200

[diff] [blame]

931

stripe = sched_op.ifm.shape.with_height(height * sched_op.kernel.stride.y)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

932

933

return striped_schedule

934

935

def estimate_schedule_memory_usage(self, schedule: Schedule, non_local_mem_usage: dict):

936

"""Estimates the memory usage of a schedule"""

937

cost = schedule.cost_map

938

cascades = schedule.cascades

939

peak_mem_usage = 0

940

for sched_op in self.sched_ops:

941

if sched_op not in cost:

942

# sched_op is not part of the sub-schedule - skip

943

continue

944

945

if cost[sched_op].cascade:

946

# This Op is part of a cascade - use the cascade's memory usage

947

cascade_info = cascades[cost[sched_op].cascade]

948

# Non-local memory usage is already included in the cascade_info

949

peak_mem_usage = max(cascade_info.mem_usage, peak_mem_usage)

950

else:

951

# This Op is not part of a cascade - calculate the memory usage

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

952

op_weight_buffer = sum(tens.storage_size() for tens in cost[sched_op].buffered_weight_tensors)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

953

954

op_mem_usage = (

955

sched_op.ifm_size_in_bytes()

956

+ sched_op.ofm_size_in_bytes()

957

+ op_weight_buffer

958

+ non_local_mem_usage.get(sched_op, 0)

959

)

960

peak_mem_usage = max(op_mem_usage, peak_mem_usage)

961

962

return peak_mem_usage

963

Johan Alfvén

2022-07-16 18:27:05 +0200

[diff] [blame]

964

def build_cascades_for_min_schedule(self, min_schedule: Schedule, max_template: Schedule, memory_limit: int):

965

# Update memory snapshot

966

self.sg.schedule = min_schedule

967

self.update_op_memory_snapshot(min_schedule)

968

969

# Calculate residual memory for Min schedule

970

non_local_mem_usage = {}

971

for sched_op in self.sched_ops:

972

time_index = min_schedule.cost_map[sched_op].time_index

973

974

if self.arch.is_spilling_enabled():

975

# For Dedicated SRAM only the intermediate buffers are in SRAM, hence op_mem_usage is 0

976

op_mem_usage = 0

977

else:

978

# Min schedule only have ifm and ofm in SRAM (no buffered weigth tensors)

Johan Alfvén

eb332a3

2022-12-09 17:50:48 +0100

[diff] [blame]

979

# Only include IFM's that are in the scratch area

980

ifm = sched_op.ifm.connection.parent_tens

981

ifm_size = (

982

0 if ifm.mem_type not in (MemType.Scratch, MemType.Scratch_fast) else sched_op.ifm_size_in_bytes()

983

)

Johan Alfvén

92689d5

2022-12-06 11:16:19 +0100

[diff] [blame]

984

ofm_size = 0 if ofm_can_reuse_ifm(sched_op) else sched_op.ofm_size_in_bytes()

Johan Alfvén

eb332a3

2022-12-09 17:50:48 +0100

[diff] [blame]

985

op_mem_usage = ifm_size + ofm_size

Johan Alfvén

2022-07-16 18:27:05 +0200

[diff] [blame]

986

987

non_local_mem_usage[sched_op] = min_schedule.memory_snapshot[time_index] - op_mem_usage

Johan Alfvén

92689d5

2022-12-06 11:16:19 +0100

[diff] [blame]

988

assert non_local_mem_usage[sched_op] >= 0

Johan Alfvén

2022-07-16 18:27:05 +0200

[diff] [blame]

989

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

990

# Create cascades for Min schedule

Johan Alfvén

2022-07-16 18:27:05 +0200

[diff] [blame]

991

cascade_builder = CascadeBuilder(self.sched_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)

992

cascade_builder.build_cascades(min_schedule, max_template, memory_limit)

993

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

994

def optimize_sub_schedule(

995

self, cascade_info: CascadeInfo, ref_schedule: Schedule, max_template: Schedule, memory_limit: int

996

) -> Schedule:

997

"""Extracts the Ops covered by the given cascade and creates a sub-schedule. The sub-schedule is optimized by

998

proposing weight buffering and then continously proposing new stripe sizes"""

999

ref_cost = ref_schedule.cost_map

1000

# Extract the ops that are part of this sub-schedule

1001

start = cascade_info.start

1002

end = cascade_info.end

1003

sub_schedule_ops = self.sched_ops[start : end + 1]

1004

# Create a sub-schedule that contains only the costs for the Ops that are part of the sub-schedule

1005

sub_schedule = Schedule(self.sg, f"SUB_{start}_{end}")

1006

for sched_op in sub_schedule_ops:

1007

sub_schedule.cost_map[sched_op] = ref_cost[sched_op]

1008

1009

sub_schedule.cascades[end] = cascade_info

1010

# Use the memory snapshot from the reference schedule

1011

sub_schedule.memory_snapshot = ref_schedule.memory_snapshot

1012

1013

# Calculate memory usage that is live during the sub-schedule but not part of it

1014

time_for_cascade = ref_cost[sub_schedule_ops[0]].time_index

1015

mem_usage_parallel_to_sub_schedule = ref_schedule.memory_snapshot[time_for_cascade] - cascade_info.mem_usage

1016

# If the first Op's IFM has other consumers it has to live throughout the whole sub-schedule whether it's

1017

# included in a cascade or not

1018

persistent_initial_ifm = (

1019

sub_schedule_ops[0].ifm_size_in_bytes() if len(sub_schedule_ops[0].ifm.connection.consumers) > 1 else 0

1020

)

1021

# Calculate non-local-mem-usage per Operator

1022

non_local_mem_usage = {}

1023

for idx, sched_op in enumerate(sub_schedule_ops):

1024

non_local_mem_usage[sched_op] = mem_usage_parallel_to_sub_schedule

1025

if idx != 0:

1026

non_local_mem_usage[sched_op] += persistent_initial_ifm

1027

1028

cascade_builder = CascadeBuilder(sub_schedule_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)

1029

1030

# Start by adding buffering

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

1031

buffered_sub_schedule = self.propose_schedule_buffering(

1032

sub_schedule, self.scheduler_options.optimization_sram_limit

1033

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1034

# Copy the cascades over from the unbuffered-schedule

1035

buffered_sub_schedule.cascades = sub_schedule.cascades

1036

1037

# Generate the possible stripings for the final Op in the sub-schedule

1038

final_ofm_shape = sub_schedule_ops[-1].ofm.shape

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1039

1040

# Skip testing the min stripe used in the MIN schedule since that will be used

1041

# anyway if no new cascades are created below

1042

last_op = sub_schedule_ops[-1]

1043

min_stripe_h = sub_schedule.cost_map[last_op].stripe.height + 1

1044

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1045

possible_stripes = [

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1046

final_ofm_shape.with_height(stripe_h) for stripe_h in range(min_stripe_h, final_ofm_shape.height // 2 + 1)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1047

]

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1048

# Propose different striping

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

1049

best_schedule = None

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1050

max_nbr_of_cascades = 0

1051

for iteration, proposed_stripe in enumerate(possible_stripes):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1052

proposed_schedule = self.propose_schedule_striping(

1053

proposed_stripe, f"OPTIMIZED_{iteration}", buffered_sub_schedule

1054

)

1055

1056

cascade_builder.build_cascades(proposed_schedule, max_template, memory_limit)

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1057

nbr_of_cascades = len(proposed_schedule.cascades)

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1058

if iteration == 0:

1059

# First iteration - used as limit to prevent splitting up the cascades

1060

# Long cascades are better in order to reduce IFM/IFM dram bandwidth

1061

max_nbr_of_cascades = nbr_of_cascades

1062

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1063

# Check if proposal fits

1064

proposed_schedule_mem_usage = self.estimate_schedule_memory_usage(proposed_schedule, non_local_mem_usage)

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1065

if (proposed_schedule_mem_usage) <= memory_limit and nbr_of_cascades <= max_nbr_of_cascades:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1066

best_schedule = proposed_schedule

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1067

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1068

if not proposed_schedule.cascades:

1069

# No cascading required - early exit

1070

break

1071

else:

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1072

break

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

return best_schedule

def optimize_schedule(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

self,

schedule: Schedule,

max_sched: Schedule,

max_template: Schedule,

1081

options: SchedulerOptions,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1082

) -> Schedule:

1083

"""Extracts sub-schedules based on the cascades and optimizes them and applies them to the final schedule"""

1084

sram_limit = options.optimization_sram_limit

1085

if max_sched.fast_storage_peak_usage < sram_limit and not self.arch.is_spilling_enabled():

1086

# Maximum performance schedule fits within the SRAM target

1087

return max_sched

1088

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

1089

# Iterate over a copy of the cascades since they may change during the loop

1090

for cascade_info in list(schedule.cascades.values()):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1091

# Optimize the sub-schedule in this cascade

1092

opt_sub_schedule = self.optimize_sub_schedule(cascade_info, schedule, max_template, sram_limit)

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

1093

if opt_sub_schedule:

1094

# Remove the existing cascade

1095

del schedule.cascades[cascade_info.end]

1096

# Update the sub-schedule Op and cascade costs to the full schedule

1097

schedule.cost_map.update(opt_sub_schedule.cost_map)

1098

schedule.cascades.update(opt_sub_schedule.cascades)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1099

1100

# Update memory snapshot

1101

self.sg.schedule = schedule

1102

self.update_op_memory_snapshot(schedule)

1103

# Propose schedule buffering to the optimized schedule

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

1104

optimized_sched = self.propose_schedule_buffering(schedule, self.scheduler_options.optimization_sram_limit)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1105

# Copy the cascade's metadata from the unbuffered schedule

1106

optimized_sched.cascades = schedule.cascades

1107

return optimized_sched

1108

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1109

def optimize_weight_buffering_size(

1110

self,

1111

min_schedule: Schedule,

1112

options: SchedulerOptions,

1113

):

1114

default_schedule = self.sg.schedule

Tim Hall

c1be087

2022-03-03 17:50:52 +0000

[diff] [blame]

1115

npu_performance.calc_new_performance_for_network(self.nng, self.arch, None, False)

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1116

default_tot_cycles = self.nng.cycles[npu_performance.PassCycles.Total]

1117

default_dram_cycles = self.nng.cycles[npu_performance.PassCycles.DramAccess]

1118

1119

# Restore mem/type for scratched_fms

1120

for tens in self.scratched_fms:

1121

tens.mem_area = self.scratched_fms[tens][0]

1122

tens.mem_type = self.scratched_fms[tens][1]

1123

1124

self.update_op_memory_snapshot(self.sg.schedule)

1125

1126

# Collect live ranges from tensors

1127

memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

1128

lr_graph = live_range.LiveRangeGraph()

1129

for mem_area, mem_type_set in memories_list:

Johan Alfven

2023-02-28 09:03:03 +0100

[diff] [blame]

1130

live_range.extract_live_ranges_from_schedule(

1131

self.sg,

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1132

mem_area,

1133

mem_type_set,

1134

lr_graph,

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1135

)

1136

1137

# Find the relation between the sched_op and the buffering tensor

1138

weight_ops = {}

1139

for sched_op in self.sched_ops:

1140

cost = self.sg.schedule.cost_map[sched_op]

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

1141

for tens in cost.buffered_weight_tensors:

1142

weight_ops[tens] = sched_op

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1143

1144

# Filter out weight buffer live ranges

1145

weight_lrs = []

1146

for lr in lr_graph.lrs:

1147

for tens in lr.tensors:

1148

if weight_ops.get(tens):

1149

weight_lrs.append(lr)

1150

break

1151

1152

# See if any evicted fm overlaps with a weight buffering op.

1153

# If this is the case add a size limitation to the buffering op

1154

for lr in self.evicted_fms:

1155

for weight_lr in weight_lrs:

1156

if lr.overlaps_ranges(weight_lr):

1157

for tens in weight_lr.tensors:

1158

sched_op = weight_ops.get(tens)

1159

if sched_op:

1160

# Add size reduction to the op

1161

sched_op.evicted_fms_size += lr.size

1162

break

1163

1164

self.sg.schedule = min_schedule

1165

self.update_op_memory_snapshot(self.sg.schedule)

1166

1167

# Run schedule buffering - with weight buffer size reduction

1168

schedule = self.propose_schedule_buffering(self.sg.schedule, options.optimization_sram_limit)

1169

schedule.cascades = self.sg.schedule.cascades

1170

self.sg.schedule = schedule

1171

1172

# Apply new buffer schdule and calc new performance

1173

self.update_op_memory_snapshot(self.sg.schedule)

1174

self.apply_schedule(self.sg.schedule)

1175

self.use_fast_storage_for_feature_maps(self.sg.schedule, options.optimization_sram_limit)

1176

Tim Hall

c1be087

2022-03-03 17:50:52 +0000

[diff] [blame]

1177

npu_performance.calc_new_performance_for_network(self.nng, self.arch, None, False)

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1178

new_tot_cycles = self.nng.cycles[npu_performance.PassCycles.Total]

1179

new_dram_cycles = self.nng.cycles[npu_performance.PassCycles.DramAccess]

1180

Tim Hall

8bc7a65

2022-05-19 15:29:23 +0100

[diff] [blame]

1181

improvement_tot = (

1182

round((default_tot_cycles - new_tot_cycles) / default_tot_cycles, 2) if default_tot_cycles != 0 else 0

1183

)

1184

improvement_dram = (

1185

round((default_dram_cycles - new_dram_cycles) / default_dram_cycles, 2) if default_dram_cycles != 0 else 0

1186

)

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1187

1188

# Compare both total and dram improvement

Johan Alfvén

3dae1b6

2022-05-17 10:26:48 +0200

[diff] [blame]

1189

if not (improvement_tot >= 0 and improvement_dram > 0):

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1190

# No improvement, restore the default schedule

1191

for sched_op in self.sched_ops:

1192

sched_op.evicted_fms_size = 0

1193

1194

for tens in self.scratched_fms:

1195

tens.mem_area = self.scratched_fms[tens][0]

1196

tens.mem_type = self.scratched_fms[tens][1]

1197

1198

self.sg.schedule = default_schedule

1199

self.update_op_memory_snapshot(self.sg.schedule)

1200

self.apply_schedule(self.sg.schedule)

1201

self.use_fast_storage_for_feature_maps(self.sg.schedule, options.optimization_sram_limit)

1202

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1203

def apply_schedule(self, sched: Schedule):

1204

"""Applies the given schedule as a final solution"""

1205

for sched_op in self.sched_ops:

1206

op_info = sched.cost_map[sched_op]

1207

cascade_info = sched.cascades.get(op_info.cascade, None)

1208

if cascade_info and sched_op in cascade_info.buffers:

1209

buffer_tens = sched_op.ifm.connection.parent_tens

1210

# Apply memory area and type

1211

buffer_tens.mem_area = self.arch.fast_storage_mem_area

1212

buffer_tens.mem_type = MemType.Scratch_fast

1213

# Apply Rolling buffer

1214

buffer_tens.set_format(TensorFormat.NHCWB16, self.arch)

1215

buffer_tens.set_new_sub_purpose(TensorSubPurpose.RollingBufferY, cascade_info.buffers[sched_op].height)

1216

1217

sched_op.parent_ps.block_config = op_info.block_config.old_style_representation()

1218

1219

# Ensure that the src_tensor reference is set correctly

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

1220

for tens in op_info.buffered_weight_tensors:

1221

tens.src_tensor = op_info.npu_weights_tensor

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1222

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1223

def use_fast_storage_for_feature_maps(self, schedule, staging_limit):

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1224

"""Finds the set of feature maps that fits within the staging limit which combined has the largest amount of

1225

access cycles and moves those feature map into fast storage"""

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1226

max_mem_usage = []

1227

base_mem_usage = []

1228

fast_storage_type = MemType.Scratch_fast

1229

fast_storage_mem_area = self.arch.fast_storage_mem_area

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1230

self.evicted_fms = []

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1231

1232

# Force all OFMs to fast-storage

1233

for sched_op in self.sched_ops:

1234

cost = schedule.cost_map[sched_op]

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1235

if cost.cascade == 0 and sched_op.get_dependants():

1236

ofm_tens = sched_op.ofm.connection.parent_tens

1237

if not any(cons is None for cons in ofm_tens.consumer_list):

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1238

if ofm_tens not in self.scratched_fms:

1239

# Remember default mem area and mem type, only done once

1240

self.scratched_fms[ofm_tens] = (ofm_tens.mem_area, ofm_tens.mem_type)

1241

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1242

ofm_tens.mem_area = fast_storage_mem_area

1243

ofm_tens.mem_type = fast_storage_type

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1244

1245

# Collect live ranges from tensors

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1246

memories_list = [(fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1247

lr_graph = live_range.LiveRangeGraph()

1248

for mem_area, mem_type_set in memories_list:

Johan Alfven

2023-02-28 09:03:03 +0100

[diff] [blame]

1249

live_range.extract_live_ranges_from_schedule(

1250

self.sg,

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

1251

mem_area,

1252

mem_type_set,

1253

lr_graph,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1254

)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1255

max_mem_usage = lr_graph.get_temporal_memory_usage(fast_storage_mem_area)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1256

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1257

# If max_mem_usage does not exceed staging limit at any point all lrs fit and can stay in fast storage

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1258

if max(max_mem_usage) <= staging_limit:

1259

return

1260

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1261

# Build up the base memory usage by removing the mem_usage of the lrs we previously moved to fast-storage

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1262

base_mem_usage = np.array(max_mem_usage)

1263

curr_lrs = []

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1264

for lr in lr_graph.lrs:

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1265

for tens in lr.tensors:

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1266

if self.scratched_fms.get(tens):

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1267

curr_lrs.append(lr)

1268

base_mem_usage[lr.start_time : lr.end_time + 1] -= lr.size

1269

break

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1270

competing_lrs = []

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1271

competing_tens_access = {}

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1272

1273

# Evict live ranges that will never fit

1274

for lr in curr_lrs.copy():

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1275

base_usage = max(base_mem_usage[lr.start_time : lr.end_time + 1])

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1276

if base_usage + lr.size > staging_limit:

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1277

# Lr will never fit and may thus be evicted

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1278

self.evicted_fms.append(lr)

1279

FastStorageComponentAllocator.evict(lr, max_mem_usage, self.scratched_fms)

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1280

curr_lrs.remove(lr)

1281

1282

# Keep live ranges that will always fit in fast storage and let the remaining ones compete

1283

for lr in curr_lrs:

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1284

# Since max_mem_usage is the memory usage with all FMs still in fast-storage,

1285

# the memory limit cannot be exceeded if max_mem_usage does not.

1286

# Thus, the affected lrs can remain in fast-storage if the following is true

1287

if max(max_mem_usage[lr.start_time : lr.end_time + 1]) <= staging_limit:

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1288

FastStorageComponentAllocator.keep(lr, base_mem_usage)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1289

else:

1290

competing_lrs.append(lr)

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1291

for tens in lr.tensors:

1292

competing_tens_access[tens] = 0

1293

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1294

# All lrs and their tensors have been handled if competing_lrs_sz is zero, we may thus return

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1295

if len(competing_lrs) == 0:

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1296

return

1297

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1298

# Estimate element access for all tensors that are competing for a place in fast-storage.

1299

# This number is used when deciding which tensor that stays in fast-storage

1300

for sched_op in self.sched_ops:

1301

cost = schedule.cost_map[sched_op]

1302

1303

if competing_tens_access.get(sched_op.ifm.connection.parent_tens) is not None:

1304

tens = sched_op.ifm.connection.parent_tens

1305

access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)

1306

competing_tens_access[tens] += access.ifm_read[0]

1307

1308

if sched_op.ifm2 and competing_tens_access.get(sched_op.ifm2.connection.parent_tens) is not None:

1309

tens = sched_op.ifm2.connection.parent_tens

1310

access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)

1311

competing_tens_access[tens] += access.ifm_read[1]

1312

1313

if competing_tens_access.get(sched_op.ofm.connection.parent_tens) is not None:

1314

tens = sched_op.ofm.connection.parent_tens

1315

access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)

1316

competing_tens_access[tens] += access.ofm_write

1317

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1318

# Sort live ranges "from left to right" on the time axis to simplify checking overlapping ranges

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1319

competing_lrs = sorted(competing_lrs, key=lambda lr: (lr.start_time, lr.end_time + 1, lr.size))

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1320

1321

# Remove lrs that have a live range that is too long compared to others.

1322

# They are causing problems for the HillClimb Allocator when it has to

1323

# change the allocation indices, in order to fit all the allocations into SRAM.

1324

# This problem only occur in larger networks with complex graphs.

1325

#

1326

# Limit the number of items for allocate_component to work with max MAX_EXHAUSTIVE_ITEMS

1327

# at the time. Too many will give too long compilation time

1328

#

1329

# Too long is currently decided to be (based on experience, analyzing many networks):

1330

# Compare lr at postion i with lr at position i + MAX_EXHAUSTIVE_ITEMS.

1331

# If end time differs by at least MAX_EXHAUSTIVE_LIFE_RANGE then do not include lr at position i.

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1332

if len(competing_lrs) > FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS:

1333

# Create a copy of the original list to iterate over because the original version is modified in-loop

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1334

competing_lrs_copy = competing_lrs.copy()

1335

for i, lr in enumerate(competing_lrs_copy):

1336

lr_time = lr.end_time - lr.start_time

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1337

# Only check live ranges longer than MAX_EXHAUSTIVE_LIFE_RANGE

1338

if lr_time >= FastStorageComponentAllocator.MAX_EXHAUSTIVE_LIFE_RANGE:

1339

# Compare current lr with lr at position lr + MAX_EXHAUSTIVE_ITEMS

1340

cmp_pos = min(i + FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS, len(competing_lrs) - 1)

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1341

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1342

# Compare end times + plus a margin by MAX_EXHAUSTIVE_LIFE_RANGE

1343

if (

1344

lr.end_time

1345

> competing_lrs_copy[cmp_pos].end_time + FastStorageComponentAllocator.MAX_EXHAUSTIVE_LIFE_RANGE

1346

):

1347

# Current lr live time stands out, remove it. No use adding it to the

1348

# evicted_fms list since the lr should not be included in the fast storage allocation

1349

FastStorageComponentAllocator.evict(lr, max_mem_usage, self.scratched_fms)

1350

competing_lrs.remove(lr)

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1351

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1352

# Split competing live ranges into components by finding disconnected groups of live ranges or components of

1353

# max size MAX_EXHAUSTIVE_ITEMS

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1354

start = 0

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1355

end_time = competing_lrs[0].end_time

1356

component_allocator = FastStorageComponentAllocator(base_mem_usage, max_mem_usage, staging_limit)

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1357

component_ranges = []

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1358

for i, lr in enumerate(competing_lrs):

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1359

nbr_items = i - start

1360

if lr.start_time <= end_time and (nbr_items < FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS):

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1361

end_time = max(end_time, lr.end_time)

1362

else:

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1363

# Number items reached max items or current lr's start time

1364

# does not overlap with previous lr's end time

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1365

component_ranges.append((start, i))

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1366

start = i

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1367

end_time = lr.end_time

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1368

component_ranges.append((start, len(competing_lrs)))

1369

1370

# Allocate each component separately

1371

for start, end in component_ranges:

1372

component_allocator.allocate_component(

1373

competing_lrs[start:end],

max_mem_usage,

base_mem_usage,

self.scratched_fms,

competing_tens_access,

1378

self.evicted_fms,

1379

)

1380

assert max(max_mem_usage) <= staging_limit, "Allocation exceeds staging limit"

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1381

1382

def move_constant_data(self):

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1383

"""Determine if data can be moved from permanent storage to another memory area. A move will generate a DMA

1384

command in the high-level command stream"""

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1385

for sched_op in self.sched_ops:

1386

parent_op = sched_op.parent_op

1387

is_lut_used = any(inp.purpose == TensorPurpose.LUT for inp in parent_op.inputs)

1388

max_ifm_shram_avail = (

1389

(self.arch.available_shram_banks(is_lut_used) - self.arch.shram_reserved_output_banks)

1390

* self.arch.shram_bank_size

// 2

)

for idx, tens in enumerate(parent_op.inputs):

1395

if tens.mem_type not in (MemType.Scratch, MemType.Scratch_fast):

1396

# Tensor is in permanent storage

1397

# Only when permanent storage differs from feature map storage, there is a point moving the data

1398

if (

1399

tens.mem_area in self.arch.permanent_storage_mem_area

1400

and self.arch.permanent_storage_mem_area != self.arch.feature_map_storage_mem_area

1401

) or tens.purpose == TensorPurpose.LUT:

1402

if tens.purpose == TensorPurpose.LUT or (

Patrik Gustavsson

94292fe

2021-09-02 08:22:58 +0200

[diff] [blame]

1403

# For elementwise broadcast

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1404

tens.purpose == TensorPurpose.FeatureMap

1405

and sched_op.op_type.is_binary_elementwise_op()

1406

and tens.shape != []

1407

and sched_op.ifm.shape != sched_op.ofm.shape

Patrik Gustavsson

94292fe

2021-09-02 08:22:58 +0200

[diff] [blame]

1408

and parent_op.write_shape is None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1409

and tens.storage_size() > max_ifm_shram_avail

1410

):

1411

only_vector_product_consumers = all(

1412

oper and oper.type.npu_block_type == NpuBlockType.VectorProduct

1413

for oper in tens.consumers()

1414

)

1415

1416

if (not only_vector_product_consumers) or tens.purpose == TensorPurpose.LUT:

1417

new_tens = tens.clone_into_fast_storage(self.arch)

1418

if tens.purpose == TensorPurpose.LUT:

1419

new_tens.mem_area = MemArea.Shram

1420

1421

new_tens.consumer_list.append(parent_op)

1422

parent_op.inputs[idx] = new_tens

Dwight Lidman

352607c

2021-09-29 17:00:09 +0200

[diff] [blame]

1423

# If the index is out of range, IFM and IFM2 are the same tensor

1424

# and pass inputs don't have duplicates

1425

if idx < len(sched_op.parent_ps.inputs):

1426

sched_op.parent_ps.inputs[idx] = new_tens

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1427

1428

def print_schedule(self, schedule: Schedule):

1429

print(f"Schedule: '{schedule.name}'")

1430

for sched_op in self.sched_ops:

1431

if sched_op not in schedule.cost_map:

1432

# Sub-schedule printing

1433

continue

1434

1435

op_info = schedule.cost_map[sched_op]

1436

print(f"\t{sched_op.index}: Operation {sched_op.name} - OFM {sched_op.ofm.shape}")

1437

print(f"\t\tType: {sched_op.op_type}")

1438

print(f"\t\tKernel: {sched_op.kernel}")

1439

print(f"{op_info}")

1440

mem_usage = (

1441

schedule.memory_snapshot[op_info.time_index]

1442

if op_info.time_index < len(schedule.memory_snapshot)

1443

else 0

1444

)

1445

print(f"\t\tSRAM Used: {mem_usage} bytes")

1446

Jonas Ohlsson

25e700c

2022-03-04 14:58:56 +0100

[diff] [blame]

1447

print("\tCascades:")

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1448

for i, cascade in enumerate(schedule.cascades.values()):

1449

print(f"\t\t{i}: {cascade.start} -> {cascade.end}, size: {cascade.mem_usage}")

Patrik Gustavsson

feeb06d

2020-04-22 12:53:47 +0200

[diff] [blame]

1450

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1451

Johan Alfven

2023-02-28 09:03:03 +0100

[diff] [blame]

1452

def _update_memory_snapshot_for_all_npu_graphs(nng: Graph, arch: ArchitectureFeatures, schedulers):

1453

mem_area = arch.fast_storage_mem_area

1454

mem_type_set = set((MemType.Scratch, MemType.Scratch_fast))

1455

1456

# Collect live ranges for the full graph

1457

# extract_live_ranges_from_cascaded_passes will start from the root sg and

1458

# all sub graphs/cascaded passes will be visited and the correct time_index

1459

# will be set for all the tensors.

1460

lr_graph = live_range.LiveRangeGraph()

1461

live_range.extract_live_ranges_from_cascaded_passes(

1462

nng.get_root_subgraph(),

mem_area,

mem_type_set,

lr_graph,

Tensor.AllocationQuantum,

1467

)

1468

# Populate time-array with memory used by live ranges

1469

temporal_usage = lr_graph.get_temporal_memory_usage(arch.fast_storage_mem_area)

1470

1471

# Update snapshot for all the npu sub graphs

1472

# Not needed for the scheduler any longer but npu_performance

1473

# is using this information so it must have the correct state

1474

for sg in schedulers:

1475

sg.schedule.memory_snapshot = temporal_usage

1476

sg.schedule.fast_storage_peak_usage = max(temporal_usage, default=0)

1477

1478

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1479

def _update_tensor_allocation(nng: Graph, arch: ArchitectureFeatures, options):

1480

"""

1481

Creates live ranges and runs tensor allocator for the current schedule

1482

(i.e. sg.schedule for all subgraphs), returns the maximum memory usage

1483

and updates SchedulerOpInfo.mem_usage for all operations in the schedule.

1484

"""

1485

root_sg = nng.get_root_subgraph()

1486

1487

alloc_list = []

1488

if arch.is_spilling_enabled():

1489

mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,)))

1490

mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))

1491

# Order is important

1492

alloc_list.append(mem_alloc_scratch_fast)

1493

alloc_list.append(mem_alloc_scratch)

1494

else:

1495

mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))

1496

alloc_list.append(mem_alloc_scratch)

1497

1498

for mem_area, mem_type_set in alloc_list:

1499

tensor_allocation.allocate_tensors(

nng,

root_sg,

arch,

mem_area,

mem_type_set,

tensor_allocator=options.tensor_allocator,

1506

verbose_allocation=options.verbose_allocation,

1507

cpu_tensor_alignment=options.cpu_tensor_alignment,

Tim Hall

cda4fcb

2022-05-19 12:36:58 +0100

[diff] [blame]

1508

hillclimb_max_iterations=options.hillclimb_max_iterations,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1509

)

1510

1511

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1512

class FastStorageComponentAllocator:

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1513

MAX_EXHAUSTIVE_LIFE_RANGE = 20

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1514

MAX_EXHAUSTIVE_ITEMS = 20

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1515

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1516

def __init__(self, base_mem_usage, max_mem_usage, staging_limit):

1517

self.base_mem_usage = base_mem_usage

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1518

self.max_mem_usage = max_mem_usage

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1519

self.staging_limit = staging_limit

1520

self.lrs = []

1521

self.evicted = []

1522

self.curr_evicted = []

1523

self.remaining_total_size = []

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1524

self.best_score = 0

1525

self.competing_tens_access = {}

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1526

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1527

def allocate_exhaustive(self, ix, score):

1528

# Favour tensors with highest element access (score)

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1529

if ix >= self.num_lrs:

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1530

if score > self.best_score:

1531

self.best_score = score

Louis Verhaard

5c8f1e5

2022-02-23 14:13:07 +0100

[diff] [blame]

1532

self.evicted = self.curr_evicted.copy()

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1533

return

1534

1535

lr = self.lrs[ix]

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1536

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1537

# If adding the tensor size to the base mem usage doesn't exceed the staging limit anywhere on the lr time

1538

# range, it can fit and the case where the tensor is included needs to be checked

1539

can_fit = max(self.base_mem_usage[lr.start_time : lr.end_time + 1]) + lr.size <= self.staging_limit

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1540

if can_fit:

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1541

# Tensor can fit, add tensor element access to the score and check case where tensor is included

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1542

self.curr_evicted[ix] = False

1543

self.base_mem_usage = self.update_mem_usage(self.base_mem_usage, lr, True)

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1544

self.allocate_exhaustive(ix + 1, score + self.competing_tens_access[lr.tensors[0]])

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1545

self.base_mem_usage = self.update_mem_usage(self.base_mem_usage, lr, False)

1546

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1547

# If the max mem usage doesn't exceed the staging limit anywhere on the lr time range, it always fits and the

1548

# case where the tensor is not included can be skipped

1549

always_fits = max(self.max_mem_usage[lr.start_time : lr.end_time + 1]) <= self.staging_limit

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1550

if not always_fits:

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1551

# Tensor doesn't always fit, check case when tensor is not included

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1552

self.curr_evicted[ix] = True

1553

self.max_mem_usage = self.update_mem_usage(self.max_mem_usage, lr, False)

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1554

self.allocate_exhaustive(ix + 1, score)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1555

self.max_mem_usage = self.update_mem_usage(self.max_mem_usage, lr, True)

1556

1557

@staticmethod

1558

def update_mem_usage(mem_usage, lr, increase):

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1559

size = lr.size if increase else -lr.size

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1560

for t in range(lr.start_time, lr.end_time + 1):

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1561

mem_usage[t] += size

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

return mem_usage

@staticmethod

def evict(lr, max_mem_usage, scratched_fms):

1566

for t in range(lr.start_time, lr.end_time + 1):

1567

max_mem_usage[t] -= lr.size

1568

for tens in lr.tensors:

1569

if tens in scratched_fms:

1570

tens.mem_area = scratched_fms[tens][0]

1571

tens.mem_type = scratched_fms[tens][1]

1572

1573

@staticmethod

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1574

def keep(lr, base_mem_usage):

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1575

for t in range(lr.start_time, lr.end_time + 1):

1576

base_mem_usage[t] += lr.size

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1577

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1578

def allocate_component(self, lrs, max_mem, min_mem, scratched_fms, competing_tens_access, evicted_fms):

1579

self.lrs = lrs

1580

self.num_lrs = len(lrs)

1581

self.evicted = [0] * self.num_lrs

1582

self.curr_evicted = [0] * self.num_lrs

1583

self.best_score = -1

1584

self.competing_tens_access = competing_tens_access

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1585

# Recursively evaluate all permutations of allocations of the lrs found in the component.

1586

# For every permutation that fits within the staging_limit there is a score calculated.

1587

# The permutation with the highest score will then be chosen. The score is calculated

1588

# as the sum of the actual element access (ifm read and ofm write) for all the

1589

# including tensors. So it is not necessary the tensor with the biggest size that ends up

1590

# being included in the result.

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1591

self.allocate_exhaustive(0, 0)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1592

# Optimal allocation has been found, move lrs accordingly

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1593

for i, lr in enumerate(self.lrs):

1594

if self.evicted[i]:

1595

self.evict(lr, max_mem, scratched_fms)

1596

if lr not in evicted_fms:

1597

evicted_fms.append(lr)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1598

else:

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1599

self.keep(lr, min_mem)

1600

if lr in evicted_fms:

1601

evicted_fms.remove(lr)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1602

1603

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1604

def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_options: SchedulerOptions):

1605

"""Entry point for the Scheduler"""

1606

# Initialize CPU subgraphs

1607

schedulers = dict()

1608

# Initialize schedulers with max schedule. Only schedule NPU subgraphs

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1609

for sg in nng.subgraphs:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1610

if sg.placement != PassPlacement.Npu:

1611

# Create cascaded passes for CPU Ops

1612

cascaded_passes = []

1613

for idx, ps in enumerate(sg.passes):

1614

cps = CascadedPass(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

1615

ps.name,

1616

SchedulingStrategy.WeightStream,

ps.inputs,

[],

ps.outputs,

[ps],

ps.placement,

False,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1623

)

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1624

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1625

cps.time = idx

1626

ps.cascade = cps

1627

cascaded_passes.append(cps)

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1628

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1629

sg.cascaded_passes = cascaded_passes

1630

else:

1631

# Npu subgraph - create schedule

1632

scheduler = Scheduler(nng, sg, arch, scheduler_options)

1633

schedulers[sg] = scheduler

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1634

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1635

scheduler.create_scheduler_representation(arch)

1636

sg.sched_ops = scheduler.sched_ops

1637

scheduler.move_constant_data()

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1638

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1639

# Create the Max schedule template

1640

max_schedule_template = scheduler.create_initial_schedule()

1641

scheduler.max_schedule = max_schedule_template

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1642

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1643

# Create the optimimised Max schedule

1644

sg.schedule = max_schedule_template

1645

scheduler.update_op_memory_snapshot(max_schedule_template)

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

1646

opt_max_schedule = scheduler.propose_schedule_buffering(max_schedule_template, 1 << 32)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1647

sg.schedule = opt_max_schedule

1648

scheduler.update_op_memory_snapshot(opt_max_schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1649

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1650

# Create Min schedule

1651

min_schedule = scheduler.propose_minimal_schedule()

1652

initial_sram_limit = scheduler_options.optimization_sram_limit

1653

if scheduler_options.optimization_strategy == OptimizationStrategy.Size:

1654

initial_sram_limit = scheduler.min_memory_req

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1655

Johan Alfvén

2022-07-16 18:27:05 +0200

[diff] [blame]

1656

# Build cascades for Min schedule

1657

scheduler.build_cascades_for_min_schedule(min_schedule, max_schedule_template, initial_sram_limit)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1658

sg.schedule = min_schedule

1659

scheduler.update_op_memory_snapshot(min_schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1660

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1661

if scheduler_options.optimization_strategy == OptimizationStrategy.Performance:

1662

# Create an optimized schedule

1663

sg.schedule = scheduler.optimize_schedule(

1664

min_schedule, opt_max_schedule, max_schedule_template, scheduler_options

1665

)

1666

scheduler.update_op_memory_snapshot(sg.schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1667

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1668

scheduler.apply_schedule(sg.schedule)

1669

scheduler.use_fast_storage_for_feature_maps(sg.schedule, scheduler_options.optimization_sram_limit)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1670

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1671

if scheduler_options.optimization_strategy == OptimizationStrategy.Performance and scheduler.evicted_fms:

1672

# It might be possible to gain performance by reducing

1673

# weight buffer size and instead fit fms in fast storage

1674

scheduler.optimize_weight_buffering_size(min_schedule, scheduler_options)

1675

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1676

if scheduler_options.verbose_schedule:

1677

scheduler.print_schedule(sg.schedule)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1678

Johan Alfven

2023-02-28 09:03:03 +0100

[diff] [blame]

1679

# Make a full live range calculation starting from the root sg

1680

_update_memory_snapshot_for_all_npu_graphs(nng, arch, schedulers)

1681

Tim Hall