Blame - ethosu/vela/scheduler.py - ml/ethos-u/ethos-u-vela

2020-04-27 18:20:16 +0100

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

16

#

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

17

# Description:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

18

# The scheduler creates and searches for an optimal plan for the network, selecting block configurations and

19

# subdivisions for the Operators

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

20

# For Class name forward references for the type annotations. (see PEP 563).

21

from __future__ import annotations

22

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

23

import copy

Johan Alfvén

5e0ae55

2022-02-09 21:20:10 +0100

[diff] [blame]

24

from collections import namedtuple

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

25

from enum import auto

26

from enum import IntEnum

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

27

from typing import Any

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

28

from typing import Dict

29

from typing import List

30

from typing import Optional

31

from typing import Tuple

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

32

from typing import TYPE_CHECKING

33

34

# Import needed for Type annotations. Only import for Type checking to avoid run-time errors due to cyclic import.

35

if TYPE_CHECKING:

36

from .npu_performance import CycleCost

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

37

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

38

import numpy as np

39

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

40

from . import live_range

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

41

from . import npu_performance

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

42

from . import tensor_allocation

43

from . import weight_compressor

44

from .architecture_allocator import ArchitectureBlockConfig

45

from .architecture_allocator import find_block_config

46

from .architecture_allocator import get_ifm_area_required

Fredrik Svedberg

d03dc50

2022-06-30 10:44:12 +0200

[diff] [blame]

47

from .architecture_allocator import to_upscale

erik.andersson@arm.com

8912f3a

2022-08-16 11:08:57 +0200

[diff] [blame]

48

from .architecture_allocator import is_nearest

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

49

from .architecture_features import ArchitectureFeatures

50

from .architecture_features import Block

51

from .cascade_builder import CascadeBuilder

52

from .cascade_builder import CascadeInfo

Fredrik Svedberg

880e735

2020-08-25 11:31:47 +0200

[diff] [blame]

53

from .data_type import DataType

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

54

from .nn_graph import CascadedPass

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

55

from .nn_graph import Graph

56

from .nn_graph import Pass

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

57

from .nn_graph import PassPlacement

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

58

from .nn_graph import SchedulingStrategy

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

59

from .nn_graph import Subgraph

Johan Alfvén

92689d5

2022-12-06 11:16:19 +0100

[diff] [blame]

60

from .live_range import ofm_can_reuse_ifm

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

61

from .numeric_util import round_down

62

from .numeric_util import round_up

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

63

from .operation import NpuBlockType

Louis Verhaard

aee5d75

2020-09-30 09:01:52 +0200

[diff] [blame]

64

from .operation import Op

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

65

from .shape4d import Shape4D

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

66

from .tensor import MemArea

Patrik Gustavsson

eca2e95

2020-05-27 09:15:11 +0200

[diff] [blame]

67

from .tensor import MemType

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

68

from .tensor import Tensor

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

69

from .tensor import TensorFormat

70

from .tensor import TensorPurpose

71

from .tensor import TensorSubPurpose

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

72

from .weight_compressor import NpuWeightTensor

Jacob Bohlin

1a66697

2020-09-11 10:04:15 +0200

[diff] [blame]

73

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

74

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

75

def shape_for_format(shape: Shape4D, tensor_format: TensorFormat) -> Shape4D:

76

if tensor_format == TensorFormat.NHCWB16:

77

return shape.with_depth(round_up(shape.depth, 16))

return shape

class OptimizationStrategy(IntEnum):

83

"""Enum defining the different optimization strategies for the Scheduler"""

84

85

Size = auto()

86

Performance = auto()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

def __str__(self):

return self.name

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

92

class SchedulerOpInfo:

93

"""Contains metadata about a SchedulerOperation that is unique to one Schedule"""

94

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

95

def __init__(

96

self,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

97

block_config: ArchitectureBlockConfig,

98

weights_size: int,

99

stripe_input: Shape4D,

100

stripe_input2: Optional[Shape4D],

101

stripe: Shape4D,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

102

):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

103

self.block_config = block_config

104

self.weights_size = weights_size

105

self.stripe_input = stripe_input

106

self.stripe_input2 = stripe_input2

107

self.stripe = stripe

108

self.cascade = 0 # Assigned by CascadeBuilder. 0 means not part of a cascade

109

self.time_index = None # Set by update_op_memory_snapshot

110

self.ofm_depth_slices: List[int] = [0, stripe.depth]

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

111

self.npu_weights_tensor: Optional[NpuWeightTensor] = None

112

self.npu_scales_tensor: Optional[NpuWeightTensor] = None

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

113

self.buffered_weight_tensors: List[Tensor] = []

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

114

self.cycles: Optional[CycleCost] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

115

self.slack_buffering_cycles = 0

116

self.slack_buffering_memory = 0

117

self.full_weight_transfer_cycles = 0

118

119

def copy(self):

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

120

res = SchedulerOpInfo(

self.block_config,

self.weights_size,

self.stripe_input,

self.stripe_input2,

self.stripe,

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

127

res.cascade = self.cascade

return res

def __str__(self):

res = f"\t\tBlock Config = {self.block_config}\n"

132

res += f"\t\tOFM Block = {self.block_config.ofm_block}\n"

133

res += f"\t\tIFM Stripe = {self.stripe_input}\n"

134

res += f"\t\tIFM2 Stripe = {self.stripe_input2}\n"

135

res += f"\t\tOFM Stripe = {self.stripe}\n"

136

res += f"\t\tEncoded Weights = {self.npu_weights_tensor and len(self.npu_weights_tensor.buffer)} bytes\n"

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

137

for idx, tens in enumerate(self.buffered_weight_tensors):

138

res += f"\t\tWeight buffer{idx + 1} = {tens.storage_size()} bytes\n"

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

139

res += f"\t\tDepth slices = {self.ofm_depth_slices}\n"

140

res += f"\t\tAssigned Cascade = {self.cascade}"

return res

class SchedulerOptions:

145

"""Contains options for the Scheduler"""

146

147

def __init__(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

148

self,

149

optimization_strategy,

150

sram_target,

151

verbose_schedule,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

152

):

153

self.optimization_strategy = optimization_strategy

154

self.optimization_sram_limit = sram_target

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

155

self.verbose_schedule = verbose_schedule

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

156

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

157

def __str__(self) -> str:

158

return f"{type(self).__name__}: {str(self.__dict__)}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

__repr__ = __str__

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

163

class SchedulerTensor:

164

def __init__(self, shape, dt, mem_area, _format):

165

self.dtype = dt

166

self.mem_area = mem_area

167

self.shape = shape

168

self.format = _format

169

self.connection = None

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

170

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

171

172

class SchedulerOperation:

173

"""Scheduler internal representation of 'Operation'

174

This class can be seen as a node within the Scheduler Graph representation

175

"""

176

177

def __init__(self, ps: Pass, arch: ArchitectureFeatures, nng: Graph):

178

self.arch = arch

179

self.parent_ps = ps

180

self.parent_op = ps.primary_op

181

self.name = ps.primary_op.name

182

self.op_type = ps.primary_op.type

183

self.activation = ps.primary_op.activation

184

self.kernel = ps.primary_op.kernel

Tim Hall

3c5cfe9

2022-03-16 16:31:57 +0000

[diff] [blame]

185

self.resampling_mode = ps.primary_op.ifm_resampling_mode

Fredrik Svedberg

b81e1bb

2022-10-11 21:50:51 +0200

[diff] [blame]

186

self.reversed_operands = False

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

187

self.uses_scalar = ps.primary_op.ifm2 is not None and (

188

ps.primary_op.ifm.shape == [] or ps.primary_op.ifm2.shape == []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

189

)

erik.andersson@arm.com

2022-03-22 15:35:30 +0100

[diff] [blame]

190

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

191

self.ifm_ublock = arch.ifm_ublock

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

192

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

193

self.ifm = SchedulerTensor(

194

ps.ifm_shapes[0],

195

ps.ifm_tensor.dtype,

196

ps.ifm_tensor.mem_area,

197

ps.ifm_tensor.format,

198

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

199

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

200

self.ifm2 = None

201

if ps.ifm2_tensor:

202

self.ifm2 = SchedulerTensor(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

203

ps.ifm_shapes[1],

204

ps.ifm2_tensor.dtype,

205

ps.ifm2_tensor.mem_area,

206

ps.ifm2_tensor.format,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

207

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

208

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

209

self.ofm = SchedulerTensor(

210

ps.ofm_shapes[0],

211

ps.ofm_tensor.dtype,

212

ps.ofm_tensor.mem_area,

213

ps.ofm_tensor.format,

214

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

215

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

216

# Input volume width and height required to produce the smallest possible stripe

217

self.min_stripe_input_w, self.min_stripe_input_h = self._calculate_min_stripe_input()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

218

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

219

# Flags that marks whether this SchedulerOperation requires full IFM/OFM

220

self.requires_full_ifm = False

221

self.requires_full_ifm2 = False

222

self.requires_full_ofm = False

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

223

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

224

self.evicted_fms_size = 0

225

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

226

self.index = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

227

erik.andersson@arm.com

2022-03-22 15:35:30 +0100

[diff] [blame]

228

# Perform an IFM swap for certain binary elementwise operators

229

# in order to enable cascading, if the SchedOp conforms to

230

# Elementwise cascading rules.

Johan Alfvén

0f2e59f

2022-10-21 11:21:38 +0200

[diff] [blame]

231

# The non-constant/non-scalar/non-broadcast IFM should be the primary input

232

if self.op_type.is_binary_elementwise_op():

233

ifm = self.parent_op.ifm

234

ifm2 = self.parent_op.ifm2

235

ofm = self.parent_op.ofm

erik.andersson@arm.com

2022-03-22 15:35:30 +0100

[diff] [blame]

236

Johan Alfvén

993ea53

2022-10-26 10:20:01 +0200

[diff] [blame]

237

ifm_can_swap = ifm.is_const or ifm.is_scalar

Johan Alfvén

0f2e59f

2022-10-21 11:21:38 +0200

[diff] [blame]

238

ifm2_can_be_primary = not (ifm2.is_const or ifm2.is_scalar or ifm2.is_broadcast(ofm))

239

Johan Alfvén

993ea53

2022-10-26 10:20:01 +0200

[diff] [blame]

240

if ifm_can_swap and ifm2_can_be_primary:

Johan Alfvén

0f2e59f

2022-10-21 11:21:38 +0200

[diff] [blame]

241

# IFM2 is the primary input

Fredrik Svedberg

b81e1bb

2022-10-11 21:50:51 +0200

[diff] [blame]

242

self.reversed_operands = True

erik.andersson@arm.com

2022-03-22 15:35:30 +0100

[diff] [blame]

243

self.ifm, self.ifm2 = self.ifm2, self.ifm

244

245

self.parent_ps.ifm_shapes = self.parent_ps.ifm_shapes[::-1]

246

self.parent_ps.inputs = self.parent_ps.inputs[::-1]

247

self.parent_ps.ifm_tensor, self.parent_ps.ifm2_tensor = (

248

self.parent_ps.ifm2_tensor,

249

self.parent_ps.ifm_tensor,

250

)

251

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

252

def add_ifm_connection(self, conn: "Connection"):

253

"""Add input connection to another SchedulerOperation or Subgraph Input"""

254

conn.consumers.append(self)

255

self.ifm.connection = conn

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

256

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

257

def add_ifm2_connection(self, conn: "Connection"):

258

"""Add input connection to another SchedulerOperation or Subgraph Input"""

259

if self.ifm2:

260

conn.consumers.append(self)

261

self.ifm2.connection = conn

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

262

else:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

263

assert False, f"Trying to set an IFM2 Connection to {self} which has no IFM2"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

264

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

265

def add_ofm_connection(self, conn: "Connection"):

266

"""Add output connection to another SchedulerOperation or Subgraph Output"""

267

conn.producers.append(self)

268

self.ofm.connection = conn

269

270

def get_dependants(self):

271

"""Returns a list of the Ops that depend on this Operation's OFM"""

272

return self.ofm.connection.consumers

273

274

def ifm_size_in_bytes(self) -> int:

275

"""Returns size of the IFM in bytes"""

276

ifm_storage_shape = shape_for_format(self.ifm.shape, self.ifm.format)

277

return round_up(ifm_storage_shape.elements() * self.ifm.dtype.size_in_bytes(), Tensor.AllocationQuantum)

278

279

def ifm2_size_in_bytes(self) -> int:

280

"""Returns size of the IFM2 in bytes"""

281

if self.ifm2:

282

ifm2_storage_shape = shape_for_format(self.ifm2.shape, self.ifm2.format)

283

return round_up(ifm2_storage_shape.elements() * self.ifm2.dtype.size_in_bytes(), Tensor.AllocationQuantum)

return 0

def ofm_size_in_bytes(self) -> int:

288

"""Returns size of the OFM in bytes"""

289

ofm_storage_shape = shape_for_format(self.ofm.shape, self.ofm.format)

290

return round_up(ofm_storage_shape.elements() * self.ofm.dtype.size_in_bytes(), Tensor.AllocationQuantum)

291

292

def create_scheduler_info(self, nng: Graph, stripe: Shape4D) -> SchedulerOpInfo:

293

"""Returns schedule info about this SchedulerOperation based on how many ofm elements it should produce"""

294

ifm_shape = self.ifm.shape

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

295

ifm2_shape = self.ifm2.shape if self.ifm2 is not None else None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

296

ofm_shape = stripe

297

298

if ofm_shape != self.ofm.shape:

299

# Striped Op - Need to calculate stripe input volume

300

stripe_input_w, stripe_input_h = self._get_stripe_input_requirement(stripe)

301

# Ensure stripe input volume is within the full IFM volume

302

stripe_input_h = min(stripe_input_h, self.ifm.shape.height)

303

stripe_input_w = min(stripe_input_w, self.ifm.shape.width)

304

ifm_shape = ifm_shape.with_hw(stripe_input_h, stripe_input_w)

305

306

if self.ifm2:

307

stripe_input2_h = min(stripe_input_h, self.ifm2.shape.height)

308

stripe_input2_w = min(stripe_input_w, self.ifm2.shape.width)

309

ifm2_shape = ifm2_shape.with_hw(stripe_input2_h, stripe_input2_w)

310

311

block_config = self._get_block_config(ifm_shape, ifm2_shape, self.uses_scalar, ofm_shape)

312

313

scheduler_op_info = SchedulerOpInfo(block_config, 0, ifm_shape, ifm2_shape, ofm_shape)

314

if self.parent_op.weights:

315

# Default full-depth weight encoding with no buffering

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

316

(

317

scheduler_op_info.npu_weights_tensor,

318

scheduler_op_info.npu_scales_tensor,

319

) = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

320

self.arch,

321

self.parent_op,

322

self.parent_op.weights,

self.parent_op.bias,

self.kernel,

block_config,

[0, self.ofm.shape.depth],

327

)

328

329

self.parent_ps.block_config = block_config.old_style_representation()

330

return scheduler_op_info

331

332

def _get_stripe_input_requirement(self, stripe_shape: Shape4D) -> Tuple[int, int]:

333

"""Returns the amount of IFM required to produce the stripe with shape:'stripe_shape'"""

334

ofm_shape_to_produce = Block.from_shape(stripe_shape.as_list())

335

Fredrik Svedberg

3ff7a4a

2021-09-29 10:08:04 +0200

[diff] [blame]

336

return get_ifm_area_required(ofm_shape_to_produce, self.kernel, self.resampling_mode)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

337

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

338

def _calculate_min_stripe_input(self) -> Tuple[int, int]:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

339

# Calculate the input volume required height and width for the smallest possible stripe (h,w = 1,1)

340

min_stripe = self.ofm.shape.with_hw(1, 1)

341

return self._get_stripe_input_requirement(min_stripe)

342

343

def _get_block_config(

344

self, ifm_shape: Shape4D, ifm2_shape: Optional[Shape4D], uses_scalar: bool, ofm_shape: Shape4D

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

345

) -> Optional[ArchitectureBlockConfig]:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

346

# Returns a block config and SHRAM layout

347

lut_banks = 2 if self.parent_op.activation_lut else 0

348

return find_block_config(

349

self.arch,

350

self.op_type.npu_block_type,

ofm_shape,

ifm_shape,

ifm2_shape,

uses_scalar,

self.ifm.dtype.size_in_bits(),

356

self.kernel,

357

lut_banks,

358

self.parent_op.has_scaling(),

359

self.resampling_mode,

)

class Connection:

"""Scheduler internal representation of a Tensor that connects two SchedulerOperations

365

This class can be seen as an edge within the Scheduler Graph representation

366

"""

367

368

def __init__(self, tensor: Tensor):

369

self.parent_tens = tensor

370

371

# SchedulerOperation relationships

372

self.producers: List[SchedulerOperation] = []

373

self.consumers: List[SchedulerOperation] = []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

374

375

def __str__(self):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

376

return f"<Connection {self.parent_tens.name}>"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

__repr__ = __str__

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

381

class Schedule:

382

"""Class that contains a solution of how to schedule an NPU subgraph and its cost"""

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

383

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

384

def __init__(self, sg: Subgraph, label: str):

385

self.sg = sg

386

self.label = label

387

self.cost_map: Dict[SchedulerOperation, SchedulerOpInfo] = {}

388

self.cascades: Dict[int, CascadeInfo] = {}

389

self.fast_storage_peak_usage = 0

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

390

self.memory_snapshot: Optional[List[int]] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

@property

def name(self):

return f"{self.sg.name}_{self.label}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

395

396

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

397

class Scheduler:

398

"""Main class of the Vela Scheduling"""

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

399

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

400

def __init__(self, nng: Graph, sg: Subgraph, arch: ArchitectureFeatures, options: SchedulerOptions):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

401

self.nng = nng

402

self.sg = sg

403

self.arch = arch

Ayaan Masood

b801dda

2022-02-22 11:28:55 +0000

[diff] [blame]

404

self.sched_ops: List[SchedulerOperation] = []

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

405

self.max_schedule: Optional[Schedule] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

406

self.scheduler_options = options

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

407

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

408

self.scratched_fms: Dict[Tensor, Any] = {}

409

self.evicted_fms: List[live_range.LiveRange] = []

410

Johan Alfvén

5e0ae55

2022-02-09 21:20:10 +0100

[diff] [blame]

411

def avoid_nhcwb16_for_ofm(self, tens, ps, arch):

412

# Only run this check for opt strategy Size

413

if self.scheduler_options.optimization_strategy == OptimizationStrategy.Performance:

return False

op = ps.primary_op

if not op.type.is_elementwise_op():

418

return False

419

420

depth = op.ofm_shapes[0][-1]

421

if (depth % 16) == 0:

422

return False

423

424

# Check if overwriting the inputs can be allowed

425

OpShapeTens = namedtuple("OpShapeTens", ["op_shape", "tens"])

426

outp = OpShapeTens(op.ofm_shapes[0], op.ofm)

427

inps = []

428

if op.ifm is not None:

429

inps.append(OpShapeTens(op.ifm_shapes[0], op.ifm))

430

if op.ifm2 is not None:

431

inps.append(OpShapeTens(op.ifm_shapes[1], op.ifm2))

432

433

# Find an input tensor that can be overwritten by the output

434

for inp in inps:

435

if (

436

# check op input and output shapes allow overlapping

437

inp.op_shape == outp.op_shape

438

# check input tensor is valid

439

and inp.tens is not None

440

and inp.tens.shape != []

441

# check input and output tensors are compatible

442

and inp.tens.format == outp.tens.format

443

and inp.tens.dtype == outp.tens.dtype

444

):

445

if inp.tens.format == TensorFormat.NHWC:

return True

return False

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

450

def create_scheduler_representation(self, arch: ArchitectureFeatures):

451

"""Creates a Scheduler Graph representation"""

452

# Temporary dict for creating connections between the Operations

453

connections: Dict[Tensor, Connection] = {}

454

# Memory required for the largest FeatureMap that has to be full

455

min_memory_req = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

456

for ps in self.sg.passes:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

457

if ps.primary_op:

458

# Set tensor format to NHCWB16 for output FeatureMaps, if possible

Louis Verhaard

0b9c9a3

2020-09-15 14:05:38 +0200

[diff] [blame]

459

for output in ps.outputs:

Jacob Bohlin

a5e8c1c

2021-06-14 13:33:39 +0200

[diff] [blame]

460

if output in self.sg.output_tensors or output.purpose != TensorPurpose.FeatureMap:

Patrik Gustavsson

feeb06d

2020-04-22 12:53:47 +0200

[diff] [blame]

461

continue

Johan Alfvén

5e0ae55

2022-02-09 21:20:10 +0100

[diff] [blame]

462

463

if output.needs_linear_format:

464

continue

465

466

if self.avoid_nhcwb16_for_ofm(output, ps, arch):

467

output.needs_linear_format = True

468

continue

469

470

output.set_format(TensorFormat.NHCWB16, arch)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

471

472

# Create SchedulerOperations

473

op = SchedulerOperation(ps, arch, self.nng)

474

op.index = len(self.sched_ops)

475

476

# Make connections

477

if ps.ifm_tensor not in connections:

478

connections[ps.ifm_tensor] = Connection(ps.ifm_tensor)

479

if ps.ifm2_tensor and ps.ifm2_tensor not in connections:

480

connections[ps.ifm2_tensor] = Connection(ps.ifm2_tensor)

481

if ps.ofm_tensor not in connections:

482

connections[ps.ofm_tensor] = Connection(ps.ofm_tensor)

483

484

op.add_ifm_connection(connections[ps.ifm_tensor])

485

if ps.ifm2_tensor:

486

op.add_ifm2_connection(connections[ps.ifm2_tensor])

487

op.add_ofm_connection(connections[ps.ofm_tensor])

488

489

# Set requirements on the ifm/ofm buffers

490

self.sched_ops.append(op)

491

if ps.ifm_tensor in self.sg.input_tensors:

492

# This Op consumes a subgraph input

493

op.requires_full_ifm = True

494

if ps.ifm2_tensor and ps.ifm2_tensor in self.sg.input_tensors:

495

# This Op consumes a subgraph input

496

op.requires_full_ifm2 = True

497

if ps.ofm_tensor in self.sg.output_tensors:

498

# This Op produces a subgraph output

499

op.requires_full_ofm = True

500

if ps.ifm_tensor.needs_linear_format:

501

op.requires_full_ifm = True

502

if ps.ifm2_tensor and ps.ifm2_tensor.needs_linear_format:

503

op.requires_full_ifm2 = True

504

if ps.ofm_tensor.needs_linear_format or ps.primary_op.memory_function == Op.ConcatSliceWrite:

505

op.requires_full_ofm = True

506

if len(ps.primary_op.outputs) > 1 or len(ps.primary_op.outputs[0].consumer_list) > 1:

507

# Op has multiple outputs or consumers - requires full OFM

508

op.requires_full_ofm = True

509

510

# Check memory requirements if this Op requires any full FeatureMaps

511

op_memory_req = 0

512

if op.requires_full_ifm:

513

op_memory_req += op.ifm_size_in_bytes()

514

if op.requires_full_ifm2:

515

op_memory_req += op.ifm2_size_in_bytes()

516

if op.requires_full_ofm:

517

op_memory_req += op.ofm_size_in_bytes()

518

519

min_memory_req = max(op_memory_req, min_memory_req)

520

521

# Theoretical minimum required memory - used to guide the cascade building

522

self.min_memory_req = min_memory_req

523

524

def create_initial_schedule(self) -> Schedule:

525

"""Creates an initial schedule with no cascading or buffering of any kind"""

526

schedule = Schedule(self.sg, "MAX")

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

527

for op in self.sched_ops:

528

cost = op.create_scheduler_info(self.nng, op.ofm.shape)

529

cost.cycles = self.estimate_op_performance(op, cost.block_config, op.ofm.shape.depth)

530

schedule.cost_map[op] = cost

return schedule

def update_op_memory_snapshot(self, schedule: Schedule):

535

memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

536

537

# Collect live ranges from tensors

538

lr_graph = live_range.LiveRangeGraph()

539

for mem_area, mem_type_set in memories_list:

540

live_range.extract_live_ranges_from_cascaded_passes(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

541

self.nng.get_root_subgraph(),

mem_area,

mem_type_set,

lr_graph,

Tensor.AllocationQuantum,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

546

)

547

548

# Populate time-array with memory used by live ranges

549

temporal_usage = lr_graph.get_temporal_memory_usage(self.arch.fast_storage_mem_area)

550

schedule.memory_snapshot = temporal_usage

551

552

# Set the peak memory usage

553

schedule.fast_storage_peak_usage = max(temporal_usage, default=0)

554

555

def estimate_op_performance(self, op: SchedulerOperation, block_config, ofm_depth):

556

query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)

557

query.ifm_shape = op.ifm.shape

558

query.ifm_memory_area = op.ifm.mem_area

559

query.ifm_bits = op.ifm.dtype.size_in_bits()

560

query.ifm_format = op.ifm.format

561

query.ifm2_shape = op.ifm2 and op.ifm2.shape

562

query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area

563

query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()

564

query.ifm2_format = op.ifm2 and op.ifm2.format

565

query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)

566

query.ofm_memory_area = op.ofm.mem_area

567

query.ofm_bits = op.ofm.dtype.size_in_bits()

568

query.ofm_format = op.ofm.format

569

if op.parent_op.bias:

570

query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)

571

query.const_memory_area = self.arch.fast_storage_mem_area

572

573

query.kernel = op.kernel

574

query.config = block_config

575

576

return npu_performance.measure_cycle_cost(self.arch, op.op_type, op.activation and op.activation.op_type, query)

577

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

578

def estimate_element_access(self, op: SchedulerOperation, block_config, ofm_depth):

579

query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)

580

query.ifm_shape = op.ifm.shape

581

query.ifm_memory_area = op.ifm.mem_area

582

query.ifm_bits = op.ifm.dtype.size_in_bits()

583

query.ifm_format = op.ifm.format

584

query.ifm2_shape = op.ifm2 and op.ifm2.shape

585

query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area

586

query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()

587

query.ifm2_format = op.ifm2 and op.ifm2.format

588

query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)

589

query.ofm_memory_area = op.ofm.mem_area

590

query.ofm_bits = op.ofm.dtype.size_in_bits()

591

query.ofm_format = op.ofm.format

592

if op.parent_op.bias:

593

query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)

594

query.const_memory_area = self.arch.fast_storage_mem_area

595

596

query.kernel = op.kernel

597

query.config = block_config

598

599

return npu_performance.measure_element_access(self.arch, query)

600

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

601

def propose_schedule_buffering(self, ref_schedule: Schedule, staging_limit_bytes):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

602

"""Create a buffered schedule"""

603

buffered_schedule = Schedule(self.sg, f"{ref_schedule.label}_BUFFERED")

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

604

605

prev_op = None

606

for sched_op in self.sched_ops:

607

if sched_op not in ref_schedule.cost_map:

608

# sched_op is not part of this sub-schedule - skip

609

continue

610

611

self.propose_operator_buffering(sched_op, prev_op, buffered_schedule, ref_schedule, staging_limit_bytes)

612

prev_op = sched_op

613

614

return buffered_schedule

615

616

def propose_operator_buffering(

617

self,

618

sched_op: SchedulerOperation,

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

619

prev_op: Optional[SchedulerOperation],

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

620

buffered_schedule: Schedule,

621

ref_schedule: Schedule,

622

staging_limit_bytes,

623

):

624

# Mild recursion might mean this Op has already been seen

625

if sched_op in buffered_schedule.cost_map:

626

return

627

628

# Take the reference schedule as default costings for this schedule

629

ref_cost = ref_schedule.cost_map[sched_op]

630

cost = copy.copy(ref_cost)

631

cost.slack_buffering_cycles = ref_cost.cycles.op_cycles

632

memory_snapshot = ref_schedule.memory_snapshot

633

ref_memory_usage = memory_snapshot[ref_cost.time_index] if ref_cost.time_index < len(memory_snapshot) else 0

634

cost.slack_buffering_memory = staging_limit_bytes - ref_memory_usage

635

buffered_schedule.cost_map[sched_op] = cost

636

637

# Attempt weight buffering on anything with a weights tensor

638

if sched_op.parent_op.weights:

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

639

buffer_limit_bytes = cost.slack_buffering_memory

640

641

# If applicable apply size limitation, but keep it within reason (ratio 1.5).

642

# Size limitation is used when use_fast_storage_for_feature_maps have

643

# detected that there are fms that do not fit in fast storage.

644

if sched_op.evicted_fms_size and ((buffer_limit_bytes / sched_op.evicted_fms_size) >= 1.5):

645

buffer_limit_bytes -= sched_op.evicted_fms_size

646

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

647

self.propose_weight_buffering(

648

sched_op.parent_op.weights,

649

sched_op.parent_op.bias,

sched_op,

prev_op,

buffered_schedule,

ref_schedule,

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

654

buffer_limit_bytes,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

)

return cost

def weights_needs_dma(self, weight_tensor):

660

if weight_tensor and weight_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast):

661

# Weights are in permanent storage

662

# Only when permanent storage differs from feature map storage, there is a point moving the data

663

if (

664

weight_tensor.mem_area in (MemArea.Dram, MemArea.OffChipFlash)

665

and self.arch.permanent_storage_mem_area != self.arch.fast_storage_mem_area

):

return True

return False

def propose_weight_buffering(

self,

weight_tensor,

scale_tensor,

sched_op: SchedulerOperation,

675

prev_op: SchedulerOperation,

676

buffered_schedule: Schedule,

677

ref_schedule: Schedule,

678

buffer_limit_bytes,

679

):

680

cost = buffered_schedule.cost_map[sched_op]

681

prev_cost = buffered_schedule.cost_map.get(prev_op)

682

ref_cost = ref_schedule.cost_map[sched_op]

683

assert cost and ref_cost

684

685

needs_dma = self.weights_needs_dma(weight_tensor)

686

687

ofm_full_depth_slices = [0, ref_cost.stripe.depth]

688

689

# Encode weights for the full depth

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

690

full_weights, full_scales = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

self.arch,

sched_op.parent_op,

weight_tensor,

scale_tensor,

sched_op.kernel,

cost.block_config,

ofm_full_depth_slices,

698

)

699

full_weights_bytes = len(full_weights.buffer)

700

cost.ofm_depth_slices = ofm_full_depth_slices

701

702

# No buffering required - take all the weights from permanent storage

703

if sched_op.op_type == Op.FullyConnected or not needs_dma:

704

cost.npu_weights_tensor = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

705

cost.npu_scales_tensor = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

706

return

707

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

708

encoded_weights: Optional[NpuWeightTensor] = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

709

encoded_scales = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

710

711

# How many NPU cycles are available under the previously executing

712

# operator and SRAM unused for performing buffered DMA transfers

713

slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0

714

slack_memory = prev_cost.slack_buffering_memory if prev_cost else 0

715

716

# Force full depth for cascaded Ops

717

if ref_cost.cascade != 0:

718

weight_tensor_purpose = TensorSubPurpose.Standard

719

weight_buffer_size = full_weights_bytes

720

# Update the memory snapshot to reflect the added size of the weights

721

ref_schedule.memory_snapshot[ref_cost.time_index] += weight_buffer_size

722

else:

723

# Estimate the buffering cycle time for the full set of weights

724

full_transfer_cycles = npu_performance.measure_mem2mem_cycles(

725

self.arch, weight_tensor.mem_area, self.arch.fast_storage_mem_area, full_weights_bytes

726

)

727

cost.full_weight_transfer_cycles = full_transfer_cycles

728

729

# Calculate the amount of prebuffering necessary (or what is possible with limited

730

# double buffer buffer size)

731

half_buffer_limit = buffer_limit_bytes // 2

732

if full_transfer_cycles > slack_cycles:

733

prebuffer_ratio = slack_cycles / full_transfer_cycles

734

prebuffer_bytes = min(prebuffer_ratio * full_weights_bytes, half_buffer_limit)

735

else:

736

prebuffer_bytes = min(full_weights_bytes, half_buffer_limit)

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

737

738

prebuffer_ratio = prebuffer_bytes / full_weights_bytes

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

739

740

# Have to split the weights if the initial buffering can't store

741

# all of the compressed weights

742

if prebuffer_bytes < full_weights_bytes:

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

743

block_depth = cost.block_config.ofm_block.depth

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

744

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

745

# Choose initial prebuffering depth (already buffer clamped)

746

prebuffer_depth = ref_cost.stripe.depth * prebuffer_ratio

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

747

prebuffer_depth = int(max(16, round_down(prebuffer_depth, ArchitectureFeatures.OFMSplitDepth)))

748

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

749

# Calculate cycles executed during the prebuffer

750

pre_op_cycles = self.estimate_op_performance(sched_op, cost.block_config, prebuffer_depth)

751

buffering_depth = ref_cost.stripe.depth * (pre_op_cycles.op_cycles / full_transfer_cycles)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

752

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

753

# Choose initial buffering depth and clamp to the double buffering limit

754

buffering_depth = round_up(buffering_depth, block_depth)

755

buffering_bytes = (buffering_depth / ref_cost.stripe.depth) * full_weights_bytes

756

if buffering_bytes > half_buffer_limit:

757

buffering_depth = (half_buffer_limit / full_weights_bytes) * ref_cost.stripe.depth

758

759

while True:

760

# Attempt to buffer whole blocks

Johan Alfvén

cce7f2d

2022-04-08 10:47:09 +0200

[diff] [blame]

761

if buffering_depth > block_depth:

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

762

buffering_depth = round_down(buffering_depth, block_depth)

763

else:

764

buffering_depth = round_down(buffering_depth, ArchitectureFeatures.OFMSplitDepth)

765

buffering_depth = int(max(buffering_depth, ArchitectureFeatures.OFMSplitDepth))

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

766

767

# Create list of depth slices

768

depth_slices = [0]

769

if prebuffer_depth < ref_cost.stripe.depth:

770

depth_slices += list(range(prebuffer_depth, ref_cost.stripe.depth, buffering_depth))

771

depth_slices.append(ref_cost.stripe.depth)

772

773

# Encode weights based depth slices

774

cost.ofm_depth_slices = depth_slices

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

775

encoded_weights, encoded_scales = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

self.arch,

sched_op.parent_op,

weight_tensor,

scale_tensor,

sched_op.kernel,

cost.block_config,

cost.ofm_depth_slices,

783

)

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

784

assert encoded_weights is not None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

785

# Chosen buffering might not fit at all, iterate until it does

786

# or until the minimum usable slice size is reached

787

if (

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

788

encoded_weights.double_buffer_size() <= buffer_limit_bytes

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

789

or prebuffer_depth == ArchitectureFeatures.OFMSplitDepth

):

break

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

793

if buffering_depth > prebuffer_depth:

794

buffering_depth = round_up(buffering_depth // 2, ArchitectureFeatures.OFMSplitDepth)

795

else:

796

prebuffer_depth = round_up(prebuffer_depth // 2, ArchitectureFeatures.OFMSplitDepth)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

797

798

# Calculate cycles required to run the last op for use as future slack

799

tail_cycles = self.estimate_op_performance(

800

sched_op, cost.block_config, depth_slices[-1] - depth_slices[-2]

801

)

802

cost.slack_buffering_cycles = tail_cycles.op_cycles

803

804

# Determine whether the weights need to be double buffered

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

805

weight_buffer_size = min(len(encoded_weights.buffer), encoded_weights.max_range_bytes())

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

806

807

# Only buffer weights if there's still space left for the buffer

808

if weight_buffer_size <= buffer_limit_bytes:

809

assert weight_buffer_size % 16 == 0

810

# Determine whether to double buffer or single buffer

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

811

double_buffer_size = encoded_weights.double_buffer_size()

812

if (double_buffer_size <= buffer_limit_bytes) and (weight_buffer_size < len(encoded_weights.buffer)):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

813

weight_tensor_purpose = TensorSubPurpose.DoubleBuffer

814

else:

815

weight_tensor_purpose = TensorSubPurpose.Standard

816

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

817

cost.buffered_weight_tensors = [

818

self.buffer_tensor(

819

encoded_weights,

820

weight_tensor_purpose,

821

encoded_weights.double_buffer_sizes[0],

822

weight_tensor.name + "_buffer",

823

)

824

]

825

if weight_tensor_purpose == TensorSubPurpose.DoubleBuffer:

826

buf2 = self.buffer_tensor(

827

encoded_weights,

828

weight_tensor_purpose,

829

encoded_weights.double_buffer_sizes[1],

830

weight_tensor.name + "_buffer2",

831

)

832

cost.buffered_weight_tensors.append(buf2)

833

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

834

# Note! OFM depth slices define slices as [0, s1, ... sn]. For example, [0, 70, 140] describes two slices

835

# (0-70 and 70-140) but has a length of 3, which would result in idx = 3 % 2 = 1 if two buffers were used.

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

836

last_used_buffer_idx = len(cost.ofm_depth_slices) % len(cost.buffered_weight_tensors)

837

weight_buffer_size = encoded_weights.double_buffer_sizes[last_used_buffer_idx]

838

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

839

if ref_cost.cascade == 0:

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

840

# Determine if the lifetime can be extended and pre-buffer the first weight buffer

841

# under the previous operation

842

cost.buffered_weight_tensors[0].pre_buffer = encoded_weights.double_buffer_size() < slack_memory

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

843

844

cost.slack_buffering_memory -= weight_buffer_size

845

else:

846

# Don't slice or buffer - use the whole depth from persistent storage

847

cost.ofm_depth_slices = ofm_full_depth_slices

848

encoded_weights = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

849

encoded_scales = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

850

851

cost.npu_weights_tensor = encoded_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

852

cost.npu_scales_tensor = encoded_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

853

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

854

def buffer_tensor(self, src_tensor: Tensor, sub_purpose: TensorSubPurpose, buffer_size: int, name: str) -> Tensor:

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

855

buffered_weight_tensor = Tensor([1, 1, 1, buffer_size], DataType.uint8, name)

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

856

buffered_weight_tensor.src_tensor = src_tensor

857

buffered_weight_tensor.mem_area = self.arch.fast_storage_mem_area

858

buffered_weight_tensor.mem_type = MemType.Scratch_fast

859

buffered_weight_tensor.purpose = TensorPurpose.Weights

860

buffered_weight_tensor.sub_purpose = sub_purpose

861

return buffered_weight_tensor

862

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

863

def propose_minimal_schedule(self) -> Schedule:

864

"""Proposes scheduling parameters where every operator is subdivided into the smallest stripe that satisfies the

865

next operators stride"""

866

min_schedule = Schedule(self.sg, "MIN")

867

cost_map = min_schedule.cost_map

868

869

# Keep track of the previous Op - which consumes the current Op's OFM

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

870

prev_op: Optional[SchedulerOperation] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

871

for sched_op in reversed(self.sched_ops):

872

min_stripe_height = prev_op.kernel.stride.y if prev_op else 1

873

min_stripe = sched_op.ofm.shape.with_height(min_stripe_height)

874

875

cost = sched_op.create_scheduler_info(self.nng, min_stripe)

876

cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)

877

cost_map[sched_op] = cost

prev_op = sched_op

return min_schedule

def propose_schedule_striping(self, final_stripe: Shape4D, label: str, ref_schedule: Schedule) -> Schedule:

884

"""Proposes new striping for a schedule. The stripe is derived from the ifm requirements of the next Op down"""

885

ref_cost = ref_schedule.cost_map

886

887

striped_schedule = Schedule(self.sg, label)

888

stripe = final_stripe

889

for sched_op in reversed(self.sched_ops):

890

if sched_op not in ref_cost:

891

# sched_op is not part of the sub-schedule - skip

892

continue

893

894

# Create a cost entry with the new stripe

895

cost = sched_op.create_scheduler_info(self.nng, stripe)

896

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

897

weight_tensor = cost.npu_weights_tensor

898

for idx, buffered_tens in enumerate(ref_cost[sched_op].buffered_weight_tensors):

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

899

# If the weights are buffered in the reference schedule they should be in the new proposal

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

900

cost.buffered_weight_tensors.append(

901

self.buffer_tensor(

902

weight_tensor,

903

buffered_tens.sub_purpose,

904

weight_tensor.double_buffer_sizes[idx],

905

buffered_tens.name,

906

)

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

907

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

908

909

# Estimate performance

910

cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)

911

striped_schedule.cost_map[sched_op] = cost

912

erik.andersson@arm.com

8912f3a

2022-08-16 11:08:57 +0200

[diff] [blame]

913

# Calculate the preceeding Op's stripe.

914

915

# In certain cases where an upscaling Op is cascaded,

916

# it may get instructed to produce an odd stripe height.

917

# Thus we need to force it back to even heights.

918

force_even_stripe_heights = False

919

for op in self.sched_ops:

920

# Check if the cascade has a Nearest Neighbor-op.

921

# If that is the case, force the stripes to be even.

922

if (

923

ref_cost.get(op, None)

924

and ref_cost.get(sched_op, None)

925

and ref_cost[op].cascade == ref_cost[sched_op].cascade

926

and is_nearest(op.resampling_mode)

927

):

928

force_even_stripe_heights = True

929

break

930

upscaling_remainder = stripe.height % to_upscale(sched_op.resampling_mode)

931

height = stripe.height + (stripe.height % 2 if force_even_stripe_heights else upscaling_remainder)

Fredrik Svedberg

d03dc50

2022-06-30 10:44:12 +0200

[diff] [blame]

932

stripe = sched_op.ifm.shape.with_height(height * sched_op.kernel.stride.y)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

933

934

return striped_schedule

935

936

def estimate_schedule_memory_usage(self, schedule: Schedule, non_local_mem_usage: dict):

937

"""Estimates the memory usage of a schedule"""

938

cost = schedule.cost_map

939

cascades = schedule.cascades

940

peak_mem_usage = 0

941

for sched_op in self.sched_ops:

942

if sched_op not in cost:

943

# sched_op is not part of the sub-schedule - skip

944

continue

945

946

if cost[sched_op].cascade:

947

# This Op is part of a cascade - use the cascade's memory usage

948

cascade_info = cascades[cost[sched_op].cascade]

949

# Non-local memory usage is already included in the cascade_info

950

peak_mem_usage = max(cascade_info.mem_usage, peak_mem_usage)

951

else:

952

# This Op is not part of a cascade - calculate the memory usage

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

953

op_weight_buffer = sum(tens.storage_size() for tens in cost[sched_op].buffered_weight_tensors)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

954

955

op_mem_usage = (

956

sched_op.ifm_size_in_bytes()

957

+ sched_op.ofm_size_in_bytes()

958

+ op_weight_buffer

959

+ non_local_mem_usage.get(sched_op, 0)

960

)

961

peak_mem_usage = max(op_mem_usage, peak_mem_usage)

962

963

return peak_mem_usage

964

Johan Alfvén

2022-07-16 18:27:05 +0200

[diff] [blame]

965

def build_cascades_for_min_schedule(self, min_schedule: Schedule, max_template: Schedule, memory_limit: int):

966

# Update memory snapshot

967

self.sg.schedule = min_schedule

968

self.update_op_memory_snapshot(min_schedule)

969

970

# Calculate residual memory for Min schedule

971

non_local_mem_usage = {}

972

for sched_op in self.sched_ops:

973

time_index = min_schedule.cost_map[sched_op].time_index

974

975

if self.arch.is_spilling_enabled():

976

# For Dedicated SRAM only the intermediate buffers are in SRAM, hence op_mem_usage is 0

977

op_mem_usage = 0

978

else:

979

# Min schedule only have ifm and ofm in SRAM (no buffered weigth tensors)

Johan Alfvén

eb332a3

2022-12-09 17:50:48 +0100

[diff] [blame]

980

# Only include IFM's that are in the scratch area

981

ifm = sched_op.ifm.connection.parent_tens

982

ifm_size = (

983

0 if ifm.mem_type not in (MemType.Scratch, MemType.Scratch_fast) else sched_op.ifm_size_in_bytes()

984

)

Johan Alfvén

92689d5

2022-12-06 11:16:19 +0100

[diff] [blame]

985

ofm_size = 0 if ofm_can_reuse_ifm(sched_op) else sched_op.ofm_size_in_bytes()

Johan Alfvén

eb332a3

2022-12-09 17:50:48 +0100

[diff] [blame]

986

op_mem_usage = ifm_size + ofm_size

Johan Alfvén

2022-07-16 18:27:05 +0200

[diff] [blame]

987

988

non_local_mem_usage[sched_op] = min_schedule.memory_snapshot[time_index] - op_mem_usage

Johan Alfvén

92689d5

2022-12-06 11:16:19 +0100

[diff] [blame]

989

assert non_local_mem_usage[sched_op] >= 0

Johan Alfvén

2022-07-16 18:27:05 +0200

[diff] [blame]

990

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

991

# Create cascades for Min schedule

Johan Alfvén

2022-07-16 18:27:05 +0200

[diff] [blame]

992

cascade_builder = CascadeBuilder(self.sched_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)

993

cascade_builder.build_cascades(min_schedule, max_template, memory_limit)

994

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

995

def optimize_sub_schedule(

996

self, cascade_info: CascadeInfo, ref_schedule: Schedule, max_template: Schedule, memory_limit: int

997

) -> Schedule:

998

"""Extracts the Ops covered by the given cascade and creates a sub-schedule. The sub-schedule is optimized by

999

proposing weight buffering and then continously proposing new stripe sizes"""

1000

ref_cost = ref_schedule.cost_map

1001

# Extract the ops that are part of this sub-schedule

1002

start = cascade_info.start

1003

end = cascade_info.end

1004

sub_schedule_ops = self.sched_ops[start : end + 1]

1005

# Create a sub-schedule that contains only the costs for the Ops that are part of the sub-schedule

1006

sub_schedule = Schedule(self.sg, f"SUB_{start}_{end}")

1007

for sched_op in sub_schedule_ops:

1008

sub_schedule.cost_map[sched_op] = ref_cost[sched_op]

1009

1010

sub_schedule.cascades[end] = cascade_info

1011

# Use the memory snapshot from the reference schedule

1012

sub_schedule.memory_snapshot = ref_schedule.memory_snapshot

1013

1014

# Calculate memory usage that is live during the sub-schedule but not part of it

1015

time_for_cascade = ref_cost[sub_schedule_ops[0]].time_index

1016

mem_usage_parallel_to_sub_schedule = ref_schedule.memory_snapshot[time_for_cascade] - cascade_info.mem_usage

1017

# If the first Op's IFM has other consumers it has to live throughout the whole sub-schedule whether it's

1018

# included in a cascade or not

1019

persistent_initial_ifm = (

1020

sub_schedule_ops[0].ifm_size_in_bytes() if len(sub_schedule_ops[0].ifm.connection.consumers) > 1 else 0

1021

)

1022

# Calculate non-local-mem-usage per Operator

1023

non_local_mem_usage = {}

1024

for idx, sched_op in enumerate(sub_schedule_ops):

1025

non_local_mem_usage[sched_op] = mem_usage_parallel_to_sub_schedule

1026

if idx != 0:

1027

non_local_mem_usage[sched_op] += persistent_initial_ifm

1028

1029

cascade_builder = CascadeBuilder(sub_schedule_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)

1030

1031

# Start by adding buffering

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

1032

buffered_sub_schedule = self.propose_schedule_buffering(

1033

sub_schedule, self.scheduler_options.optimization_sram_limit

1034

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1035

# Copy the cascades over from the unbuffered-schedule

1036

buffered_sub_schedule.cascades = sub_schedule.cascades

1037

1038

# Generate the possible stripings for the final Op in the sub-schedule

1039

final_ofm_shape = sub_schedule_ops[-1].ofm.shape

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1040

1041

# Skip testing the min stripe used in the MIN schedule since that will be used

1042

# anyway if no new cascades are created below

1043

last_op = sub_schedule_ops[-1]

1044

min_stripe_h = sub_schedule.cost_map[last_op].stripe.height + 1

1045

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1046

possible_stripes = [

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1047

final_ofm_shape.with_height(stripe_h) for stripe_h in range(min_stripe_h, final_ofm_shape.height // 2 + 1)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1048

]

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1049

# Propose different striping

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

1050

best_schedule = None

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1051

max_nbr_of_cascades = 0

1052

for iteration, proposed_stripe in enumerate(possible_stripes):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1053

proposed_schedule = self.propose_schedule_striping(

1054

proposed_stripe, f"OPTIMIZED_{iteration}", buffered_sub_schedule

1055

)

1056

1057

cascade_builder.build_cascades(proposed_schedule, max_template, memory_limit)

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1058

nbr_of_cascades = len(proposed_schedule.cascades)

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1059

if iteration == 0:

1060

# First iteration - used as limit to prevent splitting up the cascades

1061

# Long cascades are better in order to reduce IFM/IFM dram bandwidth

1062

max_nbr_of_cascades = nbr_of_cascades

1063

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

1064

# Check if proposal fits

1065

proposed_schedule_mem_usage = self.estimate_schedule_memory_usage(proposed_schedule, non_local_mem_usage)

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1066

if (proposed_schedule_mem_usage) <= memory_limit and nbr_of_cascades <= max_nbr_of_cascades:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1067

best_schedule = proposed_schedule

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1068

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1069

if not proposed_schedule.cascades:

1070

# No cascading required - early exit

1071

break

1072

else:

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1073

break

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

return best_schedule

def optimize_schedule(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

self,

schedule: Schedule,

max_sched: Schedule,

max_template: Schedule,

1082

options: SchedulerOptions,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1083

) -> Schedule:

1084

"""Extracts sub-schedules based on the cascades and optimizes them and applies them to the final schedule"""

1085

sram_limit = options.optimization_sram_limit

1086

if max_sched.fast_storage_peak_usage < sram_limit and not self.arch.is_spilling_enabled():

1087

# Maximum performance schedule fits within the SRAM target

1088

return max_sched

1089

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

1090

# Iterate over a copy of the cascades since they may change during the loop

1091

for cascade_info in list(schedule.cascades.values()):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1092

# Optimize the sub-schedule in this cascade

1093

opt_sub_schedule = self.optimize_sub_schedule(cascade_info, schedule, max_template, sram_limit)

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

1094

if opt_sub_schedule:

1095

# Remove the existing cascade

1096

del schedule.cascades[cascade_info.end]

1097

# Update the sub-schedule Op and cascade costs to the full schedule

1098

schedule.cost_map.update(opt_sub_schedule.cost_map)

1099

schedule.cascades.update(opt_sub_schedule.cascades)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1100

1101

# Update memory snapshot

1102

self.sg.schedule = schedule

1103

self.update_op_memory_snapshot(schedule)

1104

# Propose schedule buffering to the optimized schedule

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

1105

optimized_sched = self.propose_schedule_buffering(schedule, self.scheduler_options.optimization_sram_limit)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1106

# Copy the cascade's metadata from the unbuffered schedule

1107

optimized_sched.cascades = schedule.cascades

1108

return optimized_sched

1109

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1110

def optimize_weight_buffering_size(

1111

self,

1112

min_schedule: Schedule,

1113

options: SchedulerOptions,

1114

):

1115

default_schedule = self.sg.schedule

Tim Hall

c1be087

2022-03-03 17:50:52 +0000

[diff] [blame]

1116

npu_performance.calc_new_performance_for_network(self.nng, self.arch, None, False)

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1117

default_tot_cycles = self.nng.cycles[npu_performance.PassCycles.Total]

1118

default_dram_cycles = self.nng.cycles[npu_performance.PassCycles.DramAccess]

1119

1120

# Restore mem/type for scratched_fms

1121

for tens in self.scratched_fms:

1122

tens.mem_area = self.scratched_fms[tens][0]

1123

tens.mem_type = self.scratched_fms[tens][1]

1124

1125

self.update_op_memory_snapshot(self.sg.schedule)

1126

1127

# Collect live ranges from tensors

1128

memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

1129

lr_graph = live_range.LiveRangeGraph()

1130

for mem_area, mem_type_set in memories_list:

1131

live_range.extract_live_ranges_from_cascaded_passes(

1132

self.nng.get_root_subgraph(),

mem_area,

mem_type_set,

lr_graph,

Tensor.AllocationQuantum,

1137

)

1138

1139

# Find the relation between the sched_op and the buffering tensor

1140

weight_ops = {}

1141

for sched_op in self.sched_ops:

1142

cost = self.sg.schedule.cost_map[sched_op]

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

1143

for tens in cost.buffered_weight_tensors:

1144

weight_ops[tens] = sched_op

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1145

1146

# Filter out weight buffer live ranges

1147

weight_lrs = []

1148

for lr in lr_graph.lrs:

1149

for tens in lr.tensors:

1150

if weight_ops.get(tens):

1151

weight_lrs.append(lr)

1152

break

1153

1154

# See if any evicted fm overlaps with a weight buffering op.

1155

# If this is the case add a size limitation to the buffering op

1156

for lr in self.evicted_fms:

1157

for weight_lr in weight_lrs:

1158

if lr.overlaps_ranges(weight_lr):

1159

for tens in weight_lr.tensors:

1160

sched_op = weight_ops.get(tens)

1161

if sched_op:

1162

# Add size reduction to the op

1163

sched_op.evicted_fms_size += lr.size

1164

break

1165

1166

self.sg.schedule = min_schedule

1167

self.update_op_memory_snapshot(self.sg.schedule)

1168

1169

# Run schedule buffering - with weight buffer size reduction

1170

schedule = self.propose_schedule_buffering(self.sg.schedule, options.optimization_sram_limit)

1171

schedule.cascades = self.sg.schedule.cascades

1172

self.sg.schedule = schedule

1173

1174

# Apply new buffer schdule and calc new performance

1175

self.update_op_memory_snapshot(self.sg.schedule)

1176

self.apply_schedule(self.sg.schedule)

1177

self.use_fast_storage_for_feature_maps(self.sg.schedule, options.optimization_sram_limit)

1178

Tim Hall

c1be087

2022-03-03 17:50:52 +0000

[diff] [blame]

1179

npu_performance.calc_new_performance_for_network(self.nng, self.arch, None, False)

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1180

new_tot_cycles = self.nng.cycles[npu_performance.PassCycles.Total]

1181

new_dram_cycles = self.nng.cycles[npu_performance.PassCycles.DramAccess]

1182

Tim Hall

8bc7a65

2022-05-19 15:29:23 +0100

[diff] [blame]

1183

improvement_tot = (

1184

round((default_tot_cycles - new_tot_cycles) / default_tot_cycles, 2) if default_tot_cycles != 0 else 0

1185

)

1186

improvement_dram = (

1187

round((default_dram_cycles - new_dram_cycles) / default_dram_cycles, 2) if default_dram_cycles != 0 else 0

1188

)

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1189

1190

# Compare both total and dram improvement

Johan Alfvén

3dae1b6

2022-05-17 10:26:48 +0200

[diff] [blame]

1191

if not (improvement_tot >= 0 and improvement_dram > 0):

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1192

# No improvement, restore the default schedule

1193

for sched_op in self.sched_ops:

1194

sched_op.evicted_fms_size = 0

1195

1196

for tens in self.scratched_fms:

1197

tens.mem_area = self.scratched_fms[tens][0]

1198

tens.mem_type = self.scratched_fms[tens][1]

1199

1200

self.sg.schedule = default_schedule

1201

self.update_op_memory_snapshot(self.sg.schedule)

1202

self.apply_schedule(self.sg.schedule)

1203

self.use_fast_storage_for_feature_maps(self.sg.schedule, options.optimization_sram_limit)

1204

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1205

def apply_schedule(self, sched: Schedule):

1206

"""Applies the given schedule as a final solution"""

1207

for sched_op in self.sched_ops:

1208

op_info = sched.cost_map[sched_op]

1209

cascade_info = sched.cascades.get(op_info.cascade, None)

1210

if cascade_info and sched_op in cascade_info.buffers:

1211

buffer_tens = sched_op.ifm.connection.parent_tens

1212

# Apply memory area and type

1213

buffer_tens.mem_area = self.arch.fast_storage_mem_area

1214

buffer_tens.mem_type = MemType.Scratch_fast

1215

# Apply Rolling buffer

1216

buffer_tens.set_format(TensorFormat.NHCWB16, self.arch)

1217

buffer_tens.set_new_sub_purpose(TensorSubPurpose.RollingBufferY, cascade_info.buffers[sched_op].height)

1218

1219

sched_op.parent_ps.block_config = op_info.block_config.old_style_representation()

1220

1221

# Ensure that the src_tensor reference is set correctly

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

1222

for tens in op_info.buffered_weight_tensors:

1223

tens.src_tensor = op_info.npu_weights_tensor

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1224

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1225

def use_fast_storage_for_feature_maps(self, schedule, staging_limit):

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

1226

"""Finds the set of feature maps that fits within the staging limit which combined has the largest amount of

1227

access cycles and moves those feature map into fast storage"""

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1228

max_mem_usage = []

1229

base_mem_usage = []

1230

fast_storage_type = MemType.Scratch_fast

1231

fast_storage_mem_area = self.arch.fast_storage_mem_area

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1232

self.evicted_fms = []

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1233

1234

# Force all OFMs to fast-storage

1235

for sched_op in self.sched_ops:

1236

cost = schedule.cost_map[sched_op]

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1237

if cost.cascade == 0 and sched_op.get_dependants():

1238

ofm_tens = sched_op.ofm.connection.parent_tens

1239

if not any(cons is None for cons in ofm_tens.consumer_list):

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1240

if ofm_tens not in self.scratched_fms:

1241

# Remember default mem area and mem type, only done once

1242

self.scratched_fms[ofm_tens] = (ofm_tens.mem_area, ofm_tens.mem_type)

1243

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1244

ofm_tens.mem_area = fast_storage_mem_area

1245

ofm_tens.mem_type = fast_storage_type

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1246

1247

# Collect live ranges from tensors

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1248

memories_list = [(fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1249

lr_graph = live_range.LiveRangeGraph()

1250

for mem_area, mem_type_set in memories_list:

1251

live_range.extract_live_ranges_from_cascaded_passes(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

1252

self.nng.get_root_subgraph(),

mem_area,

mem_type_set,

lr_graph,

Tensor.AllocationQuantum,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1257

)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1258

max_mem_usage = lr_graph.get_temporal_memory_usage(fast_storage_mem_area)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1259

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

1260

# If max_mem_usage does not exceed staging limit at any point all lrs fit and can stay in fast storage

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1261

if max(max_mem_usage) <= staging_limit:

1262

return

1263

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

1264

# Build up the base memory usage by removing the mem_usage of the lrs we previously moved to fast-storage

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1265

base_mem_usage = np.array(max_mem_usage)

1266

curr_lrs = []

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1267

for lr in lr_graph.lrs:

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1268

for tens in lr.tensors:

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1269

if self.scratched_fms.get(tens):

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1270

curr_lrs.append(lr)

1271

base_mem_usage[lr.start_time : lr.end_time + 1] -= lr.size

1272

break

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1273

competing_lrs = []

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1274

competing_tens_access = {}

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

1275

1276

# Evict live ranges that will never fit

1277

for lr in curr_lrs.copy():

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1278

base_usage = max(base_mem_usage[lr.start_time : lr.end_time + 1])

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1279

if base_usage + lr.size > staging_limit:

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

1280

# Lr will never fit and may thus be evicted

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1281

self.evicted_fms.append(lr)

1282

FastStorageComponentAllocator.evict(lr, max_mem_usage, self.scratched_fms)

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

1283

curr_lrs.remove(lr)

1284

1285

# Keep live ranges that will always fit in fast storage and let the remaining ones compete

1286

for lr in curr_lrs:

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1287

# Since max_mem_usage is the memory usage with all FMs still in fast-storage,

1288

# the memory limit cannot be exceeded if max_mem_usage does not.

1289

# Thus, the affected lrs can remain in fast-storage if the following is true

1290

if max(max_mem_usage[lr.start_time : lr.end_time + 1]) <= staging_limit:

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

1291

FastStorageComponentAllocator.keep(lr, base_mem_usage)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1292

else:

1293

competing_lrs.append(lr)

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1294

for tens in lr.tensors:

1295

competing_tens_access[tens] = 0

1296

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1297

# All lrs and their tensors have been handled if competing_lrs_sz is zero, we may thus return

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

1298

if len(competing_lrs) == 0:

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1299

return

1300

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1301

# Estimate element access for all tensors that are competing for a place in fast-storage.

1302

# This number is used when deciding which tensor that stays in fast-storage

1303

for sched_op in self.sched_ops:

1304

cost = schedule.cost_map[sched_op]

1305

1306

if competing_tens_access.get(sched_op.ifm.connection.parent_tens) is not None:

1307

tens = sched_op.ifm.connection.parent_tens

1308

access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)

1309

competing_tens_access[tens] += access.ifm_read[0]

1310

1311

if sched_op.ifm2 and competing_tens_access.get(sched_op.ifm2.connection.parent_tens) is not None:

1312

tens = sched_op.ifm2.connection.parent_tens

1313

access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)

1314

competing_tens_access[tens] += access.ifm_read[1]

1315

1316

if competing_tens_access.get(sched_op.ofm.connection.parent_tens) is not None:

1317

tens = sched_op.ofm.connection.parent_tens

1318

access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)

1319

competing_tens_access[tens] += access.ofm_write

1320

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

1321

# Sort live ranges "from left to right" on the time axis to simplify checking overlapping ranges

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1322

competing_lrs = sorted(competing_lrs, key=lambda lr: (lr.start_time, lr.end_time + 1, lr.size))

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1323

1324

# Remove lrs that have a live range that is too long compared to others.

1325

# They are causing problems for the HillClimb Allocator when it has to

1326

# change the allocation indices, in order to fit all the allocations into SRAM.

1327

# This problem only occur in larger networks with complex graphs.

1328

#

1329

# Limit the number of items for allocate_component to work with max MAX_EXHAUSTIVE_ITEMS

1330

# at the time. Too many will give too long compilation time

1331

#

1332

# Too long is currently decided to be (based on experience, analyzing many networks):

1333

# Compare lr at postion i with lr at position i + MAX_EXHAUSTIVE_ITEMS.

1334

# If end time differs by at least MAX_EXHAUSTIVE_LIFE_RANGE then do not include lr at position i.

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

1335

if len(competing_lrs) > FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS:

1336

# Create a copy of the original list to iterate over because the original version is modified in-loop

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1337

competing_lrs_copy = competing_lrs.copy()

1338

for i, lr in enumerate(competing_lrs_copy):

1339

lr_time = lr.end_time - lr.start_time

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

1340

# Only check live ranges longer than MAX_EXHAUSTIVE_LIFE_RANGE

1341

if lr_time >= FastStorageComponentAllocator.MAX_EXHAUSTIVE_LIFE_RANGE:

1342

# Compare current lr with lr at position lr + MAX_EXHAUSTIVE_ITEMS

1343

cmp_pos = min(i + FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS, len(competing_lrs) - 1)

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1344

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

1345

# Compare end times + plus a margin by MAX_EXHAUSTIVE_LIFE_RANGE

1346

if (

1347

lr.end_time

1348

> competing_lrs_copy[cmp_pos].end_time + FastStorageComponentAllocator.MAX_EXHAUSTIVE_LIFE_RANGE

1349

):

1350

# Current lr live time stands out, remove it. No use adding it to the

1351

# evicted_fms list since the lr should not be included in the fast storage allocation

1352

FastStorageComponentAllocator.evict(lr, max_mem_usage, self.scratched_fms)

1353

competing_lrs.remove(lr)

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1354

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

1355

# Split competing live ranges into components by finding disconnected groups of live ranges or components of

1356

# max size MAX_EXHAUSTIVE_ITEMS

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1357

start = 0

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1358

end_time = competing_lrs[0].end_time

1359

component_allocator = FastStorageComponentAllocator(base_mem_usage, max_mem_usage, staging_limit)

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

1360

component_ranges = []

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1361

for i, lr in enumerate(competing_lrs):

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1362

nbr_items = i - start

1363

if lr.start_time <= end_time and (nbr_items < FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS):

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1364

end_time = max(end_time, lr.end_time)

1365

else:

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1366

# Number items reached max items or current lr's start time

1367

# does not overlap with previous lr's end time

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

1368

component_ranges.append((start, i))

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1369

start = i

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1370

end_time = lr.end_time

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

1371

component_ranges.append((start, len(competing_lrs)))

1372

1373

# Allocate each component separately

1374

for start, end in component_ranges:

1375

component_allocator.allocate_component(

1376

competing_lrs[start:end],

max_mem_usage,

base_mem_usage,

self.scratched_fms,

competing_tens_access,

1381

self.evicted_fms,

1382

)

1383

assert max(max_mem_usage) <= staging_limit, "Allocation exceeds staging limit"

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1384

1385

def move_constant_data(self):

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

1386

"""Determine if data can be moved from permanent storage to another memory area. A move will generate a DMA

1387

command in the high-level command stream"""

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1388

for sched_op in self.sched_ops:

1389

parent_op = sched_op.parent_op

1390

is_lut_used = any(inp.purpose == TensorPurpose.LUT for inp in parent_op.inputs)

1391

max_ifm_shram_avail = (

1392

(self.arch.available_shram_banks(is_lut_used) - self.arch.shram_reserved_output_banks)

1393

* self.arch.shram_bank_size

// 2

)

for idx, tens in enumerate(parent_op.inputs):

1398

if tens.mem_type not in (MemType.Scratch, MemType.Scratch_fast):

1399

# Tensor is in permanent storage

1400

# Only when permanent storage differs from feature map storage, there is a point moving the data

1401

if (

1402

tens.mem_area in self.arch.permanent_storage_mem_area

1403

and self.arch.permanent_storage_mem_area != self.arch.feature_map_storage_mem_area

1404

) or tens.purpose == TensorPurpose.LUT:

1405

if tens.purpose == TensorPurpose.LUT or (

Patrik Gustavsson

94292fe

2021-09-02 08:22:58 +0200

[diff] [blame]

1406

# For elementwise broadcast

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1407

tens.purpose == TensorPurpose.FeatureMap

1408

and sched_op.op_type.is_binary_elementwise_op()

1409

and tens.shape != []

1410

and sched_op.ifm.shape != sched_op.ofm.shape

Patrik Gustavsson

94292fe

2021-09-02 08:22:58 +0200

[diff] [blame]

1411

and parent_op.write_shape is None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1412

and tens.storage_size() > max_ifm_shram_avail

1413

):

1414

only_vector_product_consumers = all(

1415

oper and oper.type.npu_block_type == NpuBlockType.VectorProduct

1416

for oper in tens.consumers()

1417

)

1418

1419

if (not only_vector_product_consumers) or tens.purpose == TensorPurpose.LUT:

1420

new_tens = tens.clone_into_fast_storage(self.arch)

1421

if tens.purpose == TensorPurpose.LUT:

1422

new_tens.mem_area = MemArea.Shram

1423

1424

new_tens.consumer_list.append(parent_op)

1425

parent_op.inputs[idx] = new_tens

Dwight Lidman

352607c

2021-09-29 17:00:09 +0200

[diff] [blame]

1426

# If the index is out of range, IFM and IFM2 are the same tensor

1427

# and pass inputs don't have duplicates

1428

if idx < len(sched_op.parent_ps.inputs):

1429

sched_op.parent_ps.inputs[idx] = new_tens

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1430

1431

def print_schedule(self, schedule: Schedule):

1432

print(f"Schedule: '{schedule.name}'")

1433

for sched_op in self.sched_ops:

1434

if sched_op not in schedule.cost_map:

1435

# Sub-schedule printing

1436

continue

1437

1438

op_info = schedule.cost_map[sched_op]

1439

print(f"\t{sched_op.index}: Operation {sched_op.name} - OFM {sched_op.ofm.shape}")

1440

print(f"\t\tType: {sched_op.op_type}")

1441

print(f"\t\tKernel: {sched_op.kernel}")

1442

print(f"{op_info}")

1443

mem_usage = (

1444

schedule.memory_snapshot[op_info.time_index]

1445

if op_info.time_index < len(schedule.memory_snapshot)

1446

else 0

1447

)

1448

print(f"\t\tSRAM Used: {mem_usage} bytes")

1449

Jonas Ohlsson

25e700c

2022-03-04 14:58:56 +0100

[diff] [blame]

1450

print("\tCascades:")

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1451

for i, cascade in enumerate(schedule.cascades.values()):

1452

print(f"\t\t{i}: {cascade.start} -> {cascade.end}, size: {cascade.mem_usage}")

Patrik Gustavsson

feeb06d

2020-04-22 12:53:47 +0200

[diff] [blame]

1453

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1454

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1455

def _update_tensor_allocation(nng: Graph, arch: ArchitectureFeatures, options):

1456

"""

1457

Creates live ranges and runs tensor allocator for the current schedule

1458

(i.e. sg.schedule for all subgraphs), returns the maximum memory usage

1459

and updates SchedulerOpInfo.mem_usage for all operations in the schedule.

1460

"""

1461

root_sg = nng.get_root_subgraph()

1462

1463

alloc_list = []

1464

if arch.is_spilling_enabled():

1465

mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,)))

1466

mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))

1467

# Order is important

1468

alloc_list.append(mem_alloc_scratch_fast)

1469

alloc_list.append(mem_alloc_scratch)

1470

else:

1471

mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))

1472

alloc_list.append(mem_alloc_scratch)

1473

1474

for mem_area, mem_type_set in alloc_list:

1475

tensor_allocation.allocate_tensors(

nng,

root_sg,

arch,

mem_area,

mem_type_set,

tensor_allocator=options.tensor_allocator,

1482

verbose_allocation=options.verbose_allocation,

1483

cpu_tensor_alignment=options.cpu_tensor_alignment,

Tim Hall

cda4fcb

2022-05-19 12:36:58 +0100

[diff] [blame]

1484

hillclimb_max_iterations=options.hillclimb_max_iterations,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1485

)

1486

1487

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1488

class FastStorageComponentAllocator:

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1489

MAX_EXHAUSTIVE_LIFE_RANGE = 20

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1490

MAX_EXHAUSTIVE_ITEMS = 20

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1491

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1492

def __init__(self, base_mem_usage, max_mem_usage, staging_limit):

1493

self.base_mem_usage = base_mem_usage

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

1494

self.max_mem_usage = max_mem_usage

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1495

self.staging_limit = staging_limit

1496

self.lrs = []

1497

self.evicted = []

1498

self.curr_evicted = []

1499

self.remaining_total_size = []

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1500

self.best_score = 0

1501

self.competing_tens_access = {}

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1502

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1503

def allocate_exhaustive(self, ix, score):

1504

# Favour tensors with highest element access (score)

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

1505

if ix >= self.num_lrs:

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1506

if score > self.best_score:

1507

self.best_score = score

Louis Verhaard

5c8f1e5

2022-02-23 14:13:07 +0100

[diff] [blame]

1508

self.evicted = self.curr_evicted.copy()

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1509

return

1510

1511

lr = self.lrs[ix]

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1512

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

1513

# If adding the tensor size to the base mem usage doesn't exceed the staging limit anywhere on the lr time

1514

# range, it can fit and the case where the tensor is included needs to be checked

1515

can_fit = max(self.base_mem_usage[lr.start_time : lr.end_time + 1]) + lr.size <= self.staging_limit

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1516

if can_fit:

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

1517

# Tensor can fit, add tensor element access to the score and check case where tensor is included

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1518

self.curr_evicted[ix] = False

1519

self.base_mem_usage = self.update_mem_usage(self.base_mem_usage, lr, True)

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

1520

self.allocate_exhaustive(ix + 1, score + self.competing_tens_access[lr.tensors[0]])

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1521

self.base_mem_usage = self.update_mem_usage(self.base_mem_usage, lr, False)

1522

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

1523

# If the max mem usage doesn't exceed the staging limit anywhere on the lr time range, it always fits and the

1524

# case where the tensor is not included can be skipped

1525

always_fits = max(self.max_mem_usage[lr.start_time : lr.end_time + 1]) <= self.staging_limit

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1526

if not always_fits:

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

1527

# Tensor doesn't always fit, check case when tensor is not included

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1528

self.curr_evicted[ix] = True

1529

self.max_mem_usage = self.update_mem_usage(self.max_mem_usage, lr, False)

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1530

self.allocate_exhaustive(ix + 1, score)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1531

self.max_mem_usage = self.update_mem_usage(self.max_mem_usage, lr, True)

1532

1533

@staticmethod

1534

def update_mem_usage(mem_usage, lr, increase):

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

1535

size = lr.size if increase else -lr.size

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1536

for t in range(lr.start_time, lr.end_time + 1):

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

1537

mem_usage[t] += size

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

return mem_usage

@staticmethod

def evict(lr, max_mem_usage, scratched_fms):

1542

for t in range(lr.start_time, lr.end_time + 1):

1543

max_mem_usage[t] -= lr.size

1544

for tens in lr.tensors:

1545

if tens in scratched_fms:

1546

tens.mem_area = scratched_fms[tens][0]

1547

tens.mem_type = scratched_fms[tens][1]

1548

1549

@staticmethod

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

1550

def keep(lr, base_mem_usage):

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1551

for t in range(lr.start_time, lr.end_time + 1):

1552

base_mem_usage[t] += lr.size

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1553

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

1554

def allocate_component(self, lrs, max_mem, min_mem, scratched_fms, competing_tens_access, evicted_fms):

1555

self.lrs = lrs

1556

self.num_lrs = len(lrs)

1557

self.evicted = [0] * self.num_lrs

1558

self.curr_evicted = [0] * self.num_lrs

1559

self.best_score = -1

1560

self.competing_tens_access = competing_tens_access

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1561

# Recursively evaluate all permutations of allocations of the lrs found in the component.

1562

# For every permutation that fits within the staging_limit there is a score calculated.

1563

# The permutation with the highest score will then be chosen. The score is calculated

1564

# as the sum of the actual element access (ifm read and ofm write) for all the

1565

# including tensors. So it is not necessary the tensor with the biggest size that ends up

1566

# being included in the result.

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

1567

self.allocate_exhaustive(0, 0)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1568

# Optimal allocation has been found, move lrs accordingly

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

1569

for i, lr in enumerate(self.lrs):

1570

if self.evicted[i]:

1571

self.evict(lr, max_mem, scratched_fms)

1572

if lr not in evicted_fms:

1573

evicted_fms.append(lr)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1574

else:

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame^]

1575

self.keep(lr, min_mem)

1576

if lr in evicted_fms:

1577

evicted_fms.remove(lr)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1578

1579

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1580

def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_options: SchedulerOptions):

1581

"""Entry point for the Scheduler"""

1582

# Initialize CPU subgraphs

1583

schedulers = dict()

1584

# Initialize schedulers with max schedule. Only schedule NPU subgraphs

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1585

for sg in nng.subgraphs:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1586

if sg.placement != PassPlacement.Npu:

1587

# Create cascaded passes for CPU Ops

1588

cascaded_passes = []

1589

for idx, ps in enumerate(sg.passes):

1590

cps = CascadedPass(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

1591

ps.name,

1592

SchedulingStrategy.WeightStream,

ps.inputs,

[],

ps.outputs,

[ps],

ps.placement,

False,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1599

)

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1600

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1601

cps.time = idx

1602

ps.cascade = cps

1603

cascaded_passes.append(cps)

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1604

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1605

sg.cascaded_passes = cascaded_passes

1606

else:

1607

# Npu subgraph - create schedule

1608

scheduler = Scheduler(nng, sg, arch, scheduler_options)

1609

schedulers[sg] = scheduler

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1610

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1611

scheduler.create_scheduler_representation(arch)

1612

sg.sched_ops = scheduler.sched_ops

1613

scheduler.move_constant_data()

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1614

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1615

# Create the Max schedule template

1616

max_schedule_template = scheduler.create_initial_schedule()

1617

scheduler.max_schedule = max_schedule_template

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1618

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1619

# Create the optimimised Max schedule

1620

sg.schedule = max_schedule_template

1621

scheduler.update_op_memory_snapshot(max_schedule_template)

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

1622

opt_max_schedule = scheduler.propose_schedule_buffering(max_schedule_template, 1 << 32)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1623

sg.schedule = opt_max_schedule

1624

scheduler.update_op_memory_snapshot(opt_max_schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1625

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1626

# Create Min schedule

1627

min_schedule = scheduler.propose_minimal_schedule()

1628

initial_sram_limit = scheduler_options.optimization_sram_limit

1629

if scheduler_options.optimization_strategy == OptimizationStrategy.Size:

1630

initial_sram_limit = scheduler.min_memory_req

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1631

Johan Alfvén

2022-07-16 18:27:05 +0200

[diff] [blame]

1632

# Build cascades for Min schedule

1633

scheduler.build_cascades_for_min_schedule(min_schedule, max_schedule_template, initial_sram_limit)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1634

sg.schedule = min_schedule

1635

scheduler.update_op_memory_snapshot(min_schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1636

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1637

if scheduler_options.optimization_strategy == OptimizationStrategy.Performance:

1638

# Create an optimized schedule

1639

sg.schedule = scheduler.optimize_schedule(

1640

min_schedule, opt_max_schedule, max_schedule_template, scheduler_options

1641

)

1642

scheduler.update_op_memory_snapshot(sg.schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1643

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1644

scheduler.apply_schedule(sg.schedule)

1645

scheduler.use_fast_storage_for_feature_maps(sg.schedule, scheduler_options.optimization_sram_limit)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1646

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1647

if scheduler_options.optimization_strategy == OptimizationStrategy.Performance and scheduler.evicted_fms:

1648

# It might be possible to gain performance by reducing

1649

# weight buffer size and instead fit fms in fast storage

1650

scheduler.optimize_weight_buffering_size(min_schedule, scheduler_options)

1651

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1652

if scheduler_options.verbose_schedule:

1653

scheduler.print_schedule(sg.schedule)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1654

Tim Hall