Blame - ethosu/vela/scheduler.py - ml/ethos-u/ethos-u-vela

2020-04-27 18:20:16 +0100

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

16

#

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

17

# Description:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

18

# The scheduler creates and searches for an optimal plan for the network, selecting block configurations and

19

# subdivisions for the Operators

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

20

# For Class name forward references for the type annotations. (see PEP 563).

21

from __future__ import annotations

22

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

23

import copy

Johan Alfvén

2022-02-09 21:20:10 +0100

[diff] [blame]

24

from collections import namedtuple

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

25

from enum import auto

26

from enum import IntEnum

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

27

from typing import Any

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

28

from typing import Dict

29

from typing import List

30

from typing import Optional

31

from typing import Tuple

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

32

from typing import TYPE_CHECKING

33

34

# Import needed for Type annotations. Only import for Type checking to avoid run-time errors due to cyclic import.

35

if TYPE_CHECKING:

36

from .npu_performance import CycleCost

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

37

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

38

import numpy as np

39

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

40

from . import live_range

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

41

from . import npu_performance

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

42

from . import tensor_allocation

43

from . import weight_compressor

44

from .architecture_allocator import ArchitectureBlockConfig

45

from .architecture_allocator import find_block_config

46

from .architecture_allocator import get_ifm_area_required

Fredrik Svedberg

d03dc50

2022-06-30 10:44:12 +0200

[diff] [blame]

47

from .architecture_allocator import to_upscale

erik.andersson@arm.com

8912f3a

2022-08-16 11:08:57 +0200

[diff] [blame]

48

from .architecture_allocator import is_nearest

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

49

from .architecture_features import ArchitectureFeatures

50

from .architecture_features import Block

51

from .cascade_builder import CascadeBuilder

52

from .cascade_builder import CascadeInfo

Fredrik Svedberg

880e735

2020-08-25 11:31:47 +0200

[diff] [blame]

53

from .data_type import DataType

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

54

from .nn_graph import CascadedPass

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

55

from .nn_graph import Graph

56

from .nn_graph import Pass

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

57

from .nn_graph import PassPlacement

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

58

from .nn_graph import SchedulingStrategy

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

59

from .nn_graph import Subgraph

Johan Alfvén

92689d5

2022-12-06 11:16:19 +0100

[diff] [blame]

60

from .live_range import ofm_can_reuse_ifm

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

61

from .numeric_util import round_down

62

from .numeric_util import round_up

Louis Verhaard

aee5d75

2020-09-30 09:01:52 +0200

[diff] [blame]

63

from .operation import Op

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

64

from .shape4d import Shape4D

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

65

from .tensor import MemArea

Patrik Gustavsson

eca2e95

2020-05-27 09:15:11 +0200

[diff] [blame]

66

from .tensor import MemType

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

67

from .tensor import Tensor

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

68

from .tensor import TensorFormat

69

from .tensor import TensorPurpose

70

from .tensor import TensorSubPurpose

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

71

from .weight_compressor import NpuWeightTensor

Jacob Bohlin

1a66697

2020-09-11 10:04:15 +0200

[diff] [blame]

72

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

73

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

74

def shape_for_format(shape: Shape4D, tensor_format: TensorFormat) -> Shape4D:

75

if tensor_format == TensorFormat.NHCWB16:

76

return shape.with_depth(round_up(shape.depth, 16))

return shape

class OptimizationStrategy(IntEnum):

82

"""Enum defining the different optimization strategies for the Scheduler"""

83

84

Size = auto()

85

Performance = auto()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

def __str__(self):

return self.name

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

91

class SchedulerOpInfo:

92

"""Contains metadata about a SchedulerOperation that is unique to one Schedule"""

93

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

94

def __init__(

95

self,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

96

block_config: ArchitectureBlockConfig,

97

weights_size: int,

98

stripe_input: Shape4D,

99

stripe_input2: Optional[Shape4D],

100

stripe: Shape4D,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

101

):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

102

self.block_config = block_config

103

self.weights_size = weights_size

104

self.stripe_input = stripe_input

105

self.stripe_input2 = stripe_input2

106

self.stripe = stripe

107

self.cascade = 0 # Assigned by CascadeBuilder. 0 means not part of a cascade

108

self.time_index = None # Set by update_op_memory_snapshot

109

self.ofm_depth_slices: List[int] = [0, stripe.depth]

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

110

self.npu_weights_tensor: Optional[NpuWeightTensor] = None

111

self.npu_scales_tensor: Optional[NpuWeightTensor] = None

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

112

self.buffered_weight_tensors: List[Tensor] = []

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

113

self.cycles: Optional[CycleCost] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

114

self.slack_buffering_cycles = 0

115

self.slack_buffering_memory = 0

116

self.full_weight_transfer_cycles = 0

117

118

def copy(self):

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

119

res = SchedulerOpInfo(

self.block_config,

self.weights_size,

self.stripe_input,

self.stripe_input2,

self.stripe,

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

126

res.cascade = self.cascade

return res

def __str__(self):

res = f"\t\tBlock Config = {self.block_config}\n"

131

res += f"\t\tOFM Block = {self.block_config.ofm_block}\n"

132

res += f"\t\tIFM Stripe = {self.stripe_input}\n"

133

res += f"\t\tIFM2 Stripe = {self.stripe_input2}\n"

134

res += f"\t\tOFM Stripe = {self.stripe}\n"

135

res += f"\t\tEncoded Weights = {self.npu_weights_tensor and len(self.npu_weights_tensor.buffer)} bytes\n"

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

136

for idx, tens in enumerate(self.buffered_weight_tensors):

137

res += f"\t\tWeight buffer{idx + 1} = {tens.storage_size()} bytes\n"

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

138

res += f"\t\tDepth slices = {self.ofm_depth_slices}\n"

139

res += f"\t\tAssigned Cascade = {self.cascade}"

return res

class SchedulerOptions:

144

"""Contains options for the Scheduler"""

145

146

def __init__(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

147

self,

148

optimization_strategy,

149

sram_target,

150

verbose_schedule,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

151

):

152

self.optimization_strategy = optimization_strategy

153

self.optimization_sram_limit = sram_target

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

154

self.verbose_schedule = verbose_schedule

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

155

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

156

def __str__(self) -> str:

157

return f"{type(self).__name__}: {str(self.__dict__)}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

__repr__ = __str__

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

162

class SchedulerTensor:

163

def __init__(self, shape, dt, mem_area, _format):

164

self.dtype = dt

165

self.mem_area = mem_area

166

self.shape = shape

167

self.format = _format

168

self.connection = None

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

169

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

170

171

class SchedulerOperation:

172

"""Scheduler internal representation of 'Operation'

173

This class can be seen as a node within the Scheduler Graph representation

174

"""

175

176

def __init__(self, ps: Pass, arch: ArchitectureFeatures, nng: Graph):

177

self.arch = arch

178

self.parent_ps = ps

179

self.parent_op = ps.primary_op

180

self.name = ps.primary_op.name

181

self.op_type = ps.primary_op.type

182

self.activation = ps.primary_op.activation

183

self.kernel = ps.primary_op.kernel

Tim Hall

3c5cfe9

2022-03-16 16:31:57 +0000

[diff] [blame]

184

self.resampling_mode = ps.primary_op.ifm_resampling_mode

Fredrik Svedberg

b81e1bb

2022-10-11 21:50:51 +0200

[diff] [blame]

185

self.reversed_operands = False

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

186

self.uses_scalar = ps.primary_op.ifm2 is not None and (

187

ps.primary_op.ifm.shape == [] or ps.primary_op.ifm2.shape == []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

188

)

erik.andersson@arm.com

2022-03-22 15:35:30 +0100

[diff] [blame]

189

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

190

self.ifm_ublock = arch.ifm_ublock

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

191

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

192

self.ifm = SchedulerTensor(

193

ps.ifm_shapes[0],

194

ps.ifm_tensor.dtype,

195

ps.ifm_tensor.mem_area,

196

ps.ifm_tensor.format,

197

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

198

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

199

self.ifm2 = None

200

if ps.ifm2_tensor:

201

self.ifm2 = SchedulerTensor(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

202

ps.ifm_shapes[1],

203

ps.ifm2_tensor.dtype,

204

ps.ifm2_tensor.mem_area,

205

ps.ifm2_tensor.format,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

206

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

207

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

208

self.ofm = SchedulerTensor(

209

ps.ofm_shapes[0],

210

ps.ofm_tensor.dtype,

211

ps.ofm_tensor.mem_area,

212

ps.ofm_tensor.format,

213

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

214

Johan Alfven

126558e

2023-03-09 08:36:10 +0100

[diff] [blame^]

215

# LUT must be placed in shram area. The copy is done by DMA

216

# generated by the high level command stream generator.

217

for idx, tens in enumerate(self.parent_op.inputs):

218

if tens.purpose == TensorPurpose.LUT:

219

new_tens = tens.clone_into_shram(self.arch)

220

new_tens.consumer_list.append(self.parent_op)

221

self.parent_op.inputs[idx] = new_tens

222

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

223

# Input volume width and height required to produce the smallest possible stripe

224

self.min_stripe_input_w, self.min_stripe_input_h = self._calculate_min_stripe_input()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

225

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

226

# Flags that marks whether this SchedulerOperation requires full IFM/OFM

227

self.requires_full_ifm = False

228

self.requires_full_ifm2 = False

229

self.requires_full_ofm = False

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

230

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

231

self.evicted_fms_size = 0

232

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

233

self.index = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

234

erik.andersson@arm.com

2022-03-22 15:35:30 +0100

[diff] [blame]

235

# Perform an IFM swap for certain binary elementwise operators

236

# in order to enable cascading, if the SchedOp conforms to

237

# Elementwise cascading rules.

Johan Alfvén

0f2e59f

2022-10-21 11:21:38 +0200

[diff] [blame]

238

# The non-constant/non-scalar/non-broadcast IFM should be the primary input

239

if self.op_type.is_binary_elementwise_op():

240

ifm = self.parent_op.ifm

241

ifm2 = self.parent_op.ifm2

242

ofm = self.parent_op.ofm

erik.andersson@arm.com

2022-03-22 15:35:30 +0100

[diff] [blame]

243

Johan Alfvén

993ea53

2022-10-26 10:20:01 +0200

[diff] [blame]

244

ifm_can_swap = ifm.is_const or ifm.is_scalar

Johan Alfvén

0f2e59f

2022-10-21 11:21:38 +0200

[diff] [blame]

245

ifm2_can_be_primary = not (ifm2.is_const or ifm2.is_scalar or ifm2.is_broadcast(ofm))

246

Johan Alfvén

993ea53

2022-10-26 10:20:01 +0200

[diff] [blame]

247

if ifm_can_swap and ifm2_can_be_primary:

Johan Alfvén

0f2e59f

2022-10-21 11:21:38 +0200

[diff] [blame]

248

# IFM2 is the primary input

Fredrik Svedberg

b81e1bb

2022-10-11 21:50:51 +0200

[diff] [blame]

249

self.reversed_operands = True

erik.andersson@arm.com

2022-03-22 15:35:30 +0100

[diff] [blame]

250

self.ifm, self.ifm2 = self.ifm2, self.ifm

251

252

self.parent_ps.ifm_shapes = self.parent_ps.ifm_shapes[::-1]

253

self.parent_ps.inputs = self.parent_ps.inputs[::-1]

254

self.parent_ps.ifm_tensor, self.parent_ps.ifm2_tensor = (

255

self.parent_ps.ifm2_tensor,

256

self.parent_ps.ifm_tensor,

257

)

258

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

259

def add_ifm_connection(self, conn: "Connection"):

260

"""Add input connection to another SchedulerOperation or Subgraph Input"""

261

conn.consumers.append(self)

262

self.ifm.connection = conn

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

263

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

264

def add_ifm2_connection(self, conn: "Connection"):

265

"""Add input connection to another SchedulerOperation or Subgraph Input"""

266

if self.ifm2:

267

conn.consumers.append(self)

268

self.ifm2.connection = conn

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

269

else:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

270

assert False, f"Trying to set an IFM2 Connection to {self} which has no IFM2"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

271

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

272

def add_ofm_connection(self, conn: "Connection"):

273

"""Add output connection to another SchedulerOperation or Subgraph Output"""

274

conn.producers.append(self)

275

self.ofm.connection = conn

276

277

def get_dependants(self):

278

"""Returns a list of the Ops that depend on this Operation's OFM"""

279

return self.ofm.connection.consumers

280

281

def ifm_size_in_bytes(self) -> int:

282

"""Returns size of the IFM in bytes"""

283

ifm_storage_shape = shape_for_format(self.ifm.shape, self.ifm.format)

284

return round_up(ifm_storage_shape.elements() * self.ifm.dtype.size_in_bytes(), Tensor.AllocationQuantum)

285

286

def ifm2_size_in_bytes(self) -> int:

287

"""Returns size of the IFM2 in bytes"""

288

if self.ifm2:

289

ifm2_storage_shape = shape_for_format(self.ifm2.shape, self.ifm2.format)

290

return round_up(ifm2_storage_shape.elements() * self.ifm2.dtype.size_in_bytes(), Tensor.AllocationQuantum)

return 0

def ofm_size_in_bytes(self) -> int:

295

"""Returns size of the OFM in bytes"""

296

ofm_storage_shape = shape_for_format(self.ofm.shape, self.ofm.format)

297

return round_up(ofm_storage_shape.elements() * self.ofm.dtype.size_in_bytes(), Tensor.AllocationQuantum)

298

299

def create_scheduler_info(self, nng: Graph, stripe: Shape4D) -> SchedulerOpInfo:

300

"""Returns schedule info about this SchedulerOperation based on how many ofm elements it should produce"""

301

ifm_shape = self.ifm.shape

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

302

ifm2_shape = self.ifm2.shape if self.ifm2 is not None else None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

303

ofm_shape = stripe

304

305

if ofm_shape != self.ofm.shape:

306

# Striped Op - Need to calculate stripe input volume

307

stripe_input_w, stripe_input_h = self._get_stripe_input_requirement(stripe)

308

# Ensure stripe input volume is within the full IFM volume

309

stripe_input_h = min(stripe_input_h, self.ifm.shape.height)

310

stripe_input_w = min(stripe_input_w, self.ifm.shape.width)

311

ifm_shape = ifm_shape.with_hw(stripe_input_h, stripe_input_w)

312

313

if self.ifm2:

314

stripe_input2_h = min(stripe_input_h, self.ifm2.shape.height)

315

stripe_input2_w = min(stripe_input_w, self.ifm2.shape.width)

316

ifm2_shape = ifm2_shape.with_hw(stripe_input2_h, stripe_input2_w)

317

318

block_config = self._get_block_config(ifm_shape, ifm2_shape, self.uses_scalar, ofm_shape)

319

320

scheduler_op_info = SchedulerOpInfo(block_config, 0, ifm_shape, ifm2_shape, ofm_shape)

321

if self.parent_op.weights:

322

# Default full-depth weight encoding with no buffering

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

323

(

324

scheduler_op_info.npu_weights_tensor,

325

scheduler_op_info.npu_scales_tensor,

326

) = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

327

self.arch,

328

self.parent_op,

329

self.parent_op.weights,

self.parent_op.bias,

self.kernel,

block_config,

[0, self.ofm.shape.depth],

334

)

335

336

self.parent_ps.block_config = block_config.old_style_representation()

337

return scheduler_op_info

338

339

def _get_stripe_input_requirement(self, stripe_shape: Shape4D) -> Tuple[int, int]:

340

"""Returns the amount of IFM required to produce the stripe with shape:'stripe_shape'"""

341

ofm_shape_to_produce = Block.from_shape(stripe_shape.as_list())

342

Fredrik Svedberg

3ff7a4a

2021-09-29 10:08:04 +0200

[diff] [blame]

343

return get_ifm_area_required(ofm_shape_to_produce, self.kernel, self.resampling_mode)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

344

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

345

def _calculate_min_stripe_input(self) -> Tuple[int, int]:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

346

# Calculate the input volume required height and width for the smallest possible stripe (h,w = 1,1)

347

min_stripe = self.ofm.shape.with_hw(1, 1)

348

return self._get_stripe_input_requirement(min_stripe)

349

350

def _get_block_config(

351

self, ifm_shape: Shape4D, ifm2_shape: Optional[Shape4D], uses_scalar: bool, ofm_shape: Shape4D

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

352

) -> Optional[ArchitectureBlockConfig]:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

353

# Returns a block config and SHRAM layout

354

lut_banks = 2 if self.parent_op.activation_lut else 0

355

return find_block_config(

356

self.arch,

357

self.op_type.npu_block_type,

ofm_shape,

ifm_shape,

ifm2_shape,

uses_scalar,

self.ifm.dtype.size_in_bits(),

363

self.kernel,

364

lut_banks,

365

self.parent_op.has_scaling(),

366

self.resampling_mode,

)

class Connection:

"""Scheduler internal representation of a Tensor that connects two SchedulerOperations

372

This class can be seen as an edge within the Scheduler Graph representation

373

"""

374

375

def __init__(self, tensor: Tensor):

376

self.parent_tens = tensor

377

378

# SchedulerOperation relationships

379

self.producers: List[SchedulerOperation] = []

380

self.consumers: List[SchedulerOperation] = []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

381

382

def __str__(self):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

383

return f"<Connection {self.parent_tens.name}>"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

__repr__ = __str__

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

388

class Schedule:

389

"""Class that contains a solution of how to schedule an NPU subgraph and its cost"""

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

390

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

391

def __init__(self, sg: Subgraph, label: str):

392

self.sg = sg

393

self.label = label

394

self.cost_map: Dict[SchedulerOperation, SchedulerOpInfo] = {}

395

self.cascades: Dict[int, CascadeInfo] = {}

396

self.fast_storage_peak_usage = 0

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

397

self.memory_snapshot: Optional[List[int]] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

@property

def name(self):

return f"{self.sg.name}_{self.label}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

402

403

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

404

class Scheduler:

405

"""Main class of the Vela Scheduling"""

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

406

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

407

def __init__(self, nng: Graph, sg: Subgraph, arch: ArchitectureFeatures, options: SchedulerOptions):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

408

self.nng = nng

409

self.sg = sg

410

self.arch = arch

Ayaan Masood

b801dda

2022-02-22 11:28:55 +0000

[diff] [blame]

411

self.sched_ops: List[SchedulerOperation] = []

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

412

self.max_schedule: Optional[Schedule] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

413

self.scheduler_options = options

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

414

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

415

self.scratched_fms: Dict[Tensor, Any] = {}

416

self.evicted_fms: List[live_range.LiveRange] = []

417

Johan Alfvén

2022-02-09 21:20:10 +0100

[diff] [blame]

418

def avoid_nhcwb16_for_ofm(self, tens, ps, arch):

Johan Alfven

f349047

2023-01-13 08:46:27 +0100

[diff] [blame]

419

"""For elementwise ops when ifm is in nhwc format and not brick format aligned (16),

420

then if the ofm can overwrite the ifm it is better to enforce ofm format to nhwc in

421

order to reduce memory transactions"""

Johan Alfvén

2022-02-09 21:20:10 +0100

[diff] [blame]

422

423

op = ps.primary_op

424

if not op.type.is_elementwise_op():

425

return False

426

427

depth = op.ofm_shapes[0][-1]

428

if (depth % 16) == 0:

429

return False

430

431

# Check if overwriting the inputs can be allowed

432

OpShapeTens = namedtuple("OpShapeTens", ["op_shape", "tens"])

433

outp = OpShapeTens(op.ofm_shapes[0], op.ofm)

434

inps = []

435

if op.ifm is not None:

436

inps.append(OpShapeTens(op.ifm_shapes[0], op.ifm))

437

if op.ifm2 is not None:

438

inps.append(OpShapeTens(op.ifm_shapes[1], op.ifm2))

439

440

# Find an input tensor that can be overwritten by the output

441

for inp in inps:

442

if (

443

# check op input and output shapes allow overlapping

444

inp.op_shape == outp.op_shape

445

# check input tensor is valid

446

and inp.tens is not None

447

and inp.tens.shape != []

448

# check input and output tensors are compatible

449

and inp.tens.format == outp.tens.format

450

and inp.tens.dtype == outp.tens.dtype

451

):

452

if inp.tens.format == TensorFormat.NHWC:

return True

return False

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

457

def create_scheduler_representation(self, arch: ArchitectureFeatures):

458

"""Creates a Scheduler Graph representation"""

459

# Temporary dict for creating connections between the Operations

460

connections: Dict[Tensor, Connection] = {}

461

# Memory required for the largest FeatureMap that has to be full

462

min_memory_req = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

463

for ps in self.sg.passes:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

464

if ps.primary_op:

465

# Set tensor format to NHCWB16 for output FeatureMaps, if possible

Louis Verhaard

0b9c9a3

2020-09-15 14:05:38 +0200

[diff] [blame]

466

for output in ps.outputs:

Jacob Bohlin

a5e8c1c

2021-06-14 13:33:39 +0200

[diff] [blame]

467

if output in self.sg.output_tensors or output.purpose != TensorPurpose.FeatureMap:

Patrik Gustavsson

feeb06d

2020-04-22 12:53:47 +0200

[diff] [blame]

468

continue

Johan Alfvén

2022-02-09 21:20:10 +0100

[diff] [blame]

469

470

if output.needs_linear_format:

471

continue

472

473

if self.avoid_nhcwb16_for_ofm(output, ps, arch):

474

output.needs_linear_format = True

475

continue

476

477

output.set_format(TensorFormat.NHCWB16, arch)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

478

479

# Create SchedulerOperations

480

op = SchedulerOperation(ps, arch, self.nng)

481

op.index = len(self.sched_ops)

482

483

# Make connections

484

if ps.ifm_tensor not in connections:

485

connections[ps.ifm_tensor] = Connection(ps.ifm_tensor)

486

if ps.ifm2_tensor and ps.ifm2_tensor not in connections:

487

connections[ps.ifm2_tensor] = Connection(ps.ifm2_tensor)

488

if ps.ofm_tensor not in connections:

489

connections[ps.ofm_tensor] = Connection(ps.ofm_tensor)

490

491

op.add_ifm_connection(connections[ps.ifm_tensor])

492

if ps.ifm2_tensor:

493

op.add_ifm2_connection(connections[ps.ifm2_tensor])

494

op.add_ofm_connection(connections[ps.ofm_tensor])

495

496

# Set requirements on the ifm/ofm buffers

497

self.sched_ops.append(op)

498

if ps.ifm_tensor in self.sg.input_tensors:

499

# This Op consumes a subgraph input

500

op.requires_full_ifm = True

501

if ps.ifm2_tensor and ps.ifm2_tensor in self.sg.input_tensors:

502

# This Op consumes a subgraph input

503

op.requires_full_ifm2 = True

504

if ps.ofm_tensor in self.sg.output_tensors:

505

# This Op produces a subgraph output

506

op.requires_full_ofm = True

507

if ps.ifm_tensor.needs_linear_format:

508

op.requires_full_ifm = True

509

if ps.ifm2_tensor and ps.ifm2_tensor.needs_linear_format:

510

op.requires_full_ifm2 = True

511

if ps.ofm_tensor.needs_linear_format or ps.primary_op.memory_function == Op.ConcatSliceWrite:

512

op.requires_full_ofm = True

513

if len(ps.primary_op.outputs) > 1 or len(ps.primary_op.outputs[0].consumer_list) > 1:

514

# Op has multiple outputs or consumers - requires full OFM

515

op.requires_full_ofm = True

516

517

# Check memory requirements if this Op requires any full FeatureMaps

518

op_memory_req = 0

519

if op.requires_full_ifm:

520

op_memory_req += op.ifm_size_in_bytes()

521

if op.requires_full_ifm2:

522

op_memory_req += op.ifm2_size_in_bytes()

523

if op.requires_full_ofm:

524

op_memory_req += op.ofm_size_in_bytes()

525

526

min_memory_req = max(op_memory_req, min_memory_req)

527

528

# Theoretical minimum required memory - used to guide the cascade building

529

self.min_memory_req = min_memory_req

530

531

def create_initial_schedule(self) -> Schedule:

532

"""Creates an initial schedule with no cascading or buffering of any kind"""

533

schedule = Schedule(self.sg, "MAX")

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

534

for op in self.sched_ops:

535

cost = op.create_scheduler_info(self.nng, op.ofm.shape)

536

cost.cycles = self.estimate_op_performance(op, cost.block_config, op.ofm.shape.depth)

537

schedule.cost_map[op] = cost

return schedule

def update_op_memory_snapshot(self, schedule: Schedule):

542

memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

543

544

# Collect live ranges from tensors

545

lr_graph = live_range.LiveRangeGraph()

546

for mem_area, mem_type_set in memories_list:

Johan Alfven

2023-02-28 09:03:03 +0100

[diff] [blame]

547

live_range.extract_live_ranges_from_schedule(

548

self.sg,

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

549

mem_area,

550

mem_type_set,

551

lr_graph,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

552

)

553

554

# Populate time-array with memory used by live ranges

555

temporal_usage = lr_graph.get_temporal_memory_usage(self.arch.fast_storage_mem_area)

556

schedule.memory_snapshot = temporal_usage

557

558

# Set the peak memory usage

559

schedule.fast_storage_peak_usage = max(temporal_usage, default=0)

560

561

def estimate_op_performance(self, op: SchedulerOperation, block_config, ofm_depth):

562

query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)

563

query.ifm_shape = op.ifm.shape

564

query.ifm_memory_area = op.ifm.mem_area

565

query.ifm_bits = op.ifm.dtype.size_in_bits()

566

query.ifm_format = op.ifm.format

567

query.ifm2_shape = op.ifm2 and op.ifm2.shape

568

query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area

569

query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()

570

query.ifm2_format = op.ifm2 and op.ifm2.format

571

query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)

572

query.ofm_memory_area = op.ofm.mem_area

573

query.ofm_bits = op.ofm.dtype.size_in_bits()

574

query.ofm_format = op.ofm.format

575

if op.parent_op.bias:

576

query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)

577

query.const_memory_area = self.arch.fast_storage_mem_area

578

579

query.kernel = op.kernel

580

query.config = block_config

581

582

return npu_performance.measure_cycle_cost(self.arch, op.op_type, op.activation and op.activation.op_type, query)

583

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

584

def estimate_element_access(self, op: SchedulerOperation, block_config, ofm_depth):

585

query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)

586

query.ifm_shape = op.ifm.shape

587

query.ifm_memory_area = op.ifm.mem_area

588

query.ifm_bits = op.ifm.dtype.size_in_bits()

589

query.ifm_format = op.ifm.format

590

query.ifm2_shape = op.ifm2 and op.ifm2.shape

591

query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area

592

query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()

593

query.ifm2_format = op.ifm2 and op.ifm2.format

594

query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)

595

query.ofm_memory_area = op.ofm.mem_area

596

query.ofm_bits = op.ofm.dtype.size_in_bits()

597

query.ofm_format = op.ofm.format

598

if op.parent_op.bias:

599

query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)

600

query.const_memory_area = self.arch.fast_storage_mem_area

601

602

query.kernel = op.kernel

603

query.config = block_config

604

605

return npu_performance.measure_element_access(self.arch, query)

606

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

607

def propose_schedule_buffering(self, ref_schedule: Schedule, staging_limit_bytes):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

608

"""Create a buffered schedule"""

609

buffered_schedule = Schedule(self.sg, f"{ref_schedule.label}_BUFFERED")

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

610

611

prev_op = None

612

for sched_op in self.sched_ops:

613

if sched_op not in ref_schedule.cost_map:

614

# sched_op is not part of this sub-schedule - skip

615

continue

616

617

self.propose_operator_buffering(sched_op, prev_op, buffered_schedule, ref_schedule, staging_limit_bytes)

618

prev_op = sched_op

619

620

return buffered_schedule

621

622

def propose_operator_buffering(

623

self,

624

sched_op: SchedulerOperation,

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

625

prev_op: Optional[SchedulerOperation],

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

626

buffered_schedule: Schedule,

627

ref_schedule: Schedule,

628

staging_limit_bytes,

629

):

630

# Mild recursion might mean this Op has already been seen

631

if sched_op in buffered_schedule.cost_map:

632

return

633

634

# Take the reference schedule as default costings for this schedule

635

ref_cost = ref_schedule.cost_map[sched_op]

636

cost = copy.copy(ref_cost)

637

cost.slack_buffering_cycles = ref_cost.cycles.op_cycles

638

memory_snapshot = ref_schedule.memory_snapshot

639

ref_memory_usage = memory_snapshot[ref_cost.time_index] if ref_cost.time_index < len(memory_snapshot) else 0

640

cost.slack_buffering_memory = staging_limit_bytes - ref_memory_usage

641

buffered_schedule.cost_map[sched_op] = cost

642

643

# Attempt weight buffering on anything with a weights tensor

644

if sched_op.parent_op.weights:

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

645

buffer_limit_bytes = cost.slack_buffering_memory

646

647

# If applicable apply size limitation, but keep it within reason (ratio 1.5).

648

# Size limitation is used when use_fast_storage_for_feature_maps have

649

# detected that there are fms that do not fit in fast storage.

650

if sched_op.evicted_fms_size and ((buffer_limit_bytes / sched_op.evicted_fms_size) >= 1.5):

651

buffer_limit_bytes -= sched_op.evicted_fms_size

652

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

653

self.propose_weight_buffering(

654

sched_op.parent_op.weights,

655

sched_op.parent_op.bias,

sched_op,

prev_op,

buffered_schedule,

ref_schedule,

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

660

buffer_limit_bytes,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

)

return cost

def weights_needs_dma(self, weight_tensor):

666

if weight_tensor and weight_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast):

667

# Weights are in permanent storage

668

# Only when permanent storage differs from feature map storage, there is a point moving the data

669

if (

670

weight_tensor.mem_area in (MemArea.Dram, MemArea.OffChipFlash)

671

and self.arch.permanent_storage_mem_area != self.arch.fast_storage_mem_area

):

return True

return False

def propose_weight_buffering(

self,

weight_tensor,

scale_tensor,

sched_op: SchedulerOperation,

681

prev_op: SchedulerOperation,

682

buffered_schedule: Schedule,

683

ref_schedule: Schedule,

684

buffer_limit_bytes,

685

):

686

cost = buffered_schedule.cost_map[sched_op]

687

prev_cost = buffered_schedule.cost_map.get(prev_op)

688

ref_cost = ref_schedule.cost_map[sched_op]

689

assert cost and ref_cost

690

691

needs_dma = self.weights_needs_dma(weight_tensor)

692

693

ofm_full_depth_slices = [0, ref_cost.stripe.depth]

694

695

# Encode weights for the full depth

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

696

full_weights, full_scales = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

self.arch,

sched_op.parent_op,

weight_tensor,

scale_tensor,

sched_op.kernel,

cost.block_config,

ofm_full_depth_slices,

704

)

705

full_weights_bytes = len(full_weights.buffer)

706

cost.ofm_depth_slices = ofm_full_depth_slices

707

708

# No buffering required - take all the weights from permanent storage

709

if sched_op.op_type == Op.FullyConnected or not needs_dma:

710

cost.npu_weights_tensor = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

711

cost.npu_scales_tensor = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

712

return

713

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

714

encoded_weights: Optional[NpuWeightTensor] = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

715

encoded_scales = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

716

717

# How many NPU cycles are available under the previously executing

718

# operator and SRAM unused for performing buffered DMA transfers

719

slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0

720

slack_memory = prev_cost.slack_buffering_memory if prev_cost else 0

721

722

# Force full depth for cascaded Ops

723

if ref_cost.cascade != 0:

724

weight_tensor_purpose = TensorSubPurpose.Standard

725

weight_buffer_size = full_weights_bytes

726

# Update the memory snapshot to reflect the added size of the weights

727

ref_schedule.memory_snapshot[ref_cost.time_index] += weight_buffer_size

728

else:

729

# Estimate the buffering cycle time for the full set of weights

730

full_transfer_cycles = npu_performance.measure_mem2mem_cycles(

731

self.arch, weight_tensor.mem_area, self.arch.fast_storage_mem_area, full_weights_bytes

732

)

733

cost.full_weight_transfer_cycles = full_transfer_cycles

734

735

# Calculate the amount of prebuffering necessary (or what is possible with limited

736

# double buffer buffer size)

737

half_buffer_limit = buffer_limit_bytes // 2

738

if full_transfer_cycles > slack_cycles:

739

prebuffer_ratio = slack_cycles / full_transfer_cycles

740

prebuffer_bytes = min(prebuffer_ratio * full_weights_bytes, half_buffer_limit)

741

else:

742

prebuffer_bytes = min(full_weights_bytes, half_buffer_limit)

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

743

744

prebuffer_ratio = prebuffer_bytes / full_weights_bytes

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

745

746

# Have to split the weights if the initial buffering can't store

747

# all of the compressed weights

748

if prebuffer_bytes < full_weights_bytes:

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

749

block_depth = cost.block_config.ofm_block.depth

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

750

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

751

# Choose initial prebuffering depth (already buffer clamped)

752

prebuffer_depth = ref_cost.stripe.depth * prebuffer_ratio

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

753

prebuffer_depth = int(max(16, round_down(prebuffer_depth, ArchitectureFeatures.OFMSplitDepth)))

754

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

755

# Calculate cycles executed during the prebuffer

756

pre_op_cycles = self.estimate_op_performance(sched_op, cost.block_config, prebuffer_depth)

757

buffering_depth = ref_cost.stripe.depth * (pre_op_cycles.op_cycles / full_transfer_cycles)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

758

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

759

# Choose initial buffering depth and clamp to the double buffering limit

760

buffering_depth = round_up(buffering_depth, block_depth)

761

buffering_bytes = (buffering_depth / ref_cost.stripe.depth) * full_weights_bytes

762

if buffering_bytes > half_buffer_limit:

763

buffering_depth = (half_buffer_limit / full_weights_bytes) * ref_cost.stripe.depth

764

765

while True:

766

# Attempt to buffer whole blocks

Johan Alfvén

cce7f2d

2022-04-08 10:47:09 +0200

[diff] [blame]

767

if buffering_depth > block_depth:

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

768

buffering_depth = round_down(buffering_depth, block_depth)

769

else:

770

buffering_depth = round_down(buffering_depth, ArchitectureFeatures.OFMSplitDepth)

771

buffering_depth = int(max(buffering_depth, ArchitectureFeatures.OFMSplitDepth))

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

772

773

# Create list of depth slices

774

depth_slices = [0]

775

if prebuffer_depth < ref_cost.stripe.depth:

776

depth_slices += list(range(prebuffer_depth, ref_cost.stripe.depth, buffering_depth))

777

depth_slices.append(ref_cost.stripe.depth)

778

779

# Encode weights based depth slices

780

cost.ofm_depth_slices = depth_slices

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

781

encoded_weights, encoded_scales = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

self.arch,

sched_op.parent_op,

weight_tensor,

scale_tensor,

sched_op.kernel,

cost.block_config,

cost.ofm_depth_slices,

789

)

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

790

assert encoded_weights is not None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

791

# Chosen buffering might not fit at all, iterate until it does

792

# or until the minimum usable slice size is reached

793

if (

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

794

encoded_weights.double_buffer_size() <= buffer_limit_bytes

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

795

or prebuffer_depth == ArchitectureFeatures.OFMSplitDepth

):

break

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

799

if buffering_depth > prebuffer_depth:

800

buffering_depth = round_up(buffering_depth // 2, ArchitectureFeatures.OFMSplitDepth)

801

else:

802

prebuffer_depth = round_up(prebuffer_depth // 2, ArchitectureFeatures.OFMSplitDepth)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

803

804

# Calculate cycles required to run the last op for use as future slack

805

tail_cycles = self.estimate_op_performance(

806

sched_op, cost.block_config, depth_slices[-1] - depth_slices[-2]

807

)

808

cost.slack_buffering_cycles = tail_cycles.op_cycles

809

810

# Determine whether the weights need to be double buffered

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

811

weight_buffer_size = min(len(encoded_weights.buffer), encoded_weights.max_range_bytes())

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

812

813

# Only buffer weights if there's still space left for the buffer

814

if weight_buffer_size <= buffer_limit_bytes:

815

assert weight_buffer_size % 16 == 0

816

# Determine whether to double buffer or single buffer

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

817

double_buffer_size = encoded_weights.double_buffer_size()

818

if (double_buffer_size <= buffer_limit_bytes) and (weight_buffer_size < len(encoded_weights.buffer)):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

819

weight_tensor_purpose = TensorSubPurpose.DoubleBuffer

820

else:

821

weight_tensor_purpose = TensorSubPurpose.Standard

822

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

823

cost.buffered_weight_tensors = [

824

self.buffer_tensor(

825

encoded_weights,

826

weight_tensor_purpose,

827

encoded_weights.double_buffer_sizes[0],

828

weight_tensor.name + "_buffer",

829

)

830

]

831

if weight_tensor_purpose == TensorSubPurpose.DoubleBuffer:

832

buf2 = self.buffer_tensor(

833

encoded_weights,

834

weight_tensor_purpose,

835

encoded_weights.double_buffer_sizes[1],

836

weight_tensor.name + "_buffer2",

837

)

838

cost.buffered_weight_tensors.append(buf2)

839

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

840

# Note! OFM depth slices define slices as [0, s1, ... sn]. For example, [0, 70, 140] describes two slices

841

# (0-70 and 70-140) but has a length of 3, which would result in idx = 3 % 2 = 1 if two buffers were used.

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

842

last_used_buffer_idx = len(cost.ofm_depth_slices) % len(cost.buffered_weight_tensors)

843

weight_buffer_size = encoded_weights.double_buffer_sizes[last_used_buffer_idx]

844

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

845

if ref_cost.cascade == 0:

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

846

# Determine if the lifetime can be extended and pre-buffer the first weight buffer

847

# under the previous operation

848

cost.buffered_weight_tensors[0].pre_buffer = encoded_weights.double_buffer_size() < slack_memory

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

849

850

cost.slack_buffering_memory -= weight_buffer_size

851

else:

852

# Don't slice or buffer - use the whole depth from persistent storage

853

cost.ofm_depth_slices = ofm_full_depth_slices

854

encoded_weights = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

855

encoded_scales = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

856

857

cost.npu_weights_tensor = encoded_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

858

cost.npu_scales_tensor = encoded_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

859

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

860

def buffer_tensor(self, src_tensor: Tensor, sub_purpose: TensorSubPurpose, buffer_size: int, name: str) -> Tensor:

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

861

buffered_weight_tensor = Tensor([1, 1, 1, buffer_size], DataType.uint8, name)

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

862

buffered_weight_tensor.src_tensor = src_tensor

863

buffered_weight_tensor.mem_area = self.arch.fast_storage_mem_area

864

buffered_weight_tensor.mem_type = MemType.Scratch_fast

865

buffered_weight_tensor.purpose = TensorPurpose.Weights

866

buffered_weight_tensor.sub_purpose = sub_purpose

867

return buffered_weight_tensor

868

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

869

def propose_minimal_schedule(self) -> Schedule:

870

"""Proposes scheduling parameters where every operator is subdivided into the smallest stripe that satisfies the

871

next operators stride"""

872

min_schedule = Schedule(self.sg, "MIN")

873

cost_map = min_schedule.cost_map

874

875

# Keep track of the previous Op - which consumes the current Op's OFM

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

876

prev_op: Optional[SchedulerOperation] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

877

for sched_op in reversed(self.sched_ops):

878

min_stripe_height = prev_op.kernel.stride.y if prev_op else 1

879

min_stripe = sched_op.ofm.shape.with_height(min_stripe_height)

880

881

cost = sched_op.create_scheduler_info(self.nng, min_stripe)

882

cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)

883

cost_map[sched_op] = cost

prev_op = sched_op

return min_schedule

def propose_schedule_striping(self, final_stripe: Shape4D, label: str, ref_schedule: Schedule) -> Schedule:

890

"""Proposes new striping for a schedule. The stripe is derived from the ifm requirements of the next Op down"""

891

ref_cost = ref_schedule.cost_map

892

893

striped_schedule = Schedule(self.sg, label)

894

stripe = final_stripe

895

for sched_op in reversed(self.sched_ops):

896

if sched_op not in ref_cost:

897

# sched_op is not part of the sub-schedule - skip

898

continue

899

900

# Create a cost entry with the new stripe

901

cost = sched_op.create_scheduler_info(self.nng, stripe)

902

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

903

weight_tensor = cost.npu_weights_tensor

904

for idx, buffered_tens in enumerate(ref_cost[sched_op].buffered_weight_tensors):

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

905

# If the weights are buffered in the reference schedule they should be in the new proposal

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

906

cost.buffered_weight_tensors.append(

907

self.buffer_tensor(

908

weight_tensor,

909

buffered_tens.sub_purpose,

910

weight_tensor.double_buffer_sizes[idx],

911

buffered_tens.name,

912

)

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

913

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

914

915

# Estimate performance

916

cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)

917

striped_schedule.cost_map[sched_op] = cost

918

erik.andersson@arm.com

8912f3a

2022-08-16 11:08:57 +0200

[diff] [blame]

919

# Calculate the preceeding Op's stripe.

920

921

# In certain cases where an upscaling Op is cascaded,

922

# it may get instructed to produce an odd stripe height.

923

# Thus we need to force it back to even heights.

924

force_even_stripe_heights = False

925

for op in self.sched_ops:

926

# Check if the cascade has a Nearest Neighbor-op.

927

# If that is the case, force the stripes to be even.

928

if (

929

ref_cost.get(op, None)

930

and ref_cost.get(sched_op, None)

931

and ref_cost[op].cascade == ref_cost[sched_op].cascade

932

and is_nearest(op.resampling_mode)

933

):

934

force_even_stripe_heights = True

935

break

936

upscaling_remainder = stripe.height % to_upscale(sched_op.resampling_mode)

937

height = stripe.height + (stripe.height % 2 if force_even_stripe_heights else upscaling_remainder)

Fredrik Svedberg

d03dc50

2022-06-30 10:44:12 +0200

[diff] [blame]

938

stripe = sched_op.ifm.shape.with_height(height * sched_op.kernel.stride.y)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

939

940

return striped_schedule

941

942

def estimate_schedule_memory_usage(self, schedule: Schedule, non_local_mem_usage: dict):

943

"""Estimates the memory usage of a schedule"""

944

cost = schedule.cost_map

945

cascades = schedule.cascades

946

peak_mem_usage = 0

947

for sched_op in self.sched_ops:

948

if sched_op not in cost:

949

# sched_op is not part of the sub-schedule - skip

950

continue

951

952

if cost[sched_op].cascade:

953

# This Op is part of a cascade - use the cascade's memory usage

954

cascade_info = cascades[cost[sched_op].cascade]

955

# Non-local memory usage is already included in the cascade_info

956

peak_mem_usage = max(cascade_info.mem_usage, peak_mem_usage)

957

else:

958

# This Op is not part of a cascade - calculate the memory usage

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

959

op_weight_buffer = sum(tens.storage_size() for tens in cost[sched_op].buffered_weight_tensors)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

960

961

op_mem_usage = (

962

sched_op.ifm_size_in_bytes()

963

+ sched_op.ofm_size_in_bytes()

964

+ op_weight_buffer

965

+ non_local_mem_usage.get(sched_op, 0)

966

)

967

peak_mem_usage = max(op_mem_usage, peak_mem_usage)

968

969

return peak_mem_usage

970

Johan Alfvén

2022-07-16 18:27:05 +0200

[diff] [blame]

971

def build_cascades_for_min_schedule(self, min_schedule: Schedule, max_template: Schedule, memory_limit: int):

972

# Update memory snapshot

973

self.sg.schedule = min_schedule

974

self.update_op_memory_snapshot(min_schedule)

975

976

# Calculate residual memory for Min schedule

977

non_local_mem_usage = {}

978

for sched_op in self.sched_ops:

979

time_index = min_schedule.cost_map[sched_op].time_index

980

981

if self.arch.is_spilling_enabled():

982

# For Dedicated SRAM only the intermediate buffers are in SRAM, hence op_mem_usage is 0

983

op_mem_usage = 0

984

else:

985

# Min schedule only have ifm and ofm in SRAM (no buffered weigth tensors)

Johan Alfvén

eb332a3

2022-12-09 17:50:48 +0100

[diff] [blame]

986

# Only include IFM's that are in the scratch area

987

ifm = sched_op.ifm.connection.parent_tens

988

ifm_size = (

989

0 if ifm.mem_type not in (MemType.Scratch, MemType.Scratch_fast) else sched_op.ifm_size_in_bytes()

990

)

Johan Alfvén

92689d5

2022-12-06 11:16:19 +0100

[diff] [blame]

991

ofm_size = 0 if ofm_can_reuse_ifm(sched_op) else sched_op.ofm_size_in_bytes()

Johan Alfvén

eb332a3

2022-12-09 17:50:48 +0100

[diff] [blame]

992

op_mem_usage = ifm_size + ofm_size

Johan Alfvén

2022-07-16 18:27:05 +0200

[diff] [blame]

993

994

non_local_mem_usage[sched_op] = min_schedule.memory_snapshot[time_index] - op_mem_usage

Johan Alfvén

92689d5

2022-12-06 11:16:19 +0100

[diff] [blame]

995

assert non_local_mem_usage[sched_op] >= 0

Johan Alfvén

2022-07-16 18:27:05 +0200

[diff] [blame]

996

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

997

# Create cascades for Min schedule

Johan Alfvén

2022-07-16 18:27:05 +0200

[diff] [blame]

998

cascade_builder = CascadeBuilder(self.sched_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)

999

cascade_builder.build_cascades(min_schedule, max_template, memory_limit)

1000

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1001

def optimize_sub_schedule(

1002

self, cascade_info: CascadeInfo, ref_schedule: Schedule, max_template: Schedule, memory_limit: int

1003

) -> Schedule:

1004

"""Extracts the Ops covered by the given cascade and creates a sub-schedule. The sub-schedule is optimized by

1005

proposing weight buffering and then continously proposing new stripe sizes"""

1006

ref_cost = ref_schedule.cost_map

1007

# Extract the ops that are part of this sub-schedule

1008

start = cascade_info.start

1009

end = cascade_info.end

1010

sub_schedule_ops = self.sched_ops[start : end + 1]

1011

# Create a sub-schedule that contains only the costs for the Ops that are part of the sub-schedule

1012

sub_schedule = Schedule(self.sg, f"SUB_{start}_{end}")

1013

for sched_op in sub_schedule_ops:

1014

sub_schedule.cost_map[sched_op] = ref_cost[sched_op]

1015

1016

sub_schedule.cascades[end] = cascade_info

1017

# Use the memory snapshot from the reference schedule

1018

sub_schedule.memory_snapshot = ref_schedule.memory_snapshot

1019

1020

# Calculate memory usage that is live during the sub-schedule but not part of it

1021

time_for_cascade = ref_cost[sub_schedule_ops[0]].time_index

1022

mem_usage_parallel_to_sub_schedule = ref_schedule.memory_snapshot[time_for_cascade] - cascade_info.mem_usage

1023

# If the first Op's IFM has other consumers it has to live throughout the whole sub-schedule whether it's

1024

# included in a cascade or not

1025

persistent_initial_ifm = (

1026

sub_schedule_ops[0].ifm_size_in_bytes() if len(sub_schedule_ops[0].ifm.connection.consumers) > 1 else 0

1027

)

1028

# Calculate non-local-mem-usage per Operator

1029

non_local_mem_usage = {}

1030

for idx, sched_op in enumerate(sub_schedule_ops):

1031

non_local_mem_usage[sched_op] = mem_usage_parallel_to_sub_schedule

1032

if idx != 0:

1033

non_local_mem_usage[sched_op] += persistent_initial_ifm

1034

1035

cascade_builder = CascadeBuilder(sub_schedule_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)

1036

1037

# Start by adding buffering

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

1038

buffered_sub_schedule = self.propose_schedule_buffering(

1039

sub_schedule, self.scheduler_options.optimization_sram_limit

1040

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1041

# Copy the cascades over from the unbuffered-schedule

1042

buffered_sub_schedule.cascades = sub_schedule.cascades

1043

1044

# Generate the possible stripings for the final Op in the sub-schedule

1045

final_ofm_shape = sub_schedule_ops[-1].ofm.shape

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1046

1047

# Skip testing the min stripe used in the MIN schedule since that will be used

1048

# anyway if no new cascades are created below

1049

last_op = sub_schedule_ops[-1]

1050

min_stripe_h = sub_schedule.cost_map[last_op].stripe.height + 1

1051

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1052

possible_stripes = [

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1053

final_ofm_shape.with_height(stripe_h) for stripe_h in range(min_stripe_h, final_ofm_shape.height // 2 + 1)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1054

]

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1055

# Propose different striping

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

1056

best_schedule = None

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1057

max_nbr_of_cascades = 0

1058

for iteration, proposed_stripe in enumerate(possible_stripes):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1059

proposed_schedule = self.propose_schedule_striping(

1060

proposed_stripe, f"OPTIMIZED_{iteration}", buffered_sub_schedule

1061

)

1062

1063

cascade_builder.build_cascades(proposed_schedule, max_template, memory_limit)

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1064

nbr_of_cascades = len(proposed_schedule.cascades)

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1065

if iteration == 0:

1066

# First iteration - used as limit to prevent splitting up the cascades

1067

# Long cascades are better in order to reduce IFM/IFM dram bandwidth

1068

max_nbr_of_cascades = nbr_of_cascades

1069

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1070

# Check if proposal fits

1071

proposed_schedule_mem_usage = self.estimate_schedule_memory_usage(proposed_schedule, non_local_mem_usage)

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1072

if (proposed_schedule_mem_usage) <= memory_limit and nbr_of_cascades <= max_nbr_of_cascades:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1073

best_schedule = proposed_schedule

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1074

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1075

if not proposed_schedule.cascades:

1076

# No cascading required - early exit

1077

break

1078

else:

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1079

break

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

return best_schedule

def optimize_schedule(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

self,

schedule: Schedule,

max_sched: Schedule,

max_template: Schedule,

1088

options: SchedulerOptions,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1089

) -> Schedule:

1090

"""Extracts sub-schedules based on the cascades and optimizes them and applies them to the final schedule"""

1091

sram_limit = options.optimization_sram_limit

1092

if max_sched.fast_storage_peak_usage < sram_limit and not self.arch.is_spilling_enabled():

1093

# Maximum performance schedule fits within the SRAM target

1094

return max_sched

1095

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

1096

# Iterate over a copy of the cascades since they may change during the loop

1097

for cascade_info in list(schedule.cascades.values()):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1098

# Optimize the sub-schedule in this cascade

1099

opt_sub_schedule = self.optimize_sub_schedule(cascade_info, schedule, max_template, sram_limit)

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

1100

if opt_sub_schedule:

1101

# Remove the existing cascade

1102

del schedule.cascades[cascade_info.end]

1103

# Update the sub-schedule Op and cascade costs to the full schedule

1104

schedule.cost_map.update(opt_sub_schedule.cost_map)

1105

schedule.cascades.update(opt_sub_schedule.cascades)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1106

1107

# Update memory snapshot

1108

self.sg.schedule = schedule

1109

self.update_op_memory_snapshot(schedule)

1110

# Propose schedule buffering to the optimized schedule

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

1111

optimized_sched = self.propose_schedule_buffering(schedule, self.scheduler_options.optimization_sram_limit)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1112

# Copy the cascade's metadata from the unbuffered schedule

1113

optimized_sched.cascades = schedule.cascades

1114

return optimized_sched

1115

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1116

def optimize_weight_buffering_size(

1117

self,

1118

min_schedule: Schedule,

1119

options: SchedulerOptions,

1120

):

1121

default_schedule = self.sg.schedule

Tim Hall

c1be087

2022-03-03 17:50:52 +0000

[diff] [blame]

1122

npu_performance.calc_new_performance_for_network(self.nng, self.arch, None, False)

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1123

default_tot_cycles = self.nng.cycles[npu_performance.PassCycles.Total]

1124

default_dram_cycles = self.nng.cycles[npu_performance.PassCycles.DramAccess]

1125

1126

# Restore mem/type for scratched_fms

1127

for tens in self.scratched_fms:

1128

tens.mem_area = self.scratched_fms[tens][0]

1129

tens.mem_type = self.scratched_fms[tens][1]

1130

1131

self.update_op_memory_snapshot(self.sg.schedule)

1132

1133

# Collect live ranges from tensors

1134

memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

1135

lr_graph = live_range.LiveRangeGraph()

1136

for mem_area, mem_type_set in memories_list:

Johan Alfven

2023-02-28 09:03:03 +0100

[diff] [blame]

1137

live_range.extract_live_ranges_from_schedule(

1138

self.sg,

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1139

mem_area,

1140

mem_type_set,

1141

lr_graph,

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1142

)

1143

1144

# Find the relation between the sched_op and the buffering tensor

1145

weight_ops = {}

1146

for sched_op in self.sched_ops:

1147

cost = self.sg.schedule.cost_map[sched_op]

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

1148

for tens in cost.buffered_weight_tensors:

1149

weight_ops[tens] = sched_op

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1150

1151

# Filter out weight buffer live ranges

1152

weight_lrs = []

1153

for lr in lr_graph.lrs:

1154

for tens in lr.tensors:

1155

if weight_ops.get(tens):

1156

weight_lrs.append(lr)

1157

break

1158

1159

# See if any evicted fm overlaps with a weight buffering op.

1160

# If this is the case add a size limitation to the buffering op

1161

for lr in self.evicted_fms:

1162

for weight_lr in weight_lrs:

1163

if lr.overlaps_ranges(weight_lr):

1164

for tens in weight_lr.tensors:

1165

sched_op = weight_ops.get(tens)

1166

if sched_op:

1167

# Add size reduction to the op

1168

sched_op.evicted_fms_size += lr.size

1169

break

1170

1171

self.sg.schedule = min_schedule

1172

self.update_op_memory_snapshot(self.sg.schedule)

1173

1174

# Run schedule buffering - with weight buffer size reduction

1175

schedule = self.propose_schedule_buffering(self.sg.schedule, options.optimization_sram_limit)

1176

schedule.cascades = self.sg.schedule.cascades

1177

self.sg.schedule = schedule

1178

1179

# Apply new buffer schdule and calc new performance

1180

self.update_op_memory_snapshot(self.sg.schedule)

1181

self.apply_schedule(self.sg.schedule)

1182

self.use_fast_storage_for_feature_maps(self.sg.schedule, options.optimization_sram_limit)

1183

Tim Hall

c1be087

2022-03-03 17:50:52 +0000

[diff] [blame]

1184

npu_performance.calc_new_performance_for_network(self.nng, self.arch, None, False)

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1185

new_tot_cycles = self.nng.cycles[npu_performance.PassCycles.Total]

1186

new_dram_cycles = self.nng.cycles[npu_performance.PassCycles.DramAccess]

1187

Tim Hall

8bc7a65

2022-05-19 15:29:23 +0100

[diff] [blame]

1188

improvement_tot = (

1189

round((default_tot_cycles - new_tot_cycles) / default_tot_cycles, 2) if default_tot_cycles != 0 else 0

1190

)

1191

improvement_dram = (

1192

round((default_dram_cycles - new_dram_cycles) / default_dram_cycles, 2) if default_dram_cycles != 0 else 0

1193

)

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1194

1195

# Compare both total and dram improvement

Johan Alfvén

3dae1b6

2022-05-17 10:26:48 +0200

[diff] [blame]

1196

if not (improvement_tot >= 0 and improvement_dram > 0):

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1197

# No improvement, restore the default schedule

1198

for sched_op in self.sched_ops:

1199

sched_op.evicted_fms_size = 0

1200

1201

for tens in self.scratched_fms:

1202

tens.mem_area = self.scratched_fms[tens][0]

1203

tens.mem_type = self.scratched_fms[tens][1]

1204

1205

self.sg.schedule = default_schedule

1206

self.update_op_memory_snapshot(self.sg.schedule)

1207

self.apply_schedule(self.sg.schedule)

1208

self.use_fast_storage_for_feature_maps(self.sg.schedule, options.optimization_sram_limit)

1209

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1210

def apply_schedule(self, sched: Schedule):

1211

"""Applies the given schedule as a final solution"""

1212

for sched_op in self.sched_ops:

1213

op_info = sched.cost_map[sched_op]

1214

cascade_info = sched.cascades.get(op_info.cascade, None)

1215

if cascade_info and sched_op in cascade_info.buffers:

1216

buffer_tens = sched_op.ifm.connection.parent_tens

1217

# Apply memory area and type

1218

buffer_tens.mem_area = self.arch.fast_storage_mem_area

1219

buffer_tens.mem_type = MemType.Scratch_fast

1220

# Apply Rolling buffer

1221

buffer_tens.set_format(TensorFormat.NHCWB16, self.arch)

1222

buffer_tens.set_new_sub_purpose(TensorSubPurpose.RollingBufferY, cascade_info.buffers[sched_op].height)

1223

1224

sched_op.parent_ps.block_config = op_info.block_config.old_style_representation()

1225

1226

# Ensure that the src_tensor reference is set correctly

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

1227

for tens in op_info.buffered_weight_tensors:

1228

tens.src_tensor = op_info.npu_weights_tensor

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1229

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1230

def use_fast_storage_for_feature_maps(self, schedule, staging_limit):

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1231

"""Finds the set of feature maps that fits within the staging limit which combined has the largest amount of

1232

access cycles and moves those feature map into fast storage"""

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1233

max_mem_usage = []

1234

base_mem_usage = []

1235

fast_storage_type = MemType.Scratch_fast

1236

fast_storage_mem_area = self.arch.fast_storage_mem_area

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1237

self.evicted_fms = []

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1238

1239

# Force all OFMs to fast-storage

1240

for sched_op in self.sched_ops:

1241

cost = schedule.cost_map[sched_op]

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1242

if cost.cascade == 0 and sched_op.get_dependants():

1243

ofm_tens = sched_op.ofm.connection.parent_tens

1244

if not any(cons is None for cons in ofm_tens.consumer_list):

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1245

if ofm_tens not in self.scratched_fms:

1246

# Remember default mem area and mem type, only done once

1247

self.scratched_fms[ofm_tens] = (ofm_tens.mem_area, ofm_tens.mem_type)

1248

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1249

ofm_tens.mem_area = fast_storage_mem_area

1250

ofm_tens.mem_type = fast_storage_type

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1251

1252

# Collect live ranges from tensors

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1253

memories_list = [(fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1254

lr_graph = live_range.LiveRangeGraph()

1255

for mem_area, mem_type_set in memories_list:

Johan Alfven

2023-02-28 09:03:03 +0100

[diff] [blame]

1256

live_range.extract_live_ranges_from_schedule(

1257

self.sg,

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

1258

mem_area,

1259

mem_type_set,

1260

lr_graph,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1261

)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1262

max_mem_usage = lr_graph.get_temporal_memory_usage(fast_storage_mem_area)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1263

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1264

# If max_mem_usage does not exceed staging limit at any point all lrs fit and can stay in fast storage

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1265

if max(max_mem_usage) <= staging_limit:

1266

return

1267

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1268

# Build up the base memory usage by removing the mem_usage of the lrs we previously moved to fast-storage

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1269

base_mem_usage = np.array(max_mem_usage)

1270

curr_lrs = []

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1271

for lr in lr_graph.lrs:

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1272

for tens in lr.tensors:

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1273

if self.scratched_fms.get(tens):

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1274

curr_lrs.append(lr)

1275

base_mem_usage[lr.start_time : lr.end_time + 1] -= lr.size

1276

break

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1277

competing_lrs = []

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1278

competing_tens_access = {}

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1279

1280

# Evict live ranges that will never fit

1281

for lr in curr_lrs.copy():

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1282

base_usage = max(base_mem_usage[lr.start_time : lr.end_time + 1])

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1283

if base_usage + lr.size > staging_limit:

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1284

# Lr will never fit and may thus be evicted

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1285

self.evicted_fms.append(lr)

1286

FastStorageComponentAllocator.evict(lr, max_mem_usage, self.scratched_fms)

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1287

curr_lrs.remove(lr)

1288

1289

# Keep live ranges that will always fit in fast storage and let the remaining ones compete

1290

for lr in curr_lrs:

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1291

# Since max_mem_usage is the memory usage with all FMs still in fast-storage,

1292

# the memory limit cannot be exceeded if max_mem_usage does not.

1293

# Thus, the affected lrs can remain in fast-storage if the following is true

1294

if max(max_mem_usage[lr.start_time : lr.end_time + 1]) <= staging_limit:

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1295

FastStorageComponentAllocator.keep(lr, base_mem_usage)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1296

else:

1297

competing_lrs.append(lr)

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1298

for tens in lr.tensors:

1299

competing_tens_access[tens] = 0

1300

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1301

# All lrs and their tensors have been handled if competing_lrs_sz is zero, we may thus return

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1302

if len(competing_lrs) == 0:

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1303

return

1304

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1305

# Estimate element access for all tensors that are competing for a place in fast-storage.

1306

# This number is used when deciding which tensor that stays in fast-storage

1307

for sched_op in self.sched_ops:

1308

cost = schedule.cost_map[sched_op]

1309

1310

if competing_tens_access.get(sched_op.ifm.connection.parent_tens) is not None:

1311

tens = sched_op.ifm.connection.parent_tens

1312

access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)

1313

competing_tens_access[tens] += access.ifm_read[0]

1314

1315

if sched_op.ifm2 and competing_tens_access.get(sched_op.ifm2.connection.parent_tens) is not None:

1316

tens = sched_op.ifm2.connection.parent_tens

1317

access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)

1318

competing_tens_access[tens] += access.ifm_read[1]

1319

1320

if competing_tens_access.get(sched_op.ofm.connection.parent_tens) is not None:

1321

tens = sched_op.ofm.connection.parent_tens

1322

access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)

1323

competing_tens_access[tens] += access.ofm_write

1324

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1325

# Sort live ranges "from left to right" on the time axis to simplify checking overlapping ranges

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1326

competing_lrs = sorted(competing_lrs, key=lambda lr: (lr.start_time, lr.end_time + 1, lr.size))

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1327

1328

# Remove lrs that have a live range that is too long compared to others.

1329

# They are causing problems for the HillClimb Allocator when it has to

1330

# change the allocation indices, in order to fit all the allocations into SRAM.

1331

# This problem only occur in larger networks with complex graphs.

1332

#

1333

# Limit the number of items for allocate_component to work with max MAX_EXHAUSTIVE_ITEMS

1334

# at the time. Too many will give too long compilation time

1335

#

1336

# Too long is currently decided to be (based on experience, analyzing many networks):

1337

# Compare lr at postion i with lr at position i + MAX_EXHAUSTIVE_ITEMS.

1338

# If end time differs by at least MAX_EXHAUSTIVE_LIFE_RANGE then do not include lr at position i.

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1339

if len(competing_lrs) > FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS:

1340

# Create a copy of the original list to iterate over because the original version is modified in-loop

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1341

competing_lrs_copy = competing_lrs.copy()

1342

for i, lr in enumerate(competing_lrs_copy):

1343

lr_time = lr.end_time - lr.start_time

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1344

# Only check live ranges longer than MAX_EXHAUSTIVE_LIFE_RANGE

1345

if lr_time >= FastStorageComponentAllocator.MAX_EXHAUSTIVE_LIFE_RANGE:

1346

# Compare current lr with lr at position lr + MAX_EXHAUSTIVE_ITEMS

1347

cmp_pos = min(i + FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS, len(competing_lrs) - 1)

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1348

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1349

# Compare end times + plus a margin by MAX_EXHAUSTIVE_LIFE_RANGE

1350

if (

1351

lr.end_time

1352

> competing_lrs_copy[cmp_pos].end_time + FastStorageComponentAllocator.MAX_EXHAUSTIVE_LIFE_RANGE

1353

):

1354

# Current lr live time stands out, remove it. No use adding it to the

1355

# evicted_fms list since the lr should not be included in the fast storage allocation

1356

FastStorageComponentAllocator.evict(lr, max_mem_usage, self.scratched_fms)

1357

competing_lrs.remove(lr)

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1358

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1359

# Split competing live ranges into components by finding disconnected groups of live ranges or components of

1360

# max size MAX_EXHAUSTIVE_ITEMS

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1361

start = 0

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1362

end_time = competing_lrs[0].end_time

1363

component_allocator = FastStorageComponentAllocator(base_mem_usage, max_mem_usage, staging_limit)

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1364

component_ranges = []

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1365

for i, lr in enumerate(competing_lrs):

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1366

nbr_items = i - start

1367

if lr.start_time <= end_time and (nbr_items < FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS):

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1368

end_time = max(end_time, lr.end_time)

1369

else:

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1370

# Number items reached max items or current lr's start time

1371

# does not overlap with previous lr's end time

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1372

component_ranges.append((start, i))

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1373

start = i

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1374

end_time = lr.end_time

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1375

component_ranges.append((start, len(competing_lrs)))

1376

1377

# Allocate each component separately

1378

for start, end in component_ranges:

1379

component_allocator.allocate_component(

1380

competing_lrs[start:end],

max_mem_usage,

base_mem_usage,

self.scratched_fms,

competing_tens_access,

1385

self.evicted_fms,

1386

)

1387

assert max(max_mem_usage) <= staging_limit, "Allocation exceeds staging limit"

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1388

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1389

def print_schedule(self, schedule: Schedule):

1390

print(f"Schedule: '{schedule.name}'")

1391

for sched_op in self.sched_ops:

1392

if sched_op not in schedule.cost_map:

1393

# Sub-schedule printing

1394

continue

1395

1396

op_info = schedule.cost_map[sched_op]

1397

print(f"\t{sched_op.index}: Operation {sched_op.name} - OFM {sched_op.ofm.shape}")

1398

print(f"\t\tType: {sched_op.op_type}")

1399

print(f"\t\tKernel: {sched_op.kernel}")

1400

print(f"{op_info}")

1401

mem_usage = (

1402

schedule.memory_snapshot[op_info.time_index]

1403

if op_info.time_index < len(schedule.memory_snapshot)

1404

else 0

1405

)

1406

print(f"\t\tSRAM Used: {mem_usage} bytes")

1407

Jonas Ohlsson

25e700c

2022-03-04 14:58:56 +0100

[diff] [blame]

1408

print("\tCascades:")

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1409

for i, cascade in enumerate(schedule.cascades.values()):

1410

print(f"\t\t{i}: {cascade.start} -> {cascade.end}, size: {cascade.mem_usage}")

Patrik Gustavsson

feeb06d

2020-04-22 12:53:47 +0200

[diff] [blame]

1411

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1412

Johan Alfven

2023-02-28 09:03:03 +0100

[diff] [blame]

1413

def _update_memory_snapshot_for_all_npu_graphs(nng: Graph, arch: ArchitectureFeatures, schedulers):

1414

mem_area = arch.fast_storage_mem_area

1415

mem_type_set = set((MemType.Scratch, MemType.Scratch_fast))

1416

1417

# Collect live ranges for the full graph

1418

# extract_live_ranges_from_cascaded_passes will start from the root sg and

1419

# all sub graphs/cascaded passes will be visited and the correct time_index

1420

# will be set for all the tensors.

1421

lr_graph = live_range.LiveRangeGraph()

1422

live_range.extract_live_ranges_from_cascaded_passes(

1423

nng.get_root_subgraph(),

mem_area,

mem_type_set,

lr_graph,

Tensor.AllocationQuantum,

1428

)

1429

# Populate time-array with memory used by live ranges

1430

temporal_usage = lr_graph.get_temporal_memory_usage(arch.fast_storage_mem_area)

1431

1432

# Update snapshot for all the npu sub graphs

1433

# Not needed for the scheduler any longer but npu_performance

1434

# is using this information so it must have the correct state

1435

for sg in schedulers:

1436

sg.schedule.memory_snapshot = temporal_usage

1437

sg.schedule.fast_storage_peak_usage = max(temporal_usage, default=0)

1438

1439

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1440

def _update_tensor_allocation(nng: Graph, arch: ArchitectureFeatures, options):

1441

"""

1442

Creates live ranges and runs tensor allocator for the current schedule

1443

(i.e. sg.schedule for all subgraphs), returns the maximum memory usage

1444

and updates SchedulerOpInfo.mem_usage for all operations in the schedule.

1445

"""

1446

root_sg = nng.get_root_subgraph()

1447

1448

alloc_list = []

1449

if arch.is_spilling_enabled():

1450

mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,)))

1451

mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))

1452

# Order is important

1453

alloc_list.append(mem_alloc_scratch_fast)

1454

alloc_list.append(mem_alloc_scratch)

1455

else:

1456

mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))

1457

alloc_list.append(mem_alloc_scratch)

1458

1459

for mem_area, mem_type_set in alloc_list:

1460

tensor_allocation.allocate_tensors(

nng,

root_sg,

arch,

mem_area,

mem_type_set,

tensor_allocator=options.tensor_allocator,

1467

verbose_allocation=options.verbose_allocation,

1468

cpu_tensor_alignment=options.cpu_tensor_alignment,

Tim Hall

cda4fcb

2022-05-19 12:36:58 +0100

[diff] [blame]

1469

hillclimb_max_iterations=options.hillclimb_max_iterations,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1470

)

1471

1472

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1473

class FastStorageComponentAllocator:

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1474

MAX_EXHAUSTIVE_LIFE_RANGE = 20

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1475

MAX_EXHAUSTIVE_ITEMS = 20

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1476

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1477

def __init__(self, base_mem_usage, max_mem_usage, staging_limit):

1478

self.base_mem_usage = base_mem_usage

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1479

self.max_mem_usage = max_mem_usage

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1480

self.staging_limit = staging_limit

1481

self.lrs = []

1482

self.evicted = []

1483

self.curr_evicted = []

1484

self.remaining_total_size = []

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1485

self.best_score = 0

1486

self.competing_tens_access = {}

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1487

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1488

def allocate_exhaustive(self, ix, score):

1489

# Favour tensors with highest element access (score)

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1490

if ix >= self.num_lrs:

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1491

if score > self.best_score:

1492

self.best_score = score

Louis Verhaard

5c8f1e5

2022-02-23 14:13:07 +0100

[diff] [blame]

1493

self.evicted = self.curr_evicted.copy()

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1494

return

1495

1496

lr = self.lrs[ix]

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1497

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1498

# If adding the tensor size to the base mem usage doesn't exceed the staging limit anywhere on the lr time

1499

# range, it can fit and the case where the tensor is included needs to be checked

1500

can_fit = max(self.base_mem_usage[lr.start_time : lr.end_time + 1]) + lr.size <= self.staging_limit

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1501

if can_fit:

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1502

# Tensor can fit, add tensor element access to the score and check case where tensor is included

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1503

self.curr_evicted[ix] = False

1504

self.base_mem_usage = self.update_mem_usage(self.base_mem_usage, lr, True)

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1505

self.allocate_exhaustive(ix + 1, score + self.competing_tens_access[lr.tensors[0]])

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1506

self.base_mem_usage = self.update_mem_usage(self.base_mem_usage, lr, False)

1507

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1508

# If the max mem usage doesn't exceed the staging limit anywhere on the lr time range, it always fits and the

1509

# case where the tensor is not included can be skipped

1510

always_fits = max(self.max_mem_usage[lr.start_time : lr.end_time + 1]) <= self.staging_limit

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1511

if not always_fits:

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1512

# Tensor doesn't always fit, check case when tensor is not included

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1513

self.curr_evicted[ix] = True

1514

self.max_mem_usage = self.update_mem_usage(self.max_mem_usage, lr, False)

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1515

self.allocate_exhaustive(ix + 1, score)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1516

self.max_mem_usage = self.update_mem_usage(self.max_mem_usage, lr, True)

1517

1518

@staticmethod

1519

def update_mem_usage(mem_usage, lr, increase):

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1520

size = lr.size if increase else -lr.size

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1521

for t in range(lr.start_time, lr.end_time + 1):

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1522

mem_usage[t] += size

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

return mem_usage

@staticmethod

def evict(lr, max_mem_usage, scratched_fms):

1527

for t in range(lr.start_time, lr.end_time + 1):

1528

max_mem_usage[t] -= lr.size

1529

for tens in lr.tensors:

1530

if tens in scratched_fms:

1531

tens.mem_area = scratched_fms[tens][0]

1532

tens.mem_type = scratched_fms[tens][1]

1533

1534

@staticmethod

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1535

def keep(lr, base_mem_usage):

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1536

for t in range(lr.start_time, lr.end_time + 1):

1537

base_mem_usage[t] += lr.size

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1538

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1539

def allocate_component(self, lrs, max_mem, min_mem, scratched_fms, competing_tens_access, evicted_fms):

1540

self.lrs = lrs

1541

self.num_lrs = len(lrs)

1542

self.evicted = [0] * self.num_lrs

1543

self.curr_evicted = [0] * self.num_lrs

1544

self.best_score = -1

1545

self.competing_tens_access = competing_tens_access

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1546

# Recursively evaluate all permutations of allocations of the lrs found in the component.

1547

# For every permutation that fits within the staging_limit there is a score calculated.

1548

# The permutation with the highest score will then be chosen. The score is calculated

1549

# as the sum of the actual element access (ifm read and ofm write) for all the

1550

# including tensors. So it is not necessary the tensor with the biggest size that ends up

1551

# being included in the result.

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1552

self.allocate_exhaustive(0, 0)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1553

# Optimal allocation has been found, move lrs accordingly

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1554

for i, lr in enumerate(self.lrs):

1555

if self.evicted[i]:

1556

self.evict(lr, max_mem, scratched_fms)

1557

if lr not in evicted_fms:

1558

evicted_fms.append(lr)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1559

else:

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1560

self.keep(lr, min_mem)

1561

if lr in evicted_fms:

1562

evicted_fms.remove(lr)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1563

1564

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1565

def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_options: SchedulerOptions):

1566

"""Entry point for the Scheduler"""

1567

# Initialize CPU subgraphs

1568

schedulers = dict()

1569

# Initialize schedulers with max schedule. Only schedule NPU subgraphs

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1570

for sg in nng.subgraphs:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1571

if sg.placement != PassPlacement.Npu:

1572

# Create cascaded passes for CPU Ops

1573

cascaded_passes = []

1574

for idx, ps in enumerate(sg.passes):

1575

cps = CascadedPass(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

1576

ps.name,

1577

SchedulingStrategy.WeightStream,

ps.inputs,

[],

ps.outputs,

[ps],

ps.placement,

False,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1584

)

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1585

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1586

cps.time = idx

1587

ps.cascade = cps

1588

cascaded_passes.append(cps)

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1589

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1590

sg.cascaded_passes = cascaded_passes

1591

else:

1592

# Npu subgraph - create schedule

1593

scheduler = Scheduler(nng, sg, arch, scheduler_options)

1594

schedulers[sg] = scheduler

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1595

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1596

scheduler.create_scheduler_representation(arch)

1597

sg.sched_ops = scheduler.sched_ops

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1598

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1599

# Create the Max schedule template

1600

max_schedule_template = scheduler.create_initial_schedule()

1601

scheduler.max_schedule = max_schedule_template

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1602

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1603

# Create the optimimised Max schedule

1604

sg.schedule = max_schedule_template

1605

scheduler.update_op_memory_snapshot(max_schedule_template)

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

1606

opt_max_schedule = scheduler.propose_schedule_buffering(max_schedule_template, 1 << 32)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1607

sg.schedule = opt_max_schedule

1608

scheduler.update_op_memory_snapshot(opt_max_schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1609

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1610

# Create Min schedule

1611

min_schedule = scheduler.propose_minimal_schedule()

1612

initial_sram_limit = scheduler_options.optimization_sram_limit

1613

if scheduler_options.optimization_strategy == OptimizationStrategy.Size:

1614

initial_sram_limit = scheduler.min_memory_req

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1615

Johan Alfvén

2022-07-16 18:27:05 +0200

[diff] [blame]

1616

# Build cascades for Min schedule

1617

scheduler.build_cascades_for_min_schedule(min_schedule, max_schedule_template, initial_sram_limit)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1618

sg.schedule = min_schedule

1619

scheduler.update_op_memory_snapshot(min_schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1620

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1621

if scheduler_options.optimization_strategy == OptimizationStrategy.Performance:

1622

# Create an optimized schedule

1623

sg.schedule = scheduler.optimize_schedule(

1624

min_schedule, opt_max_schedule, max_schedule_template, scheduler_options

1625

)

1626

scheduler.update_op_memory_snapshot(sg.schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1627

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1628

scheduler.apply_schedule(sg.schedule)

1629

scheduler.use_fast_storage_for_feature_maps(sg.schedule, scheduler_options.optimization_sram_limit)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1630

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1631

if scheduler_options.optimization_strategy == OptimizationStrategy.Performance and scheduler.evicted_fms:

1632

# It might be possible to gain performance by reducing

1633

# weight buffer size and instead fit fms in fast storage

1634

scheduler.optimize_weight_buffering_size(min_schedule, scheduler_options)

1635

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1636

if scheduler_options.verbose_schedule:

1637

scheduler.print_schedule(sg.schedule)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1638

Johan Alfven

2023-02-28 09:03:03 +0100

[diff] [blame]

1639

# Make a full live range calculation starting from the root sg

1640

_update_memory_snapshot_for_all_npu_graphs(nng, arch, schedulers)

1641

Tim Hall