Blame - ethosu/vela/scheduler.py - ml/ethos-u/ethos-u-vela

2020-04-27 18:20:16 +0100

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

16

#

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

17

# Description:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

18

# The scheduler creates and searches for an optimal plan for the network, selecting block configurations and

19

# subdivisions for the Operators

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

20

# For Class name forward references for the type annotations. (see PEP 563).

21

from __future__ import annotations

22

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

23

import copy

Johan Alfvén

2022-02-09 21:20:10 +0100

[diff] [blame]

24

from collections import namedtuple

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

25

from enum import auto

26

from enum import IntEnum

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

27

from typing import Any

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

28

from typing import Dict

29

from typing import List

30

from typing import Optional

31

from typing import Tuple

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

32

from typing import TYPE_CHECKING

33

34

# Import needed for Type annotations. Only import for Type checking to avoid run-time errors due to cyclic import.

35

if TYPE_CHECKING:

36

from .npu_performance import CycleCost

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

37

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

38

import numpy as np

39

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

40

from . import live_range

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

41

from . import npu_performance

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

42

from . import tensor_allocation

43

from . import weight_compressor

44

from .architecture_allocator import ArchitectureBlockConfig

45

from .architecture_allocator import find_block_config

46

from .architecture_allocator import get_ifm_area_required

Fredrik Svedberg

d03dc50

2022-06-30 10:44:12 +0200

[diff] [blame]

47

from .architecture_allocator import to_upscale

erik.andersson@arm.com

8912f3a

2022-08-16 11:08:57 +0200

[diff] [blame]

48

from .architecture_allocator import is_nearest

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

49

from .architecture_features import ArchitectureFeatures

50

from .architecture_features import Block

51

from .cascade_builder import CascadeBuilder

52

from .cascade_builder import CascadeInfo

Fredrik Svedberg

880e735

2020-08-25 11:31:47 +0200

[diff] [blame]

53

from .data_type import DataType

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

54

from .nn_graph import CascadedPass

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

55

from .nn_graph import Graph

56

from .nn_graph import Pass

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

57

from .nn_graph import PassPlacement

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

58

from .nn_graph import SchedulingStrategy

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

59

from .nn_graph import Subgraph

Johan Alfvén

92689d5

2022-12-06 11:16:19 +0100

[diff] [blame]

60

from .live_range import ofm_can_reuse_ifm

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

61

from .numeric_util import round_down

62

from .numeric_util import round_up

Louis Verhaard

aee5d75

2020-09-30 09:01:52 +0200

[diff] [blame]

63

from .operation import Op

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

64

from .shape4d import Shape4D

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

65

from .tensor import MemArea

Patrik Gustavsson

eca2e95

2020-05-27 09:15:11 +0200

[diff] [blame]

66

from .tensor import MemType

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

67

from .tensor import Tensor

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

68

from .tensor import TensorFormat

69

from .tensor import TensorPurpose

70

from .tensor import TensorSubPurpose

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

71

from .weight_compressor import NpuWeightTensor

Jacob Bohlin

1a66697

2020-09-11 10:04:15 +0200

[diff] [blame]

72

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

73

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

74

def shape_for_format(shape: Shape4D, tensor_format: TensorFormat) -> Shape4D:

75

if tensor_format == TensorFormat.NHCWB16:

76

return shape.with_depth(round_up(shape.depth, 16))

return shape

class OptimizationStrategy(IntEnum):

82

"""Enum defining the different optimization strategies for the Scheduler"""

83

84

Size = auto()

85

Performance = auto()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

def __str__(self):

return self.name

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

91

class SchedulerOpInfo:

92

"""Contains metadata about a SchedulerOperation that is unique to one Schedule"""

93

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

94

def __init__(

95

self,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

96

block_config: ArchitectureBlockConfig,

97

weights_size: int,

98

stripe_input: Shape4D,

99

stripe_input2: Optional[Shape4D],

100

stripe: Shape4D,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

101

):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

102

self.block_config = block_config

103

self.weights_size = weights_size

104

self.stripe_input = stripe_input

105

self.stripe_input2 = stripe_input2

106

self.stripe = stripe

107

self.cascade = 0 # Assigned by CascadeBuilder. 0 means not part of a cascade

108

self.time_index = None # Set by update_op_memory_snapshot

109

self.ofm_depth_slices: List[int] = [0, stripe.depth]

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

110

self.npu_weights_tensor: Optional[NpuWeightTensor] = None

111

self.npu_scales_tensor: Optional[NpuWeightTensor] = None

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

112

self.buffered_weight_tensors: List[Tensor] = []

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

113

self.cycles: Optional[CycleCost] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

114

self.slack_buffering_cycles = 0

115

self.slack_buffering_memory = 0

116

self.full_weight_transfer_cycles = 0

117

118

def copy(self):

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

119

res = SchedulerOpInfo(

self.block_config,

self.weights_size,

self.stripe_input,

self.stripe_input2,

self.stripe,

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

126

res.cascade = self.cascade

return res

def __str__(self):

res = f"\t\tBlock Config = {self.block_config}\n"

131

res += f"\t\tOFM Block = {self.block_config.ofm_block}\n"

132

res += f"\t\tIFM Stripe = {self.stripe_input}\n"

133

res += f"\t\tIFM2 Stripe = {self.stripe_input2}\n"

134

res += f"\t\tOFM Stripe = {self.stripe}\n"

135

res += f"\t\tEncoded Weights = {self.npu_weights_tensor and len(self.npu_weights_tensor.buffer)} bytes\n"

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

136

for idx, tens in enumerate(self.buffered_weight_tensors):

137

res += f"\t\tWeight buffer{idx + 1} = {tens.storage_size()} bytes\n"

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

138

res += f"\t\tDepth slices = {self.ofm_depth_slices}\n"

139

res += f"\t\tAssigned Cascade = {self.cascade}"

return res

class SchedulerOptions:

144

"""Contains options for the Scheduler"""

145

146

def __init__(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

147

self,

148

optimization_strategy,

149

sram_target,

150

verbose_schedule,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

151

):

152

self.optimization_strategy = optimization_strategy

153

self.optimization_sram_limit = sram_target

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

154

self.verbose_schedule = verbose_schedule

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

155

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

156

def __str__(self) -> str:

157

return f"{type(self).__name__}: {str(self.__dict__)}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

__repr__ = __str__

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

162

class SchedulerTensor:

163

def __init__(self, shape, dt, mem_area, _format):

164

self.dtype = dt

165

self.mem_area = mem_area

166

self.shape = shape

167

self.format = _format

168

self.connection = None

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

169

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

170

171

class SchedulerOperation:

172

"""Scheduler internal representation of 'Operation'

173

This class can be seen as a node within the Scheduler Graph representation

174

"""

175

176

def __init__(self, ps: Pass, arch: ArchitectureFeatures, nng: Graph):

177

self.arch = arch

178

self.parent_ps = ps

179

self.parent_op = ps.primary_op

180

self.name = ps.primary_op.name

181

self.op_type = ps.primary_op.type

182

self.activation = ps.primary_op.activation

183

self.kernel = ps.primary_op.kernel

Tim Hall

3c5cfe9

2022-03-16 16:31:57 +0000

[diff] [blame]

184

self.resampling_mode = ps.primary_op.ifm_resampling_mode

Fredrik Svedberg

b81e1bb

2022-10-11 21:50:51 +0200

[diff] [blame]

185

self.reversed_operands = False

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

186

self.uses_scalar = ps.primary_op.ifm2 is not None and (

187

ps.primary_op.ifm.shape == [] or ps.primary_op.ifm2.shape == []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

188

)

erik.andersson@arm.com

2022-03-22 15:35:30 +0100

[diff] [blame]

189

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

190

self.ifm_ublock = arch.ifm_ublock

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

191

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

192

self.ifm = SchedulerTensor(

193

ps.ifm_shapes[0],

194

ps.ifm_tensor.dtype,

195

ps.ifm_tensor.mem_area,

196

ps.ifm_tensor.format,

197

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

198

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

199

self.ifm2 = None

200

if ps.ifm2_tensor:

201

self.ifm2 = SchedulerTensor(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

202

ps.ifm_shapes[1],

203

ps.ifm2_tensor.dtype,

204

ps.ifm2_tensor.mem_area,

205

ps.ifm2_tensor.format,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

206

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

207

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

208

self.ofm = SchedulerTensor(

209

ps.ofm_shapes[0],

210

ps.ofm_tensor.dtype,

211

ps.ofm_tensor.mem_area,

212

ps.ofm_tensor.format,

213

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

214

Johan Alfven

126558e

2023-03-09 08:36:10 +0100

[diff] [blame]

215

# LUT must be placed in shram area. The copy is done by DMA

216

# generated by the high level command stream generator.

217

for idx, tens in enumerate(self.parent_op.inputs):

218

if tens.purpose == TensorPurpose.LUT:

219

new_tens = tens.clone_into_shram(self.arch)

220

new_tens.consumer_list.append(self.parent_op)

221

self.parent_op.inputs[idx] = new_tens

222

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

223

# Input volume width and height required to produce the smallest possible stripe

224

self.min_stripe_input_w, self.min_stripe_input_h = self._calculate_min_stripe_input()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

225

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

226

# Flags that marks whether this SchedulerOperation requires full IFM/OFM

227

self.requires_full_ifm = False

228

self.requires_full_ifm2 = False

229

self.requires_full_ofm = False

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

230

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

231

self.evicted_fms_size = 0

232

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

233

self.index = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

234

erik.andersson@arm.com

2022-03-22 15:35:30 +0100

[diff] [blame]

235

# Perform an IFM swap for certain binary elementwise operators

236

# in order to enable cascading, if the SchedOp conforms to

237

# Elementwise cascading rules.

Johan Alfvén

0f2e59f

2022-10-21 11:21:38 +0200

[diff] [blame]

238

# The non-constant/non-scalar/non-broadcast IFM should be the primary input

239

if self.op_type.is_binary_elementwise_op():

240

ifm = self.parent_op.ifm

241

ifm2 = self.parent_op.ifm2

242

ofm = self.parent_op.ofm

erik.andersson@arm.com

2022-03-22 15:35:30 +0100

[diff] [blame]

243

Johan Alfvén

993ea53

2022-10-26 10:20:01 +0200

[diff] [blame]

244

ifm_can_swap = ifm.is_const or ifm.is_scalar

Johan Alfvén

0f2e59f

2022-10-21 11:21:38 +0200

[diff] [blame]

245

ifm2_can_be_primary = not (ifm2.is_const or ifm2.is_scalar or ifm2.is_broadcast(ofm))

246

Johan Alfvén

993ea53

2022-10-26 10:20:01 +0200

[diff] [blame]

247

if ifm_can_swap and ifm2_can_be_primary:

Johan Alfvén

0f2e59f

2022-10-21 11:21:38 +0200

[diff] [blame]

248

# IFM2 is the primary input

Fredrik Svedberg

b81e1bb

2022-10-11 21:50:51 +0200

[diff] [blame]

249

self.reversed_operands = True

erik.andersson@arm.com

2022-03-22 15:35:30 +0100

[diff] [blame]

250

self.ifm, self.ifm2 = self.ifm2, self.ifm

251

252

self.parent_ps.ifm_shapes = self.parent_ps.ifm_shapes[::-1]

253

self.parent_ps.inputs = self.parent_ps.inputs[::-1]

254

self.parent_ps.ifm_tensor, self.parent_ps.ifm2_tensor = (

255

self.parent_ps.ifm2_tensor,

256

self.parent_ps.ifm_tensor,

257

)

258

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

259

def add_ifm_connection(self, conn: "Connection"):

260

"""Add input connection to another SchedulerOperation or Subgraph Input"""

261

conn.consumers.append(self)

262

self.ifm.connection = conn

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

263

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

264

def add_ifm2_connection(self, conn: "Connection"):

265

"""Add input connection to another SchedulerOperation or Subgraph Input"""

266

if self.ifm2:

267

conn.consumers.append(self)

268

self.ifm2.connection = conn

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

269

else:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

270

assert False, f"Trying to set an IFM2 Connection to {self} which has no IFM2"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

271

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

272

def add_ofm_connection(self, conn: "Connection"):

273

"""Add output connection to another SchedulerOperation or Subgraph Output"""

274

conn.producers.append(self)

275

self.ofm.connection = conn

276

277

def get_dependants(self):

278

"""Returns a list of the Ops that depend on this Operation's OFM"""

279

return self.ofm.connection.consumers

280

281

def ifm_size_in_bytes(self) -> int:

282

"""Returns size of the IFM in bytes"""

283

ifm_storage_shape = shape_for_format(self.ifm.shape, self.ifm.format)

284

return round_up(ifm_storage_shape.elements() * self.ifm.dtype.size_in_bytes(), Tensor.AllocationQuantum)

285

286

def ifm2_size_in_bytes(self) -> int:

287

"""Returns size of the IFM2 in bytes"""

288

if self.ifm2:

289

ifm2_storage_shape = shape_for_format(self.ifm2.shape, self.ifm2.format)

290

return round_up(ifm2_storage_shape.elements() * self.ifm2.dtype.size_in_bytes(), Tensor.AllocationQuantum)

return 0

def ofm_size_in_bytes(self) -> int:

295

"""Returns size of the OFM in bytes"""

296

ofm_storage_shape = shape_for_format(self.ofm.shape, self.ofm.format)

297

return round_up(ofm_storage_shape.elements() * self.ofm.dtype.size_in_bytes(), Tensor.AllocationQuantum)

298

299

def create_scheduler_info(self, nng: Graph, stripe: Shape4D) -> SchedulerOpInfo:

300

"""Returns schedule info about this SchedulerOperation based on how many ofm elements it should produce"""

301

ifm_shape = self.ifm.shape

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

302

ifm2_shape = self.ifm2.shape if self.ifm2 is not None else None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

303

ofm_shape = stripe

304

305

if ofm_shape != self.ofm.shape:

306

# Striped Op - Need to calculate stripe input volume

307

stripe_input_w, stripe_input_h = self._get_stripe_input_requirement(stripe)

308

# Ensure stripe input volume is within the full IFM volume

309

stripe_input_h = min(stripe_input_h, self.ifm.shape.height)

310

stripe_input_w = min(stripe_input_w, self.ifm.shape.width)

311

ifm_shape = ifm_shape.with_hw(stripe_input_h, stripe_input_w)

312

313

if self.ifm2:

314

stripe_input2_h = min(stripe_input_h, self.ifm2.shape.height)

315

stripe_input2_w = min(stripe_input_w, self.ifm2.shape.width)

316

ifm2_shape = ifm2_shape.with_hw(stripe_input2_h, stripe_input2_w)

317

318

block_config = self._get_block_config(ifm_shape, ifm2_shape, self.uses_scalar, ofm_shape)

319

320

scheduler_op_info = SchedulerOpInfo(block_config, 0, ifm_shape, ifm2_shape, ofm_shape)

321

if self.parent_op.weights:

322

# Default full-depth weight encoding with no buffering

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

323

(

324

scheduler_op_info.npu_weights_tensor,

325

scheduler_op_info.npu_scales_tensor,

326

) = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

327

self.arch,

328

self.parent_op,

329

self.parent_op.weights,

self.parent_op.bias,

self.kernel,

block_config,

[0, self.ofm.shape.depth],

334

)

335

336

self.parent_ps.block_config = block_config.old_style_representation()

337

return scheduler_op_info

338

339

def _get_stripe_input_requirement(self, stripe_shape: Shape4D) -> Tuple[int, int]:

340

"""Returns the amount of IFM required to produce the stripe with shape:'stripe_shape'"""

341

ofm_shape_to_produce = Block.from_shape(stripe_shape.as_list())

342

Fredrik Svedberg

3ff7a4a

2021-09-29 10:08:04 +0200

[diff] [blame]

343

return get_ifm_area_required(ofm_shape_to_produce, self.kernel, self.resampling_mode)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

344

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

345

def _calculate_min_stripe_input(self) -> Tuple[int, int]:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

346

# Calculate the input volume required height and width for the smallest possible stripe (h,w = 1,1)

347

min_stripe = self.ofm.shape.with_hw(1, 1)

348

return self._get_stripe_input_requirement(min_stripe)

349

350

def _get_block_config(

351

self, ifm_shape: Shape4D, ifm2_shape: Optional[Shape4D], uses_scalar: bool, ofm_shape: Shape4D

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

352

) -> Optional[ArchitectureBlockConfig]:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

353

# Returns a block config and SHRAM layout

354

lut_banks = 2 if self.parent_op.activation_lut else 0

355

return find_block_config(

356

self.arch,

357

self.op_type.npu_block_type,

ofm_shape,

ifm_shape,

ifm2_shape,

uses_scalar,

self.ifm.dtype.size_in_bits(),

363

self.kernel,

364

lut_banks,

365

self.parent_op.has_scaling(),

366

self.resampling_mode,

)

class Connection:

"""Scheduler internal representation of a Tensor that connects two SchedulerOperations

372

This class can be seen as an edge within the Scheduler Graph representation

373

"""

374

375

def __init__(self, tensor: Tensor):

376

self.parent_tens = tensor

377

378

# SchedulerOperation relationships

379

self.producers: List[SchedulerOperation] = []

380

self.consumers: List[SchedulerOperation] = []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

381

382

def __str__(self):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

383

return f"<Connection {self.parent_tens.name}>"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

__repr__ = __str__

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

388

class Schedule:

389

"""Class that contains a solution of how to schedule an NPU subgraph and its cost"""

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

390

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

391

def __init__(self, sg: Subgraph, label: str):

392

self.sg = sg

393

self.label = label

394

self.cost_map: Dict[SchedulerOperation, SchedulerOpInfo] = {}

395

self.cascades: Dict[int, CascadeInfo] = {}

396

self.fast_storage_peak_usage = 0

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

397

self.memory_snapshot: Optional[List[int]] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

@property

def name(self):

return f"{self.sg.name}_{self.label}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

402

403

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

404

class Scheduler:

405

"""Main class of the Vela Scheduling"""

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

406

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

407

def __init__(self, nng: Graph, sg: Subgraph, arch: ArchitectureFeatures, options: SchedulerOptions):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

408

self.nng = nng

409

self.sg = sg

410

self.arch = arch

Ayaan Masood

b801dda

2022-02-22 11:28:55 +0000

[diff] [blame]

411

self.sched_ops: List[SchedulerOperation] = []

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

412

self.max_schedule: Optional[Schedule] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

413

self.scheduler_options = options

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

414

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

415

self.scratched_fms: Dict[Tensor, Any] = {}

416

self.evicted_fms: List[live_range.LiveRange] = []

417

Johan Alfvén

2022-02-09 21:20:10 +0100

[diff] [blame]

418

def avoid_nhcwb16_for_ofm(self, tens, ps, arch):

Johan Alfven

f349047

2023-01-13 08:46:27 +0100

[diff] [blame]

419

"""For elementwise ops when ifm is in nhwc format and not brick format aligned (16),

420

then if the ofm can overwrite the ifm it is better to enforce ofm format to nhwc in

421

order to reduce memory transactions"""

Johan Alfvén

2022-02-09 21:20:10 +0100

[diff] [blame]

422

423

op = ps.primary_op

424

if not op.type.is_elementwise_op():

425

return False

426

427

depth = op.ofm_shapes[0][-1]

428

if (depth % 16) == 0:

429

return False

430

431

# Check if overwriting the inputs can be allowed

432

OpShapeTens = namedtuple("OpShapeTens", ["op_shape", "tens"])

433

outp = OpShapeTens(op.ofm_shapes[0], op.ofm)

434

inps = []

435

if op.ifm is not None:

436

inps.append(OpShapeTens(op.ifm_shapes[0], op.ifm))

437

if op.ifm2 is not None:

438

inps.append(OpShapeTens(op.ifm_shapes[1], op.ifm2))

439

440

# Find an input tensor that can be overwritten by the output

441

for inp in inps:

442

if (

443

# check op input and output shapes allow overlapping

444

inp.op_shape == outp.op_shape

445

# check input tensor is valid

446

and inp.tens is not None

447

and inp.tens.shape != []

448

# check input and output tensors are compatible

449

and inp.tens.format == outp.tens.format

450

and inp.tens.dtype == outp.tens.dtype

451

):

452

if inp.tens.format == TensorFormat.NHWC:

return True

return False

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

457

def create_scheduler_representation(self, arch: ArchitectureFeatures):

458

"""Creates a Scheduler Graph representation"""

459

# Temporary dict for creating connections between the Operations

460

connections: Dict[Tensor, Connection] = {}

461

# Memory required for the largest FeatureMap that has to be full

462

min_memory_req = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

463

for ps in self.sg.passes:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

464

if ps.primary_op:

465

# Set tensor format to NHCWB16 for output FeatureMaps, if possible

Louis Verhaard

0b9c9a3

2020-09-15 14:05:38 +0200

[diff] [blame]

466

for output in ps.outputs:

Jacob Bohlin

a5e8c1c

2021-06-14 13:33:39 +0200

[diff] [blame]

467

if output in self.sg.output_tensors or output.purpose != TensorPurpose.FeatureMap:

Patrik Gustavsson

feeb06d

2020-04-22 12:53:47 +0200

[diff] [blame]

468

continue

Johan Alfvén

2022-02-09 21:20:10 +0100

[diff] [blame]

469

470

if output.needs_linear_format:

471

continue

472

473

if self.avoid_nhcwb16_for_ofm(output, ps, arch):

474

output.needs_linear_format = True

475

continue

476

477

output.set_format(TensorFormat.NHCWB16, arch)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

478

479

# Create SchedulerOperations

480

op = SchedulerOperation(ps, arch, self.nng)

481

op.index = len(self.sched_ops)

482

483

# Make connections

484

if ps.ifm_tensor not in connections:

485

connections[ps.ifm_tensor] = Connection(ps.ifm_tensor)

486

if ps.ifm2_tensor and ps.ifm2_tensor not in connections:

487

connections[ps.ifm2_tensor] = Connection(ps.ifm2_tensor)

488

if ps.ofm_tensor not in connections:

489

connections[ps.ofm_tensor] = Connection(ps.ofm_tensor)

490

491

op.add_ifm_connection(connections[ps.ifm_tensor])

492

if ps.ifm2_tensor:

493

op.add_ifm2_connection(connections[ps.ifm2_tensor])

494

op.add_ofm_connection(connections[ps.ofm_tensor])

495

496

# Set requirements on the ifm/ofm buffers

497

self.sched_ops.append(op)

498

if ps.ifm_tensor in self.sg.input_tensors:

499

# This Op consumes a subgraph input

500

op.requires_full_ifm = True

501

if ps.ifm2_tensor and ps.ifm2_tensor in self.sg.input_tensors:

502

# This Op consumes a subgraph input

503

op.requires_full_ifm2 = True

504

if ps.ofm_tensor in self.sg.output_tensors:

505

# This Op produces a subgraph output

506

op.requires_full_ofm = True

507

if ps.ifm_tensor.needs_linear_format:

508

op.requires_full_ifm = True

509

if ps.ifm2_tensor and ps.ifm2_tensor.needs_linear_format:

510

op.requires_full_ifm2 = True

511

if ps.ofm_tensor.needs_linear_format or ps.primary_op.memory_function == Op.ConcatSliceWrite:

512

op.requires_full_ofm = True

513

if len(ps.primary_op.outputs) > 1 or len(ps.primary_op.outputs[0].consumer_list) > 1:

514

# Op has multiple outputs or consumers - requires full OFM

515

op.requires_full_ofm = True

516

517

# Check memory requirements if this Op requires any full FeatureMaps

518

op_memory_req = 0

519

if op.requires_full_ifm:

520

op_memory_req += op.ifm_size_in_bytes()

521

if op.requires_full_ifm2:

522

op_memory_req += op.ifm2_size_in_bytes()

523

if op.requires_full_ofm:

524

op_memory_req += op.ofm_size_in_bytes()

525

526

min_memory_req = max(op_memory_req, min_memory_req)

527

528

# Theoretical minimum required memory - used to guide the cascade building

529

self.min_memory_req = min_memory_req

530

531

def create_initial_schedule(self) -> Schedule:

532

"""Creates an initial schedule with no cascading or buffering of any kind"""

533

schedule = Schedule(self.sg, "MAX")

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

534

for op in self.sched_ops:

535

cost = op.create_scheduler_info(self.nng, op.ofm.shape)

536

cost.cycles = self.estimate_op_performance(op, cost.block_config, op.ofm.shape.depth)

537

schedule.cost_map[op] = cost

return schedule

def update_op_memory_snapshot(self, schedule: Schedule):

542

memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

543

544

# Collect live ranges from tensors

545

lr_graph = live_range.LiveRangeGraph()

546

for mem_area, mem_type_set in memories_list:

Johan Alfven

2023-02-28 09:03:03 +0100

[diff] [blame]

547

live_range.extract_live_ranges_from_schedule(

548

self.sg,

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

549

mem_area,

550

mem_type_set,

551

lr_graph,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

552

)

553

554

# Populate time-array with memory used by live ranges

555

temporal_usage = lr_graph.get_temporal_memory_usage(self.arch.fast_storage_mem_area)

556

schedule.memory_snapshot = temporal_usage

557

558

# Set the peak memory usage

559

schedule.fast_storage_peak_usage = max(temporal_usage, default=0)

560

561

def estimate_op_performance(self, op: SchedulerOperation, block_config, ofm_depth):

562

query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)

563

query.ifm_shape = op.ifm.shape

564

query.ifm_memory_area = op.ifm.mem_area

565

query.ifm_bits = op.ifm.dtype.size_in_bits()

566

query.ifm_format = op.ifm.format

567

query.ifm2_shape = op.ifm2 and op.ifm2.shape

568

query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area

569

query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()

570

query.ifm2_format = op.ifm2 and op.ifm2.format

571

query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)

572

query.ofm_memory_area = op.ofm.mem_area

573

query.ofm_bits = op.ofm.dtype.size_in_bits()

574

query.ofm_format = op.ofm.format

575

if op.parent_op.bias:

576

query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)

577

query.const_memory_area = self.arch.fast_storage_mem_area

578

579

query.kernel = op.kernel

580

query.config = block_config

581

582

return npu_performance.measure_cycle_cost(self.arch, op.op_type, op.activation and op.activation.op_type, query)

583

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

584

def estimate_element_access(self, op: SchedulerOperation, block_config, ofm_depth):

585

query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)

586

query.ifm_shape = op.ifm.shape

587

query.ifm_memory_area = op.ifm.mem_area

588

query.ifm_bits = op.ifm.dtype.size_in_bits()

589

query.ifm_format = op.ifm.format

590

query.ifm2_shape = op.ifm2 and op.ifm2.shape

591

query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area

592

query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()

593

query.ifm2_format = op.ifm2 and op.ifm2.format

594

query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)

595

query.ofm_memory_area = op.ofm.mem_area

596

query.ofm_bits = op.ofm.dtype.size_in_bits()

597

query.ofm_format = op.ofm.format

598

if op.parent_op.bias:

599

query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)

600

query.const_memory_area = self.arch.fast_storage_mem_area

601

602

query.kernel = op.kernel

603

query.config = block_config

604

605

return npu_performance.measure_element_access(self.arch, query)

606

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

607

def propose_schedule_buffering(self, ref_schedule: Schedule, staging_limit_bytes):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

608

"""Create a buffered schedule"""

609

buffered_schedule = Schedule(self.sg, f"{ref_schedule.label}_BUFFERED")

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

610

611

prev_op = None

612

for sched_op in self.sched_ops:

613

if sched_op not in ref_schedule.cost_map:

614

# sched_op is not part of this sub-schedule - skip

615

continue

616

617

self.propose_operator_buffering(sched_op, prev_op, buffered_schedule, ref_schedule, staging_limit_bytes)

618

prev_op = sched_op

619

620

return buffered_schedule

621

622

def propose_operator_buffering(

623

self,

624

sched_op: SchedulerOperation,

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

625

prev_op: Optional[SchedulerOperation],

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

626

buffered_schedule: Schedule,

627

ref_schedule: Schedule,

628

staging_limit_bytes,

629

):

630

# Mild recursion might mean this Op has already been seen

631

if sched_op in buffered_schedule.cost_map:

632

return

633

634

# Take the reference schedule as default costings for this schedule

635

ref_cost = ref_schedule.cost_map[sched_op]

636

cost = copy.copy(ref_cost)

637

cost.slack_buffering_cycles = ref_cost.cycles.op_cycles

638

memory_snapshot = ref_schedule.memory_snapshot

639

ref_memory_usage = memory_snapshot[ref_cost.time_index] if ref_cost.time_index < len(memory_snapshot) else 0

640

cost.slack_buffering_memory = staging_limit_bytes - ref_memory_usage

641

buffered_schedule.cost_map[sched_op] = cost

642

643

# Attempt weight buffering on anything with a weights tensor

644

if sched_op.parent_op.weights:

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

645

buffer_limit_bytes = cost.slack_buffering_memory

646

647

# If applicable apply size limitation, but keep it within reason (ratio 1.5).

648

# Size limitation is used when use_fast_storage_for_feature_maps have

649

# detected that there are fms that do not fit in fast storage.

650

if sched_op.evicted_fms_size and ((buffer_limit_bytes / sched_op.evicted_fms_size) >= 1.5):

651

buffer_limit_bytes -= sched_op.evicted_fms_size

652

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

653

self.propose_weight_buffering(

654

sched_op.parent_op.weights,

655

sched_op.parent_op.bias,

sched_op,

prev_op,

buffered_schedule,

ref_schedule,

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

660

buffer_limit_bytes,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

)

return cost

def weights_needs_dma(self, weight_tensor):

666

if weight_tensor and weight_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast):

667

# Weights are in permanent storage

668

# Only when permanent storage differs from feature map storage, there is a point moving the data

669

if (

670

weight_tensor.mem_area in (MemArea.Dram, MemArea.OffChipFlash)

671

and self.arch.permanent_storage_mem_area != self.arch.fast_storage_mem_area

):

return True

return False

def propose_weight_buffering(

self,

weight_tensor,

scale_tensor,

sched_op: SchedulerOperation,

681

prev_op: SchedulerOperation,

682

buffered_schedule: Schedule,

683

ref_schedule: Schedule,

684

buffer_limit_bytes,

685

):

686

cost = buffered_schedule.cost_map[sched_op]

687

prev_cost = buffered_schedule.cost_map.get(prev_op)

688

ref_cost = ref_schedule.cost_map[sched_op]

689

assert cost and ref_cost

690

691

needs_dma = self.weights_needs_dma(weight_tensor)

692

693

ofm_full_depth_slices = [0, ref_cost.stripe.depth]

694

695

# Encode weights for the full depth

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

696

full_weights, full_scales = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

self.arch,

sched_op.parent_op,

weight_tensor,

scale_tensor,

sched_op.kernel,

cost.block_config,

ofm_full_depth_slices,

704

)

705

full_weights_bytes = len(full_weights.buffer)

706

cost.ofm_depth_slices = ofm_full_depth_slices

707

708

# No buffering required - take all the weights from permanent storage

709

if sched_op.op_type == Op.FullyConnected or not needs_dma:

710

cost.npu_weights_tensor = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

711

cost.npu_scales_tensor = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

712

return

713

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

714

encoded_weights: Optional[NpuWeightTensor] = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

715

encoded_scales = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

716

717

# How many NPU cycles are available under the previously executing

718

# operator and SRAM unused for performing buffered DMA transfers

719

slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0

720

slack_memory = prev_cost.slack_buffering_memory if prev_cost else 0

721

722

# Force full depth for cascaded Ops

723

if ref_cost.cascade != 0:

724

weight_tensor_purpose = TensorSubPurpose.Standard

725

weight_buffer_size = full_weights_bytes

726

# Update the memory snapshot to reflect the added size of the weights

727

ref_schedule.memory_snapshot[ref_cost.time_index] += weight_buffer_size

728

else:

729

# Estimate the buffering cycle time for the full set of weights

730

full_transfer_cycles = npu_performance.measure_mem2mem_cycles(

731

self.arch, weight_tensor.mem_area, self.arch.fast_storage_mem_area, full_weights_bytes

732

)

733

cost.full_weight_transfer_cycles = full_transfer_cycles

734

735

# Calculate the amount of prebuffering necessary (or what is possible with limited

736

# double buffer buffer size)

737

half_buffer_limit = buffer_limit_bytes // 2

738

if full_transfer_cycles > slack_cycles:

739

prebuffer_ratio = slack_cycles / full_transfer_cycles

740

prebuffer_bytes = min(prebuffer_ratio * full_weights_bytes, half_buffer_limit)

741

else:

742

prebuffer_bytes = min(full_weights_bytes, half_buffer_limit)

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

743

744

prebuffer_ratio = prebuffer_bytes / full_weights_bytes

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

745

746

# Have to split the weights if the initial buffering can't store

747

# all of the compressed weights

748

if prebuffer_bytes < full_weights_bytes:

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

749

block_depth = cost.block_config.ofm_block.depth

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

750

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

751

# Choose initial prebuffering depth (already buffer clamped)

752

prebuffer_depth = ref_cost.stripe.depth * prebuffer_ratio

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

753

prebuffer_depth = int(max(16, round_down(prebuffer_depth, ArchitectureFeatures.OFMSplitDepth)))

754

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

755

# Calculate cycles executed during the prebuffer

756

pre_op_cycles = self.estimate_op_performance(sched_op, cost.block_config, prebuffer_depth)

757

buffering_depth = ref_cost.stripe.depth * (pre_op_cycles.op_cycles / full_transfer_cycles)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

758

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

759

# Choose initial buffering depth and clamp to the double buffering limit

760

buffering_depth = round_up(buffering_depth, block_depth)

761

buffering_bytes = (buffering_depth / ref_cost.stripe.depth) * full_weights_bytes

762

if buffering_bytes > half_buffer_limit:

763

buffering_depth = (half_buffer_limit / full_weights_bytes) * ref_cost.stripe.depth

764

765

while True:

766

# Attempt to buffer whole blocks

Johan Alfvén

cce7f2d

2022-04-08 10:47:09 +0200

[diff] [blame]

767

if buffering_depth > block_depth:

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

768

buffering_depth = round_down(buffering_depth, block_depth)

769

else:

770

buffering_depth = round_down(buffering_depth, ArchitectureFeatures.OFMSplitDepth)

771

buffering_depth = int(max(buffering_depth, ArchitectureFeatures.OFMSplitDepth))

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

772

773

# Create list of depth slices

774

depth_slices = [0]

775

if prebuffer_depth < ref_cost.stripe.depth:

776

depth_slices += list(range(prebuffer_depth, ref_cost.stripe.depth, buffering_depth))

777

depth_slices.append(ref_cost.stripe.depth)

778

779

# Encode weights based depth slices

780

cost.ofm_depth_slices = depth_slices

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

781

encoded_weights, encoded_scales = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

self.arch,

sched_op.parent_op,

weight_tensor,

scale_tensor,

sched_op.kernel,

cost.block_config,

cost.ofm_depth_slices,

789

)

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

790

assert encoded_weights is not None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

791

# Chosen buffering might not fit at all, iterate until it does

792

# or until the minimum usable slice size is reached

793

if (

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

794

encoded_weights.double_buffer_size() <= buffer_limit_bytes

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

795

or prebuffer_depth == ArchitectureFeatures.OFMSplitDepth

):

break

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

799

if buffering_depth > prebuffer_depth:

800

buffering_depth = round_up(buffering_depth // 2, ArchitectureFeatures.OFMSplitDepth)

801

else:

802

prebuffer_depth = round_up(prebuffer_depth // 2, ArchitectureFeatures.OFMSplitDepth)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

803

804

# Calculate cycles required to run the last op for use as future slack

805

tail_cycles = self.estimate_op_performance(

806

sched_op, cost.block_config, depth_slices[-1] - depth_slices[-2]

807

)

808

cost.slack_buffering_cycles = tail_cycles.op_cycles

809

810

# Determine whether the weights need to be double buffered

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

811

weight_buffer_size = min(len(encoded_weights.buffer), encoded_weights.max_range_bytes())

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

812

813

# Only buffer weights if there's still space left for the buffer

814

if weight_buffer_size <= buffer_limit_bytes:

815

assert weight_buffer_size % 16 == 0

816

# Determine whether to double buffer or single buffer

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

817

double_buffer_size = encoded_weights.double_buffer_size()

818

if (double_buffer_size <= buffer_limit_bytes) and (weight_buffer_size < len(encoded_weights.buffer)):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

819

weight_tensor_purpose = TensorSubPurpose.DoubleBuffer

820

else:

821

weight_tensor_purpose = TensorSubPurpose.Standard

822

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

823

cost.buffered_weight_tensors = [

824

self.buffer_tensor(

825

encoded_weights,

826

weight_tensor_purpose,

827

encoded_weights.double_buffer_sizes[0],

828

weight_tensor.name + "_buffer",

829

)

830

]

831

if weight_tensor_purpose == TensorSubPurpose.DoubleBuffer:

832

buf2 = self.buffer_tensor(

833

encoded_weights,

834

weight_tensor_purpose,

835

encoded_weights.double_buffer_sizes[1],

836

weight_tensor.name + "_buffer2",

837

)

838

cost.buffered_weight_tensors.append(buf2)

839

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

840

# Note! OFM depth slices define slices as [0, s1, ... sn]. For example, [0, 70, 140] describes two slices

841

# (0-70 and 70-140) but has a length of 3, which would result in idx = 3 % 2 = 1 if two buffers were used.

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

842

last_used_buffer_idx = len(cost.ofm_depth_slices) % len(cost.buffered_weight_tensors)

843

weight_buffer_size = encoded_weights.double_buffer_sizes[last_used_buffer_idx]

844

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

845

if ref_cost.cascade == 0:

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

846

# Determine if the lifetime can be extended and pre-buffer the first weight buffer

847

# under the previous operation

848

cost.buffered_weight_tensors[0].pre_buffer = encoded_weights.double_buffer_size() < slack_memory

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

849

850

cost.slack_buffering_memory -= weight_buffer_size

851

else:

852

# Don't slice or buffer - use the whole depth from persistent storage

853

cost.ofm_depth_slices = ofm_full_depth_slices

854

encoded_weights = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

855

encoded_scales = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

856

857

cost.npu_weights_tensor = encoded_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

858

cost.npu_scales_tensor = encoded_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

859

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

860

def buffer_tensor(self, src_tensor: Tensor, sub_purpose: TensorSubPurpose, buffer_size: int, name: str) -> Tensor:

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

861

buffered_weight_tensor = Tensor([1, 1, 1, buffer_size], DataType.uint8, name)

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

862

buffered_weight_tensor.src_tensor = src_tensor

863

buffered_weight_tensor.mem_area = self.arch.fast_storage_mem_area

864

buffered_weight_tensor.mem_type = MemType.Scratch_fast

865

buffered_weight_tensor.purpose = TensorPurpose.Weights

866

buffered_weight_tensor.sub_purpose = sub_purpose

867

return buffered_weight_tensor

868

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

869

def propose_minimal_schedule(self) -> Schedule:

870

"""Proposes scheduling parameters where every operator is subdivided into the smallest stripe that satisfies the

871

next operators stride"""

872

min_schedule = Schedule(self.sg, "MIN")

873

cost_map = min_schedule.cost_map

874

875

# Keep track of the previous Op - which consumes the current Op's OFM

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

876

prev_op: Optional[SchedulerOperation] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

877

for sched_op in reversed(self.sched_ops):

878

min_stripe_height = prev_op.kernel.stride.y if prev_op else 1

879

min_stripe = sched_op.ofm.shape.with_height(min_stripe_height)

880

881

cost = sched_op.create_scheduler_info(self.nng, min_stripe)

882

cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)

883

cost_map[sched_op] = cost

prev_op = sched_op

return min_schedule

def propose_schedule_striping(self, final_stripe: Shape4D, label: str, ref_schedule: Schedule) -> Schedule:

890

"""Proposes new striping for a schedule. The stripe is derived from the ifm requirements of the next Op down"""

891

ref_cost = ref_schedule.cost_map

892

893

striped_schedule = Schedule(self.sg, label)

894

stripe = final_stripe

895

for sched_op in reversed(self.sched_ops):

896

if sched_op not in ref_cost:

897

# sched_op is not part of the sub-schedule - skip

898

continue

899

900

# Create a cost entry with the new stripe

901

cost = sched_op.create_scheduler_info(self.nng, stripe)

902

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

903

weight_tensor = cost.npu_weights_tensor

904

for idx, buffered_tens in enumerate(ref_cost[sched_op].buffered_weight_tensors):

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

905

# If the weights are buffered in the reference schedule they should be in the new proposal

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

906

cost.buffered_weight_tensors.append(

907

self.buffer_tensor(

908

weight_tensor,

909

buffered_tens.sub_purpose,

910

weight_tensor.double_buffer_sizes[idx],

911

buffered_tens.name,

912

)

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

913

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

914

915

# Estimate performance

916

cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)

917

striped_schedule.cost_map[sched_op] = cost

918

erik.andersson@arm.com

8912f3a

2022-08-16 11:08:57 +0200

[diff] [blame]

919

# Calculate the preceeding Op's stripe.

920

921

# In certain cases where an upscaling Op is cascaded,

922

# it may get instructed to produce an odd stripe height.

923

# Thus we need to force it back to even heights.

924

force_even_stripe_heights = False

925

for op in self.sched_ops:

926

# Check if the cascade has a Nearest Neighbor-op.

927

# If that is the case, force the stripes to be even.

928

if (

929

ref_cost.get(op, None)

930

and ref_cost.get(sched_op, None)

931

and ref_cost[op].cascade == ref_cost[sched_op].cascade

932

and is_nearest(op.resampling_mode)

933

):

934

force_even_stripe_heights = True

935

break

936

upscaling_remainder = stripe.height % to_upscale(sched_op.resampling_mode)

937

height = stripe.height + (stripe.height % 2 if force_even_stripe_heights else upscaling_remainder)

Fredrik Svedberg

d03dc50

2022-06-30 10:44:12 +0200

[diff] [blame]

938

stripe = sched_op.ifm.shape.with_height(height * sched_op.kernel.stride.y)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

939

940

return striped_schedule

941

942

def estimate_schedule_memory_usage(self, schedule: Schedule, non_local_mem_usage: dict):

943

"""Estimates the memory usage of a schedule"""

944

cost = schedule.cost_map

945

cascades = schedule.cascades

946

peak_mem_usage = 0

947

for sched_op in self.sched_ops:

948

if sched_op not in cost:

949

# sched_op is not part of the sub-schedule - skip

950

continue

951

952

if cost[sched_op].cascade:

953

# This Op is part of a cascade - use the cascade's memory usage

954

cascade_info = cascades[cost[sched_op].cascade]

Johan Alfven

2023-03-16 11:04:31 +0100

[diff] [blame^]

955

op_mem_usage = cascade_info.mem_usage + non_local_mem_usage.get(sched_op, 0)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

956

else:

957

# This Op is not part of a cascade - calculate the memory usage

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

958

op_weight_buffer = sum(tens.storage_size() for tens in cost[sched_op].buffered_weight_tensors)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

959

960

op_mem_usage = (

961

sched_op.ifm_size_in_bytes()

962

+ sched_op.ofm_size_in_bytes()

963

+ op_weight_buffer

964

+ non_local_mem_usage.get(sched_op, 0)

965

)

Johan Alfven

2023-03-16 11:04:31 +0100

[diff] [blame^]

966

peak_mem_usage = max(op_mem_usage, peak_mem_usage)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

967

968

return peak_mem_usage

969

Johan Alfvén

2022-07-16 18:27:05 +0200

[diff] [blame]

970

def build_cascades_for_min_schedule(self, min_schedule: Schedule, max_template: Schedule, memory_limit: int):

971

# Update memory snapshot

972

self.sg.schedule = min_schedule

973

self.update_op_memory_snapshot(min_schedule)

974

975

# Calculate residual memory for Min schedule

976

non_local_mem_usage = {}

977

for sched_op in self.sched_ops:

978

time_index = min_schedule.cost_map[sched_op].time_index

979

980

if self.arch.is_spilling_enabled():

981

# For Dedicated SRAM only the intermediate buffers are in SRAM, hence op_mem_usage is 0

982

op_mem_usage = 0

983

else:

984

# Min schedule only have ifm and ofm in SRAM (no buffered weigth tensors)

Johan Alfvén

eb332a3

2022-12-09 17:50:48 +0100

[diff] [blame]

985

# Only include IFM's that are in the scratch area

986

ifm = sched_op.ifm.connection.parent_tens

987

ifm_size = (

988

0 if ifm.mem_type not in (MemType.Scratch, MemType.Scratch_fast) else sched_op.ifm_size_in_bytes()

989

)

Johan Alfvén

92689d5

2022-12-06 11:16:19 +0100

[diff] [blame]

990

ofm_size = 0 if ofm_can_reuse_ifm(sched_op) else sched_op.ofm_size_in_bytes()

Johan Alfvén

eb332a3

2022-12-09 17:50:48 +0100

[diff] [blame]

991

op_mem_usage = ifm_size + ofm_size

Johan Alfvén

2022-07-16 18:27:05 +0200

[diff] [blame]

992

993

non_local_mem_usage[sched_op] = min_schedule.memory_snapshot[time_index] - op_mem_usage

Johan Alfvén

92689d5

2022-12-06 11:16:19 +0100

[diff] [blame]

994

assert non_local_mem_usage[sched_op] >= 0

Johan Alfvén

2022-07-16 18:27:05 +0200

[diff] [blame]

995

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

996

# Create cascades for Min schedule

Johan Alfvén

2022-07-16 18:27:05 +0200

[diff] [blame]

997

cascade_builder = CascadeBuilder(self.sched_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)

998

cascade_builder.build_cascades(min_schedule, max_template, memory_limit)

999

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1000

def optimize_sub_schedule(

1001

self, cascade_info: CascadeInfo, ref_schedule: Schedule, max_template: Schedule, memory_limit: int

1002

) -> Schedule:

1003

"""Extracts the Ops covered by the given cascade and creates a sub-schedule. The sub-schedule is optimized by

1004

proposing weight buffering and then continously proposing new stripe sizes"""

1005

ref_cost = ref_schedule.cost_map

1006

# Extract the ops that are part of this sub-schedule

1007

start = cascade_info.start

1008

end = cascade_info.end

1009

sub_schedule_ops = self.sched_ops[start : end + 1]

1010

# Create a sub-schedule that contains only the costs for the Ops that are part of the sub-schedule

1011

sub_schedule = Schedule(self.sg, f"SUB_{start}_{end}")

1012

for sched_op in sub_schedule_ops:

1013

sub_schedule.cost_map[sched_op] = ref_cost[sched_op]

1014

1015

sub_schedule.cascades[end] = cascade_info

1016

# Use the memory snapshot from the reference schedule

1017

sub_schedule.memory_snapshot = ref_schedule.memory_snapshot

1018

1019

# Calculate memory usage that is live during the sub-schedule but not part of it

1020

time_for_cascade = ref_cost[sub_schedule_ops[0]].time_index

1021

mem_usage_parallel_to_sub_schedule = ref_schedule.memory_snapshot[time_for_cascade] - cascade_info.mem_usage

1022

# If the first Op's IFM has other consumers it has to live throughout the whole sub-schedule whether it's

Johan Alfven

2023-03-16 11:04:31 +0100

[diff] [blame^]

1023

# included in a cascade or not. Not valid in Dedicated SRAM mode (spilling enabled).

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1024

persistent_initial_ifm = (

Johan Alfven

2023-03-16 11:04:31 +0100

[diff] [blame^]

1025

sub_schedule_ops[0].ifm_size_in_bytes()

1026

if not self.arch.is_spilling_enabled() and len(sub_schedule_ops[0].ifm.connection.consumers) > 1

1027

else 0

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1028

)

1029

# Calculate non-local-mem-usage per Operator

1030

non_local_mem_usage = {}

1031

for idx, sched_op in enumerate(sub_schedule_ops):

1032

non_local_mem_usage[sched_op] = mem_usage_parallel_to_sub_schedule

1033

if idx != 0:

1034

non_local_mem_usage[sched_op] += persistent_initial_ifm

1035

1036

cascade_builder = CascadeBuilder(sub_schedule_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)

1037

1038

# Start by adding buffering

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

1039

buffered_sub_schedule = self.propose_schedule_buffering(

1040

sub_schedule, self.scheduler_options.optimization_sram_limit

1041

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1042

# Copy the cascades over from the unbuffered-schedule

1043

buffered_sub_schedule.cascades = sub_schedule.cascades

1044

1045

# Generate the possible stripings for the final Op in the sub-schedule

1046

final_ofm_shape = sub_schedule_ops[-1].ofm.shape

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1047

1048

# Skip testing the min stripe used in the MIN schedule since that will be used

1049

# anyway if no new cascades are created below

1050

last_op = sub_schedule_ops[-1]

1051

min_stripe_h = sub_schedule.cost_map[last_op].stripe.height + 1

1052

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1053

possible_stripes = [

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1054

final_ofm_shape.with_height(stripe_h) for stripe_h in range(min_stripe_h, final_ofm_shape.height // 2 + 1)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1055

]

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1056

# Propose different striping

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

1057

best_schedule = None

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1058

max_nbr_of_cascades = 0

1059

for iteration, proposed_stripe in enumerate(possible_stripes):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1060

proposed_schedule = self.propose_schedule_striping(

1061

proposed_stripe, f"OPTIMIZED_{iteration}", buffered_sub_schedule

1062

)

1063

1064

cascade_builder.build_cascades(proposed_schedule, max_template, memory_limit)

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1065

nbr_of_cascades = len(proposed_schedule.cascades)

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1066

if iteration == 0:

1067

# First iteration - used as limit to prevent splitting up the cascades

1068

# Long cascades are better in order to reduce IFM/IFM dram bandwidth

1069

max_nbr_of_cascades = nbr_of_cascades

1070

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1071

# Check if proposal fits

1072

proposed_schedule_mem_usage = self.estimate_schedule_memory_usage(proposed_schedule, non_local_mem_usage)

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1073

if (proposed_schedule_mem_usage) <= memory_limit and nbr_of_cascades <= max_nbr_of_cascades:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1074

best_schedule = proposed_schedule

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1075

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1076

if not proposed_schedule.cascades:

1077

# No cascading required - early exit

1078

break

1079

else:

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1080

break

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

return best_schedule

def optimize_schedule(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

self,

schedule: Schedule,

max_sched: Schedule,

max_template: Schedule,

1089

options: SchedulerOptions,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1090

) -> Schedule:

1091

"""Extracts sub-schedules based on the cascades and optimizes them and applies them to the final schedule"""

1092

sram_limit = options.optimization_sram_limit

1093

if max_sched.fast_storage_peak_usage < sram_limit and not self.arch.is_spilling_enabled():

1094

# Maximum performance schedule fits within the SRAM target

1095

return max_sched

1096

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

1097

# Iterate over a copy of the cascades since they may change during the loop

1098

for cascade_info in list(schedule.cascades.values()):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1099

# Optimize the sub-schedule in this cascade

1100

opt_sub_schedule = self.optimize_sub_schedule(cascade_info, schedule, max_template, sram_limit)

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

1101

if opt_sub_schedule:

1102

# Remove the existing cascade

1103

del schedule.cascades[cascade_info.end]

1104

# Update the sub-schedule Op and cascade costs to the full schedule

1105

schedule.cost_map.update(opt_sub_schedule.cost_map)

1106

schedule.cascades.update(opt_sub_schedule.cascades)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1107

1108

# Update memory snapshot

1109

self.sg.schedule = schedule

1110

self.update_op_memory_snapshot(schedule)

1111

# Propose schedule buffering to the optimized schedule

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

1112

optimized_sched = self.propose_schedule_buffering(schedule, self.scheduler_options.optimization_sram_limit)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1113

# Copy the cascade's metadata from the unbuffered schedule

1114

optimized_sched.cascades = schedule.cascades

1115

return optimized_sched

1116

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1117

def optimize_weight_buffering_size(

1118

self,

1119

min_schedule: Schedule,

1120

options: SchedulerOptions,

1121

):

1122

default_schedule = self.sg.schedule

Tim Hall

c1be087

2022-03-03 17:50:52 +0000

[diff] [blame]

1123

npu_performance.calc_new_performance_for_network(self.nng, self.arch, None, False)

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1124

default_tot_cycles = self.nng.cycles[npu_performance.PassCycles.Total]

1125

default_dram_cycles = self.nng.cycles[npu_performance.PassCycles.DramAccess]

1126

1127

# Restore mem/type for scratched_fms

1128

for tens in self.scratched_fms:

1129

tens.mem_area = self.scratched_fms[tens][0]

1130

tens.mem_type = self.scratched_fms[tens][1]

1131

1132

self.update_op_memory_snapshot(self.sg.schedule)

1133

1134

# Collect live ranges from tensors

1135

memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

1136

lr_graph = live_range.LiveRangeGraph()

1137

for mem_area, mem_type_set in memories_list:

Johan Alfven

2023-02-28 09:03:03 +0100

[diff] [blame]

1138

live_range.extract_live_ranges_from_schedule(

1139

self.sg,

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1140

mem_area,

1141

mem_type_set,

1142

lr_graph,

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1143

)

1144

1145

# Find the relation between the sched_op and the buffering tensor

1146

weight_ops = {}

1147

for sched_op in self.sched_ops:

1148

cost = self.sg.schedule.cost_map[sched_op]

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

1149

for tens in cost.buffered_weight_tensors:

1150

weight_ops[tens] = sched_op

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1151

1152

# Filter out weight buffer live ranges

1153

weight_lrs = []

1154

for lr in lr_graph.lrs:

1155

for tens in lr.tensors:

1156

if weight_ops.get(tens):

1157

weight_lrs.append(lr)

1158

break

1159

1160

# See if any evicted fm overlaps with a weight buffering op.

1161

# If this is the case add a size limitation to the buffering op

1162

for lr in self.evicted_fms:

1163

for weight_lr in weight_lrs:

1164

if lr.overlaps_ranges(weight_lr):

1165

for tens in weight_lr.tensors:

1166

sched_op = weight_ops.get(tens)

1167

if sched_op:

1168

# Add size reduction to the op

1169

sched_op.evicted_fms_size += lr.size

1170

break

1171

1172

self.sg.schedule = min_schedule

1173

self.update_op_memory_snapshot(self.sg.schedule)

1174

1175

# Run schedule buffering - with weight buffer size reduction

1176

schedule = self.propose_schedule_buffering(self.sg.schedule, options.optimization_sram_limit)

1177

schedule.cascades = self.sg.schedule.cascades

1178

self.sg.schedule = schedule

1179

1180

# Apply new buffer schdule and calc new performance

1181

self.update_op_memory_snapshot(self.sg.schedule)

1182

self.apply_schedule(self.sg.schedule)

1183

self.use_fast_storage_for_feature_maps(self.sg.schedule, options.optimization_sram_limit)

1184

Tim Hall

c1be087

2022-03-03 17:50:52 +0000

[diff] [blame]

1185

npu_performance.calc_new_performance_for_network(self.nng, self.arch, None, False)

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1186

new_tot_cycles = self.nng.cycles[npu_performance.PassCycles.Total]

1187

new_dram_cycles = self.nng.cycles[npu_performance.PassCycles.DramAccess]

1188

Tim Hall

8bc7a65

2022-05-19 15:29:23 +0100

[diff] [blame]

1189

improvement_tot = (

1190

round((default_tot_cycles - new_tot_cycles) / default_tot_cycles, 2) if default_tot_cycles != 0 else 0

1191

)

1192

improvement_dram = (

1193

round((default_dram_cycles - new_dram_cycles) / default_dram_cycles, 2) if default_dram_cycles != 0 else 0

1194

)

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1195

1196

# Compare both total and dram improvement

Johan Alfvén

3dae1b6

2022-05-17 10:26:48 +0200

[diff] [blame]

1197

if not (improvement_tot >= 0 and improvement_dram > 0):

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1198

# No improvement, restore the default schedule

1199

for sched_op in self.sched_ops:

1200

sched_op.evicted_fms_size = 0

1201

1202

for tens in self.scratched_fms:

1203

tens.mem_area = self.scratched_fms[tens][0]

1204

tens.mem_type = self.scratched_fms[tens][1]

1205

1206

self.sg.schedule = default_schedule

1207

self.update_op_memory_snapshot(self.sg.schedule)

1208

self.apply_schedule(self.sg.schedule)

1209

self.use_fast_storage_for_feature_maps(self.sg.schedule, options.optimization_sram_limit)

1210

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1211

def apply_schedule(self, sched: Schedule):

1212

"""Applies the given schedule as a final solution"""

1213

for sched_op in self.sched_ops:

1214

op_info = sched.cost_map[sched_op]

1215

cascade_info = sched.cascades.get(op_info.cascade, None)

1216

if cascade_info and sched_op in cascade_info.buffers:

1217

buffer_tens = sched_op.ifm.connection.parent_tens

1218

# Apply memory area and type

1219

buffer_tens.mem_area = self.arch.fast_storage_mem_area

1220

buffer_tens.mem_type = MemType.Scratch_fast

1221

# Apply Rolling buffer

1222

buffer_tens.set_format(TensorFormat.NHCWB16, self.arch)

1223

buffer_tens.set_new_sub_purpose(TensorSubPurpose.RollingBufferY, cascade_info.buffers[sched_op].height)

1224

1225

sched_op.parent_ps.block_config = op_info.block_config.old_style_representation()

1226

1227

# Ensure that the src_tensor reference is set correctly

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

1228

for tens in op_info.buffered_weight_tensors:

1229

tens.src_tensor = op_info.npu_weights_tensor

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1230

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1231

def use_fast_storage_for_feature_maps(self, schedule, staging_limit):

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1232

"""Finds the set of feature maps that fits within the staging limit which combined has the largest amount of

1233

access cycles and moves those feature map into fast storage"""

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1234

max_mem_usage = []

1235

base_mem_usage = []

1236

fast_storage_type = MemType.Scratch_fast

1237

fast_storage_mem_area = self.arch.fast_storage_mem_area

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1238

self.evicted_fms = []

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1239

1240

# Force all OFMs to fast-storage

1241

for sched_op in self.sched_ops:

1242

cost = schedule.cost_map[sched_op]

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1243

if cost.cascade == 0 and sched_op.get_dependants():

1244

ofm_tens = sched_op.ofm.connection.parent_tens

1245

if not any(cons is None for cons in ofm_tens.consumer_list):

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1246

if ofm_tens not in self.scratched_fms:

1247

# Remember default mem area and mem type, only done once

1248

self.scratched_fms[ofm_tens] = (ofm_tens.mem_area, ofm_tens.mem_type)

1249

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1250

ofm_tens.mem_area = fast_storage_mem_area

1251

ofm_tens.mem_type = fast_storage_type

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1252

1253

# Collect live ranges from tensors

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1254

memories_list = [(fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1255

lr_graph = live_range.LiveRangeGraph()

1256

for mem_area, mem_type_set in memories_list:

Johan Alfven

2023-02-28 09:03:03 +0100

[diff] [blame]

1257

live_range.extract_live_ranges_from_schedule(

1258

self.sg,

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

1259

mem_area,

1260

mem_type_set,

1261

lr_graph,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1262

)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1263

max_mem_usage = lr_graph.get_temporal_memory_usage(fast_storage_mem_area)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1264

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1265

# If max_mem_usage does not exceed staging limit at any point all lrs fit and can stay in fast storage

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1266

if max(max_mem_usage) <= staging_limit:

1267

return

1268

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1269

# Build up the base memory usage by removing the mem_usage of the lrs we previously moved to fast-storage

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1270

base_mem_usage = np.array(max_mem_usage)

1271

curr_lrs = []

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1272

for lr in lr_graph.lrs:

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1273

for tens in lr.tensors:

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1274

if self.scratched_fms.get(tens):

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1275

curr_lrs.append(lr)

1276

base_mem_usage[lr.start_time : lr.end_time + 1] -= lr.size

1277

break

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1278

competing_lrs = []

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1279

competing_tens_access = {}

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1280

1281

# Evict live ranges that will never fit

1282

for lr in curr_lrs.copy():

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1283

base_usage = max(base_mem_usage[lr.start_time : lr.end_time + 1])

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1284

if base_usage + lr.size > staging_limit:

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1285

# Lr will never fit and may thus be evicted

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1286

self.evicted_fms.append(lr)

1287

FastStorageComponentAllocator.evict(lr, max_mem_usage, self.scratched_fms)

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1288

curr_lrs.remove(lr)

1289

1290

# Keep live ranges that will always fit in fast storage and let the remaining ones compete

1291

for lr in curr_lrs:

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1292

# Since max_mem_usage is the memory usage with all FMs still in fast-storage,

1293

# the memory limit cannot be exceeded if max_mem_usage does not.

1294

# Thus, the affected lrs can remain in fast-storage if the following is true

1295

if max(max_mem_usage[lr.start_time : lr.end_time + 1]) <= staging_limit:

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1296

FastStorageComponentAllocator.keep(lr, base_mem_usage)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1297

else:

1298

competing_lrs.append(lr)

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1299

for tens in lr.tensors:

1300

competing_tens_access[tens] = 0

1301

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1302

# All lrs and their tensors have been handled if competing_lrs_sz is zero, we may thus return

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1303

if len(competing_lrs) == 0:

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1304

return

1305

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1306

# Estimate element access for all tensors that are competing for a place in fast-storage.

1307

# This number is used when deciding which tensor that stays in fast-storage

1308

for sched_op in self.sched_ops:

1309

cost = schedule.cost_map[sched_op]

1310

1311

if competing_tens_access.get(sched_op.ifm.connection.parent_tens) is not None:

1312

tens = sched_op.ifm.connection.parent_tens

1313

access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)

1314

competing_tens_access[tens] += access.ifm_read[0]

1315

1316

if sched_op.ifm2 and competing_tens_access.get(sched_op.ifm2.connection.parent_tens) is not None:

1317

tens = sched_op.ifm2.connection.parent_tens

1318

access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)

1319

competing_tens_access[tens] += access.ifm_read[1]

1320

1321

if competing_tens_access.get(sched_op.ofm.connection.parent_tens) is not None:

1322

tens = sched_op.ofm.connection.parent_tens

1323

access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)

1324

competing_tens_access[tens] += access.ofm_write

1325

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1326

# Sort live ranges "from left to right" on the time axis to simplify checking overlapping ranges

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1327

competing_lrs = sorted(competing_lrs, key=lambda lr: (lr.start_time, lr.end_time + 1, lr.size))

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1328

1329

# Remove lrs that have a live range that is too long compared to others.

1330

# They are causing problems for the HillClimb Allocator when it has to

1331

# change the allocation indices, in order to fit all the allocations into SRAM.

1332

# This problem only occur in larger networks with complex graphs.

1333

#

1334

# Limit the number of items for allocate_component to work with max MAX_EXHAUSTIVE_ITEMS

1335

# at the time. Too many will give too long compilation time

1336

#

1337

# Too long is currently decided to be (based on experience, analyzing many networks):

1338

# Compare lr at postion i with lr at position i + MAX_EXHAUSTIVE_ITEMS.

1339

# If end time differs by at least MAX_EXHAUSTIVE_LIFE_RANGE then do not include lr at position i.

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1340

if len(competing_lrs) > FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS:

1341

# Create a copy of the original list to iterate over because the original version is modified in-loop

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1342

competing_lrs_copy = competing_lrs.copy()

1343

for i, lr in enumerate(competing_lrs_copy):

1344

lr_time = lr.end_time - lr.start_time

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1345

# Only check live ranges longer than MAX_EXHAUSTIVE_LIFE_RANGE

1346

if lr_time >= FastStorageComponentAllocator.MAX_EXHAUSTIVE_LIFE_RANGE:

1347

# Compare current lr with lr at position lr + MAX_EXHAUSTIVE_ITEMS

1348

cmp_pos = min(i + FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS, len(competing_lrs) - 1)

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1349

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1350

# Compare end times + plus a margin by MAX_EXHAUSTIVE_LIFE_RANGE

1351

if (

1352

lr.end_time

1353

> competing_lrs_copy[cmp_pos].end_time + FastStorageComponentAllocator.MAX_EXHAUSTIVE_LIFE_RANGE

1354

):

1355

# Current lr live time stands out, remove it. No use adding it to the

1356

# evicted_fms list since the lr should not be included in the fast storage allocation

1357

FastStorageComponentAllocator.evict(lr, max_mem_usage, self.scratched_fms)

1358

competing_lrs.remove(lr)

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1359

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1360

# Split competing live ranges into components by finding disconnected groups of live ranges or components of

1361

# max size MAX_EXHAUSTIVE_ITEMS

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1362

start = 0

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1363

end_time = competing_lrs[0].end_time

1364

component_allocator = FastStorageComponentAllocator(base_mem_usage, max_mem_usage, staging_limit)

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1365

component_ranges = []

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1366

for i, lr in enumerate(competing_lrs):

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1367

nbr_items = i - start

1368

if lr.start_time <= end_time and (nbr_items < FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS):

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1369

end_time = max(end_time, lr.end_time)

1370

else:

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1371

# Number items reached max items or current lr's start time

1372

# does not overlap with previous lr's end time

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1373

component_ranges.append((start, i))

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1374

start = i

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1375

end_time = lr.end_time

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1376

component_ranges.append((start, len(competing_lrs)))

1377

1378

# Allocate each component separately

1379

for start, end in component_ranges:

1380

component_allocator.allocate_component(

1381

competing_lrs[start:end],

max_mem_usage,

base_mem_usage,

self.scratched_fms,

competing_tens_access,

1386

self.evicted_fms,

1387

)

1388

assert max(max_mem_usage) <= staging_limit, "Allocation exceeds staging limit"

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1389

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1390

def print_schedule(self, schedule: Schedule):

1391

print(f"Schedule: '{schedule.name}'")

1392

for sched_op in self.sched_ops:

1393

if sched_op not in schedule.cost_map:

1394

# Sub-schedule printing

1395

continue

1396

1397

op_info = schedule.cost_map[sched_op]

1398

print(f"\t{sched_op.index}: Operation {sched_op.name} - OFM {sched_op.ofm.shape}")

1399

print(f"\t\tType: {sched_op.op_type}")

1400

print(f"\t\tKernel: {sched_op.kernel}")

1401

print(f"{op_info}")

1402

mem_usage = (

1403

schedule.memory_snapshot[op_info.time_index]

1404

if op_info.time_index < len(schedule.memory_snapshot)

1405

else 0

1406

)

1407

print(f"\t\tSRAM Used: {mem_usage} bytes")

1408

Jonas Ohlsson

25e700c

2022-03-04 14:58:56 +0100

[diff] [blame]

1409

print("\tCascades:")

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1410

for i, cascade in enumerate(schedule.cascades.values()):

1411

print(f"\t\t{i}: {cascade.start} -> {cascade.end}, size: {cascade.mem_usage}")

Patrik Gustavsson

feeb06d

2020-04-22 12:53:47 +0200

[diff] [blame]

1412

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1413

Johan Alfven

2023-02-28 09:03:03 +0100

[diff] [blame]

1414

def _update_memory_snapshot_for_all_npu_graphs(nng: Graph, arch: ArchitectureFeatures, schedulers):

1415

mem_area = arch.fast_storage_mem_area

1416

mem_type_set = set((MemType.Scratch, MemType.Scratch_fast))

1417

1418

# Collect live ranges for the full graph

1419

# extract_live_ranges_from_cascaded_passes will start from the root sg and

1420

# all sub graphs/cascaded passes will be visited and the correct time_index

1421

# will be set for all the tensors.

1422

lr_graph = live_range.LiveRangeGraph()

1423

live_range.extract_live_ranges_from_cascaded_passes(

1424

nng.get_root_subgraph(),

mem_area,

mem_type_set,

lr_graph,

Tensor.AllocationQuantum,

1429

)

1430

# Populate time-array with memory used by live ranges

1431

temporal_usage = lr_graph.get_temporal_memory_usage(arch.fast_storage_mem_area)

1432

1433

# Update snapshot for all the npu sub graphs

1434

# Not needed for the scheduler any longer but npu_performance

1435

# is using this information so it must have the correct state

1436

for sg in schedulers:

1437

sg.schedule.memory_snapshot = temporal_usage

1438

sg.schedule.fast_storage_peak_usage = max(temporal_usage, default=0)

1439

1440

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1441

def _update_tensor_allocation(nng: Graph, arch: ArchitectureFeatures, options):

1442

"""

1443

Creates live ranges and runs tensor allocator for the current schedule

1444

(i.e. sg.schedule for all subgraphs), returns the maximum memory usage

1445

and updates SchedulerOpInfo.mem_usage for all operations in the schedule.

1446

"""

1447

root_sg = nng.get_root_subgraph()

1448

1449

alloc_list = []

1450

if arch.is_spilling_enabled():

1451

mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,)))

1452

mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))

1453

# Order is important

1454

alloc_list.append(mem_alloc_scratch_fast)

1455

alloc_list.append(mem_alloc_scratch)

1456

else:

1457

mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))

1458

alloc_list.append(mem_alloc_scratch)

1459

1460

for mem_area, mem_type_set in alloc_list:

1461

tensor_allocation.allocate_tensors(

nng,

root_sg,

arch,

mem_area,

mem_type_set,

tensor_allocator=options.tensor_allocator,

1468

verbose_allocation=options.verbose_allocation,

1469

cpu_tensor_alignment=options.cpu_tensor_alignment,

Tim Hall

cda4fcb

2022-05-19 12:36:58 +0100

[diff] [blame]

1470

hillclimb_max_iterations=options.hillclimb_max_iterations,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1471

)

1472

1473

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1474

class FastStorageComponentAllocator:

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1475

MAX_EXHAUSTIVE_LIFE_RANGE = 20

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1476

MAX_EXHAUSTIVE_ITEMS = 20

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1477

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1478

def __init__(self, base_mem_usage, max_mem_usage, staging_limit):

1479

self.base_mem_usage = base_mem_usage

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1480

self.max_mem_usage = max_mem_usage

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1481

self.staging_limit = staging_limit

1482

self.lrs = []

1483

self.evicted = []

1484

self.curr_evicted = []

1485

self.remaining_total_size = []

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1486

self.best_score = 0

1487

self.competing_tens_access = {}

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1488

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1489

def allocate_exhaustive(self, ix, score):

1490

# Favour tensors with highest element access (score)

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1491

if ix >= self.num_lrs:

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1492

if score > self.best_score:

1493

self.best_score = score

Louis Verhaard

5c8f1e5

2022-02-23 14:13:07 +0100

[diff] [blame]

1494

self.evicted = self.curr_evicted.copy()

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1495

return

1496

1497

lr = self.lrs[ix]

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1498

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1499

# If adding the tensor size to the base mem usage doesn't exceed the staging limit anywhere on the lr time

1500

# range, it can fit and the case where the tensor is included needs to be checked

1501

can_fit = max(self.base_mem_usage[lr.start_time : lr.end_time + 1]) + lr.size <= self.staging_limit

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1502

if can_fit:

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1503

# Tensor can fit, add tensor element access to the score and check case where tensor is included

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1504

self.curr_evicted[ix] = False

1505

self.base_mem_usage = self.update_mem_usage(self.base_mem_usage, lr, True)

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1506

self.allocate_exhaustive(ix + 1, score + self.competing_tens_access[lr.tensors[0]])

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1507

self.base_mem_usage = self.update_mem_usage(self.base_mem_usage, lr, False)

1508

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1509

# If the max mem usage doesn't exceed the staging limit anywhere on the lr time range, it always fits and the

1510

# case where the tensor is not included can be skipped

1511

always_fits = max(self.max_mem_usage[lr.start_time : lr.end_time + 1]) <= self.staging_limit

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1512

if not always_fits:

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1513

# Tensor doesn't always fit, check case when tensor is not included

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1514

self.curr_evicted[ix] = True

1515

self.max_mem_usage = self.update_mem_usage(self.max_mem_usage, lr, False)

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1516

self.allocate_exhaustive(ix + 1, score)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1517

self.max_mem_usage = self.update_mem_usage(self.max_mem_usage, lr, True)

1518

1519

@staticmethod

1520

def update_mem_usage(mem_usage, lr, increase):

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1521

size = lr.size if increase else -lr.size

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1522

for t in range(lr.start_time, lr.end_time + 1):

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1523

mem_usage[t] += size

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

return mem_usage

@staticmethod

def evict(lr, max_mem_usage, scratched_fms):

1528

for t in range(lr.start_time, lr.end_time + 1):

1529

max_mem_usage[t] -= lr.size

1530

for tens in lr.tensors:

1531

if tens in scratched_fms:

1532

tens.mem_area = scratched_fms[tens][0]

1533

tens.mem_type = scratched_fms[tens][1]

1534

1535

@staticmethod

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1536

def keep(lr, base_mem_usage):

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1537

for t in range(lr.start_time, lr.end_time + 1):

1538

base_mem_usage[t] += lr.size

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1539

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1540

def allocate_component(self, lrs, max_mem, min_mem, scratched_fms, competing_tens_access, evicted_fms):

1541

self.lrs = lrs

1542

self.num_lrs = len(lrs)

1543

self.evicted = [0] * self.num_lrs

1544

self.curr_evicted = [0] * self.num_lrs

1545

self.best_score = -1

1546

self.competing_tens_access = competing_tens_access

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1547

# Recursively evaluate all permutations of allocations of the lrs found in the component.

1548

# For every permutation that fits within the staging_limit there is a score calculated.

1549

# The permutation with the highest score will then be chosen. The score is calculated

1550

# as the sum of the actual element access (ifm read and ofm write) for all the

1551

# including tensors. So it is not necessary the tensor with the biggest size that ends up

1552

# being included in the result.

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1553

self.allocate_exhaustive(0, 0)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1554

# Optimal allocation has been found, move lrs accordingly

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1555

for i, lr in enumerate(self.lrs):

1556

if self.evicted[i]:

1557

self.evict(lr, max_mem, scratched_fms)

1558

if lr not in evicted_fms:

1559

evicted_fms.append(lr)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1560

else:

Rickard Bolin

2022-12-07 08:54:53 +0000

[diff] [blame]

1561

self.keep(lr, min_mem)

1562

if lr in evicted_fms:

1563

evicted_fms.remove(lr)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1564

1565

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1566

def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_options: SchedulerOptions):

1567

"""Entry point for the Scheduler"""

1568

# Initialize CPU subgraphs

1569

schedulers = dict()

1570

# Initialize schedulers with max schedule. Only schedule NPU subgraphs

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1571

for sg in nng.subgraphs:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1572

if sg.placement != PassPlacement.Npu:

1573

# Create cascaded passes for CPU Ops

1574

cascaded_passes = []

1575

for idx, ps in enumerate(sg.passes):

1576

cps = CascadedPass(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

1577

ps.name,

1578

SchedulingStrategy.WeightStream,

ps.inputs,

[],

ps.outputs,

[ps],

ps.placement,

False,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1585

)

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1586

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1587

cps.time = idx

1588

ps.cascade = cps

1589

cascaded_passes.append(cps)

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1590

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1591

sg.cascaded_passes = cascaded_passes

1592

else:

1593

# Npu subgraph - create schedule

1594

scheduler = Scheduler(nng, sg, arch, scheduler_options)

1595

schedulers[sg] = scheduler

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1596

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1597

scheduler.create_scheduler_representation(arch)

1598

sg.sched_ops = scheduler.sched_ops

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1599

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1600

# Create the Max schedule template

1601

max_schedule_template = scheduler.create_initial_schedule()

1602

scheduler.max_schedule = max_schedule_template

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1603

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1604

# Create the optimimised Max schedule

1605

sg.schedule = max_schedule_template

1606

scheduler.update_op_memory_snapshot(max_schedule_template)

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

1607

opt_max_schedule = scheduler.propose_schedule_buffering(max_schedule_template, 1 << 32)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1608

sg.schedule = opt_max_schedule

1609

scheduler.update_op_memory_snapshot(opt_max_schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1610

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1611

# Create Min schedule

1612

min_schedule = scheduler.propose_minimal_schedule()

1613

initial_sram_limit = scheduler_options.optimization_sram_limit

1614

if scheduler_options.optimization_strategy == OptimizationStrategy.Size:

1615

initial_sram_limit = scheduler.min_memory_req

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1616

Johan Alfvén

2022-07-16 18:27:05 +0200

[diff] [blame]

1617

# Build cascades for Min schedule

1618

scheduler.build_cascades_for_min_schedule(min_schedule, max_schedule_template, initial_sram_limit)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1619

sg.schedule = min_schedule

1620

scheduler.update_op_memory_snapshot(min_schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1621

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1622

if scheduler_options.optimization_strategy == OptimizationStrategy.Performance:

1623

# Create an optimized schedule

1624

sg.schedule = scheduler.optimize_schedule(

1625

min_schedule, opt_max_schedule, max_schedule_template, scheduler_options

1626

)

1627

scheduler.update_op_memory_snapshot(sg.schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1628

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1629

scheduler.apply_schedule(sg.schedule)

1630

scheduler.use_fast_storage_for_feature_maps(sg.schedule, scheduler_options.optimization_sram_limit)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1631

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1632

if scheduler_options.optimization_strategy == OptimizationStrategy.Performance and scheduler.evicted_fms:

1633

# It might be possible to gain performance by reducing

1634

# weight buffer size and instead fit fms in fast storage

1635

scheduler.optimize_weight_buffering_size(min_schedule, scheduler_options)

1636

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1637

if scheduler_options.verbose_schedule:

1638

scheduler.print_schedule(sg.schedule)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1639

Johan Alfven

2023-02-28 09:03:03 +0100

[diff] [blame]

1640

# Make a full live range calculation starting from the root sg

1641

_update_memory_snapshot_for_all_npu_graphs(nng, arch, schedulers)

1642

Tim Hall