Blame - ethosu/vela/scheduler.py - ml/ethos-u/ethos-u-vela

2020-04-27 18:20:16 +0100

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

16

#

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

17

# Description:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

18

# The scheduler creates and searches for an optimal plan for the network, selecting block configurations and

19

# subdivisions for the Operators

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

20

# For Class name forward references for the type annotations. (see PEP 563).

21

from __future__ import annotations

22

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

23

import copy

Johan Alfvén

5e0ae55

2022-02-09 21:20:10 +0100

[diff] [blame]

24

from collections import namedtuple

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

25

from enum import auto

26

from enum import IntEnum

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

27

from typing import Any

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

28

from typing import Dict

29

from typing import List

30

from typing import Optional

31

from typing import Tuple

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

32

from typing import TYPE_CHECKING

33

34

# Import needed for Type annotations. Only import for Type checking to avoid run-time errors due to cyclic import.

35

if TYPE_CHECKING:

36

from .npu_performance import CycleCost

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

37

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

38

import numpy as np

39

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

40

from . import live_range

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

41

from . import npu_performance

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

42

from . import tensor_allocation

43

from . import weight_compressor

44

from .architecture_allocator import ArchitectureBlockConfig

45

from .architecture_allocator import find_block_config

46

from .architecture_allocator import get_ifm_area_required

Fredrik Svedberg

d03dc50

2022-06-30 10:44:12 +0200

[diff] [blame]

47

from .architecture_allocator import to_upscale

erik.andersson@arm.com

8912f3a

2022-08-16 11:08:57 +0200

[diff] [blame]

48

from .architecture_allocator import is_nearest

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

49

from .architecture_features import ArchitectureFeatures

50

from .architecture_features import Block

51

from .cascade_builder import CascadeBuilder

52

from .cascade_builder import CascadeInfo

Fredrik Svedberg

880e735

2020-08-25 11:31:47 +0200

[diff] [blame]

53

from .data_type import DataType

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

54

from .nn_graph import CascadedPass

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

55

from .nn_graph import Graph

56

from .nn_graph import Pass

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

57

from .nn_graph import PassPlacement

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

58

from .nn_graph import SchedulingStrategy

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

59

from .nn_graph import Subgraph

Johan Alfvén

92689d5

2022-12-06 11:16:19 +0100

[diff] [blame]

60

from .live_range import ofm_can_reuse_ifm

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

61

from .numeric_util import round_down

62

from .numeric_util import round_up

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

63

from .operation import NpuBlockType

Louis Verhaard

aee5d75

2020-09-30 09:01:52 +0200

[diff] [blame]

64

from .operation import Op

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

65

from .shape4d import Shape4D

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

66

from .tensor import MemArea

Patrik Gustavsson

eca2e95

2020-05-27 09:15:11 +0200

[diff] [blame]

67

from .tensor import MemType

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

68

from .tensor import Tensor

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

69

from .tensor import TensorFormat

70

from .tensor import TensorPurpose

71

from .tensor import TensorSubPurpose

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

72

from .weight_compressor import NpuWeightTensor

Jacob Bohlin

1a66697

2020-09-11 10:04:15 +0200

[diff] [blame]

73

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

74

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

75

def shape_for_format(shape: Shape4D, tensor_format: TensorFormat) -> Shape4D:

76

if tensor_format == TensorFormat.NHCWB16:

77

return shape.with_depth(round_up(shape.depth, 16))

return shape

class OptimizationStrategy(IntEnum):

83

"""Enum defining the different optimization strategies for the Scheduler"""

84

85

Size = auto()

86

Performance = auto()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

def __str__(self):

return self.name

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

92

class SchedulerOpInfo:

93

"""Contains metadata about a SchedulerOperation that is unique to one Schedule"""

94

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

95

def __init__(

96

self,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

97

block_config: ArchitectureBlockConfig,

98

weights_size: int,

99

stripe_input: Shape4D,

100

stripe_input2: Optional[Shape4D],

101

stripe: Shape4D,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

102

):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

103

self.block_config = block_config

104

self.weights_size = weights_size

105

self.stripe_input = stripe_input

106

self.stripe_input2 = stripe_input2

107

self.stripe = stripe

108

self.cascade = 0 # Assigned by CascadeBuilder. 0 means not part of a cascade

109

self.time_index = None # Set by update_op_memory_snapshot

110

self.ofm_depth_slices: List[int] = [0, stripe.depth]

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

111

self.npu_weights_tensor: Optional[NpuWeightTensor] = None

112

self.npu_scales_tensor: Optional[NpuWeightTensor] = None

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

113

self.buffered_weight_tensors: List[Tensor] = []

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

114

self.cycles: Optional[CycleCost] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

115

self.slack_buffering_cycles = 0

116

self.slack_buffering_memory = 0

117

self.full_weight_transfer_cycles = 0

118

119

def copy(self):

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

120

res = SchedulerOpInfo(

self.block_config,

self.weights_size,

self.stripe_input,

self.stripe_input2,

self.stripe,

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

127

res.cascade = self.cascade

return res

def __str__(self):

res = f"\t\tBlock Config = {self.block_config}\n"

132

res += f"\t\tOFM Block = {self.block_config.ofm_block}\n"

133

res += f"\t\tIFM Stripe = {self.stripe_input}\n"

134

res += f"\t\tIFM2 Stripe = {self.stripe_input2}\n"

135

res += f"\t\tOFM Stripe = {self.stripe}\n"

136

res += f"\t\tEncoded Weights = {self.npu_weights_tensor and len(self.npu_weights_tensor.buffer)} bytes\n"

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

137

for idx, tens in enumerate(self.buffered_weight_tensors):

138

res += f"\t\tWeight buffer{idx + 1} = {tens.storage_size()} bytes\n"

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

139

res += f"\t\tDepth slices = {self.ofm_depth_slices}\n"

140

res += f"\t\tAssigned Cascade = {self.cascade}"

return res

class SchedulerOptions:

145

"""Contains options for the Scheduler"""

146

147

def __init__(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

148

self,

149

optimization_strategy,

150

sram_target,

151

verbose_schedule,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

152

):

153

self.optimization_strategy = optimization_strategy

154

self.optimization_sram_limit = sram_target

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

155

self.verbose_schedule = verbose_schedule

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

156

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

157

def __str__(self) -> str:

158

return f"{type(self).__name__}: {str(self.__dict__)}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

__repr__ = __str__

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

163

class SchedulerTensor:

164

def __init__(self, shape, dt, mem_area, _format):

165

self.dtype = dt

166

self.mem_area = mem_area

167

self.shape = shape

168

self.format = _format

169

self.connection = None

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

170

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

171

172

class SchedulerOperation:

173

"""Scheduler internal representation of 'Operation'

174

This class can be seen as a node within the Scheduler Graph representation

175

"""

176

177

def __init__(self, ps: Pass, arch: ArchitectureFeatures, nng: Graph):

178

self.arch = arch

179

self.parent_ps = ps

180

self.parent_op = ps.primary_op

181

self.name = ps.primary_op.name

182

self.op_type = ps.primary_op.type

183

self.activation = ps.primary_op.activation

184

self.kernel = ps.primary_op.kernel

Tim Hall

3c5cfe9

2022-03-16 16:31:57 +0000

[diff] [blame]

185

self.resampling_mode = ps.primary_op.ifm_resampling_mode

Fredrik Svedberg

b81e1bb

2022-10-11 21:50:51 +0200

[diff] [blame]

186

self.reversed_operands = False

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

187

self.uses_scalar = ps.primary_op.ifm2 is not None and (

188

ps.primary_op.ifm.shape == [] or ps.primary_op.ifm2.shape == []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

189

)

erik.andersson@arm.com

2022-03-22 15:35:30 +0100

[diff] [blame]

190

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

191

self.ifm_ublock = arch.ifm_ublock

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

192

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

193

self.ifm = SchedulerTensor(

194

ps.ifm_shapes[0],

195

ps.ifm_tensor.dtype,

196

ps.ifm_tensor.mem_area,

197

ps.ifm_tensor.format,

198

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

199

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

200

self.ifm2 = None

201

if ps.ifm2_tensor:

202

self.ifm2 = SchedulerTensor(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

203

ps.ifm_shapes[1],

204

ps.ifm2_tensor.dtype,

205

ps.ifm2_tensor.mem_area,

206

ps.ifm2_tensor.format,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

207

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

208

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

209

self.ofm = SchedulerTensor(

210

ps.ofm_shapes[0],

211

ps.ofm_tensor.dtype,

212

ps.ofm_tensor.mem_area,

213

ps.ofm_tensor.format,

214

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

215

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

216

# Input volume width and height required to produce the smallest possible stripe

217

self.min_stripe_input_w, self.min_stripe_input_h = self._calculate_min_stripe_input()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

218

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

219

# Flags that marks whether this SchedulerOperation requires full IFM/OFM

220

self.requires_full_ifm = False

221

self.requires_full_ifm2 = False

222

self.requires_full_ofm = False

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

223

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

224

self.evicted_fms_size = 0

225

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

226

self.index = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

227

erik.andersson@arm.com

2022-03-22 15:35:30 +0100

[diff] [blame]

228

# Perform an IFM swap for certain binary elementwise operators

229

# in order to enable cascading, if the SchedOp conforms to

230

# Elementwise cascading rules.

Johan Alfvén

0f2e59f

2022-10-21 11:21:38 +0200

[diff] [blame]

231

# The non-constant/non-scalar/non-broadcast IFM should be the primary input

232

if self.op_type.is_binary_elementwise_op():

233

ifm = self.parent_op.ifm

234

ifm2 = self.parent_op.ifm2

235

ofm = self.parent_op.ofm

erik.andersson@arm.com

2022-03-22 15:35:30 +0100

[diff] [blame]

236

Johan Alfvén

993ea53

2022-10-26 10:20:01 +0200

[diff] [blame]

237

ifm_can_swap = ifm.is_const or ifm.is_scalar

Johan Alfvén

0f2e59f

2022-10-21 11:21:38 +0200

[diff] [blame]

238

ifm2_can_be_primary = not (ifm2.is_const or ifm2.is_scalar or ifm2.is_broadcast(ofm))

239

Johan Alfvén

993ea53

2022-10-26 10:20:01 +0200

[diff] [blame]

240

if ifm_can_swap and ifm2_can_be_primary:

Johan Alfvén

0f2e59f

2022-10-21 11:21:38 +0200

[diff] [blame]

241

# IFM2 is the primary input

Fredrik Svedberg

b81e1bb

2022-10-11 21:50:51 +0200

[diff] [blame]

242

self.reversed_operands = True

erik.andersson@arm.com

2022-03-22 15:35:30 +0100

[diff] [blame]

243

self.ifm, self.ifm2 = self.ifm2, self.ifm

244

245

self.parent_ps.ifm_shapes = self.parent_ps.ifm_shapes[::-1]

246

self.parent_ps.inputs = self.parent_ps.inputs[::-1]

247

self.parent_ps.ifm_tensor, self.parent_ps.ifm2_tensor = (

248

self.parent_ps.ifm2_tensor,

249

self.parent_ps.ifm_tensor,

250

)

251

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

252

def add_ifm_connection(self, conn: "Connection"):

253

"""Add input connection to another SchedulerOperation or Subgraph Input"""

254

conn.consumers.append(self)

255

self.ifm.connection = conn

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

256

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

257

def add_ifm2_connection(self, conn: "Connection"):

258

"""Add input connection to another SchedulerOperation or Subgraph Input"""

259

if self.ifm2:

260

conn.consumers.append(self)

261

self.ifm2.connection = conn

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

262

else:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

263

assert False, f"Trying to set an IFM2 Connection to {self} which has no IFM2"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

264

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

265

def add_ofm_connection(self, conn: "Connection"):

266

"""Add output connection to another SchedulerOperation or Subgraph Output"""

267

conn.producers.append(self)

268

self.ofm.connection = conn

269

270

def get_dependants(self):

271

"""Returns a list of the Ops that depend on this Operation's OFM"""

272

return self.ofm.connection.consumers

273

274

def ifm_size_in_bytes(self) -> int:

275

"""Returns size of the IFM in bytes"""

276

ifm_storage_shape = shape_for_format(self.ifm.shape, self.ifm.format)

277

return round_up(ifm_storage_shape.elements() * self.ifm.dtype.size_in_bytes(), Tensor.AllocationQuantum)

278

279

def ifm2_size_in_bytes(self) -> int:

280

"""Returns size of the IFM2 in bytes"""

281

if self.ifm2:

282

ifm2_storage_shape = shape_for_format(self.ifm2.shape, self.ifm2.format)

283

return round_up(ifm2_storage_shape.elements() * self.ifm2.dtype.size_in_bytes(), Tensor.AllocationQuantum)

return 0

def ofm_size_in_bytes(self) -> int:

288

"""Returns size of the OFM in bytes"""

289

ofm_storage_shape = shape_for_format(self.ofm.shape, self.ofm.format)

290

return round_up(ofm_storage_shape.elements() * self.ofm.dtype.size_in_bytes(), Tensor.AllocationQuantum)

291

292

def create_scheduler_info(self, nng: Graph, stripe: Shape4D) -> SchedulerOpInfo:

293

"""Returns schedule info about this SchedulerOperation based on how many ofm elements it should produce"""

294

ifm_shape = self.ifm.shape

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

295

ifm2_shape = self.ifm2.shape if self.ifm2 is not None else None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

296

ofm_shape = stripe

297

298

if ofm_shape != self.ofm.shape:

299

# Striped Op - Need to calculate stripe input volume

300

stripe_input_w, stripe_input_h = self._get_stripe_input_requirement(stripe)

301

# Ensure stripe input volume is within the full IFM volume

302

stripe_input_h = min(stripe_input_h, self.ifm.shape.height)

303

stripe_input_w = min(stripe_input_w, self.ifm.shape.width)

304

ifm_shape = ifm_shape.with_hw(stripe_input_h, stripe_input_w)

305

306

if self.ifm2:

307

stripe_input2_h = min(stripe_input_h, self.ifm2.shape.height)

308

stripe_input2_w = min(stripe_input_w, self.ifm2.shape.width)

309

ifm2_shape = ifm2_shape.with_hw(stripe_input2_h, stripe_input2_w)

310

311

block_config = self._get_block_config(ifm_shape, ifm2_shape, self.uses_scalar, ofm_shape)

312

313

scheduler_op_info = SchedulerOpInfo(block_config, 0, ifm_shape, ifm2_shape, ofm_shape)

314

if self.parent_op.weights:

315

# Default full-depth weight encoding with no buffering

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

316

(

317

scheduler_op_info.npu_weights_tensor,

318

scheduler_op_info.npu_scales_tensor,

319

) = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

320

self.arch,

321

self.parent_op,

322

self.parent_op.weights,

self.parent_op.bias,

self.kernel,

block_config,

[0, self.ofm.shape.depth],

327

)

328

329

self.parent_ps.block_config = block_config.old_style_representation()

330

return scheduler_op_info

331

332

def _get_stripe_input_requirement(self, stripe_shape: Shape4D) -> Tuple[int, int]:

333

"""Returns the amount of IFM required to produce the stripe with shape:'stripe_shape'"""

334

ofm_shape_to_produce = Block.from_shape(stripe_shape.as_list())

335

Fredrik Svedberg

3ff7a4a

2021-09-29 10:08:04 +0200

[diff] [blame]

336

return get_ifm_area_required(ofm_shape_to_produce, self.kernel, self.resampling_mode)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

337

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

338

def _calculate_min_stripe_input(self) -> Tuple[int, int]:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

339

# Calculate the input volume required height and width for the smallest possible stripe (h,w = 1,1)

340

min_stripe = self.ofm.shape.with_hw(1, 1)

341

return self._get_stripe_input_requirement(min_stripe)

342

343

def _get_block_config(

344

self, ifm_shape: Shape4D, ifm2_shape: Optional[Shape4D], uses_scalar: bool, ofm_shape: Shape4D

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

345

) -> Optional[ArchitectureBlockConfig]:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

346

# Returns a block config and SHRAM layout

347

lut_banks = 2 if self.parent_op.activation_lut else 0

348

return find_block_config(

349

self.arch,

350

self.op_type.npu_block_type,

ofm_shape,

ifm_shape,

ifm2_shape,

uses_scalar,

self.ifm.dtype.size_in_bits(),

356

self.kernel,

357

lut_banks,

358

self.parent_op.has_scaling(),

359

self.resampling_mode,

)

class Connection:

"""Scheduler internal representation of a Tensor that connects two SchedulerOperations

365

This class can be seen as an edge within the Scheduler Graph representation

366

"""

367

368

def __init__(self, tensor: Tensor):

369

self.parent_tens = tensor

370

371

# SchedulerOperation relationships

372

self.producers: List[SchedulerOperation] = []

373

self.consumers: List[SchedulerOperation] = []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

374

375

def __str__(self):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

376

return f"<Connection {self.parent_tens.name}>"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

__repr__ = __str__

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

381

class Schedule:

382

"""Class that contains a solution of how to schedule an NPU subgraph and its cost"""

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

383

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

384

def __init__(self, sg: Subgraph, label: str):

385

self.sg = sg

386

self.label = label

387

self.cost_map: Dict[SchedulerOperation, SchedulerOpInfo] = {}

388

self.cascades: Dict[int, CascadeInfo] = {}

389

self.fast_storage_peak_usage = 0

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

390

self.memory_snapshot: Optional[List[int]] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

@property

def name(self):

return f"{self.sg.name}_{self.label}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

395

396

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

397

class Scheduler:

398

"""Main class of the Vela Scheduling"""

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

399

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

400

def __init__(self, nng: Graph, sg: Subgraph, arch: ArchitectureFeatures, options: SchedulerOptions):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

401

self.nng = nng

402

self.sg = sg

403

self.arch = arch

Ayaan Masood

b801dda

2022-02-22 11:28:55 +0000

[diff] [blame]

404

self.sched_ops: List[SchedulerOperation] = []

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

405

self.max_schedule: Optional[Schedule] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

406

self.scheduler_options = options

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

407

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

408

self.scratched_fms: Dict[Tensor, Any] = {}

409

self.evicted_fms: List[live_range.LiveRange] = []

410

Johan Alfvén

5e0ae55

2022-02-09 21:20:10 +0100

[diff] [blame]

411

def avoid_nhcwb16_for_ofm(self, tens, ps, arch):

412

# Only run this check for opt strategy Size

413

if self.scheduler_options.optimization_strategy == OptimizationStrategy.Performance:

return False

op = ps.primary_op

if not op.type.is_elementwise_op():

418

return False

419

420

depth = op.ofm_shapes[0][-1]

421

if (depth % 16) == 0:

422

return False

423

424

# Check if overwriting the inputs can be allowed

425

OpShapeTens = namedtuple("OpShapeTens", ["op_shape", "tens"])

426

outp = OpShapeTens(op.ofm_shapes[0], op.ofm)

427

inps = []

428

if op.ifm is not None:

429

inps.append(OpShapeTens(op.ifm_shapes[0], op.ifm))

430

if op.ifm2 is not None:

431

inps.append(OpShapeTens(op.ifm_shapes[1], op.ifm2))

432

433

# Find an input tensor that can be overwritten by the output

434

for inp in inps:

435

if (

436

# check op input and output shapes allow overlapping

437

inp.op_shape == outp.op_shape

438

# check input tensor is valid

439

and inp.tens is not None

440

and inp.tens.shape != []

441

# check input and output tensors are compatible

442

and inp.tens.format == outp.tens.format

443

and inp.tens.dtype == outp.tens.dtype

444

):

445

if inp.tens.format == TensorFormat.NHWC:

return True

return False

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

450

def create_scheduler_representation(self, arch: ArchitectureFeatures):

451

"""Creates a Scheduler Graph representation"""

452

# Temporary dict for creating connections between the Operations

453

connections: Dict[Tensor, Connection] = {}

454

# Memory required for the largest FeatureMap that has to be full

455

min_memory_req = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

456

for ps in self.sg.passes:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

457

if ps.primary_op:

458

# Set tensor format to NHCWB16 for output FeatureMaps, if possible

Louis Verhaard

0b9c9a3

2020-09-15 14:05:38 +0200

[diff] [blame]

459

for output in ps.outputs:

Jacob Bohlin

a5e8c1c

2021-06-14 13:33:39 +0200

[diff] [blame]

460

if output in self.sg.output_tensors or output.purpose != TensorPurpose.FeatureMap:

Patrik Gustavsson

feeb06d

2020-04-22 12:53:47 +0200

[diff] [blame]

461

continue

Johan Alfvén

5e0ae55

2022-02-09 21:20:10 +0100

[diff] [blame]

462

463

if output.needs_linear_format:

464

continue

465

466

if self.avoid_nhcwb16_for_ofm(output, ps, arch):

467

output.needs_linear_format = True

468

continue

469

470

output.set_format(TensorFormat.NHCWB16, arch)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

471

472

# Create SchedulerOperations

473

op = SchedulerOperation(ps, arch, self.nng)

474

op.index = len(self.sched_ops)

475

476

# Make connections

477

if ps.ifm_tensor not in connections:

478

connections[ps.ifm_tensor] = Connection(ps.ifm_tensor)

479

if ps.ifm2_tensor and ps.ifm2_tensor not in connections:

480

connections[ps.ifm2_tensor] = Connection(ps.ifm2_tensor)

481

if ps.ofm_tensor not in connections:

482

connections[ps.ofm_tensor] = Connection(ps.ofm_tensor)

483

484

op.add_ifm_connection(connections[ps.ifm_tensor])

485

if ps.ifm2_tensor:

486

op.add_ifm2_connection(connections[ps.ifm2_tensor])

487

op.add_ofm_connection(connections[ps.ofm_tensor])

488

489

# Set requirements on the ifm/ofm buffers

490

self.sched_ops.append(op)

491

if ps.ifm_tensor in self.sg.input_tensors:

492

# This Op consumes a subgraph input

493

op.requires_full_ifm = True

494

if ps.ifm2_tensor and ps.ifm2_tensor in self.sg.input_tensors:

495

# This Op consumes a subgraph input

496

op.requires_full_ifm2 = True

497

if ps.ofm_tensor in self.sg.output_tensors:

498

# This Op produces a subgraph output

499

op.requires_full_ofm = True

500

if ps.ifm_tensor.needs_linear_format:

501

op.requires_full_ifm = True

502

if ps.ifm2_tensor and ps.ifm2_tensor.needs_linear_format:

503

op.requires_full_ifm2 = True

504

if ps.ofm_tensor.needs_linear_format or ps.primary_op.memory_function == Op.ConcatSliceWrite:

505

op.requires_full_ofm = True

506

if len(ps.primary_op.outputs) > 1 or len(ps.primary_op.outputs[0].consumer_list) > 1:

507

# Op has multiple outputs or consumers - requires full OFM

508

op.requires_full_ofm = True

509

510

# Check memory requirements if this Op requires any full FeatureMaps

511

op_memory_req = 0

512

if op.requires_full_ifm:

513

op_memory_req += op.ifm_size_in_bytes()

514

if op.requires_full_ifm2:

515

op_memory_req += op.ifm2_size_in_bytes()

516

if op.requires_full_ofm:

517

op_memory_req += op.ofm_size_in_bytes()

518

519

min_memory_req = max(op_memory_req, min_memory_req)

520

521

# Theoretical minimum required memory - used to guide the cascade building

522

self.min_memory_req = min_memory_req

523

524

def create_initial_schedule(self) -> Schedule:

525

"""Creates an initial schedule with no cascading or buffering of any kind"""

526

schedule = Schedule(self.sg, "MAX")

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

527

for op in self.sched_ops:

528

cost = op.create_scheduler_info(self.nng, op.ofm.shape)

529

cost.cycles = self.estimate_op_performance(op, cost.block_config, op.ofm.shape.depth)

530

schedule.cost_map[op] = cost

return schedule

def update_op_memory_snapshot(self, schedule: Schedule):

535

memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

536

537

# Collect live ranges from tensors

538

lr_graph = live_range.LiveRangeGraph()

539

for mem_area, mem_type_set in memories_list:

540

live_range.extract_live_ranges_from_cascaded_passes(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

541

self.nng.get_root_subgraph(),

mem_area,

mem_type_set,

lr_graph,

Tensor.AllocationQuantum,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

546

)

547

548

# Populate time-array with memory used by live ranges

549

temporal_usage = lr_graph.get_temporal_memory_usage(self.arch.fast_storage_mem_area)

550

schedule.memory_snapshot = temporal_usage

551

552

# Set the peak memory usage

553

schedule.fast_storage_peak_usage = max(temporal_usage, default=0)

554

555

def estimate_op_performance(self, op: SchedulerOperation, block_config, ofm_depth):

556

query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)

557

query.ifm_shape = op.ifm.shape

558

query.ifm_memory_area = op.ifm.mem_area

559

query.ifm_bits = op.ifm.dtype.size_in_bits()

560

query.ifm_format = op.ifm.format

561

query.ifm2_shape = op.ifm2 and op.ifm2.shape

562

query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area

563

query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()

564

query.ifm2_format = op.ifm2 and op.ifm2.format

565

query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)

566

query.ofm_memory_area = op.ofm.mem_area

567

query.ofm_bits = op.ofm.dtype.size_in_bits()

568

query.ofm_format = op.ofm.format

569

if op.parent_op.bias:

570

query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)

571

query.const_memory_area = self.arch.fast_storage_mem_area

572

573

query.kernel = op.kernel

574

query.config = block_config

575

576

return npu_performance.measure_cycle_cost(self.arch, op.op_type, op.activation and op.activation.op_type, query)

577

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

578

def estimate_element_access(self, op: SchedulerOperation, block_config, ofm_depth):

579

query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)

580

query.ifm_shape = op.ifm.shape

581

query.ifm_memory_area = op.ifm.mem_area

582

query.ifm_bits = op.ifm.dtype.size_in_bits()

583

query.ifm_format = op.ifm.format

584

query.ifm2_shape = op.ifm2 and op.ifm2.shape

585

query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area

586

query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()

587

query.ifm2_format = op.ifm2 and op.ifm2.format

588

query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)

589

query.ofm_memory_area = op.ofm.mem_area

590

query.ofm_bits = op.ofm.dtype.size_in_bits()

591

query.ofm_format = op.ofm.format

592

if op.parent_op.bias:

593

query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)

594

query.const_memory_area = self.arch.fast_storage_mem_area

595

596

query.kernel = op.kernel

597

query.config = block_config

598

599

return npu_performance.measure_element_access(self.arch, query)

600

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

601

def propose_schedule_buffering(self, ref_schedule: Schedule, staging_limit_bytes):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

602

"""Create a buffered schedule"""

603

buffered_schedule = Schedule(self.sg, f"{ref_schedule.label}_BUFFERED")

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

604

605

prev_op = None

606

for sched_op in self.sched_ops:

607

if sched_op not in ref_schedule.cost_map:

608

# sched_op is not part of this sub-schedule - skip

609

continue

610

611

self.propose_operator_buffering(sched_op, prev_op, buffered_schedule, ref_schedule, staging_limit_bytes)

612

prev_op = sched_op

613

614

return buffered_schedule

615

616

def propose_operator_buffering(

617

self,

618

sched_op: SchedulerOperation,

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

619

prev_op: Optional[SchedulerOperation],

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

620

buffered_schedule: Schedule,

621

ref_schedule: Schedule,

622

staging_limit_bytes,

623

):

624

# Mild recursion might mean this Op has already been seen

625

if sched_op in buffered_schedule.cost_map:

626

return

627

628

# Take the reference schedule as default costings for this schedule

629

ref_cost = ref_schedule.cost_map[sched_op]

630

cost = copy.copy(ref_cost)

631

cost.slack_buffering_cycles = ref_cost.cycles.op_cycles

632

memory_snapshot = ref_schedule.memory_snapshot

633

ref_memory_usage = memory_snapshot[ref_cost.time_index] if ref_cost.time_index < len(memory_snapshot) else 0

634

cost.slack_buffering_memory = staging_limit_bytes - ref_memory_usage

635

buffered_schedule.cost_map[sched_op] = cost

636

637

# Attempt weight buffering on anything with a weights tensor

638

if sched_op.parent_op.weights:

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

639

buffer_limit_bytes = cost.slack_buffering_memory

640

641

# If applicable apply size limitation, but keep it within reason (ratio 1.5).

642

# Size limitation is used when use_fast_storage_for_feature_maps have

643

# detected that there are fms that do not fit in fast storage.

644

if sched_op.evicted_fms_size and ((buffer_limit_bytes / sched_op.evicted_fms_size) >= 1.5):

645

buffer_limit_bytes -= sched_op.evicted_fms_size

646

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

647

self.propose_weight_buffering(

648

sched_op.parent_op.weights,

649

sched_op.parent_op.bias,

sched_op,

prev_op,

buffered_schedule,

ref_schedule,

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

654

buffer_limit_bytes,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

)

return cost

def weights_needs_dma(self, weight_tensor):

660

if weight_tensor and weight_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast):

661

# Weights are in permanent storage

662

# Only when permanent storage differs from feature map storage, there is a point moving the data

663

if (

664

weight_tensor.mem_area in (MemArea.Dram, MemArea.OffChipFlash)

665

and self.arch.permanent_storage_mem_area != self.arch.fast_storage_mem_area

):

return True

return False

def propose_weight_buffering(

self,

weight_tensor,

scale_tensor,

sched_op: SchedulerOperation,

675

prev_op: SchedulerOperation,

676

buffered_schedule: Schedule,

677

ref_schedule: Schedule,

678

buffer_limit_bytes,

679

):

680

cost = buffered_schedule.cost_map[sched_op]

681

prev_cost = buffered_schedule.cost_map.get(prev_op)

682

ref_cost = ref_schedule.cost_map[sched_op]

683

assert cost and ref_cost

684

685

needs_dma = self.weights_needs_dma(weight_tensor)

686

687

ofm_full_depth_slices = [0, ref_cost.stripe.depth]

688

689

# Encode weights for the full depth

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

690

full_weights, full_scales = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

self.arch,

sched_op.parent_op,

weight_tensor,

scale_tensor,

sched_op.kernel,

cost.block_config,

ofm_full_depth_slices,

698

)

699

full_weights_bytes = len(full_weights.buffer)

700

cost.ofm_depth_slices = ofm_full_depth_slices

701

702

# No buffering required - take all the weights from permanent storage

703

if sched_op.op_type == Op.FullyConnected or not needs_dma:

704

cost.npu_weights_tensor = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

705

cost.npu_scales_tensor = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

706

return

707

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

708

encoded_weights: Optional[NpuWeightTensor] = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

709

encoded_scales = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

710

711

# How many NPU cycles are available under the previously executing

712

# operator and SRAM unused for performing buffered DMA transfers

713

slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0

714

slack_memory = prev_cost.slack_buffering_memory if prev_cost else 0

715

716

# Force full depth for cascaded Ops

717

if ref_cost.cascade != 0:

718

weight_tensor_purpose = TensorSubPurpose.Standard

719

weight_buffer_size = full_weights_bytes

720

# Update the memory snapshot to reflect the added size of the weights

721

ref_schedule.memory_snapshot[ref_cost.time_index] += weight_buffer_size

722

else:

723

# Estimate the buffering cycle time for the full set of weights

724

full_transfer_cycles = npu_performance.measure_mem2mem_cycles(

725

self.arch, weight_tensor.mem_area, self.arch.fast_storage_mem_area, full_weights_bytes

726

)

727

cost.full_weight_transfer_cycles = full_transfer_cycles

728

729

# Calculate the amount of prebuffering necessary (or what is possible with limited

730

# double buffer buffer size)

731

half_buffer_limit = buffer_limit_bytes // 2

732

if full_transfer_cycles > slack_cycles:

733

prebuffer_ratio = slack_cycles / full_transfer_cycles

734

prebuffer_bytes = min(prebuffer_ratio * full_weights_bytes, half_buffer_limit)

735

else:

736

prebuffer_bytes = min(full_weights_bytes, half_buffer_limit)

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

737

738

prebuffer_ratio = prebuffer_bytes / full_weights_bytes

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

739

740

# Have to split the weights if the initial buffering can't store

741

# all of the compressed weights

742

if prebuffer_bytes < full_weights_bytes:

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

743

block_depth = cost.block_config.ofm_block.depth

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

744

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

745

# Choose initial prebuffering depth (already buffer clamped)

746

prebuffer_depth = ref_cost.stripe.depth * prebuffer_ratio

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

747

prebuffer_depth = int(max(16, round_down(prebuffer_depth, ArchitectureFeatures.OFMSplitDepth)))

748

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

749

# Calculate cycles executed during the prebuffer

750

pre_op_cycles = self.estimate_op_performance(sched_op, cost.block_config, prebuffer_depth)

751

buffering_depth = ref_cost.stripe.depth * (pre_op_cycles.op_cycles / full_transfer_cycles)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

752

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

753

# Choose initial buffering depth and clamp to the double buffering limit

754

buffering_depth = round_up(buffering_depth, block_depth)

755

buffering_bytes = (buffering_depth / ref_cost.stripe.depth) * full_weights_bytes

756

if buffering_bytes > half_buffer_limit:

757

buffering_depth = (half_buffer_limit / full_weights_bytes) * ref_cost.stripe.depth

758

759

while True:

760

# Attempt to buffer whole blocks

Johan Alfvén

cce7f2d

2022-04-08 10:47:09 +0200

[diff] [blame]

761

if buffering_depth > block_depth:

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

762

buffering_depth = round_down(buffering_depth, block_depth)

763

else:

764

buffering_depth = round_down(buffering_depth, ArchitectureFeatures.OFMSplitDepth)

765

buffering_depth = int(max(buffering_depth, ArchitectureFeatures.OFMSplitDepth))

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

766

767

# Create list of depth slices

768

depth_slices = [0]

769

if prebuffer_depth < ref_cost.stripe.depth:

770

depth_slices += list(range(prebuffer_depth, ref_cost.stripe.depth, buffering_depth))

771

depth_slices.append(ref_cost.stripe.depth)

772

773

# Encode weights based depth slices

774

cost.ofm_depth_slices = depth_slices

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

775

encoded_weights, encoded_scales = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

self.arch,

sched_op.parent_op,

weight_tensor,

scale_tensor,

sched_op.kernel,

cost.block_config,

cost.ofm_depth_slices,

783

)

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

784

assert encoded_weights is not None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

785

# Chosen buffering might not fit at all, iterate until it does

786

# or until the minimum usable slice size is reached

787

if (

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

788

encoded_weights.double_buffer_size() <= buffer_limit_bytes

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

789

or prebuffer_depth == ArchitectureFeatures.OFMSplitDepth

):

break

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

793

if buffering_depth > prebuffer_depth:

794

buffering_depth = round_up(buffering_depth // 2, ArchitectureFeatures.OFMSplitDepth)

795

else:

796

prebuffer_depth = round_up(prebuffer_depth // 2, ArchitectureFeatures.OFMSplitDepth)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

797

798

# Calculate cycles required to run the last op for use as future slack

799

tail_cycles = self.estimate_op_performance(

800

sched_op, cost.block_config, depth_slices[-1] - depth_slices[-2]

801

)

802

cost.slack_buffering_cycles = tail_cycles.op_cycles

803

804

# Determine whether the weights need to be double buffered

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

805

weight_buffer_size = min(len(encoded_weights.buffer), encoded_weights.max_range_bytes())

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

806

807

# Only buffer weights if there's still space left for the buffer

808

if weight_buffer_size <= buffer_limit_bytes:

809

assert weight_buffer_size % 16 == 0

810

# Determine whether to double buffer or single buffer

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

811

double_buffer_size = encoded_weights.double_buffer_size()

812

if (double_buffer_size <= buffer_limit_bytes) and (weight_buffer_size < len(encoded_weights.buffer)):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

813

weight_tensor_purpose = TensorSubPurpose.DoubleBuffer

814

else:

815

weight_tensor_purpose = TensorSubPurpose.Standard

816

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

817

cost.buffered_weight_tensors = [

818

self.buffer_tensor(

819

encoded_weights,

820

weight_tensor_purpose,

821

encoded_weights.double_buffer_sizes[0],

822

weight_tensor.name + "_buffer",

823

)

824

]

825

if weight_tensor_purpose == TensorSubPurpose.DoubleBuffer:

826

buf2 = self.buffer_tensor(

827

encoded_weights,

828

weight_tensor_purpose,

829

encoded_weights.double_buffer_sizes[1],

830

weight_tensor.name + "_buffer2",

831

)

832

cost.buffered_weight_tensors.append(buf2)

833

834

last_used_buffer_idx = len(cost.ofm_depth_slices) % len(cost.buffered_weight_tensors)

835

weight_buffer_size = encoded_weights.double_buffer_sizes[last_used_buffer_idx]

836

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

837

if ref_cost.cascade == 0:

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

838

# Determine if the lifetime can be extended and pre-buffer the first weight buffer

839

# under the previous operation

840

cost.buffered_weight_tensors[0].pre_buffer = encoded_weights.double_buffer_size() < slack_memory

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

841

842

cost.slack_buffering_memory -= weight_buffer_size

843

else:

844

# Don't slice or buffer - use the whole depth from persistent storage

845

cost.ofm_depth_slices = ofm_full_depth_slices

846

encoded_weights = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

847

encoded_scales = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

848

849

cost.npu_weights_tensor = encoded_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

850

cost.npu_scales_tensor = encoded_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

851

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

852

def buffer_tensor(self, src_tensor: Tensor, sub_purpose: TensorSubPurpose, buffer_size: int, name: str) -> Tensor:

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

853

buffered_weight_tensor = Tensor([1, 1, 1, buffer_size], DataType.uint8, name)

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

854

buffered_weight_tensor.src_tensor = src_tensor

855

buffered_weight_tensor.mem_area = self.arch.fast_storage_mem_area

856

buffered_weight_tensor.mem_type = MemType.Scratch_fast

857

buffered_weight_tensor.purpose = TensorPurpose.Weights

858

buffered_weight_tensor.sub_purpose = sub_purpose

859

return buffered_weight_tensor

860

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

861

def propose_minimal_schedule(self) -> Schedule:

862

"""Proposes scheduling parameters where every operator is subdivided into the smallest stripe that satisfies the

863

next operators stride"""

864

min_schedule = Schedule(self.sg, "MIN")

865

cost_map = min_schedule.cost_map

866

867

# Keep track of the previous Op - which consumes the current Op's OFM

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

868

prev_op: Optional[SchedulerOperation] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

869

for sched_op in reversed(self.sched_ops):

870

min_stripe_height = prev_op.kernel.stride.y if prev_op else 1

871

min_stripe = sched_op.ofm.shape.with_height(min_stripe_height)

872

873

cost = sched_op.create_scheduler_info(self.nng, min_stripe)

874

cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)

875

cost_map[sched_op] = cost

prev_op = sched_op

return min_schedule

def propose_schedule_striping(self, final_stripe: Shape4D, label: str, ref_schedule: Schedule) -> Schedule:

882

"""Proposes new striping for a schedule. The stripe is derived from the ifm requirements of the next Op down"""

883

ref_cost = ref_schedule.cost_map

884

885

striped_schedule = Schedule(self.sg, label)

886

stripe = final_stripe

887

for sched_op in reversed(self.sched_ops):

888

if sched_op not in ref_cost:

889

# sched_op is not part of the sub-schedule - skip

890

continue

891

892

# Create a cost entry with the new stripe

893

cost = sched_op.create_scheduler_info(self.nng, stripe)

894

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

895

weight_tensor = cost.npu_weights_tensor

896

for idx, buffered_tens in enumerate(ref_cost[sched_op].buffered_weight_tensors):

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

897

# If the weights are buffered in the reference schedule they should be in the new proposal

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

898

cost.buffered_weight_tensors.append(

899

self.buffer_tensor(

900

weight_tensor,

901

buffered_tens.sub_purpose,

902

weight_tensor.double_buffer_sizes[idx],

903

buffered_tens.name,

904

)

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

905

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

906

907

# Estimate performance

908

cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)

909

striped_schedule.cost_map[sched_op] = cost

910

erik.andersson@arm.com

8912f3a

2022-08-16 11:08:57 +0200

[diff] [blame]

911

# Calculate the preceeding Op's stripe.

912

913

# In certain cases where an upscaling Op is cascaded,

914

# it may get instructed to produce an odd stripe height.

915

# Thus we need to force it back to even heights.

916

force_even_stripe_heights = False

917

for op in self.sched_ops:

918

# Check if the cascade has a Nearest Neighbor-op.

919

# If that is the case, force the stripes to be even.

920

if (

921

ref_cost.get(op, None)

922

and ref_cost.get(sched_op, None)

923

and ref_cost[op].cascade == ref_cost[sched_op].cascade

924

and is_nearest(op.resampling_mode)

925

):

926

force_even_stripe_heights = True

927

break

928

upscaling_remainder = stripe.height % to_upscale(sched_op.resampling_mode)

929

height = stripe.height + (stripe.height % 2 if force_even_stripe_heights else upscaling_remainder)

Fredrik Svedberg

d03dc50

2022-06-30 10:44:12 +0200

[diff] [blame]

930

stripe = sched_op.ifm.shape.with_height(height * sched_op.kernel.stride.y)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

931

932

return striped_schedule

933

934

def estimate_schedule_memory_usage(self, schedule: Schedule, non_local_mem_usage: dict):

935

"""Estimates the memory usage of a schedule"""

936

cost = schedule.cost_map

937

cascades = schedule.cascades

938

peak_mem_usage = 0

939

for sched_op in self.sched_ops:

940

if sched_op not in cost:

941

# sched_op is not part of the sub-schedule - skip

942

continue

943

944

if cost[sched_op].cascade:

945

# This Op is part of a cascade - use the cascade's memory usage

946

cascade_info = cascades[cost[sched_op].cascade]

947

# Non-local memory usage is already included in the cascade_info

948

peak_mem_usage = max(cascade_info.mem_usage, peak_mem_usage)

949

else:

950

# This Op is not part of a cascade - calculate the memory usage

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

951

op_weight_buffer = sum(tens.storage_size() for tens in cost[sched_op].buffered_weight_tensors)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

952

953

op_mem_usage = (

954

sched_op.ifm_size_in_bytes()

955

+ sched_op.ofm_size_in_bytes()

956

+ op_weight_buffer

957

+ non_local_mem_usage.get(sched_op, 0)

958

)

959

peak_mem_usage = max(op_mem_usage, peak_mem_usage)

960

961

return peak_mem_usage

962

Johan Alfvén

2022-07-16 18:27:05 +0200

[diff] [blame]

963

def build_cascades_for_min_schedule(self, min_schedule: Schedule, max_template: Schedule, memory_limit: int):

964

# Update memory snapshot

965

self.sg.schedule = min_schedule

966

self.update_op_memory_snapshot(min_schedule)

967

968

# Calculate residual memory for Min schedule

969

non_local_mem_usage = {}

970

for sched_op in self.sched_ops:

971

time_index = min_schedule.cost_map[sched_op].time_index

972

973

if self.arch.is_spilling_enabled():

974

# For Dedicated SRAM only the intermediate buffers are in SRAM, hence op_mem_usage is 0

975

op_mem_usage = 0

976

else:

977

# Min schedule only have ifm and ofm in SRAM (no buffered weigth tensors)

Johan Alfvén

eb332a3

2022-12-09 17:50:48 +0100

[diff] [blame^]

978

# Only include IFM's that are in the scratch area

979

ifm = sched_op.ifm.connection.parent_tens

980

ifm_size = (

981

0 if ifm.mem_type not in (MemType.Scratch, MemType.Scratch_fast) else sched_op.ifm_size_in_bytes()

982

)

Johan Alfvén

92689d5

2022-12-06 11:16:19 +0100

[diff] [blame]

983

ofm_size = 0 if ofm_can_reuse_ifm(sched_op) else sched_op.ofm_size_in_bytes()

Johan Alfvén

eb332a3

2022-12-09 17:50:48 +0100

[diff] [blame^]

984

op_mem_usage = ifm_size + ofm_size

Johan Alfvén

2022-07-16 18:27:05 +0200

[diff] [blame]

985

986

non_local_mem_usage[sched_op] = min_schedule.memory_snapshot[time_index] - op_mem_usage

Johan Alfvén

92689d5

2022-12-06 11:16:19 +0100

[diff] [blame]

987

assert non_local_mem_usage[sched_op] >= 0

Johan Alfvén

2022-07-16 18:27:05 +0200

[diff] [blame]

988

989

# Crate cascades for Min schedule

990

cascade_builder = CascadeBuilder(self.sched_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)

991

cascade_builder.build_cascades(min_schedule, max_template, memory_limit)

992

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

993

def optimize_sub_schedule(

994

self, cascade_info: CascadeInfo, ref_schedule: Schedule, max_template: Schedule, memory_limit: int

995

) -> Schedule:

996

"""Extracts the Ops covered by the given cascade and creates a sub-schedule. The sub-schedule is optimized by

997

proposing weight buffering and then continously proposing new stripe sizes"""

998

ref_cost = ref_schedule.cost_map

999

# Extract the ops that are part of this sub-schedule

1000

start = cascade_info.start

1001

end = cascade_info.end

1002

sub_schedule_ops = self.sched_ops[start : end + 1]

1003

# Create a sub-schedule that contains only the costs for the Ops that are part of the sub-schedule

1004

sub_schedule = Schedule(self.sg, f"SUB_{start}_{end}")

1005

for sched_op in sub_schedule_ops:

1006

sub_schedule.cost_map[sched_op] = ref_cost[sched_op]

1007

1008

sub_schedule.cascades[end] = cascade_info

1009

# Use the memory snapshot from the reference schedule

1010

sub_schedule.memory_snapshot = ref_schedule.memory_snapshot

1011

1012

# Calculate memory usage that is live during the sub-schedule but not part of it

1013

time_for_cascade = ref_cost[sub_schedule_ops[0]].time_index

1014

mem_usage_parallel_to_sub_schedule = ref_schedule.memory_snapshot[time_for_cascade] - cascade_info.mem_usage

1015

# If the first Op's IFM has other consumers it has to live throughout the whole sub-schedule whether it's

1016

# included in a cascade or not

1017

persistent_initial_ifm = (

1018

sub_schedule_ops[0].ifm_size_in_bytes() if len(sub_schedule_ops[0].ifm.connection.consumers) > 1 else 0

1019

)

1020

# Calculate non-local-mem-usage per Operator

1021

non_local_mem_usage = {}

1022

for idx, sched_op in enumerate(sub_schedule_ops):

1023

non_local_mem_usage[sched_op] = mem_usage_parallel_to_sub_schedule

1024

if idx != 0:

1025

non_local_mem_usage[sched_op] += persistent_initial_ifm

1026

1027

cascade_builder = CascadeBuilder(sub_schedule_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)

1028

1029

# Start by adding buffering

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

1030

buffered_sub_schedule = self.propose_schedule_buffering(

1031

sub_schedule, self.scheduler_options.optimization_sram_limit

1032

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1033

# Copy the cascades over from the unbuffered-schedule

1034

buffered_sub_schedule.cascades = sub_schedule.cascades

1035

1036

# Generate the possible stripings for the final Op in the sub-schedule

1037

final_ofm_shape = sub_schedule_ops[-1].ofm.shape

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1038

1039

# Skip testing the min stripe used in the MIN schedule since that will be used

1040

# anyway if no new cascades are created below

1041

last_op = sub_schedule_ops[-1]

1042

min_stripe_h = sub_schedule.cost_map[last_op].stripe.height + 1

1043

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1044

possible_stripes = [

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1045

final_ofm_shape.with_height(stripe_h) for stripe_h in range(min_stripe_h, final_ofm_shape.height // 2 + 1)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1046

]

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1047

# Propose different striping

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

1048

best_schedule = None

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1049

max_nbr_of_cascades = 0

1050

for iteration, proposed_stripe in enumerate(possible_stripes):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1051

proposed_schedule = self.propose_schedule_striping(

1052

proposed_stripe, f"OPTIMIZED_{iteration}", buffered_sub_schedule

1053

)

1054

1055

cascade_builder.build_cascades(proposed_schedule, max_template, memory_limit)

1056

1057

# Check if proposal fits

1058

proposed_schedule_mem_usage = self.estimate_schedule_memory_usage(proposed_schedule, non_local_mem_usage)

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1059

1060

nbr_of_cascades = len(proposed_schedule.cascades)

1061

1062

if iteration == 0:

1063

# First iteration - used as limit to prevent splitting up the cascades

1064

# Long cascades are better in order to reduce IFM/IFM dram bandwidth

1065

max_nbr_of_cascades = nbr_of_cascades

1066

1067

if (proposed_schedule_mem_usage) <= memory_limit and nbr_of_cascades <= max_nbr_of_cascades:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1068

best_schedule = proposed_schedule

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1069

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1070

if not proposed_schedule.cascades:

1071

# No cascading required - early exit

1072

break

1073

else:

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1074

break

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

return best_schedule

def optimize_schedule(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

self,

schedule: Schedule,

max_sched: Schedule,

max_template: Schedule,

1083

options: SchedulerOptions,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1084

) -> Schedule:

1085

"""Extracts sub-schedules based on the cascades and optimizes them and applies them to the final schedule"""

1086

sram_limit = options.optimization_sram_limit

1087

if max_sched.fast_storage_peak_usage < sram_limit and not self.arch.is_spilling_enabled():

1088

# Maximum performance schedule fits within the SRAM target

1089

return max_sched

1090

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

1091

# Iterate over a copy of the cascades since they may change during the loop

1092

for cascade_info in list(schedule.cascades.values()):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1093

# Optimize the sub-schedule in this cascade

1094

opt_sub_schedule = self.optimize_sub_schedule(cascade_info, schedule, max_template, sram_limit)

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

1095

if opt_sub_schedule:

1096

# Remove the existing cascade

1097

del schedule.cascades[cascade_info.end]

1098

# Update the sub-schedule Op and cascade costs to the full schedule

1099

schedule.cost_map.update(opt_sub_schedule.cost_map)

1100

schedule.cascades.update(opt_sub_schedule.cascades)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1101

1102

# Update memory snapshot

1103

self.sg.schedule = schedule

1104

self.update_op_memory_snapshot(schedule)

1105

# Propose schedule buffering to the optimized schedule

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

1106

optimized_sched = self.propose_schedule_buffering(schedule, self.scheduler_options.optimization_sram_limit)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1107

# Copy the cascade's metadata from the unbuffered schedule

1108

optimized_sched.cascades = schedule.cascades

1109

return optimized_sched

1110

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1111

def optimize_weight_buffering_size(

1112

self,

1113

min_schedule: Schedule,

1114

options: SchedulerOptions,

1115

):

1116

default_schedule = self.sg.schedule

Tim Hall

c1be087

2022-03-03 17:50:52 +0000

[diff] [blame]

1117

npu_performance.calc_new_performance_for_network(self.nng, self.arch, None, False)

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1118

default_tot_cycles = self.nng.cycles[npu_performance.PassCycles.Total]

1119

default_dram_cycles = self.nng.cycles[npu_performance.PassCycles.DramAccess]

1120

1121

# Restore mem/type for scratched_fms

1122

for tens in self.scratched_fms:

1123

tens.mem_area = self.scratched_fms[tens][0]

1124

tens.mem_type = self.scratched_fms[tens][1]

1125

1126

self.update_op_memory_snapshot(self.sg.schedule)

1127

1128

# Collect live ranges from tensors

1129

memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

1130

lr_graph = live_range.LiveRangeGraph()

1131

for mem_area, mem_type_set in memories_list:

1132

live_range.extract_live_ranges_from_cascaded_passes(

1133

self.nng.get_root_subgraph(),

mem_area,

mem_type_set,

lr_graph,

Tensor.AllocationQuantum,

1138

)

1139

1140

# Find the relation between the sched_op and the buffering tensor

1141

weight_ops = {}

1142

for sched_op in self.sched_ops:

1143

cost = self.sg.schedule.cost_map[sched_op]

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

1144

for tens in cost.buffered_weight_tensors:

1145

weight_ops[tens] = sched_op

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1146

1147

# Filter out weight buffer live ranges

1148

weight_lrs = []

1149

for lr in lr_graph.lrs:

1150

for tens in lr.tensors:

1151

if weight_ops.get(tens):

1152

weight_lrs.append(lr)

1153

break

1154

1155

# See if any evicted fm overlaps with a weight buffering op.

1156

# If this is the case add a size limitation to the buffering op

1157

for lr in self.evicted_fms:

1158

for weight_lr in weight_lrs:

1159

if lr.overlaps_ranges(weight_lr):

1160

for tens in weight_lr.tensors:

1161

sched_op = weight_ops.get(tens)

1162

if sched_op:

1163

# Add size reduction to the op

1164

sched_op.evicted_fms_size += lr.size

1165

break

1166

1167

self.sg.schedule = min_schedule

1168

self.update_op_memory_snapshot(self.sg.schedule)

1169

1170

# Run schedule buffering - with weight buffer size reduction

1171

schedule = self.propose_schedule_buffering(self.sg.schedule, options.optimization_sram_limit)

1172

schedule.cascades = self.sg.schedule.cascades

1173

self.sg.schedule = schedule

1174

1175

# Apply new buffer schdule and calc new performance

1176

self.update_op_memory_snapshot(self.sg.schedule)

1177

self.apply_schedule(self.sg.schedule)

1178

self.use_fast_storage_for_feature_maps(self.sg.schedule, options.optimization_sram_limit)

1179

Tim Hall

c1be087

2022-03-03 17:50:52 +0000

[diff] [blame]

1180

npu_performance.calc_new_performance_for_network(self.nng, self.arch, None, False)

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1181

new_tot_cycles = self.nng.cycles[npu_performance.PassCycles.Total]

1182

new_dram_cycles = self.nng.cycles[npu_performance.PassCycles.DramAccess]

1183

Tim Hall

8bc7a65

2022-05-19 15:29:23 +0100

[diff] [blame]

1184

improvement_tot = (

1185

round((default_tot_cycles - new_tot_cycles) / default_tot_cycles, 2) if default_tot_cycles != 0 else 0

1186

)

1187

improvement_dram = (

1188

round((default_dram_cycles - new_dram_cycles) / default_dram_cycles, 2) if default_dram_cycles != 0 else 0

1189

)

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1190

1191

# Compare both total and dram improvement

Johan Alfvén

3dae1b6

2022-05-17 10:26:48 +0200

[diff] [blame]

1192

if not (improvement_tot >= 0 and improvement_dram > 0):

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1193

# No improvement, restore the default schedule

1194

for sched_op in self.sched_ops:

1195

sched_op.evicted_fms_size = 0

1196

1197

for tens in self.scratched_fms:

1198

tens.mem_area = self.scratched_fms[tens][0]

1199

tens.mem_type = self.scratched_fms[tens][1]

1200

1201

self.sg.schedule = default_schedule

1202

self.update_op_memory_snapshot(self.sg.schedule)

1203

self.apply_schedule(self.sg.schedule)

1204

self.use_fast_storage_for_feature_maps(self.sg.schedule, options.optimization_sram_limit)

1205

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1206

def apply_schedule(self, sched: Schedule):

1207

"""Applies the given schedule as a final solution"""

1208

for sched_op in self.sched_ops:

1209

op_info = sched.cost_map[sched_op]

1210

cascade_info = sched.cascades.get(op_info.cascade, None)

1211

if cascade_info and sched_op in cascade_info.buffers:

1212

buffer_tens = sched_op.ifm.connection.parent_tens

1213

# Apply memory area and type

1214

buffer_tens.mem_area = self.arch.fast_storage_mem_area

1215

buffer_tens.mem_type = MemType.Scratch_fast

1216

# Apply Rolling buffer

1217

buffer_tens.set_format(TensorFormat.NHCWB16, self.arch)

1218

buffer_tens.set_new_sub_purpose(TensorSubPurpose.RollingBufferY, cascade_info.buffers[sched_op].height)

1219

1220

sched_op.parent_ps.block_config = op_info.block_config.old_style_representation()

1221

1222

# Ensure that the src_tensor reference is set correctly

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

1223

for tens in op_info.buffered_weight_tensors:

1224

tens.src_tensor = op_info.npu_weights_tensor

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1225

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1226

def use_fast_storage_for_feature_maps(self, schedule, staging_limit):

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1227

max_mem_usage = []

1228

base_mem_usage = []

1229

fast_storage_type = MemType.Scratch_fast

1230

fast_storage_mem_area = self.arch.fast_storage_mem_area

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1231

self.evicted_fms = []

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1232

1233

# Force all OFMs to fast-storage

1234

for sched_op in self.sched_ops:

1235

cost = schedule.cost_map[sched_op]

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1236

if cost.cascade == 0 and sched_op.get_dependants():

1237

ofm_tens = sched_op.ofm.connection.parent_tens

1238

if not any(cons is None for cons in ofm_tens.consumer_list):

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1239

if ofm_tens not in self.scratched_fms:

1240

# Remember default mem area and mem type, only done once

1241

self.scratched_fms[ofm_tens] = (ofm_tens.mem_area, ofm_tens.mem_type)

1242

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1243

ofm_tens.mem_area = fast_storage_mem_area

1244

ofm_tens.mem_type = fast_storage_type

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1245

1246

# Collect live ranges from tensors

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1247

memories_list = [(fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1248

lr_graph = live_range.LiveRangeGraph()

1249

for mem_area, mem_type_set in memories_list:

1250

live_range.extract_live_ranges_from_cascaded_passes(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

1251

self.nng.get_root_subgraph(),

mem_area,

mem_type_set,

lr_graph,

Tensor.AllocationQuantum,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1256

)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1257

max_mem_usage = lr_graph.get_temporal_memory_usage(fast_storage_mem_area)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1258

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1259

# If true, everything fits and we can proceed

1260

if max(max_mem_usage) <= staging_limit:

1261

return

1262

1263

# Build up the base memory usage by removing the

1264

# mem_usage of the lrs we previously moved to fast-storage

1265

base_mem_usage = np.array(max_mem_usage)

1266

curr_lrs = []

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1267

for lr in lr_graph.lrs:

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1268

for tens in lr.tensors:

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1269

if self.scratched_fms.get(tens):

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1270

curr_lrs.append(lr)

1271

base_mem_usage[lr.start_time : lr.end_time + 1] -= lr.size

1272

break

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1273

competing_lrs = []

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1274

competing_tens_access = {}

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1275

for lr in curr_lrs:

1276

base_usage = max(base_mem_usage[lr.start_time : lr.end_time + 1])

1277

# If true, the lr will never fit and may thus be evicted

1278

if base_usage + lr.size > staging_limit:

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1279

self.evicted_fms.append(lr)

1280

FastStorageComponentAllocator.evict(lr, max_mem_usage, self.scratched_fms)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1281

continue

1282

# Since max_mem_usage is the memory usage with all FMs still in fast-storage,

1283

# the memory limit cannot be exceeded if max_mem_usage does not.

1284

# Thus, the affected lrs can remain in fast-storage if the following is true

1285

if max(max_mem_usage[lr.start_time : lr.end_time + 1]) <= staging_limit:

1286

FastStorageComponentAllocator.keep(lr, base_mem_usage, staging_limit)

1287

else:

1288

competing_lrs.append(lr)

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1289

for tens in lr.tensors:

1290

competing_tens_access[tens] = 0

1291

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1292

competing_lrs_sz = len(competing_lrs)

1293

# All lrs and their tensors have been handled if competing_lrs_sz is zero, we may thus return

1294

if competing_lrs_sz == 0:

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1295

return

1296

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1297

# Estimate element access for all tensors that are competing for a place in fast-storage.

1298

# This number is used when deciding which tensor that stays in fast-storage

1299

for sched_op in self.sched_ops:

1300

cost = schedule.cost_map[sched_op]

1301

1302

if competing_tens_access.get(sched_op.ifm.connection.parent_tens) is not None:

1303

tens = sched_op.ifm.connection.parent_tens

1304

access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)

1305

competing_tens_access[tens] += access.ifm_read[0]

1306

1307

if sched_op.ifm2 and competing_tens_access.get(sched_op.ifm2.connection.parent_tens) is not None:

1308

tens = sched_op.ifm2.connection.parent_tens

1309

access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)

1310

competing_tens_access[tens] += access.ifm_read[1]

1311

1312

if competing_tens_access.get(sched_op.ofm.connection.parent_tens) is not None:

1313

tens = sched_op.ofm.connection.parent_tens

1314

access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)

1315

competing_tens_access[tens] += access.ofm_write

1316

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1317

competing_lrs = sorted(competing_lrs, key=lambda lr: (lr.start_time, lr.end_time + 1, lr.size))

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1318

1319

# Remove lrs that have a live range that is too long compared to others.

1320

# They are causing problems for the HillClimb Allocator when it has to

1321

# change the allocation indices, in order to fit all the allocations into SRAM.

1322

# This problem only occur in larger networks with complex graphs.

1323

#

1324

# Limit the number of items for allocate_component to work with max MAX_EXHAUSTIVE_ITEMS

1325

# at the time. Too many will give too long compilation time

1326

#

1327

# Too long is currently decided to be (based on experience, analyzing many networks):

1328

# Compare lr at postion i with lr at position i + MAX_EXHAUSTIVE_ITEMS.

1329

# If end time differs by at least MAX_EXHAUSTIVE_LIFE_RANGE then do not include lr at position i.

1330

if competing_lrs_sz > FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS:

1331

# create a copy of the original list to iterate over because the original version is modified in-loop

1332

competing_lrs_copy = competing_lrs.copy()

1333

for i, lr in enumerate(competing_lrs_copy):

1334

lr_time = lr.end_time - lr.start_time

1335

if lr_time < FastStorageComponentAllocator.MAX_EXHAUSTIVE_LIFE_RANGE:

# Skip small ranges

continue

# Compare current lr with lr at position lr + MAX_EXHAUSTIVE_ITEMS

1340

cmp_pos = min(i + FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS, competing_lrs_sz - 1)

1341

1342

# Compare end times + plus a margin by MAX_EXHAUSTIVE_LIFE_RANGE

1343

if (

1344

lr.end_time

1345

> competing_lrs_copy[cmp_pos].end_time + FastStorageComponentAllocator.MAX_EXHAUSTIVE_LIFE_RANGE

1346

):

1347

# Current lr live time stands out, remove it. No use adding it to the

1348

# evicted_fms list since the lr should not be included in the fast storage allocation

1349

FastStorageComponentAllocator.evict(lr, max_mem_usage, self.scratched_fms)

1350

competing_lrs.remove(lr)

1351

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1352

start = 0

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1353

end_time = competing_lrs[0].end_time

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1354

competing_lrs_sz = len(competing_lrs)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1355

component_allocator = FastStorageComponentAllocator(base_mem_usage, max_mem_usage, staging_limit)

1356

# Build up components and then allocate each separately

1357

for i, lr in enumerate(competing_lrs):

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1358

nbr_items = i - start

1359

if lr.start_time <= end_time and (nbr_items < FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS):

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1360

end_time = max(end_time, lr.end_time)

1361

else:

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1362

# Number items reached max items or current lr's start time

1363

# does not overlap with previous lr's end time

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1364

component_allocator.allocate_component(

1365

component_allocator,

1366

competing_lrs[start:i],

1367

max_mem_usage,

1368

base_mem_usage,

1369

staging_limit,

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1370

self.scratched_fms,

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1371

competing_tens_access,

Johan Alfvén

2022-06-24 08:42:19 +0200

[diff] [blame]

1372

self.evicted_fms,

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1373

)

1374

start = i

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1375

end_time = lr.end_time

1376

component_allocator.allocate_component(

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1377

component_allocator,

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1378

competing_lrs[start:competing_lrs_sz],

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

max_mem_usage,

base_mem_usage,

staging_limit,

self.scratched_fms,

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1383

competing_tens_access,

Johan Alfvén

2022-06-24 08:42:19 +0200

[diff] [blame]

1384

self.evicted_fms,

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1385

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1386

1387

def move_constant_data(self):

1388

"""Determine if data, can be moved from permanent storage to another memory area. A move

1389

will generate a DMA command in the high-level command stream"""

1390

for sched_op in self.sched_ops:

1391

parent_op = sched_op.parent_op

1392

is_lut_used = any(inp.purpose == TensorPurpose.LUT for inp in parent_op.inputs)

1393

max_ifm_shram_avail = (

1394

(self.arch.available_shram_banks(is_lut_used) - self.arch.shram_reserved_output_banks)

1395

* self.arch.shram_bank_size

// 2

)

for idx, tens in enumerate(parent_op.inputs):

1400

if tens.mem_type not in (MemType.Scratch, MemType.Scratch_fast):

1401

# Tensor is in permanent storage

1402

# Only when permanent storage differs from feature map storage, there is a point moving the data

1403

if (

1404

tens.mem_area in self.arch.permanent_storage_mem_area

1405

and self.arch.permanent_storage_mem_area != self.arch.feature_map_storage_mem_area

1406

) or tens.purpose == TensorPurpose.LUT:

1407

if tens.purpose == TensorPurpose.LUT or (

Patrik Gustavsson

94292fe

2021-09-02 08:22:58 +0200

[diff] [blame]

1408

# For elementwise broadcast

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1409

tens.purpose == TensorPurpose.FeatureMap

1410

and sched_op.op_type.is_binary_elementwise_op()

1411

and tens.shape != []

1412

and sched_op.ifm.shape != sched_op.ofm.shape

Patrik Gustavsson

94292fe

2021-09-02 08:22:58 +0200

[diff] [blame]

1413

and parent_op.write_shape is None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1414

and tens.storage_size() > max_ifm_shram_avail

1415

):

1416

only_vector_product_consumers = all(

1417

oper and oper.type.npu_block_type == NpuBlockType.VectorProduct

1418

for oper in tens.consumers()

1419

)

1420

1421

if (not only_vector_product_consumers) or tens.purpose == TensorPurpose.LUT:

1422

new_tens = tens.clone_into_fast_storage(self.arch)

1423

if tens.purpose == TensorPurpose.LUT:

1424

new_tens.mem_area = MemArea.Shram

1425

1426

new_tens.consumer_list.append(parent_op)

1427

parent_op.inputs[idx] = new_tens

Dwight Lidman

352607c

2021-09-29 17:00:09 +0200

[diff] [blame]

1428

# If the index is out of range, IFM and IFM2 are the same tensor

1429

# and pass inputs don't have duplicates

1430

if idx < len(sched_op.parent_ps.inputs):

1431

sched_op.parent_ps.inputs[idx] = new_tens

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1432

1433

def print_schedule(self, schedule: Schedule):

1434

print(f"Schedule: '{schedule.name}'")

1435

for sched_op in self.sched_ops:

1436

if sched_op not in schedule.cost_map:

1437

# Sub-schedule printing

1438

continue

1439

1440

op_info = schedule.cost_map[sched_op]

1441

print(f"\t{sched_op.index}: Operation {sched_op.name} - OFM {sched_op.ofm.shape}")

1442

print(f"\t\tType: {sched_op.op_type}")

1443

print(f"\t\tKernel: {sched_op.kernel}")

1444

print(f"{op_info}")

1445

mem_usage = (

1446

schedule.memory_snapshot[op_info.time_index]

1447

if op_info.time_index < len(schedule.memory_snapshot)

1448

else 0

1449

)

1450

print(f"\t\tSRAM Used: {mem_usage} bytes")

1451

Jonas Ohlsson

25e700c

2022-03-04 14:58:56 +0100

[diff] [blame]

1452

print("\tCascades:")

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1453

for i, cascade in enumerate(schedule.cascades.values()):

1454

print(f"\t\t{i}: {cascade.start} -> {cascade.end}, size: {cascade.mem_usage}")

Patrik Gustavsson

feeb06d

2020-04-22 12:53:47 +0200

[diff] [blame]

1455

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1456

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1457

def _update_tensor_allocation(nng: Graph, arch: ArchitectureFeatures, options):

1458

"""

1459

Creates live ranges and runs tensor allocator for the current schedule

1460

(i.e. sg.schedule for all subgraphs), returns the maximum memory usage

1461

and updates SchedulerOpInfo.mem_usage for all operations in the schedule.

1462

"""

1463

root_sg = nng.get_root_subgraph()

1464

1465

alloc_list = []

1466

if arch.is_spilling_enabled():

1467

mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,)))

1468

mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))

1469

# Order is important

1470

alloc_list.append(mem_alloc_scratch_fast)

1471

alloc_list.append(mem_alloc_scratch)

1472

else:

1473

mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))

1474

alloc_list.append(mem_alloc_scratch)

1475

1476

for mem_area, mem_type_set in alloc_list:

1477

tensor_allocation.allocate_tensors(

nng,

root_sg,

arch,

mem_area,

mem_type_set,

tensor_allocator=options.tensor_allocator,

1484

verbose_allocation=options.verbose_allocation,

1485

cpu_tensor_alignment=options.cpu_tensor_alignment,

Tim Hall

cda4fcb

2022-05-19 12:36:58 +0100

[diff] [blame]

1486

hillclimb_max_iterations=options.hillclimb_max_iterations,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1487

)

1488

1489

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1490

class FastStorageComponentAllocator:

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1491

MAX_EXHAUSTIVE_LIFE_RANGE = 20

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1492

MAX_EXHAUSTIVE_ITEMS = 20

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1493

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1494

def __init__(self, base_mem_usage, max_mem_usage, staging_limit):

1495

self.base_mem_usage = base_mem_usage

1496

self.max_mem_usage = list(max_mem_usage)

1497

self.staging_limit = staging_limit

1498

self.lrs = []

1499

self.evicted = []

1500

self.curr_evicted = []

1501

self.remaining_total_size = []

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1502

self.best_score = 0

1503

self.competing_tens_access = {}

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1504

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1505

def allocate_exhaustive(self, ix, score):

1506

# Favour tensors with highest element access (score)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1507

if ix >= len(self.lrs):

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1508

if score > self.best_score:

1509

self.best_score = score

Louis Verhaard

5c8f1e5

2022-02-23 14:13:07 +0100

[diff] [blame]

1510

self.evicted = self.curr_evicted.copy()

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

return

lr = self.lrs[ix]

for t in range(lr.start_time, lr.end_time):

1515

assert self.base_mem_usage[t] <= self.max_mem_usage[t]

1516

base_usage = max(self.base_mem_usage[lr.start_time : lr.end_time + 1])

1517

can_fit = base_usage + lr.size <= self.staging_limit

1518

always_fits = can_fit

1519

1520

if can_fit:

1521

max_usage = max(self.max_mem_usage[lr.start_time : lr.end_time + 1])

1522

always_fits = max_usage <= self.staging_limit

1523

1524

if can_fit or always_fits:

1525

self.curr_evicted[ix] = False

1526

self.base_mem_usage = self.update_mem_usage(self.base_mem_usage, lr, True)

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1527

tens = lr.tensors[0]

1528

# Tensor is being included - add tensor element access to the score

1529

self.allocate_exhaustive(ix + 1, score + self.competing_tens_access[tens])

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1530

self.base_mem_usage = self.update_mem_usage(self.base_mem_usage, lr, False)

1531

1532

if not always_fits:

1533

self.curr_evicted[ix] = True

1534

self.max_mem_usage = self.update_mem_usage(self.max_mem_usage, lr, False)

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1535

self.allocate_exhaustive(ix + 1, score)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1536

self.max_mem_usage = self.update_mem_usage(self.max_mem_usage, lr, True)

1537

1538

@staticmethod

1539

def update_mem_usage(mem_usage, lr, increase):

1540

for t in range(lr.start_time, lr.end_time + 1):

1541

mem_usage[t] += lr.size if increase else -lr.size

1542

assert mem_usage[t] >= 0

return mem_usage

@staticmethod

def evict(lr, max_mem_usage, scratched_fms):

1547

for t in range(lr.start_time, lr.end_time + 1):

1548

max_mem_usage[t] -= lr.size

1549

for tens in lr.tensors:

1550

if tens in scratched_fms:

1551

tens.mem_area = scratched_fms[tens][0]

1552

tens.mem_type = scratched_fms[tens][1]

1553

1554

@staticmethod

1555

def keep(lr, base_mem_usage, staging_limit):

1556

for t in range(lr.start_time, lr.end_time + 1):

1557

base_mem_usage[t] += lr.size

1558

assert base_mem_usage[t] <= staging_limit

1559

Johan Alfvén

2022-06-24 08:42:19 +0200

[diff] [blame]

1560

def allocate_component(

1561

self, allocator, lrs, max_mem, min_mem, staging_limit, scratched_fms, competing_tens_access, evicted_fms

1562

):

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1563

sz = len(lrs)

1564

allocator.lrs = lrs

1565

allocator.evicted = [0] * len(lrs)

1566

allocator.curr_evicted = [0] * sz

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1567

allocator.best_score = -1

1568

allocator.competing_tens_access = competing_tens_access

1569

# Recursively evaluate all permutations of allocations of the lrs found in the component.

1570

# For every permutation that fits within the staging_limit there is a score calculated.

1571

# The permutation with the highest score will then be chosen. The score is calculated

1572

# as the sum of the actual element access (ifm read and ofm write) for all the

1573

# including tensors. So it is not necessary the tensor with the biggest size that ends up

1574

# being included in the result.

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1575

allocator.allocate_exhaustive(0, 0)

1576

1577

# Optimal allocation has been found, move lrs accordingly

1578

for i, e in enumerate(allocator.evicted):

1579

if e:

1580

self.evict(lrs[i], max_mem, scratched_fms)

Johan Alfvén

2022-06-24 08:42:19 +0200

[diff] [blame]

1581

if lrs[i] not in evicted_fms:

1582

evicted_fms.append(lrs[i])

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1583

else:

1584

self.keep(lrs[i], min_mem, staging_limit)

Johan Alfvén

2022-06-24 08:42:19 +0200

[diff] [blame]

1585

if lrs[i] in evicted_fms:

1586

evicted_fms.remove(lrs[i])

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1587

1588

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1589

def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_options: SchedulerOptions):

1590

"""Entry point for the Scheduler"""

1591

# Initialize CPU subgraphs

1592

schedulers = dict()

1593

# Initialize schedulers with max schedule. Only schedule NPU subgraphs

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1594

for sg in nng.subgraphs:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1595

if sg.placement != PassPlacement.Npu:

1596

# Create cascaded passes for CPU Ops

1597

cascaded_passes = []

1598

for idx, ps in enumerate(sg.passes):

1599

cps = CascadedPass(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

1600

ps.name,

1601

SchedulingStrategy.WeightStream,

ps.inputs,

[],

ps.outputs,

[ps],

ps.placement,

False,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1608

)

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1609

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1610

cps.time = idx

1611

ps.cascade = cps

1612

cascaded_passes.append(cps)

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1613

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1614

sg.cascaded_passes = cascaded_passes

1615

else:

1616

# Npu subgraph - create schedule

1617

scheduler = Scheduler(nng, sg, arch, scheduler_options)

1618

schedulers[sg] = scheduler

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1619

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1620

scheduler.create_scheduler_representation(arch)

1621

sg.sched_ops = scheduler.sched_ops

1622

scheduler.move_constant_data()

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1623

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1624

# Create the Max schedule template

1625

max_schedule_template = scheduler.create_initial_schedule()

1626

scheduler.max_schedule = max_schedule_template

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1627

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1628

# Create the optimimised Max schedule

1629

sg.schedule = max_schedule_template

1630

scheduler.update_op_memory_snapshot(max_schedule_template)

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

1631

opt_max_schedule = scheduler.propose_schedule_buffering(max_schedule_template, 1 << 32)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1632

sg.schedule = opt_max_schedule

1633

scheduler.update_op_memory_snapshot(opt_max_schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1634

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1635

# Create Min schedule

1636

min_schedule = scheduler.propose_minimal_schedule()

1637

initial_sram_limit = scheduler_options.optimization_sram_limit

1638

if scheduler_options.optimization_strategy == OptimizationStrategy.Size:

1639

initial_sram_limit = scheduler.min_memory_req

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1640

Johan Alfvén

2022-07-16 18:27:05 +0200

[diff] [blame]

1641

# Build cascades for Min schedule

1642

scheduler.build_cascades_for_min_schedule(min_schedule, max_schedule_template, initial_sram_limit)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1643

sg.schedule = min_schedule

1644

scheduler.update_op_memory_snapshot(min_schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1645

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1646

if scheduler_options.optimization_strategy == OptimizationStrategy.Performance:

1647

# Create an optimized schedule

1648

sg.schedule = scheduler.optimize_schedule(

1649

min_schedule, opt_max_schedule, max_schedule_template, scheduler_options

1650

)

1651

scheduler.update_op_memory_snapshot(sg.schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1652

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1653

scheduler.apply_schedule(sg.schedule)

1654

scheduler.use_fast_storage_for_feature_maps(sg.schedule, scheduler_options.optimization_sram_limit)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1655

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1656

if scheduler_options.optimization_strategy == OptimizationStrategy.Performance and scheduler.evicted_fms:

1657

# It might be possible to gain performance by reducing

1658

# weight buffer size and instead fit fms in fast storage

1659

scheduler.optimize_weight_buffering_size(min_schedule, scheduler_options)

1660

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1661

if scheduler_options.verbose_schedule:

1662

scheduler.print_schedule(sg.schedule)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1663

Tim Hall