Blame - ethosu/vela/scheduler.py - ml/ethos-u/ethos-u-vela

2020-04-27 18:20:16 +0100

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

16

#

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

17

# Description:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

18

# The scheduler creates and searches for an optimal plan for the network, selecting block configurations and

19

# subdivisions for the Operators

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

20

# For Class name forward references for the type annotations. (see PEP 563).

21

from __future__ import annotations

22

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

23

import copy

Johan Alfvén

5e0ae55

2022-02-09 21:20:10 +0100

[diff] [blame]

24

from collections import namedtuple

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

25

from enum import auto

26

from enum import IntEnum

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

27

from typing import Any

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

28

from typing import Dict

29

from typing import List

30

from typing import Optional

31

from typing import Tuple

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

32

from typing import TYPE_CHECKING

33

34

# Import needed for Type annotations. Only import for Type checking to avoid run-time errors due to cyclic import.

35

if TYPE_CHECKING:

36

from .npu_performance import CycleCost

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

37

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

38

import numpy as np

39

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

40

from . import live_range

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

41

from . import npu_performance

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

42

from . import tensor_allocation

43

from . import weight_compressor

44

from .architecture_allocator import ArchitectureBlockConfig

45

from .architecture_allocator import find_block_config

46

from .architecture_allocator import get_ifm_area_required

Fredrik Svedberg

d03dc50

2022-06-30 10:44:12 +0200

[diff] [blame]

47

from .architecture_allocator import to_upscale

erik.andersson@arm.com

8912f3a

2022-08-16 11:08:57 +0200

[diff] [blame]

48

from .architecture_allocator import is_nearest

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

49

from .architecture_features import ArchitectureFeatures

50

from .architecture_features import Block

51

from .cascade_builder import CascadeBuilder

52

from .cascade_builder import CascadeInfo

Fredrik Svedberg

880e735

2020-08-25 11:31:47 +0200

[diff] [blame]

53

from .data_type import DataType

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

54

from .nn_graph import CascadedPass

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

55

from .nn_graph import Graph

56

from .nn_graph import Pass

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

57

from .nn_graph import PassPlacement

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

58

from .nn_graph import SchedulingStrategy

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

59

from .nn_graph import Subgraph

60

from .numeric_util import round_down

61

from .numeric_util import round_up

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

62

from .operation import NpuBlockType

Louis Verhaard

aee5d75

2020-09-30 09:01:52 +0200

[diff] [blame]

63

from .operation import Op

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

64

from .shape4d import Shape4D

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

65

from .tensor import MemArea

Patrik Gustavsson

eca2e95

2020-05-27 09:15:11 +0200

[diff] [blame]

66

from .tensor import MemType

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

67

from .tensor import Tensor

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

68

from .tensor import TensorFormat

69

from .tensor import TensorPurpose

70

from .tensor import TensorSubPurpose

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

71

from .weight_compressor import NpuWeightTensor

Jacob Bohlin

1a66697

2020-09-11 10:04:15 +0200

[diff] [blame]

72

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

73

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

74

def shape_for_format(shape: Shape4D, tensor_format: TensorFormat) -> Shape4D:

75

if tensor_format == TensorFormat.NHCWB16:

76

return shape.with_depth(round_up(shape.depth, 16))

return shape

class OptimizationStrategy(IntEnum):

82

"""Enum defining the different optimization strategies for the Scheduler"""

83

84

Size = auto()

85

Performance = auto()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

def __str__(self):

return self.name

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

91

class SchedulerOpInfo:

92

"""Contains metadata about a SchedulerOperation that is unique to one Schedule"""

93

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

94

def __init__(

95

self,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

96

block_config: ArchitectureBlockConfig,

97

weights_size: int,

98

stripe_input: Shape4D,

99

stripe_input2: Optional[Shape4D],

100

stripe: Shape4D,

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

101

):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

102

self.block_config = block_config

103

self.weights_size = weights_size

104

self.stripe_input = stripe_input

105

self.stripe_input2 = stripe_input2

106

self.stripe = stripe

107

self.cascade = 0 # Assigned by CascadeBuilder. 0 means not part of a cascade

108

self.time_index = None # Set by update_op_memory_snapshot

109

self.ofm_depth_slices: List[int] = [0, stripe.depth]

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

110

self.npu_weights_tensor: Optional[NpuWeightTensor] = None

111

self.npu_scales_tensor: Optional[NpuWeightTensor] = None

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

112

self.buffered_weight_tensors: List[Tensor] = []

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

113

self.cycles: Optional[CycleCost] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

114

self.slack_buffering_cycles = 0

115

self.slack_buffering_memory = 0

116

self.full_weight_transfer_cycles = 0

117

118

def copy(self):

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

119

res = SchedulerOpInfo(

self.block_config,

self.weights_size,

self.stripe_input,

self.stripe_input2,

self.stripe,

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

126

res.cascade = self.cascade

return res

def __str__(self):

res = f"\t\tBlock Config = {self.block_config}\n"

131

res += f"\t\tOFM Block = {self.block_config.ofm_block}\n"

132

res += f"\t\tIFM Stripe = {self.stripe_input}\n"

133

res += f"\t\tIFM2 Stripe = {self.stripe_input2}\n"

134

res += f"\t\tOFM Stripe = {self.stripe}\n"

135

res += f"\t\tEncoded Weights = {self.npu_weights_tensor and len(self.npu_weights_tensor.buffer)} bytes\n"

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

136

for idx, tens in enumerate(self.buffered_weight_tensors):

137

res += f"\t\tWeight buffer{idx + 1} = {tens.storage_size()} bytes\n"

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

138

res += f"\t\tDepth slices = {self.ofm_depth_slices}\n"

139

res += f"\t\tAssigned Cascade = {self.cascade}"

return res

class SchedulerOptions:

144

"""Contains options for the Scheduler"""

145

146

def __init__(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

147

self,

148

optimization_strategy,

149

sram_target,

150

verbose_schedule,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

151

):

152

self.optimization_strategy = optimization_strategy

153

self.optimization_sram_limit = sram_target

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

154

self.verbose_schedule = verbose_schedule

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

155

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

156

def __str__(self) -> str:

157

return f"{type(self).__name__}: {str(self.__dict__)}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

__repr__ = __str__

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

162

class SchedulerTensor:

163

def __init__(self, shape, dt, mem_area, _format):

164

self.dtype = dt

165

self.mem_area = mem_area

166

self.shape = shape

167

self.format = _format

168

self.connection = None

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

169

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

170

171

class SchedulerOperation:

172

"""Scheduler internal representation of 'Operation'

173

This class can be seen as a node within the Scheduler Graph representation

174

"""

175

176

def __init__(self, ps: Pass, arch: ArchitectureFeatures, nng: Graph):

177

self.arch = arch

178

self.parent_ps = ps

179

self.parent_op = ps.primary_op

180

self.name = ps.primary_op.name

181

self.op_type = ps.primary_op.type

182

self.activation = ps.primary_op.activation

183

self.kernel = ps.primary_op.kernel

Tim Hall

3c5cfe9

2022-03-16 16:31:57 +0000

[diff] [blame]

184

self.resampling_mode = ps.primary_op.ifm_resampling_mode

Fredrik Svedberg

b81e1bb

2022-10-11 21:50:51 +0200

[diff] [blame]

185

self.reversed_operands = False

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

186

self.uses_scalar = ps.primary_op.ifm2 is not None and (

187

ps.primary_op.ifm.shape == [] or ps.primary_op.ifm2.shape == []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

188

)

erik.andersson@arm.com

2022-03-22 15:35:30 +0100

[diff] [blame]

189

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

190

self.ifm_ublock = arch.ifm_ublock

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

191

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

192

self.ifm = SchedulerTensor(

193

ps.ifm_shapes[0],

194

ps.ifm_tensor.dtype,

195

ps.ifm_tensor.mem_area,

196

ps.ifm_tensor.format,

197

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

198

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

199

self.ifm2 = None

200

if ps.ifm2_tensor:

201

self.ifm2 = SchedulerTensor(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

202

ps.ifm_shapes[1],

203

ps.ifm2_tensor.dtype,

204

ps.ifm2_tensor.mem_area,

205

ps.ifm2_tensor.format,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

206

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

207

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

208

self.ofm = SchedulerTensor(

209

ps.ofm_shapes[0],

210

ps.ofm_tensor.dtype,

211

ps.ofm_tensor.mem_area,

212

ps.ofm_tensor.format,

213

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

214

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

215

# Input volume width and height required to produce the smallest possible stripe

216

self.min_stripe_input_w, self.min_stripe_input_h = self._calculate_min_stripe_input()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

217

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

218

# Flags that marks whether this SchedulerOperation requires full IFM/OFM

219

self.requires_full_ifm = False

220

self.requires_full_ifm2 = False

221

self.requires_full_ofm = False

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

222

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

223

self.evicted_fms_size = 0

224

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

225

self.index = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

226

erik.andersson@arm.com

2022-03-22 15:35:30 +0100

[diff] [blame]

227

# Perform an IFM swap for certain binary elementwise operators

228

# in order to enable cascading, if the SchedOp conforms to

229

# Elementwise cascading rules.

Johan Alfvén

0f2e59f

2022-10-21 11:21:38 +0200

[diff] [blame]

230

# The non-constant/non-scalar/non-broadcast IFM should be the primary input

231

if self.op_type.is_binary_elementwise_op():

232

ifm = self.parent_op.ifm

233

ifm2 = self.parent_op.ifm2

234

ofm = self.parent_op.ofm

erik.andersson@arm.com

2022-03-22 15:35:30 +0100

[diff] [blame]

235

Johan Alfvén

993ea53

2022-10-26 10:20:01 +0200

[diff] [blame]

236

ifm_can_swap = ifm.is_const or ifm.is_scalar

Johan Alfvén

0f2e59f

2022-10-21 11:21:38 +0200

[diff] [blame]

237

ifm2_can_be_primary = not (ifm2.is_const or ifm2.is_scalar or ifm2.is_broadcast(ofm))

238

Johan Alfvén

993ea53

2022-10-26 10:20:01 +0200

[diff] [blame]

239

if ifm_can_swap and ifm2_can_be_primary:

Johan Alfvén

0f2e59f

2022-10-21 11:21:38 +0200

[diff] [blame]

240

# IFM2 is the primary input

Fredrik Svedberg

b81e1bb

2022-10-11 21:50:51 +0200

[diff] [blame]

241

self.reversed_operands = True

erik.andersson@arm.com

2022-03-22 15:35:30 +0100

[diff] [blame]

242

self.ifm, self.ifm2 = self.ifm2, self.ifm

243

244

self.parent_ps.ifm_shapes = self.parent_ps.ifm_shapes[::-1]

245

self.parent_ps.inputs = self.parent_ps.inputs[::-1]

246

self.parent_ps.ifm_tensor, self.parent_ps.ifm2_tensor = (

247

self.parent_ps.ifm2_tensor,

248

self.parent_ps.ifm_tensor,

249

)

250

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

251

def add_ifm_connection(self, conn: "Connection"):

252

"""Add input connection to another SchedulerOperation or Subgraph Input"""

253

conn.consumers.append(self)

254

self.ifm.connection = conn

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

255

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

256

def add_ifm2_connection(self, conn: "Connection"):

257

"""Add input connection to another SchedulerOperation or Subgraph Input"""

258

if self.ifm2:

259

conn.consumers.append(self)

260

self.ifm2.connection = conn

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

261

else:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

262

assert False, f"Trying to set an IFM2 Connection to {self} which has no IFM2"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

263

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

264

def add_ofm_connection(self, conn: "Connection"):

265

"""Add output connection to another SchedulerOperation or Subgraph Output"""

266

conn.producers.append(self)

267

self.ofm.connection = conn

268

269

def get_dependants(self):

270

"""Returns a list of the Ops that depend on this Operation's OFM"""

271

return self.ofm.connection.consumers

272

273

def ifm_size_in_bytes(self) -> int:

274

"""Returns size of the IFM in bytes"""

275

ifm_storage_shape = shape_for_format(self.ifm.shape, self.ifm.format)

276

return round_up(ifm_storage_shape.elements() * self.ifm.dtype.size_in_bytes(), Tensor.AllocationQuantum)

277

278

def ifm2_size_in_bytes(self) -> int:

279

"""Returns size of the IFM2 in bytes"""

280

if self.ifm2:

281

ifm2_storage_shape = shape_for_format(self.ifm2.shape, self.ifm2.format)

282

return round_up(ifm2_storage_shape.elements() * self.ifm2.dtype.size_in_bytes(), Tensor.AllocationQuantum)

return 0

def ofm_size_in_bytes(self) -> int:

287

"""Returns size of the OFM in bytes"""

288

ofm_storage_shape = shape_for_format(self.ofm.shape, self.ofm.format)

289

return round_up(ofm_storage_shape.elements() * self.ofm.dtype.size_in_bytes(), Tensor.AllocationQuantum)

290

291

def create_scheduler_info(self, nng: Graph, stripe: Shape4D) -> SchedulerOpInfo:

292

"""Returns schedule info about this SchedulerOperation based on how many ofm elements it should produce"""

293

ifm_shape = self.ifm.shape

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

294

ifm2_shape = self.ifm2.shape if self.ifm2 is not None else None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

295

ofm_shape = stripe

296

297

if ofm_shape != self.ofm.shape:

298

# Striped Op - Need to calculate stripe input volume

299

stripe_input_w, stripe_input_h = self._get_stripe_input_requirement(stripe)

300

# Ensure stripe input volume is within the full IFM volume

301

stripe_input_h = min(stripe_input_h, self.ifm.shape.height)

302

stripe_input_w = min(stripe_input_w, self.ifm.shape.width)

303

ifm_shape = ifm_shape.with_hw(stripe_input_h, stripe_input_w)

304

305

if self.ifm2:

306

stripe_input2_h = min(stripe_input_h, self.ifm2.shape.height)

307

stripe_input2_w = min(stripe_input_w, self.ifm2.shape.width)

308

ifm2_shape = ifm2_shape.with_hw(stripe_input2_h, stripe_input2_w)

309

310

block_config = self._get_block_config(ifm_shape, ifm2_shape, self.uses_scalar, ofm_shape)

311

312

scheduler_op_info = SchedulerOpInfo(block_config, 0, ifm_shape, ifm2_shape, ofm_shape)

313

if self.parent_op.weights:

314

# Default full-depth weight encoding with no buffering

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

315

(

316

scheduler_op_info.npu_weights_tensor,

317

scheduler_op_info.npu_scales_tensor,

318

) = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

319

self.arch,

320

self.parent_op,

321

self.parent_op.weights,

self.parent_op.bias,

self.kernel,

block_config,

[0, self.ofm.shape.depth],

326

)

327

328

self.parent_ps.block_config = block_config.old_style_representation()

329

return scheduler_op_info

330

331

def _get_stripe_input_requirement(self, stripe_shape: Shape4D) -> Tuple[int, int]:

332

"""Returns the amount of IFM required to produce the stripe with shape:'stripe_shape'"""

333

ofm_shape_to_produce = Block.from_shape(stripe_shape.as_list())

334

Fredrik Svedberg

3ff7a4a

2021-09-29 10:08:04 +0200

[diff] [blame]

335

return get_ifm_area_required(ofm_shape_to_produce, self.kernel, self.resampling_mode)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

336

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

337

def _calculate_min_stripe_input(self) -> Tuple[int, int]:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

338

# Calculate the input volume required height and width for the smallest possible stripe (h,w = 1,1)

339

min_stripe = self.ofm.shape.with_hw(1, 1)

340

return self._get_stripe_input_requirement(min_stripe)

341

342

def _get_block_config(

343

self, ifm_shape: Shape4D, ifm2_shape: Optional[Shape4D], uses_scalar: bool, ofm_shape: Shape4D

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

344

) -> Optional[ArchitectureBlockConfig]:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

345

# Returns a block config and SHRAM layout

346

lut_banks = 2 if self.parent_op.activation_lut else 0

347

return find_block_config(

348

self.arch,

349

self.op_type.npu_block_type,

ofm_shape,

ifm_shape,

ifm2_shape,

uses_scalar,

self.ifm.dtype.size_in_bits(),

355

self.kernel,

356

lut_banks,

357

self.parent_op.has_scaling(),

358

self.resampling_mode,

)

class Connection:

"""Scheduler internal representation of a Tensor that connects two SchedulerOperations

364

This class can be seen as an edge within the Scheduler Graph representation

365

"""

366

367

def __init__(self, tensor: Tensor):

368

self.parent_tens = tensor

369

370

# SchedulerOperation relationships

371

self.producers: List[SchedulerOperation] = []

372

self.consumers: List[SchedulerOperation] = []

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

373

374

def __str__(self):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

375

return f"<Connection {self.parent_tens.name}>"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

__repr__ = __str__

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

380

class Schedule:

381

"""Class that contains a solution of how to schedule an NPU subgraph and its cost"""

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

382

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

383

def __init__(self, sg: Subgraph, label: str):

384

self.sg = sg

385

self.label = label

386

self.cost_map: Dict[SchedulerOperation, SchedulerOpInfo] = {}

387

self.cascades: Dict[int, CascadeInfo] = {}

388

self.fast_storage_peak_usage = 0

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

389

self.memory_snapshot: Optional[List[int]] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

@property

def name(self):

return f"{self.sg.name}_{self.label}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

394

395

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

396

class Scheduler:

397

"""Main class of the Vela Scheduling"""

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

398

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

399

def __init__(self, nng: Graph, sg: Subgraph, arch: ArchitectureFeatures, options: SchedulerOptions):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

400

self.nng = nng

401

self.sg = sg

402

self.arch = arch

Ayaan Masood

b801dda

2022-02-22 11:28:55 +0000

[diff] [blame]

403

self.sched_ops: List[SchedulerOperation] = []

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

404

self.max_schedule: Optional[Schedule] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

405

self.scheduler_options = options

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

406

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

407

self.scratched_fms: Dict[Tensor, Any] = {}

408

self.evicted_fms: List[live_range.LiveRange] = []

409

Johan Alfvén

5e0ae55

2022-02-09 21:20:10 +0100

[diff] [blame]

410

def avoid_nhcwb16_for_ofm(self, tens, ps, arch):

411

# Only run this check for opt strategy Size

412

if self.scheduler_options.optimization_strategy == OptimizationStrategy.Performance:

return False

op = ps.primary_op

if not op.type.is_elementwise_op():

417

return False

418

419

depth = op.ofm_shapes[0][-1]

420

if (depth % 16) == 0:

421

return False

422

423

# Check if overwriting the inputs can be allowed

424

OpShapeTens = namedtuple("OpShapeTens", ["op_shape", "tens"])

425

outp = OpShapeTens(op.ofm_shapes[0], op.ofm)

426

inps = []

427

if op.ifm is not None:

428

inps.append(OpShapeTens(op.ifm_shapes[0], op.ifm))

429

if op.ifm2 is not None:

430

inps.append(OpShapeTens(op.ifm_shapes[1], op.ifm2))

431

432

# Find an input tensor that can be overwritten by the output

433

for inp in inps:

434

if (

435

# check op input and output shapes allow overlapping

436

inp.op_shape == outp.op_shape

437

# check input tensor is valid

438

and inp.tens is not None

439

and inp.tens.shape != []

440

# check input and output tensors are compatible

441

and inp.tens.format == outp.tens.format

442

and inp.tens.dtype == outp.tens.dtype

443

):

444

if inp.tens.format == TensorFormat.NHWC:

return True

return False

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

449

def create_scheduler_representation(self, arch: ArchitectureFeatures):

450

"""Creates a Scheduler Graph representation"""

451

# Temporary dict for creating connections between the Operations

452

connections: Dict[Tensor, Connection] = {}

453

# Memory required for the largest FeatureMap that has to be full

454

min_memory_req = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

455

for ps in self.sg.passes:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

456

if ps.primary_op:

457

# Set tensor format to NHCWB16 for output FeatureMaps, if possible

Louis Verhaard

0b9c9a3

2020-09-15 14:05:38 +0200

[diff] [blame]

458

for output in ps.outputs:

Jacob Bohlin

a5e8c1c

2021-06-14 13:33:39 +0200

[diff] [blame]

459

if output in self.sg.output_tensors or output.purpose != TensorPurpose.FeatureMap:

Patrik Gustavsson

feeb06d

2020-04-22 12:53:47 +0200

[diff] [blame]

460

continue

Johan Alfvén

5e0ae55

2022-02-09 21:20:10 +0100

[diff] [blame]

461

462

if output.needs_linear_format:

463

continue

464

465

if self.avoid_nhcwb16_for_ofm(output, ps, arch):

466

output.needs_linear_format = True

467

continue

468

469

output.set_format(TensorFormat.NHCWB16, arch)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

470

471

# Create SchedulerOperations

472

op = SchedulerOperation(ps, arch, self.nng)

473

op.index = len(self.sched_ops)

474

475

# Make connections

476

if ps.ifm_tensor not in connections:

477

connections[ps.ifm_tensor] = Connection(ps.ifm_tensor)

478

if ps.ifm2_tensor and ps.ifm2_tensor not in connections:

479

connections[ps.ifm2_tensor] = Connection(ps.ifm2_tensor)

480

if ps.ofm_tensor not in connections:

481

connections[ps.ofm_tensor] = Connection(ps.ofm_tensor)

482

483

op.add_ifm_connection(connections[ps.ifm_tensor])

484

if ps.ifm2_tensor:

485

op.add_ifm2_connection(connections[ps.ifm2_tensor])

486

op.add_ofm_connection(connections[ps.ofm_tensor])

487

488

# Set requirements on the ifm/ofm buffers

489

self.sched_ops.append(op)

490

if ps.ifm_tensor in self.sg.input_tensors:

491

# This Op consumes a subgraph input

492

op.requires_full_ifm = True

493

if ps.ifm2_tensor and ps.ifm2_tensor in self.sg.input_tensors:

494

# This Op consumes a subgraph input

495

op.requires_full_ifm2 = True

496

if ps.ofm_tensor in self.sg.output_tensors:

497

# This Op produces a subgraph output

498

op.requires_full_ofm = True

499

if ps.ifm_tensor.needs_linear_format:

500

op.requires_full_ifm = True

501

if ps.ifm2_tensor and ps.ifm2_tensor.needs_linear_format:

502

op.requires_full_ifm2 = True

503

if ps.ofm_tensor.needs_linear_format or ps.primary_op.memory_function == Op.ConcatSliceWrite:

504

op.requires_full_ofm = True

505

if len(ps.primary_op.outputs) > 1 or len(ps.primary_op.outputs[0].consumer_list) > 1:

506

# Op has multiple outputs or consumers - requires full OFM

507

op.requires_full_ofm = True

508

509

# Check memory requirements if this Op requires any full FeatureMaps

510

op_memory_req = 0

511

if op.requires_full_ifm:

512

op_memory_req += op.ifm_size_in_bytes()

513

if op.requires_full_ifm2:

514

op_memory_req += op.ifm2_size_in_bytes()

515

if op.requires_full_ofm:

516

op_memory_req += op.ofm_size_in_bytes()

517

518

min_memory_req = max(op_memory_req, min_memory_req)

519

520

# Theoretical minimum required memory - used to guide the cascade building

521

self.min_memory_req = min_memory_req

522

523

def create_initial_schedule(self) -> Schedule:

524

"""Creates an initial schedule with no cascading or buffering of any kind"""

525

schedule = Schedule(self.sg, "MAX")

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

526

for op in self.sched_ops:

527

cost = op.create_scheduler_info(self.nng, op.ofm.shape)

528

cost.cycles = self.estimate_op_performance(op, cost.block_config, op.ofm.shape.depth)

529

schedule.cost_map[op] = cost

return schedule

def update_op_memory_snapshot(self, schedule: Schedule):

534

memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

535

536

# Collect live ranges from tensors

537

lr_graph = live_range.LiveRangeGraph()

538

for mem_area, mem_type_set in memories_list:

539

live_range.extract_live_ranges_from_cascaded_passes(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

540

self.nng.get_root_subgraph(),

mem_area,

mem_type_set,

lr_graph,

Tensor.AllocationQuantum,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

545

)

546

547

# Populate time-array with memory used by live ranges

548

temporal_usage = lr_graph.get_temporal_memory_usage(self.arch.fast_storage_mem_area)

549

schedule.memory_snapshot = temporal_usage

550

551

# Set the peak memory usage

552

schedule.fast_storage_peak_usage = max(temporal_usage, default=0)

553

554

def estimate_op_performance(self, op: SchedulerOperation, block_config, ofm_depth):

555

query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)

556

query.ifm_shape = op.ifm.shape

557

query.ifm_memory_area = op.ifm.mem_area

558

query.ifm_bits = op.ifm.dtype.size_in_bits()

559

query.ifm_format = op.ifm.format

560

query.ifm2_shape = op.ifm2 and op.ifm2.shape

561

query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area

562

query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()

563

query.ifm2_format = op.ifm2 and op.ifm2.format

564

query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)

565

query.ofm_memory_area = op.ofm.mem_area

566

query.ofm_bits = op.ofm.dtype.size_in_bits()

567

query.ofm_format = op.ofm.format

568

if op.parent_op.bias:

569

query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)

570

query.const_memory_area = self.arch.fast_storage_mem_area

571

572

query.kernel = op.kernel

573

query.config = block_config

574

575

return npu_performance.measure_cycle_cost(self.arch, op.op_type, op.activation and op.activation.op_type, query)

576

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

577

def estimate_element_access(self, op: SchedulerOperation, block_config, ofm_depth):

578

query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)

579

query.ifm_shape = op.ifm.shape

580

query.ifm_memory_area = op.ifm.mem_area

581

query.ifm_bits = op.ifm.dtype.size_in_bits()

582

query.ifm_format = op.ifm.format

583

query.ifm2_shape = op.ifm2 and op.ifm2.shape

584

query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area

585

query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()

586

query.ifm2_format = op.ifm2 and op.ifm2.format

587

query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)

588

query.ofm_memory_area = op.ofm.mem_area

589

query.ofm_bits = op.ofm.dtype.size_in_bits()

590

query.ofm_format = op.ofm.format

591

if op.parent_op.bias:

592

query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)

593

query.const_memory_area = self.arch.fast_storage_mem_area

594

595

query.kernel = op.kernel

596

query.config = block_config

597

598

return npu_performance.measure_element_access(self.arch, query)

599

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

600

def propose_schedule_buffering(self, ref_schedule: Schedule, staging_limit_bytes):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

601

"""Create a buffered schedule"""

602

buffered_schedule = Schedule(self.sg, f"{ref_schedule.label}_BUFFERED")

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

603

604

prev_op = None

605

for sched_op in self.sched_ops:

606

if sched_op not in ref_schedule.cost_map:

607

# sched_op is not part of this sub-schedule - skip

608

continue

609

610

self.propose_operator_buffering(sched_op, prev_op, buffered_schedule, ref_schedule, staging_limit_bytes)

611

prev_op = sched_op

612

613

return buffered_schedule

614

615

def propose_operator_buffering(

616

self,

617

sched_op: SchedulerOperation,

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

618

prev_op: Optional[SchedulerOperation],

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

619

buffered_schedule: Schedule,

620

ref_schedule: Schedule,

621

staging_limit_bytes,

622

):

623

# Mild recursion might mean this Op has already been seen

624

if sched_op in buffered_schedule.cost_map:

625

return

626

627

# Take the reference schedule as default costings for this schedule

628

ref_cost = ref_schedule.cost_map[sched_op]

629

cost = copy.copy(ref_cost)

630

cost.slack_buffering_cycles = ref_cost.cycles.op_cycles

631

memory_snapshot = ref_schedule.memory_snapshot

632

ref_memory_usage = memory_snapshot[ref_cost.time_index] if ref_cost.time_index < len(memory_snapshot) else 0

633

cost.slack_buffering_memory = staging_limit_bytes - ref_memory_usage

634

buffered_schedule.cost_map[sched_op] = cost

635

636

# Attempt weight buffering on anything with a weights tensor

637

if sched_op.parent_op.weights:

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

638

buffer_limit_bytes = cost.slack_buffering_memory

639

640

# If applicable apply size limitation, but keep it within reason (ratio 1.5).

641

# Size limitation is used when use_fast_storage_for_feature_maps have

642

# detected that there are fms that do not fit in fast storage.

643

if sched_op.evicted_fms_size and ((buffer_limit_bytes / sched_op.evicted_fms_size) >= 1.5):

644

buffer_limit_bytes -= sched_op.evicted_fms_size

645

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

646

self.propose_weight_buffering(

647

sched_op.parent_op.weights,

648

sched_op.parent_op.bias,

sched_op,

prev_op,

buffered_schedule,

ref_schedule,

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

653

buffer_limit_bytes,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

)

return cost

def weights_needs_dma(self, weight_tensor):

659

if weight_tensor and weight_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast):

660

# Weights are in permanent storage

661

# Only when permanent storage differs from feature map storage, there is a point moving the data

662

if (

663

weight_tensor.mem_area in (MemArea.Dram, MemArea.OffChipFlash)

664

and self.arch.permanent_storage_mem_area != self.arch.fast_storage_mem_area

):

return True

return False

def propose_weight_buffering(

self,

weight_tensor,

scale_tensor,

sched_op: SchedulerOperation,

674

prev_op: SchedulerOperation,

675

buffered_schedule: Schedule,

676

ref_schedule: Schedule,

677

buffer_limit_bytes,

678

):

679

cost = buffered_schedule.cost_map[sched_op]

680

prev_cost = buffered_schedule.cost_map.get(prev_op)

681

ref_cost = ref_schedule.cost_map[sched_op]

682

assert cost and ref_cost

683

684

needs_dma = self.weights_needs_dma(weight_tensor)

685

686

ofm_full_depth_slices = [0, ref_cost.stripe.depth]

687

688

# Encode weights for the full depth

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

689

full_weights, full_scales = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

self.arch,

sched_op.parent_op,

weight_tensor,

scale_tensor,

sched_op.kernel,

cost.block_config,

ofm_full_depth_slices,

697

)

698

full_weights_bytes = len(full_weights.buffer)

699

cost.ofm_depth_slices = ofm_full_depth_slices

700

701

# No buffering required - take all the weights from permanent storage

702

if sched_op.op_type == Op.FullyConnected or not needs_dma:

703

cost.npu_weights_tensor = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

704

cost.npu_scales_tensor = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

705

return

706

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

707

encoded_weights: Optional[NpuWeightTensor] = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

708

encoded_scales = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

709

710

# How many NPU cycles are available under the previously executing

711

# operator and SRAM unused for performing buffered DMA transfers

712

slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0

713

slack_memory = prev_cost.slack_buffering_memory if prev_cost else 0

714

715

# Force full depth for cascaded Ops

716

if ref_cost.cascade != 0:

717

weight_tensor_purpose = TensorSubPurpose.Standard

718

weight_buffer_size = full_weights_bytes

719

# Update the memory snapshot to reflect the added size of the weights

720

ref_schedule.memory_snapshot[ref_cost.time_index] += weight_buffer_size

721

else:

722

# Estimate the buffering cycle time for the full set of weights

723

full_transfer_cycles = npu_performance.measure_mem2mem_cycles(

724

self.arch, weight_tensor.mem_area, self.arch.fast_storage_mem_area, full_weights_bytes

725

)

726

cost.full_weight_transfer_cycles = full_transfer_cycles

727

728

# Calculate the amount of prebuffering necessary (or what is possible with limited

729

# double buffer buffer size)

730

half_buffer_limit = buffer_limit_bytes // 2

731

if full_transfer_cycles > slack_cycles:

732

prebuffer_ratio = slack_cycles / full_transfer_cycles

733

prebuffer_bytes = min(prebuffer_ratio * full_weights_bytes, half_buffer_limit)

734

else:

735

prebuffer_bytes = min(full_weights_bytes, half_buffer_limit)

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

736

737

prebuffer_ratio = prebuffer_bytes / full_weights_bytes

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

738

739

# Have to split the weights if the initial buffering can't store

740

# all of the compressed weights

741

if prebuffer_bytes < full_weights_bytes:

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

742

block_depth = cost.block_config.ofm_block.depth

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

743

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

744

# Choose initial prebuffering depth (already buffer clamped)

745

prebuffer_depth = ref_cost.stripe.depth * prebuffer_ratio

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

746

prebuffer_depth = int(max(16, round_down(prebuffer_depth, ArchitectureFeatures.OFMSplitDepth)))

747

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

748

# Calculate cycles executed during the prebuffer

749

pre_op_cycles = self.estimate_op_performance(sched_op, cost.block_config, prebuffer_depth)

750

buffering_depth = ref_cost.stripe.depth * (pre_op_cycles.op_cycles / full_transfer_cycles)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

751

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

752

# Choose initial buffering depth and clamp to the double buffering limit

753

buffering_depth = round_up(buffering_depth, block_depth)

754

buffering_bytes = (buffering_depth / ref_cost.stripe.depth) * full_weights_bytes

755

if buffering_bytes > half_buffer_limit:

756

buffering_depth = (half_buffer_limit / full_weights_bytes) * ref_cost.stripe.depth

757

758

while True:

759

# Attempt to buffer whole blocks

Johan Alfvén

cce7f2d

2022-04-08 10:47:09 +0200

[diff] [blame]

760

if buffering_depth > block_depth:

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

761

buffering_depth = round_down(buffering_depth, block_depth)

762

else:

763

buffering_depth = round_down(buffering_depth, ArchitectureFeatures.OFMSplitDepth)

764

buffering_depth = int(max(buffering_depth, ArchitectureFeatures.OFMSplitDepth))

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

765

766

# Create list of depth slices

767

depth_slices = [0]

768

if prebuffer_depth < ref_cost.stripe.depth:

769

depth_slices += list(range(prebuffer_depth, ref_cost.stripe.depth, buffering_depth))

770

depth_slices.append(ref_cost.stripe.depth)

771

772

# Encode weights based depth slices

773

cost.ofm_depth_slices = depth_slices

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

774

encoded_weights, encoded_scales = weight_compressor.encode_weight_and_scale_tensor(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

self.arch,

sched_op.parent_op,

weight_tensor,

scale_tensor,

sched_op.kernel,

cost.block_config,

cost.ofm_depth_slices,

782

)

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

783

assert encoded_weights is not None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

784

# Chosen buffering might not fit at all, iterate until it does

785

# or until the minimum usable slice size is reached

786

if (

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

787

encoded_weights.double_buffer_size() <= buffer_limit_bytes

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

788

or prebuffer_depth == ArchitectureFeatures.OFMSplitDepth

):

break

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

792

if buffering_depth > prebuffer_depth:

793

buffering_depth = round_up(buffering_depth // 2, ArchitectureFeatures.OFMSplitDepth)

794

else:

795

prebuffer_depth = round_up(prebuffer_depth // 2, ArchitectureFeatures.OFMSplitDepth)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

796

797

# Calculate cycles required to run the last op for use as future slack

798

tail_cycles = self.estimate_op_performance(

799

sched_op, cost.block_config, depth_slices[-1] - depth_slices[-2]

800

)

801

cost.slack_buffering_cycles = tail_cycles.op_cycles

802

803

# Determine whether the weights need to be double buffered

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

804

weight_buffer_size = min(len(encoded_weights.buffer), encoded_weights.max_range_bytes())

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

805

806

# Only buffer weights if there's still space left for the buffer

807

if weight_buffer_size <= buffer_limit_bytes:

808

assert weight_buffer_size % 16 == 0

809

# Determine whether to double buffer or single buffer

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

810

double_buffer_size = encoded_weights.double_buffer_size()

811

if (double_buffer_size <= buffer_limit_bytes) and (weight_buffer_size < len(encoded_weights.buffer)):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

812

weight_tensor_purpose = TensorSubPurpose.DoubleBuffer

813

else:

814

weight_tensor_purpose = TensorSubPurpose.Standard

815

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

816

cost.buffered_weight_tensors = [

817

self.buffer_tensor(

818

encoded_weights,

819

weight_tensor_purpose,

820

encoded_weights.double_buffer_sizes[0],

821

weight_tensor.name + "_buffer",

822

)

823

]

824

if weight_tensor_purpose == TensorSubPurpose.DoubleBuffer:

825

buf2 = self.buffer_tensor(

826

encoded_weights,

827

weight_tensor_purpose,

828

encoded_weights.double_buffer_sizes[1],

829

weight_tensor.name + "_buffer2",

830

)

831

cost.buffered_weight_tensors.append(buf2)

832

833

last_used_buffer_idx = len(cost.ofm_depth_slices) % len(cost.buffered_weight_tensors)

834

weight_buffer_size = encoded_weights.double_buffer_sizes[last_used_buffer_idx]

835

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

836

if ref_cost.cascade == 0:

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

837

# Determine if the lifetime can be extended and pre-buffer the first weight buffer

838

# under the previous operation

839

cost.buffered_weight_tensors[0].pre_buffer = encoded_weights.double_buffer_size() < slack_memory

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

840

841

cost.slack_buffering_memory -= weight_buffer_size

842

else:

843

# Don't slice or buffer - use the whole depth from persistent storage

844

cost.ofm_depth_slices = ofm_full_depth_slices

845

encoded_weights = full_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

846

encoded_scales = full_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

847

848

cost.npu_weights_tensor = encoded_weights

Tim Hall

2021-06-08 21:25:57 +0100

[diff] [blame]

849

cost.npu_scales_tensor = encoded_scales

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

850

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

851

def buffer_tensor(self, src_tensor: Tensor, sub_purpose: TensorSubPurpose, buffer_size: int, name: str) -> Tensor:

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

852

buffered_weight_tensor = Tensor([1, 1, 1, buffer_size], DataType.uint8, name)

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

853

buffered_weight_tensor.src_tensor = src_tensor

854

buffered_weight_tensor.mem_area = self.arch.fast_storage_mem_area

855

buffered_weight_tensor.mem_type = MemType.Scratch_fast

856

buffered_weight_tensor.purpose = TensorPurpose.Weights

857

buffered_weight_tensor.sub_purpose = sub_purpose

858

return buffered_weight_tensor

859

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

860

def propose_minimal_schedule(self) -> Schedule:

861

"""Proposes scheduling parameters where every operator is subdivided into the smallest stripe that satisfies the

862

next operators stride"""

863

min_schedule = Schedule(self.sg, "MIN")

864

cost_map = min_schedule.cost_map

865

866

# Keep track of the previous Op - which consumes the current Op's OFM

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

867

prev_op: Optional[SchedulerOperation] = None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

868

for sched_op in reversed(self.sched_ops):

869

min_stripe_height = prev_op.kernel.stride.y if prev_op else 1

870

min_stripe = sched_op.ofm.shape.with_height(min_stripe_height)

871

872

cost = sched_op.create_scheduler_info(self.nng, min_stripe)

873

cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)

874

cost_map[sched_op] = cost

prev_op = sched_op

return min_schedule

def propose_schedule_striping(self, final_stripe: Shape4D, label: str, ref_schedule: Schedule) -> Schedule:

881

"""Proposes new striping for a schedule. The stripe is derived from the ifm requirements of the next Op down"""

882

ref_cost = ref_schedule.cost_map

883

884

striped_schedule = Schedule(self.sg, label)

885

stripe = final_stripe

886

for sched_op in reversed(self.sched_ops):

887

if sched_op not in ref_cost:

888

# sched_op is not part of the sub-schedule - skip

889

continue

890

891

# Create a cost entry with the new stripe

892

cost = sched_op.create_scheduler_info(self.nng, stripe)

893

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

894

weight_tensor = cost.npu_weights_tensor

895

for idx, buffered_tens in enumerate(ref_cost[sched_op].buffered_weight_tensors):

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

896

# If the weights are buffered in the reference schedule they should be in the new proposal

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

897

cost.buffered_weight_tensors.append(

898

self.buffer_tensor(

899

weight_tensor,

900

buffered_tens.sub_purpose,

901

weight_tensor.double_buffer_sizes[idx],

902

buffered_tens.name,

903

)

Jacob Bohlin

2021-08-17 17:44:45 +0200

[diff] [blame]

904

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

905

906

# Estimate performance

907

cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)

908

striped_schedule.cost_map[sched_op] = cost

909

erik.andersson@arm.com

8912f3a

2022-08-16 11:08:57 +0200

[diff] [blame]

910

# Calculate the preceeding Op's stripe.

911

912

# In certain cases where an upscaling Op is cascaded,

913

# it may get instructed to produce an odd stripe height.

914

# Thus we need to force it back to even heights.

915

force_even_stripe_heights = False

916

for op in self.sched_ops:

917

# Check if the cascade has a Nearest Neighbor-op.

918

# If that is the case, force the stripes to be even.

919

if (

920

ref_cost.get(op, None)

921

and ref_cost.get(sched_op, None)

922

and ref_cost[op].cascade == ref_cost[sched_op].cascade

923

and is_nearest(op.resampling_mode)

924

):

925

force_even_stripe_heights = True

926

break

927

upscaling_remainder = stripe.height % to_upscale(sched_op.resampling_mode)

928

height = stripe.height + (stripe.height % 2 if force_even_stripe_heights else upscaling_remainder)

Fredrik Svedberg

d03dc50

2022-06-30 10:44:12 +0200

[diff] [blame]

929

stripe = sched_op.ifm.shape.with_height(height * sched_op.kernel.stride.y)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

930

931

return striped_schedule

932

933

def estimate_schedule_memory_usage(self, schedule: Schedule, non_local_mem_usage: dict):

934

"""Estimates the memory usage of a schedule"""

935

cost = schedule.cost_map

936

cascades = schedule.cascades

937

peak_mem_usage = 0

938

for sched_op in self.sched_ops:

939

if sched_op not in cost:

940

# sched_op is not part of the sub-schedule - skip

941

continue

942

943

if cost[sched_op].cascade:

944

# This Op is part of a cascade - use the cascade's memory usage

945

cascade_info = cascades[cost[sched_op].cascade]

946

# Non-local memory usage is already included in the cascade_info

947

peak_mem_usage = max(cascade_info.mem_usage, peak_mem_usage)

948

else:

949

# This Op is not part of a cascade - calculate the memory usage

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

950

op_weight_buffer = sum(tens.storage_size() for tens in cost[sched_op].buffered_weight_tensors)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

951

952

op_mem_usage = (

953

sched_op.ifm_size_in_bytes()

954

+ sched_op.ofm_size_in_bytes()

955

+ op_weight_buffer

956

+ non_local_mem_usage.get(sched_op, 0)

957

)

958

peak_mem_usage = max(op_mem_usage, peak_mem_usage)

959

960

return peak_mem_usage

961

Johan Alfvén

255dad7

2022-07-16 18:27:05 +0200

[diff] [blame]

962

def build_cascades_for_min_schedule(self, min_schedule: Schedule, max_template: Schedule, memory_limit: int):

963

# Update memory snapshot

964

self.sg.schedule = min_schedule

965

self.update_op_memory_snapshot(min_schedule)

966

967

# Calculate residual memory for Min schedule

968

non_local_mem_usage = {}

969

for sched_op in self.sched_ops:

970

time_index = min_schedule.cost_map[sched_op].time_index

971

972

if self.arch.is_spilling_enabled():

973

# For Dedicated SRAM only the intermediate buffers are in SRAM, hence op_mem_usage is 0

974

op_mem_usage = 0

975

else:

976

# Min schedule only have ifm and ofm in SRAM (no buffered weigth tensors)

977

op_mem_usage = sched_op.ifm_size_in_bytes() + sched_op.ofm_size_in_bytes()

978

979

non_local_mem_usage[sched_op] = min_schedule.memory_snapshot[time_index] - op_mem_usage

980

981

# Crate cascades for Min schedule

982

cascade_builder = CascadeBuilder(self.sched_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)

983

cascade_builder.build_cascades(min_schedule, max_template, memory_limit)

984

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

985

def optimize_sub_schedule(

986

self, cascade_info: CascadeInfo, ref_schedule: Schedule, max_template: Schedule, memory_limit: int

987

) -> Schedule:

988

"""Extracts the Ops covered by the given cascade and creates a sub-schedule. The sub-schedule is optimized by

989

proposing weight buffering and then continously proposing new stripe sizes"""

990

ref_cost = ref_schedule.cost_map

991

# Extract the ops that are part of this sub-schedule

992

start = cascade_info.start

993

end = cascade_info.end

994

sub_schedule_ops = self.sched_ops[start : end + 1]

995

# Create a sub-schedule that contains only the costs for the Ops that are part of the sub-schedule

996

sub_schedule = Schedule(self.sg, f"SUB_{start}_{end}")

997

for sched_op in sub_schedule_ops:

998

sub_schedule.cost_map[sched_op] = ref_cost[sched_op]

999

1000

sub_schedule.cascades[end] = cascade_info

1001

# Use the memory snapshot from the reference schedule

1002

sub_schedule.memory_snapshot = ref_schedule.memory_snapshot

1003

1004

# Calculate memory usage that is live during the sub-schedule but not part of it

1005

time_for_cascade = ref_cost[sub_schedule_ops[0]].time_index

1006

mem_usage_parallel_to_sub_schedule = ref_schedule.memory_snapshot[time_for_cascade] - cascade_info.mem_usage

1007

# If the first Op's IFM has other consumers it has to live throughout the whole sub-schedule whether it's

1008

# included in a cascade or not

1009

persistent_initial_ifm = (

1010

sub_schedule_ops[0].ifm_size_in_bytes() if len(sub_schedule_ops[0].ifm.connection.consumers) > 1 else 0

1011

)

1012

# Calculate non-local-mem-usage per Operator

1013

non_local_mem_usage = {}

1014

for idx, sched_op in enumerate(sub_schedule_ops):

1015

non_local_mem_usage[sched_op] = mem_usage_parallel_to_sub_schedule

1016

if idx != 0:

1017

non_local_mem_usage[sched_op] += persistent_initial_ifm

1018

1019

cascade_builder = CascadeBuilder(sub_schedule_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)

1020

1021

# Start by adding buffering

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

1022

buffered_sub_schedule = self.propose_schedule_buffering(

1023

sub_schedule, self.scheduler_options.optimization_sram_limit

1024

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1025

# Copy the cascades over from the unbuffered-schedule

1026

buffered_sub_schedule.cascades = sub_schedule.cascades

1027

1028

# Generate the possible stripings for the final Op in the sub-schedule

1029

final_ofm_shape = sub_schedule_ops[-1].ofm.shape

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1030

1031

# Skip testing the min stripe used in the MIN schedule since that will be used

1032

# anyway if no new cascades are created below

1033

last_op = sub_schedule_ops[-1]

1034

min_stripe_h = sub_schedule.cost_map[last_op].stripe.height + 1

1035

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1036

possible_stripes = [

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1037

final_ofm_shape.with_height(stripe_h) for stripe_h in range(min_stripe_h, final_ofm_shape.height // 2 + 1)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1038

]

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1039

# Propose different striping

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

1040

best_schedule = None

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1041

max_nbr_of_cascades = 0

1042

for iteration, proposed_stripe in enumerate(possible_stripes):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1043

proposed_schedule = self.propose_schedule_striping(

1044

proposed_stripe, f"OPTIMIZED_{iteration}", buffered_sub_schedule

1045

)

1046

1047

cascade_builder.build_cascades(proposed_schedule, max_template, memory_limit)

1048

1049

# Check if proposal fits

1050

proposed_schedule_mem_usage = self.estimate_schedule_memory_usage(proposed_schedule, non_local_mem_usage)

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1051

1052

nbr_of_cascades = len(proposed_schedule.cascades)

1053

1054

if iteration == 0:

1055

# First iteration - used as limit to prevent splitting up the cascades

1056

# Long cascades are better in order to reduce IFM/IFM dram bandwidth

1057

max_nbr_of_cascades = nbr_of_cascades

1058

1059

if (proposed_schedule_mem_usage) <= memory_limit and nbr_of_cascades <= max_nbr_of_cascades:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1060

best_schedule = proposed_schedule

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1061

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1062

if not proposed_schedule.cascades:

1063

# No cascading required - early exit

1064

break

1065

else:

Johan Alfvén

2022-08-17 14:59:58 +0200

[diff] [blame]

1066

break

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

return best_schedule

def optimize_schedule(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

self,

schedule: Schedule,

max_sched: Schedule,

max_template: Schedule,

1075

options: SchedulerOptions,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1076

) -> Schedule:

1077

"""Extracts sub-schedules based on the cascades and optimizes them and applies them to the final schedule"""

1078

sram_limit = options.optimization_sram_limit

1079

if max_sched.fast_storage_peak_usage < sram_limit and not self.arch.is_spilling_enabled():

1080

# Maximum performance schedule fits within the SRAM target

1081

return max_sched

1082

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

1083

# Iterate over a copy of the cascades since they may change during the loop

1084

for cascade_info in list(schedule.cascades.values()):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1085

# Optimize the sub-schedule in this cascade

1086

opt_sub_schedule = self.optimize_sub_schedule(cascade_info, schedule, max_template, sram_limit)

Jacob Bohlin

fad7204

2021-08-24 21:51:41 +0200

[diff] [blame]

1087

if opt_sub_schedule:

1088

# Remove the existing cascade

1089

del schedule.cascades[cascade_info.end]

1090

# Update the sub-schedule Op and cascade costs to the full schedule

1091

schedule.cost_map.update(opt_sub_schedule.cost_map)

1092

schedule.cascades.update(opt_sub_schedule.cascades)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1093

1094

# Update memory snapshot

1095

self.sg.schedule = schedule

1096

self.update_op_memory_snapshot(schedule)

1097

# Propose schedule buffering to the optimized schedule

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

1098

optimized_sched = self.propose_schedule_buffering(schedule, self.scheduler_options.optimization_sram_limit)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1099

# Copy the cascade's metadata from the unbuffered schedule

1100

optimized_sched.cascades = schedule.cascades

1101

return optimized_sched

1102

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1103

def optimize_weight_buffering_size(

1104

self,

1105

min_schedule: Schedule,

1106

options: SchedulerOptions,

1107

):

1108

default_schedule = self.sg.schedule

Tim Hall

c1be087

2022-03-03 17:50:52 +0000

[diff] [blame]

1109

npu_performance.calc_new_performance_for_network(self.nng, self.arch, None, False)

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1110

default_tot_cycles = self.nng.cycles[npu_performance.PassCycles.Total]

1111

default_dram_cycles = self.nng.cycles[npu_performance.PassCycles.DramAccess]

1112

1113

# Restore mem/type for scratched_fms

1114

for tens in self.scratched_fms:

1115

tens.mem_area = self.scratched_fms[tens][0]

1116

tens.mem_type = self.scratched_fms[tens][1]

1117

1118

self.update_op_memory_snapshot(self.sg.schedule)

1119

1120

# Collect live ranges from tensors

1121

memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

1122

lr_graph = live_range.LiveRangeGraph()

1123

for mem_area, mem_type_set in memories_list:

1124

live_range.extract_live_ranges_from_cascaded_passes(

1125

self.nng.get_root_subgraph(),

mem_area,

mem_type_set,

lr_graph,

Tensor.AllocationQuantum,

1130

)

1131

1132

# Find the relation between the sched_op and the buffering tensor

1133

weight_ops = {}

1134

for sched_op in self.sched_ops:

1135

cost = self.sg.schedule.cost_map[sched_op]

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

1136

for tens in cost.buffered_weight_tensors:

1137

weight_ops[tens] = sched_op

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1138

1139

# Filter out weight buffer live ranges

1140

weight_lrs = []

1141

for lr in lr_graph.lrs:

1142

for tens in lr.tensors:

1143

if weight_ops.get(tens):

1144

weight_lrs.append(lr)

1145

break

1146

1147

# See if any evicted fm overlaps with a weight buffering op.

1148

# If this is the case add a size limitation to the buffering op

1149

for lr in self.evicted_fms:

1150

for weight_lr in weight_lrs:

1151

if lr.overlaps_ranges(weight_lr):

1152

for tens in weight_lr.tensors:

1153

sched_op = weight_ops.get(tens)

1154

if sched_op:

1155

# Add size reduction to the op

1156

sched_op.evicted_fms_size += lr.size

1157

break

1158

1159

self.sg.schedule = min_schedule

1160

self.update_op_memory_snapshot(self.sg.schedule)

1161

1162

# Run schedule buffering - with weight buffer size reduction

1163

schedule = self.propose_schedule_buffering(self.sg.schedule, options.optimization_sram_limit)

1164

schedule.cascades = self.sg.schedule.cascades

1165

self.sg.schedule = schedule

1166

1167

# Apply new buffer schdule and calc new performance

1168

self.update_op_memory_snapshot(self.sg.schedule)

1169

self.apply_schedule(self.sg.schedule)

1170

self.use_fast_storage_for_feature_maps(self.sg.schedule, options.optimization_sram_limit)

1171

Tim Hall

c1be087

2022-03-03 17:50:52 +0000

[diff] [blame]

1172

npu_performance.calc_new_performance_for_network(self.nng, self.arch, None, False)

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1173

new_tot_cycles = self.nng.cycles[npu_performance.PassCycles.Total]

1174

new_dram_cycles = self.nng.cycles[npu_performance.PassCycles.DramAccess]

1175

Tim Hall

8bc7a65

2022-05-19 15:29:23 +0100

[diff] [blame]

1176

improvement_tot = (

1177

round((default_tot_cycles - new_tot_cycles) / default_tot_cycles, 2) if default_tot_cycles != 0 else 0

1178

)

1179

improvement_dram = (

1180

round((default_dram_cycles - new_dram_cycles) / default_dram_cycles, 2) if default_dram_cycles != 0 else 0

1181

)

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1182

1183

# Compare both total and dram improvement

Johan Alfvén

3dae1b6

2022-05-17 10:26:48 +0200

[diff] [blame]

1184

if not (improvement_tot >= 0 and improvement_dram > 0):

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1185

# No improvement, restore the default schedule

1186

for sched_op in self.sched_ops:

1187

sched_op.evicted_fms_size = 0

1188

1189

for tens in self.scratched_fms:

1190

tens.mem_area = self.scratched_fms[tens][0]

1191

tens.mem_type = self.scratched_fms[tens][1]

1192

1193

self.sg.schedule = default_schedule

1194

self.update_op_memory_snapshot(self.sg.schedule)

1195

self.apply_schedule(self.sg.schedule)

1196

self.use_fast_storage_for_feature_maps(self.sg.schedule, options.optimization_sram_limit)

1197

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1198

def apply_schedule(self, sched: Schedule):

1199

"""Applies the given schedule as a final solution"""

1200

for sched_op in self.sched_ops:

1201

op_info = sched.cost_map[sched_op]

1202

cascade_info = sched.cascades.get(op_info.cascade, None)

1203

if cascade_info and sched_op in cascade_info.buffers:

1204

buffer_tens = sched_op.ifm.connection.parent_tens

1205

# Apply memory area and type

1206

buffer_tens.mem_area = self.arch.fast_storage_mem_area

1207

buffer_tens.mem_type = MemType.Scratch_fast

1208

# Apply Rolling buffer

1209

buffer_tens.set_format(TensorFormat.NHCWB16, self.arch)

1210

buffer_tens.set_new_sub_purpose(TensorSubPurpose.RollingBufferY, cascade_info.buffers[sched_op].height)

1211

1212

sched_op.parent_ps.block_config = op_info.block_config.old_style_representation()

1213

1214

# Ensure that the src_tensor reference is set correctly

Rickard Bolin

2022-05-16 09:11:06 +0000

[diff] [blame]

1215

for tens in op_info.buffered_weight_tensors:

1216

tens.src_tensor = op_info.npu_weights_tensor

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1217

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1218

def use_fast_storage_for_feature_maps(self, schedule, staging_limit):

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1219

max_mem_usage = []

1220

base_mem_usage = []

1221

fast_storage_type = MemType.Scratch_fast

1222

fast_storage_mem_area = self.arch.fast_storage_mem_area

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1223

self.evicted_fms = []

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1224

1225

# Force all OFMs to fast-storage

1226

for sched_op in self.sched_ops:

1227

cost = schedule.cost_map[sched_op]

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1228

if cost.cascade == 0 and sched_op.get_dependants():

1229

ofm_tens = sched_op.ofm.connection.parent_tens

1230

if not any(cons is None for cons in ofm_tens.consumer_list):

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1231

if ofm_tens not in self.scratched_fms:

1232

# Remember default mem area and mem type, only done once

1233

self.scratched_fms[ofm_tens] = (ofm_tens.mem_area, ofm_tens.mem_type)

1234

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1235

ofm_tens.mem_area = fast_storage_mem_area

1236

ofm_tens.mem_type = fast_storage_type

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1237

1238

# Collect live ranges from tensors

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1239

memories_list = [(fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1240

lr_graph = live_range.LiveRangeGraph()

1241

for mem_area, mem_type_set in memories_list:

1242

live_range.extract_live_ranges_from_cascaded_passes(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

1243

self.nng.get_root_subgraph(),

mem_area,

mem_type_set,

lr_graph,

Tensor.AllocationQuantum,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1248

)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1249

max_mem_usage = lr_graph.get_temporal_memory_usage(fast_storage_mem_area)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1250

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1251

# If true, everything fits and we can proceed

1252

if max(max_mem_usage) <= staging_limit:

1253

return

1254

1255

# Build up the base memory usage by removing the

1256

# mem_usage of the lrs we previously moved to fast-storage

1257

base_mem_usage = np.array(max_mem_usage)

1258

curr_lrs = []

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1259

for lr in lr_graph.lrs:

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1260

for tens in lr.tensors:

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1261

if self.scratched_fms.get(tens):

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1262

curr_lrs.append(lr)

1263

base_mem_usage[lr.start_time : lr.end_time + 1] -= lr.size

1264

break

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1265

competing_lrs = []

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1266

competing_tens_access = {}

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1267

for lr in curr_lrs:

1268

base_usage = max(base_mem_usage[lr.start_time : lr.end_time + 1])

1269

# If true, the lr will never fit and may thus be evicted

1270

if base_usage + lr.size > staging_limit:

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1271

self.evicted_fms.append(lr)

1272

FastStorageComponentAllocator.evict(lr, max_mem_usage, self.scratched_fms)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1273

continue

1274

# Since max_mem_usage is the memory usage with all FMs still in fast-storage,

1275

# the memory limit cannot be exceeded if max_mem_usage does not.

1276

# Thus, the affected lrs can remain in fast-storage if the following is true

1277

if max(max_mem_usage[lr.start_time : lr.end_time + 1]) <= staging_limit:

1278

FastStorageComponentAllocator.keep(lr, base_mem_usage, staging_limit)

1279

else:

1280

competing_lrs.append(lr)

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1281

for tens in lr.tensors:

1282

competing_tens_access[tens] = 0

1283

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1284

competing_lrs_sz = len(competing_lrs)

1285

# All lrs and their tensors have been handled if competing_lrs_sz is zero, we may thus return

1286

if competing_lrs_sz == 0:

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1287

return

1288

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1289

# Estimate element access for all tensors that are competing for a place in fast-storage.

1290

# This number is used when deciding which tensor that stays in fast-storage

1291

for sched_op in self.sched_ops:

1292

cost = schedule.cost_map[sched_op]

1293

1294

if competing_tens_access.get(sched_op.ifm.connection.parent_tens) is not None:

1295

tens = sched_op.ifm.connection.parent_tens

1296

access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)

1297

competing_tens_access[tens] += access.ifm_read[0]

1298

1299

if sched_op.ifm2 and competing_tens_access.get(sched_op.ifm2.connection.parent_tens) is not None:

1300

tens = sched_op.ifm2.connection.parent_tens

1301

access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)

1302

competing_tens_access[tens] += access.ifm_read[1]

1303

1304

if competing_tens_access.get(sched_op.ofm.connection.parent_tens) is not None:

1305

tens = sched_op.ofm.connection.parent_tens

1306

access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)

1307

competing_tens_access[tens] += access.ofm_write

1308

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1309

competing_lrs = sorted(competing_lrs, key=lambda lr: (lr.start_time, lr.end_time + 1, lr.size))

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1310

1311

# Remove lrs that have a live range that is too long compared to others.

1312

# They are causing problems for the HillClimb Allocator when it has to

1313

# change the allocation indices, in order to fit all the allocations into SRAM.

1314

# This problem only occur in larger networks with complex graphs.

1315

#

1316

# Limit the number of items for allocate_component to work with max MAX_EXHAUSTIVE_ITEMS

1317

# at the time. Too many will give too long compilation time

1318

#

1319

# Too long is currently decided to be (based on experience, analyzing many networks):

1320

# Compare lr at postion i with lr at position i + MAX_EXHAUSTIVE_ITEMS.

1321

# If end time differs by at least MAX_EXHAUSTIVE_LIFE_RANGE then do not include lr at position i.

1322

if competing_lrs_sz > FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS:

1323

# create a copy of the original list to iterate over because the original version is modified in-loop

1324

competing_lrs_copy = competing_lrs.copy()

1325

for i, lr in enumerate(competing_lrs_copy):

1326

lr_time = lr.end_time - lr.start_time

1327

if lr_time < FastStorageComponentAllocator.MAX_EXHAUSTIVE_LIFE_RANGE:

# Skip small ranges

continue

# Compare current lr with lr at position lr + MAX_EXHAUSTIVE_ITEMS

1332

cmp_pos = min(i + FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS, competing_lrs_sz - 1)

1333

1334

# Compare end times + plus a margin by MAX_EXHAUSTIVE_LIFE_RANGE

1335

if (

1336

lr.end_time

1337

> competing_lrs_copy[cmp_pos].end_time + FastStorageComponentAllocator.MAX_EXHAUSTIVE_LIFE_RANGE

1338

):

1339

# Current lr live time stands out, remove it. No use adding it to the

1340

# evicted_fms list since the lr should not be included in the fast storage allocation

1341

FastStorageComponentAllocator.evict(lr, max_mem_usage, self.scratched_fms)

1342

competing_lrs.remove(lr)

1343

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1344

start = 0

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1345

end_time = competing_lrs[0].end_time

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1346

competing_lrs_sz = len(competing_lrs)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1347

component_allocator = FastStorageComponentAllocator(base_mem_usage, max_mem_usage, staging_limit)

1348

# Build up components and then allocate each separately

1349

for i, lr in enumerate(competing_lrs):

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1350

nbr_items = i - start

1351

if lr.start_time <= end_time and (nbr_items < FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS):

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1352

end_time = max(end_time, lr.end_time)

1353

else:

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1354

# Number items reached max items or current lr's start time

1355

# does not overlap with previous lr's end time

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1356

component_allocator.allocate_component(

1357

component_allocator,

1358

competing_lrs[start:i],

1359

max_mem_usage,

1360

base_mem_usage,

1361

staging_limit,

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1362

self.scratched_fms,

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1363

competing_tens_access,

Johan Alfvén

2022-06-24 08:42:19 +0200

[diff] [blame]

1364

self.evicted_fms,

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1365

)

1366

start = i

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1367

end_time = lr.end_time

1368

component_allocator.allocate_component(

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1369

component_allocator,

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1370

competing_lrs[start:competing_lrs_sz],

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

max_mem_usage,

base_mem_usage,

staging_limit,

self.scratched_fms,

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1375

competing_tens_access,

Johan Alfvén

2022-06-24 08:42:19 +0200

[diff] [blame]

1376

self.evicted_fms,

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1377

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1378

1379

def move_constant_data(self):

1380

"""Determine if data, can be moved from permanent storage to another memory area. A move

1381

will generate a DMA command in the high-level command stream"""

1382

for sched_op in self.sched_ops:

1383

parent_op = sched_op.parent_op

1384

is_lut_used = any(inp.purpose == TensorPurpose.LUT for inp in parent_op.inputs)

1385

max_ifm_shram_avail = (

1386

(self.arch.available_shram_banks(is_lut_used) - self.arch.shram_reserved_output_banks)

1387

* self.arch.shram_bank_size

// 2

)

for idx, tens in enumerate(parent_op.inputs):

1392

if tens.mem_type not in (MemType.Scratch, MemType.Scratch_fast):

1393

# Tensor is in permanent storage

1394

# Only when permanent storage differs from feature map storage, there is a point moving the data

1395

if (

1396

tens.mem_area in self.arch.permanent_storage_mem_area

1397

and self.arch.permanent_storage_mem_area != self.arch.feature_map_storage_mem_area

1398

) or tens.purpose == TensorPurpose.LUT:

1399

if tens.purpose == TensorPurpose.LUT or (

Patrik Gustavsson

94292fe

2021-09-02 08:22:58 +0200

[diff] [blame]

1400

# For elementwise broadcast

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1401

tens.purpose == TensorPurpose.FeatureMap

1402

and sched_op.op_type.is_binary_elementwise_op()

1403

and tens.shape != []

1404

and sched_op.ifm.shape != sched_op.ofm.shape

Patrik Gustavsson

94292fe

2021-09-02 08:22:58 +0200

[diff] [blame]

1405

and parent_op.write_shape is None

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1406

and tens.storage_size() > max_ifm_shram_avail

1407

):

1408

only_vector_product_consumers = all(

1409

oper and oper.type.npu_block_type == NpuBlockType.VectorProduct

1410

for oper in tens.consumers()

1411

)

1412

1413

if (not only_vector_product_consumers) or tens.purpose == TensorPurpose.LUT:

1414

new_tens = tens.clone_into_fast_storage(self.arch)

1415

if tens.purpose == TensorPurpose.LUT:

1416

new_tens.mem_area = MemArea.Shram

1417

1418

new_tens.consumer_list.append(parent_op)

1419

parent_op.inputs[idx] = new_tens

Dwight Lidman

352607c

2021-09-29 17:00:09 +0200

[diff] [blame]

1420

# If the index is out of range, IFM and IFM2 are the same tensor

1421

# and pass inputs don't have duplicates

1422

if idx < len(sched_op.parent_ps.inputs):

1423

sched_op.parent_ps.inputs[idx] = new_tens

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1424

1425

def print_schedule(self, schedule: Schedule):

1426

print(f"Schedule: '{schedule.name}'")

1427

for sched_op in self.sched_ops:

1428

if sched_op not in schedule.cost_map:

1429

# Sub-schedule printing

1430

continue

1431

1432

op_info = schedule.cost_map[sched_op]

1433

print(f"\t{sched_op.index}: Operation {sched_op.name} - OFM {sched_op.ofm.shape}")

1434

print(f"\t\tType: {sched_op.op_type}")

1435

print(f"\t\tKernel: {sched_op.kernel}")

1436

print(f"{op_info}")

1437

mem_usage = (

1438

schedule.memory_snapshot[op_info.time_index]

1439

if op_info.time_index < len(schedule.memory_snapshot)

1440

else 0

1441

)

1442

print(f"\t\tSRAM Used: {mem_usage} bytes")

1443

Jonas Ohlsson

25e700c

2022-03-04 14:58:56 +0100

[diff] [blame]

1444

print("\tCascades:")

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1445

for i, cascade in enumerate(schedule.cascades.values()):

1446

print(f"\t\t{i}: {cascade.start} -> {cascade.end}, size: {cascade.mem_usage}")

Patrik Gustavsson

feeb06d

2020-04-22 12:53:47 +0200

[diff] [blame]

1447

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1448

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1449

def _update_tensor_allocation(nng: Graph, arch: ArchitectureFeatures, options):

1450

"""

1451

Creates live ranges and runs tensor allocator for the current schedule

1452

(i.e. sg.schedule for all subgraphs), returns the maximum memory usage

1453

and updates SchedulerOpInfo.mem_usage for all operations in the schedule.

1454

"""

1455

root_sg = nng.get_root_subgraph()

1456

1457

alloc_list = []

1458

if arch.is_spilling_enabled():

1459

mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,)))

1460

mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))

1461

# Order is important

1462

alloc_list.append(mem_alloc_scratch_fast)

1463

alloc_list.append(mem_alloc_scratch)

1464

else:

1465

mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))

1466

alloc_list.append(mem_alloc_scratch)

1467

1468

for mem_area, mem_type_set in alloc_list:

1469

tensor_allocation.allocate_tensors(

nng,

root_sg,

arch,

mem_area,

mem_type_set,

tensor_allocator=options.tensor_allocator,

1476

verbose_allocation=options.verbose_allocation,

1477

cpu_tensor_alignment=options.cpu_tensor_alignment,

Tim Hall

cda4fcb

2022-05-19 12:36:58 +0100

[diff] [blame]

1478

hillclimb_max_iterations=options.hillclimb_max_iterations,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1479

)

1480

1481

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1482

class FastStorageComponentAllocator:

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1483

MAX_EXHAUSTIVE_LIFE_RANGE = 20

Johan Alfvén

2022-10-07 18:03:48 +0200

[diff] [blame]

1484

MAX_EXHAUSTIVE_ITEMS = 20

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1485

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1486

def __init__(self, base_mem_usage, max_mem_usage, staging_limit):

1487

self.base_mem_usage = base_mem_usage

1488

self.max_mem_usage = list(max_mem_usage)

1489

self.staging_limit = staging_limit

1490

self.lrs = []

1491

self.evicted = []

1492

self.curr_evicted = []

1493

self.remaining_total_size = []

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1494

self.best_score = 0

1495

self.competing_tens_access = {}

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1496

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1497

def allocate_exhaustive(self, ix, score):

1498

# Favour tensors with highest element access (score)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1499

if ix >= len(self.lrs):

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1500

if score > self.best_score:

1501

self.best_score = score

Louis Verhaard

5c8f1e5

2022-02-23 14:13:07 +0100

[diff] [blame]

1502

self.evicted = self.curr_evicted.copy()

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

return

lr = self.lrs[ix]

for t in range(lr.start_time, lr.end_time):

1507

assert self.base_mem_usage[t] <= self.max_mem_usage[t]

1508

base_usage = max(self.base_mem_usage[lr.start_time : lr.end_time + 1])

1509

can_fit = base_usage + lr.size <= self.staging_limit

1510

always_fits = can_fit

1511

1512

if can_fit:

1513

max_usage = max(self.max_mem_usage[lr.start_time : lr.end_time + 1])

1514

always_fits = max_usage <= self.staging_limit

1515

1516

if can_fit or always_fits:

1517

self.curr_evicted[ix] = False

1518

self.base_mem_usage = self.update_mem_usage(self.base_mem_usage, lr, True)

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1519

tens = lr.tensors[0]

1520

# Tensor is being included - add tensor element access to the score

1521

self.allocate_exhaustive(ix + 1, score + self.competing_tens_access[tens])

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1522

self.base_mem_usage = self.update_mem_usage(self.base_mem_usage, lr, False)

1523

1524

if not always_fits:

1525

self.curr_evicted[ix] = True

1526

self.max_mem_usage = self.update_mem_usage(self.max_mem_usage, lr, False)

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1527

self.allocate_exhaustive(ix + 1, score)

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1528

self.max_mem_usage = self.update_mem_usage(self.max_mem_usage, lr, True)

1529

1530

@staticmethod

1531

def update_mem_usage(mem_usage, lr, increase):

1532

for t in range(lr.start_time, lr.end_time + 1):

1533

mem_usage[t] += lr.size if increase else -lr.size

1534

assert mem_usage[t] >= 0

return mem_usage

@staticmethod

def evict(lr, max_mem_usage, scratched_fms):

1539

for t in range(lr.start_time, lr.end_time + 1):

1540

max_mem_usage[t] -= lr.size

1541

for tens in lr.tensors:

1542

if tens in scratched_fms:

1543

tens.mem_area = scratched_fms[tens][0]

1544

tens.mem_type = scratched_fms[tens][1]

1545

1546

@staticmethod

1547

def keep(lr, base_mem_usage, staging_limit):

1548

for t in range(lr.start_time, lr.end_time + 1):

1549

base_mem_usage[t] += lr.size

1550

assert base_mem_usage[t] <= staging_limit

1551

Johan Alfvén

2022-06-24 08:42:19 +0200

[diff] [blame]

1552

def allocate_component(

1553

self, allocator, lrs, max_mem, min_mem, staging_limit, scratched_fms, competing_tens_access, evicted_fms

1554

):

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1555

sz = len(lrs)

1556

allocator.lrs = lrs

1557

allocator.evicted = [0] * len(lrs)

1558

allocator.curr_evicted = [0] * sz

Johan Alfvén

2022-06-10 15:40:58 +0200

[diff] [blame]

1559

allocator.best_score = -1

1560

allocator.competing_tens_access = competing_tens_access

1561

# Recursively evaluate all permutations of allocations of the lrs found in the component.

1562

# For every permutation that fits within the staging_limit there is a score calculated.

1563

# The permutation with the highest score will then be chosen. The score is calculated

1564

# as the sum of the actual element access (ifm read and ofm write) for all the

1565

# including tensors. So it is not necessary the tensor with the biggest size that ends up

1566

# being included in the result.

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1567

allocator.allocate_exhaustive(0, 0)

1568

1569

# Optimal allocation has been found, move lrs accordingly

1570

for i, e in enumerate(allocator.evicted):

1571

if e:

1572

self.evict(lrs[i], max_mem, scratched_fms)

Johan Alfvén

2022-06-24 08:42:19 +0200

[diff] [blame]

1573

if lrs[i] not in evicted_fms:

1574

evicted_fms.append(lrs[i])

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1575

else:

1576

self.keep(lrs[i], min_mem, staging_limit)

Johan Alfvén

2022-06-24 08:42:19 +0200

[diff] [blame]

1577

if lrs[i] in evicted_fms:

1578

evicted_fms.remove(lrs[i])

erik.andersson@arm.com

2022-02-02 14:03:15 +0100

[diff] [blame]

1579

1580

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1581

def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_options: SchedulerOptions):

1582

"""Entry point for the Scheduler"""

1583

# Initialize CPU subgraphs

1584

schedulers = dict()

1585

# Initialize schedulers with max schedule. Only schedule NPU subgraphs

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1586

for sg in nng.subgraphs:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1587

if sg.placement != PassPlacement.Npu:

1588

# Create cascaded passes for CPU Ops

1589

cascaded_passes = []

1590

for idx, ps in enumerate(sg.passes):

1591

cps = CascadedPass(

Jonas Ohlsson

2022-03-30 10:30:25 +0200

[diff] [blame]

1592

ps.name,

1593

SchedulingStrategy.WeightStream,

ps.inputs,

[],

ps.outputs,

[ps],

ps.placement,

False,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1600

)

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1601

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1602

cps.time = idx

1603

ps.cascade = cps

1604

cascaded_passes.append(cps)

Andreas Nevalainen

2020-11-19 11:27:50 +0100

[diff] [blame]

1605

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1606

sg.cascaded_passes = cascaded_passes

1607

else:

1608

# Npu subgraph - create schedule

1609

scheduler = Scheduler(nng, sg, arch, scheduler_options)

1610

schedulers[sg] = scheduler

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1611

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1612

scheduler.create_scheduler_representation(arch)

1613

sg.sched_ops = scheduler.sched_ops

1614

scheduler.move_constant_data()

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1615

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1616

# Create the Max schedule template

1617

max_schedule_template = scheduler.create_initial_schedule()

1618

scheduler.max_schedule = max_schedule_template

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1619

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1620

# Create the optimimised Max schedule

1621

sg.schedule = max_schedule_template

1622

scheduler.update_op_memory_snapshot(max_schedule_template)

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

1623

opt_max_schedule = scheduler.propose_schedule_buffering(max_schedule_template, 1 << 32)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1624

sg.schedule = opt_max_schedule

1625

scheduler.update_op_memory_snapshot(opt_max_schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1626

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1627

# Create Min schedule

1628

min_schedule = scheduler.propose_minimal_schedule()

1629

initial_sram_limit = scheduler_options.optimization_sram_limit

1630

if scheduler_options.optimization_strategy == OptimizationStrategy.Size:

1631

initial_sram_limit = scheduler.min_memory_req

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1632

Johan Alfvén

255dad7

2022-07-16 18:27:05 +0200

[diff] [blame]

1633

# Build cascades for Min schedule

1634

scheduler.build_cascades_for_min_schedule(min_schedule, max_schedule_template, initial_sram_limit)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1635

sg.schedule = min_schedule

1636

scheduler.update_op_memory_snapshot(min_schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1637

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1638

if scheduler_options.optimization_strategy == OptimizationStrategy.Performance:

1639

# Create an optimized schedule

1640

sg.schedule = scheduler.optimize_schedule(

1641

min_schedule, opt_max_schedule, max_schedule_template, scheduler_options

1642

)

1643

scheduler.update_op_memory_snapshot(sg.schedule)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1644

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1645

scheduler.apply_schedule(sg.schedule)

1646

scheduler.use_fast_storage_for_feature_maps(sg.schedule, scheduler_options.optimization_sram_limit)

Andreas Nevalainen

2020-10-28 15:42:08 +0100

[diff] [blame]

1647

Johan Alfvén

2022-05-05 08:42:46 +0200

[diff] [blame]

1648

if scheduler_options.optimization_strategy == OptimizationStrategy.Performance and scheduler.evicted_fms:

1649

# It might be possible to gain performance by reducing

1650

# weight buffer size and instead fit fms in fast storage

1651

scheduler.optimize_weight_buffering_size(min_schedule, scheduler_options)

1652

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

1653

if scheduler_options.verbose_schedule:

1654

scheduler.print_schedule(sg.schedule)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1655

Tim Hall