blob: a50f262e4fc4bc8db50a514a7eff126e2f501e06 [file] [log] [blame]
Johan Alfvenf3490472023-01-13 08:46:27 +01001# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Halld8339a72021-05-27 18:49:40 +010016#
Tim Hall79d07d22020-04-27 18:20:16 +010017# Description:
Tim Halld8339a72021-05-27 18:49:40 +010018# The scheduler creates and searches for an optimal plan for the network, selecting block configurations and
19# subdivisions for the Operators
Jonas Ohlsson845e2322022-03-01 12:39:55 +010020# For Class name forward references for the type annotations. (see PEP 563).
21from __future__ import annotations
22
Diego Russoea6111a2020-04-14 18:41:58 +010023import copy
Johan Alfvén5e0ae552022-02-09 21:20:10 +010024from collections import namedtuple
Tim Halld8339a72021-05-27 18:49:40 +010025from enum import auto
26from enum import IntEnum
Johan Alfvén6f4cb032022-05-05 08:42:46 +020027from typing import Any
Tim Halld8339a72021-05-27 18:49:40 +010028from typing import Dict
29from typing import List
30from typing import Optional
31from typing import Tuple
Jonas Ohlsson845e2322022-03-01 12:39:55 +010032from typing import TYPE_CHECKING
33
34# Import needed for Type annotations. Only import for Type checking to avoid run-time errors due to cyclic import.
35if TYPE_CHECKING:
36 from .npu_performance import CycleCost
Diego Russoea6111a2020-04-14 18:41:58 +010037
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +010038import numpy as np
39
Diego Russoea6111a2020-04-14 18:41:58 +010040from . import live_range
Tim Hall79d07d22020-04-27 18:20:16 +010041from . import npu_performance
Tim Halld8339a72021-05-27 18:49:40 +010042from . import tensor_allocation
43from . import weight_compressor
44from .architecture_allocator import ArchitectureBlockConfig
45from .architecture_allocator import find_block_config
46from .architecture_allocator import get_ifm_area_required
Fredrik Svedbergd03dc502022-06-30 10:44:12 +020047from .architecture_allocator import to_upscale
erik.andersson@arm.com8912f3a2022-08-16 11:08:57 +020048from .architecture_allocator import is_nearest
Tim Halld8339a72021-05-27 18:49:40 +010049from .architecture_features import ArchitectureFeatures
50from .architecture_features import Block
51from .cascade_builder import CascadeBuilder
52from .cascade_builder import CascadeInfo
Fredrik Svedberg880e7352020-08-25 11:31:47 +020053from .data_type import DataType
Diego Russoe8a10452020-04-21 17:39:10 +010054from .nn_graph import CascadedPass
Tim Halld8339a72021-05-27 18:49:40 +010055from .nn_graph import Graph
56from .nn_graph import Pass
Diego Russoe8a10452020-04-21 17:39:10 +010057from .nn_graph import PassPlacement
Diego Russoe8a10452020-04-21 17:39:10 +010058from .nn_graph import SchedulingStrategy
Tim Halld8339a72021-05-27 18:49:40 +010059from .nn_graph import Subgraph
Johan Alfvén92689d52022-12-06 11:16:19 +010060from .live_range import ofm_can_reuse_ifm
Tim Halld8339a72021-05-27 18:49:40 +010061from .numeric_util import round_down
62from .numeric_util import round_up
Diego Russoe8a10452020-04-21 17:39:10 +010063from .operation import NpuBlockType
Louis Verhaardaee5d752020-09-30 09:01:52 +020064from .operation import Op
Tim Halld8339a72021-05-27 18:49:40 +010065from .shape4d import Shape4D
Diego Russoe8a10452020-04-21 17:39:10 +010066from .tensor import MemArea
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020067from .tensor import MemType
Tim Halld8339a72021-05-27 18:49:40 +010068from .tensor import Tensor
Diego Russoe8a10452020-04-21 17:39:10 +010069from .tensor import TensorFormat
70from .tensor import TensorPurpose
71from .tensor import TensorSubPurpose
Jonas Ohlsson845e2322022-03-01 12:39:55 +010072from .weight_compressor import NpuWeightTensor
Jacob Bohlin1a666972020-09-11 10:04:15 +020073
Tim Hall79d07d22020-04-27 18:20:16 +010074
Tim Halld8339a72021-05-27 18:49:40 +010075def shape_for_format(shape: Shape4D, tensor_format: TensorFormat) -> Shape4D:
76 if tensor_format == TensorFormat.NHCWB16:
77 return shape.with_depth(round_up(shape.depth, 16))
78
79 return shape
80
81
82class OptimizationStrategy(IntEnum):
83 """Enum defining the different optimization strategies for the Scheduler"""
84
85 Size = auto()
86 Performance = auto()
Tim Hall79d07d22020-04-27 18:20:16 +010087
88 def __str__(self):
89 return self.name
90
91
Tim Halld8339a72021-05-27 18:49:40 +010092class SchedulerOpInfo:
93 """Contains metadata about a SchedulerOperation that is unique to one Schedule"""
94
Tim Hall79d07d22020-04-27 18:20:16 +010095 def __init__(
96 self,
Tim Halld8339a72021-05-27 18:49:40 +010097 block_config: ArchitectureBlockConfig,
98 weights_size: int,
99 stripe_input: Shape4D,
100 stripe_input2: Optional[Shape4D],
101 stripe: Shape4D,
Tim Hall79d07d22020-04-27 18:20:16 +0100102 ):
Tim Halld8339a72021-05-27 18:49:40 +0100103 self.block_config = block_config
104 self.weights_size = weights_size
105 self.stripe_input = stripe_input
106 self.stripe_input2 = stripe_input2
107 self.stripe = stripe
108 self.cascade = 0 # Assigned by CascadeBuilder. 0 means not part of a cascade
109 self.time_index = None # Set by update_op_memory_snapshot
110 self.ofm_depth_slices: List[int] = [0, stripe.depth]
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100111 self.npu_weights_tensor: Optional[NpuWeightTensor] = None
112 self.npu_scales_tensor: Optional[NpuWeightTensor] = None
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000113 self.buffered_weight_tensors: List[Tensor] = []
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100114 self.cycles: Optional[CycleCost] = None
Tim Halld8339a72021-05-27 18:49:40 +0100115 self.slack_buffering_cycles = 0
116 self.slack_buffering_memory = 0
117 self.full_weight_transfer_cycles = 0
118
119 def copy(self):
Jonas Ohlssond8575072022-03-30 10:30:25 +0200120 res = SchedulerOpInfo(
121 self.block_config,
122 self.weights_size,
123 self.stripe_input,
124 self.stripe_input2,
125 self.stripe,
126 )
Tim Halld8339a72021-05-27 18:49:40 +0100127 res.cascade = self.cascade
128 return res
129
130 def __str__(self):
131 res = f"\t\tBlock Config = {self.block_config}\n"
132 res += f"\t\tOFM Block = {self.block_config.ofm_block}\n"
133 res += f"\t\tIFM Stripe = {self.stripe_input}\n"
134 res += f"\t\tIFM2 Stripe = {self.stripe_input2}\n"
135 res += f"\t\tOFM Stripe = {self.stripe}\n"
136 res += f"\t\tEncoded Weights = {self.npu_weights_tensor and len(self.npu_weights_tensor.buffer)} bytes\n"
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000137 for idx, tens in enumerate(self.buffered_weight_tensors):
138 res += f"\t\tWeight buffer{idx + 1} = {tens.storage_size()} bytes\n"
Tim Halld8339a72021-05-27 18:49:40 +0100139 res += f"\t\tDepth slices = {self.ofm_depth_slices}\n"
140 res += f"\t\tAssigned Cascade = {self.cascade}"
141 return res
142
143
144class SchedulerOptions:
145 """Contains options for the Scheduler"""
146
147 def __init__(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200148 self,
149 optimization_strategy,
150 sram_target,
151 verbose_schedule,
Tim Halld8339a72021-05-27 18:49:40 +0100152 ):
153 self.optimization_strategy = optimization_strategy
154 self.optimization_sram_limit = sram_target
Tim Hall79d07d22020-04-27 18:20:16 +0100155 self.verbose_schedule = verbose_schedule
Tim Hall79d07d22020-04-27 18:20:16 +0100156
Tim Halld8339a72021-05-27 18:49:40 +0100157 def __str__(self) -> str:
158 return f"{type(self).__name__}: {str(self.__dict__)}"
Tim Hall79d07d22020-04-27 18:20:16 +0100159
160 __repr__ = __str__
161
162
Tim Halld8339a72021-05-27 18:49:40 +0100163class SchedulerTensor:
164 def __init__(self, shape, dt, mem_area, _format):
165 self.dtype = dt
166 self.mem_area = mem_area
167 self.shape = shape
168 self.format = _format
169 self.connection = None
Tim Hall79d07d22020-04-27 18:20:16 +0100170
Tim Halld8339a72021-05-27 18:49:40 +0100171
172class SchedulerOperation:
173 """Scheduler internal representation of 'Operation'
174 This class can be seen as a node within the Scheduler Graph representation
175 """
176
177 def __init__(self, ps: Pass, arch: ArchitectureFeatures, nng: Graph):
178 self.arch = arch
179 self.parent_ps = ps
180 self.parent_op = ps.primary_op
181 self.name = ps.primary_op.name
182 self.op_type = ps.primary_op.type
183 self.activation = ps.primary_op.activation
184 self.kernel = ps.primary_op.kernel
Tim Hall3c5cfe92022-03-16 16:31:57 +0000185 self.resampling_mode = ps.primary_op.ifm_resampling_mode
Fredrik Svedbergb81e1bb2022-10-11 21:50:51 +0200186 self.reversed_operands = False
Tim Halld8339a72021-05-27 18:49:40 +0100187 self.uses_scalar = ps.primary_op.ifm2 is not None and (
188 ps.primary_op.ifm.shape == [] or ps.primary_op.ifm2.shape == []
Tim Hall79d07d22020-04-27 18:20:16 +0100189 )
erik.andersson@arm.com6b2a0b42022-03-22 15:35:30 +0100190
Tim Halld8339a72021-05-27 18:49:40 +0100191 self.ifm_ublock = arch.ifm_ublock
Tim Hall79d07d22020-04-27 18:20:16 +0100192
Jonas Ohlssond8575072022-03-30 10:30:25 +0200193 self.ifm = SchedulerTensor(
194 ps.ifm_shapes[0],
195 ps.ifm_tensor.dtype,
196 ps.ifm_tensor.mem_area,
197 ps.ifm_tensor.format,
198 )
Tim Hall79d07d22020-04-27 18:20:16 +0100199
Tim Halld8339a72021-05-27 18:49:40 +0100200 self.ifm2 = None
201 if ps.ifm2_tensor:
202 self.ifm2 = SchedulerTensor(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200203 ps.ifm_shapes[1],
204 ps.ifm2_tensor.dtype,
205 ps.ifm2_tensor.mem_area,
206 ps.ifm2_tensor.format,
Tim Halld8339a72021-05-27 18:49:40 +0100207 )
Tim Hall79d07d22020-04-27 18:20:16 +0100208
Jonas Ohlssond8575072022-03-30 10:30:25 +0200209 self.ofm = SchedulerTensor(
210 ps.ofm_shapes[0],
211 ps.ofm_tensor.dtype,
212 ps.ofm_tensor.mem_area,
213 ps.ofm_tensor.format,
214 )
Tim Hall79d07d22020-04-27 18:20:16 +0100215
Tim Halld8339a72021-05-27 18:49:40 +0100216 # Input volume width and height required to produce the smallest possible stripe
217 self.min_stripe_input_w, self.min_stripe_input_h = self._calculate_min_stripe_input()
Tim Hall79d07d22020-04-27 18:20:16 +0100218
Tim Halld8339a72021-05-27 18:49:40 +0100219 # Flags that marks whether this SchedulerOperation requires full IFM/OFM
220 self.requires_full_ifm = False
221 self.requires_full_ifm2 = False
222 self.requires_full_ofm = False
Tim Hall79d07d22020-04-27 18:20:16 +0100223
Johan Alfvén6f4cb032022-05-05 08:42:46 +0200224 self.evicted_fms_size = 0
225
Tim Halld8339a72021-05-27 18:49:40 +0100226 self.index = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100227
erik.andersson@arm.com6b2a0b42022-03-22 15:35:30 +0100228 # Perform an IFM swap for certain binary elementwise operators
229 # in order to enable cascading, if the SchedOp conforms to
230 # Elementwise cascading rules.
Johan Alfvén0f2e59f2022-10-21 11:21:38 +0200231 # The non-constant/non-scalar/non-broadcast IFM should be the primary input
232 if self.op_type.is_binary_elementwise_op():
233 ifm = self.parent_op.ifm
234 ifm2 = self.parent_op.ifm2
235 ofm = self.parent_op.ofm
erik.andersson@arm.com6b2a0b42022-03-22 15:35:30 +0100236
Johan Alfvén993ea532022-10-26 10:20:01 +0200237 ifm_can_swap = ifm.is_const or ifm.is_scalar
Johan Alfvén0f2e59f2022-10-21 11:21:38 +0200238 ifm2_can_be_primary = not (ifm2.is_const or ifm2.is_scalar or ifm2.is_broadcast(ofm))
239
Johan Alfvén993ea532022-10-26 10:20:01 +0200240 if ifm_can_swap and ifm2_can_be_primary:
Johan Alfvén0f2e59f2022-10-21 11:21:38 +0200241 # IFM2 is the primary input
Fredrik Svedbergb81e1bb2022-10-11 21:50:51 +0200242 self.reversed_operands = True
erik.andersson@arm.com6b2a0b42022-03-22 15:35:30 +0100243 self.ifm, self.ifm2 = self.ifm2, self.ifm
244
245 self.parent_ps.ifm_shapes = self.parent_ps.ifm_shapes[::-1]
246 self.parent_ps.inputs = self.parent_ps.inputs[::-1]
247 self.parent_ps.ifm_tensor, self.parent_ps.ifm2_tensor = (
248 self.parent_ps.ifm2_tensor,
249 self.parent_ps.ifm_tensor,
250 )
251
Tim Halld8339a72021-05-27 18:49:40 +0100252 def add_ifm_connection(self, conn: "Connection"):
253 """Add input connection to another SchedulerOperation or Subgraph Input"""
254 conn.consumers.append(self)
255 self.ifm.connection = conn
Tim Hall79d07d22020-04-27 18:20:16 +0100256
Tim Halld8339a72021-05-27 18:49:40 +0100257 def add_ifm2_connection(self, conn: "Connection"):
258 """Add input connection to another SchedulerOperation or Subgraph Input"""
259 if self.ifm2:
260 conn.consumers.append(self)
261 self.ifm2.connection = conn
Tim Hall79d07d22020-04-27 18:20:16 +0100262 else:
Tim Halld8339a72021-05-27 18:49:40 +0100263 assert False, f"Trying to set an IFM2 Connection to {self} which has no IFM2"
Tim Hall79d07d22020-04-27 18:20:16 +0100264
Tim Halld8339a72021-05-27 18:49:40 +0100265 def add_ofm_connection(self, conn: "Connection"):
266 """Add output connection to another SchedulerOperation or Subgraph Output"""
267 conn.producers.append(self)
268 self.ofm.connection = conn
269
270 def get_dependants(self):
271 """Returns a list of the Ops that depend on this Operation's OFM"""
272 return self.ofm.connection.consumers
273
274 def ifm_size_in_bytes(self) -> int:
275 """Returns size of the IFM in bytes"""
276 ifm_storage_shape = shape_for_format(self.ifm.shape, self.ifm.format)
277 return round_up(ifm_storage_shape.elements() * self.ifm.dtype.size_in_bytes(), Tensor.AllocationQuantum)
278
279 def ifm2_size_in_bytes(self) -> int:
280 """Returns size of the IFM2 in bytes"""
281 if self.ifm2:
282 ifm2_storage_shape = shape_for_format(self.ifm2.shape, self.ifm2.format)
283 return round_up(ifm2_storage_shape.elements() * self.ifm2.dtype.size_in_bytes(), Tensor.AllocationQuantum)
284
285 return 0
286
287 def ofm_size_in_bytes(self) -> int:
288 """Returns size of the OFM in bytes"""
289 ofm_storage_shape = shape_for_format(self.ofm.shape, self.ofm.format)
290 return round_up(ofm_storage_shape.elements() * self.ofm.dtype.size_in_bytes(), Tensor.AllocationQuantum)
291
292 def create_scheduler_info(self, nng: Graph, stripe: Shape4D) -> SchedulerOpInfo:
293 """Returns schedule info about this SchedulerOperation based on how many ofm elements it should produce"""
294 ifm_shape = self.ifm.shape
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100295 ifm2_shape = self.ifm2.shape if self.ifm2 is not None else None
Tim Halld8339a72021-05-27 18:49:40 +0100296 ofm_shape = stripe
297
298 if ofm_shape != self.ofm.shape:
299 # Striped Op - Need to calculate stripe input volume
300 stripe_input_w, stripe_input_h = self._get_stripe_input_requirement(stripe)
301 # Ensure stripe input volume is within the full IFM volume
302 stripe_input_h = min(stripe_input_h, self.ifm.shape.height)
303 stripe_input_w = min(stripe_input_w, self.ifm.shape.width)
304 ifm_shape = ifm_shape.with_hw(stripe_input_h, stripe_input_w)
305
306 if self.ifm2:
307 stripe_input2_h = min(stripe_input_h, self.ifm2.shape.height)
308 stripe_input2_w = min(stripe_input_w, self.ifm2.shape.width)
309 ifm2_shape = ifm2_shape.with_hw(stripe_input2_h, stripe_input2_w)
310
311 block_config = self._get_block_config(ifm_shape, ifm2_shape, self.uses_scalar, ofm_shape)
312
313 scheduler_op_info = SchedulerOpInfo(block_config, 0, ifm_shape, ifm2_shape, ofm_shape)
314 if self.parent_op.weights:
315 # Default full-depth weight encoding with no buffering
Tim Halld784af72021-06-08 21:25:57 +0100316 (
317 scheduler_op_info.npu_weights_tensor,
318 scheduler_op_info.npu_scales_tensor,
319 ) = weight_compressor.encode_weight_and_scale_tensor(
Tim Halld8339a72021-05-27 18:49:40 +0100320 self.arch,
321 self.parent_op,
322 self.parent_op.weights,
323 self.parent_op.bias,
324 self.kernel,
325 block_config,
326 [0, self.ofm.shape.depth],
327 )
328
329 self.parent_ps.block_config = block_config.old_style_representation()
330 return scheduler_op_info
331
332 def _get_stripe_input_requirement(self, stripe_shape: Shape4D) -> Tuple[int, int]:
333 """Returns the amount of IFM required to produce the stripe with shape:'stripe_shape'"""
334 ofm_shape_to_produce = Block.from_shape(stripe_shape.as_list())
335
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200336 return get_ifm_area_required(ofm_shape_to_produce, self.kernel, self.resampling_mode)
Tim Halld8339a72021-05-27 18:49:40 +0100337
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100338 def _calculate_min_stripe_input(self) -> Tuple[int, int]:
Tim Halld8339a72021-05-27 18:49:40 +0100339 # Calculate the input volume required height and width for the smallest possible stripe (h,w = 1,1)
340 min_stripe = self.ofm.shape.with_hw(1, 1)
341 return self._get_stripe_input_requirement(min_stripe)
342
343 def _get_block_config(
344 self, ifm_shape: Shape4D, ifm2_shape: Optional[Shape4D], uses_scalar: bool, ofm_shape: Shape4D
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100345 ) -> Optional[ArchitectureBlockConfig]:
Tim Halld8339a72021-05-27 18:49:40 +0100346 # Returns a block config and SHRAM layout
347 lut_banks = 2 if self.parent_op.activation_lut else 0
348 return find_block_config(
349 self.arch,
350 self.op_type.npu_block_type,
351 ofm_shape,
352 ifm_shape,
353 ifm2_shape,
354 uses_scalar,
355 self.ifm.dtype.size_in_bits(),
356 self.kernel,
357 lut_banks,
358 self.parent_op.has_scaling(),
359 self.resampling_mode,
360 )
361
362
363class Connection:
364 """Scheduler internal representation of a Tensor that connects two SchedulerOperations
365 This class can be seen as an edge within the Scheduler Graph representation
366 """
367
368 def __init__(self, tensor: Tensor):
369 self.parent_tens = tensor
370
371 # SchedulerOperation relationships
372 self.producers: List[SchedulerOperation] = []
373 self.consumers: List[SchedulerOperation] = []
Tim Hall79d07d22020-04-27 18:20:16 +0100374
375 def __str__(self):
Tim Halld8339a72021-05-27 18:49:40 +0100376 return f"<Connection {self.parent_tens.name}>"
Tim Hall79d07d22020-04-27 18:20:16 +0100377
378 __repr__ = __str__
379
380
Tim Halld8339a72021-05-27 18:49:40 +0100381class Schedule:
382 """Class that contains a solution of how to schedule an NPU subgraph and its cost"""
Tim Hall79d07d22020-04-27 18:20:16 +0100383
Tim Halld8339a72021-05-27 18:49:40 +0100384 def __init__(self, sg: Subgraph, label: str):
385 self.sg = sg
386 self.label = label
387 self.cost_map: Dict[SchedulerOperation, SchedulerOpInfo] = {}
388 self.cascades: Dict[int, CascadeInfo] = {}
389 self.fast_storage_peak_usage = 0
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100390 self.memory_snapshot: Optional[List[int]] = None
Tim Halld8339a72021-05-27 18:49:40 +0100391
392 @property
393 def name(self):
394 return f"{self.sg.name}_{self.label}"
Tim Hall79d07d22020-04-27 18:20:16 +0100395
396
Tim Halld8339a72021-05-27 18:49:40 +0100397class Scheduler:
398 """Main class of the Vela Scheduling"""
Tim Hall79d07d22020-04-27 18:20:16 +0100399
Tim Halld8339a72021-05-27 18:49:40 +0100400 def __init__(self, nng: Graph, sg: Subgraph, arch: ArchitectureFeatures, options: SchedulerOptions):
Tim Hall79d07d22020-04-27 18:20:16 +0100401 self.nng = nng
402 self.sg = sg
403 self.arch = arch
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000404 self.sched_ops: List[SchedulerOperation] = []
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100405 self.max_schedule: Optional[Schedule] = None
Tim Halld8339a72021-05-27 18:49:40 +0100406 self.scheduler_options = options
Tim Hall79d07d22020-04-27 18:20:16 +0100407
Johan Alfvén6f4cb032022-05-05 08:42:46 +0200408 self.scratched_fms: Dict[Tensor, Any] = {}
409 self.evicted_fms: List[live_range.LiveRange] = []
410
Johan Alfvén5e0ae552022-02-09 21:20:10 +0100411 def avoid_nhcwb16_for_ofm(self, tens, ps, arch):
Johan Alfvenf3490472023-01-13 08:46:27 +0100412 """For elementwise ops when ifm is in nhwc format and not brick format aligned (16),
413 then if the ofm can overwrite the ifm it is better to enforce ofm format to nhwc in
414 order to reduce memory transactions"""
Johan Alfvén5e0ae552022-02-09 21:20:10 +0100415
416 op = ps.primary_op
417 if not op.type.is_elementwise_op():
418 return False
419
420 depth = op.ofm_shapes[0][-1]
421 if (depth % 16) == 0:
422 return False
423
424 # Check if overwriting the inputs can be allowed
425 OpShapeTens = namedtuple("OpShapeTens", ["op_shape", "tens"])
426 outp = OpShapeTens(op.ofm_shapes[0], op.ofm)
427 inps = []
428 if op.ifm is not None:
429 inps.append(OpShapeTens(op.ifm_shapes[0], op.ifm))
430 if op.ifm2 is not None:
431 inps.append(OpShapeTens(op.ifm_shapes[1], op.ifm2))
432
433 # Find an input tensor that can be overwritten by the output
434 for inp in inps:
435 if (
436 # check op input and output shapes allow overlapping
437 inp.op_shape == outp.op_shape
438 # check input tensor is valid
439 and inp.tens is not None
440 and inp.tens.shape != []
441 # check input and output tensors are compatible
442 and inp.tens.format == outp.tens.format
443 and inp.tens.dtype == outp.tens.dtype
444 ):
445 if inp.tens.format == TensorFormat.NHWC:
446 return True
447
448 return False
449
Tim Halld8339a72021-05-27 18:49:40 +0100450 def create_scheduler_representation(self, arch: ArchitectureFeatures):
451 """Creates a Scheduler Graph representation"""
452 # Temporary dict for creating connections between the Operations
453 connections: Dict[Tensor, Connection] = {}
454 # Memory required for the largest FeatureMap that has to be full
455 min_memory_req = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100456 for ps in self.sg.passes:
Tim Halld8339a72021-05-27 18:49:40 +0100457 if ps.primary_op:
458 # Set tensor format to NHCWB16 for output FeatureMaps, if possible
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200459 for output in ps.outputs:
Jacob Bohlina5e8c1c2021-06-14 13:33:39 +0200460 if output in self.sg.output_tensors or output.purpose != TensorPurpose.FeatureMap:
Patrik Gustavssonfeeb06d2020-04-22 12:53:47 +0200461 continue
Johan Alfvén5e0ae552022-02-09 21:20:10 +0100462
463 if output.needs_linear_format:
464 continue
465
466 if self.avoid_nhcwb16_for_ofm(output, ps, arch):
467 output.needs_linear_format = True
468 continue
469
470 output.set_format(TensorFormat.NHCWB16, arch)
Tim Halld8339a72021-05-27 18:49:40 +0100471
472 # Create SchedulerOperations
473 op = SchedulerOperation(ps, arch, self.nng)
474 op.index = len(self.sched_ops)
475
476 # Make connections
477 if ps.ifm_tensor not in connections:
478 connections[ps.ifm_tensor] = Connection(ps.ifm_tensor)
479 if ps.ifm2_tensor and ps.ifm2_tensor not in connections:
480 connections[ps.ifm2_tensor] = Connection(ps.ifm2_tensor)
481 if ps.ofm_tensor not in connections:
482 connections[ps.ofm_tensor] = Connection(ps.ofm_tensor)
483
484 op.add_ifm_connection(connections[ps.ifm_tensor])
485 if ps.ifm2_tensor:
486 op.add_ifm2_connection(connections[ps.ifm2_tensor])
487 op.add_ofm_connection(connections[ps.ofm_tensor])
488
489 # Set requirements on the ifm/ofm buffers
490 self.sched_ops.append(op)
491 if ps.ifm_tensor in self.sg.input_tensors:
492 # This Op consumes a subgraph input
493 op.requires_full_ifm = True
494 if ps.ifm2_tensor and ps.ifm2_tensor in self.sg.input_tensors:
495 # This Op consumes a subgraph input
496 op.requires_full_ifm2 = True
497 if ps.ofm_tensor in self.sg.output_tensors:
498 # This Op produces a subgraph output
499 op.requires_full_ofm = True
500 if ps.ifm_tensor.needs_linear_format:
501 op.requires_full_ifm = True
502 if ps.ifm2_tensor and ps.ifm2_tensor.needs_linear_format:
503 op.requires_full_ifm2 = True
504 if ps.ofm_tensor.needs_linear_format or ps.primary_op.memory_function == Op.ConcatSliceWrite:
505 op.requires_full_ofm = True
506 if len(ps.primary_op.outputs) > 1 or len(ps.primary_op.outputs[0].consumer_list) > 1:
507 # Op has multiple outputs or consumers - requires full OFM
508 op.requires_full_ofm = True
509
510 # Check memory requirements if this Op requires any full FeatureMaps
511 op_memory_req = 0
512 if op.requires_full_ifm:
513 op_memory_req += op.ifm_size_in_bytes()
514 if op.requires_full_ifm2:
515 op_memory_req += op.ifm2_size_in_bytes()
516 if op.requires_full_ofm:
517 op_memory_req += op.ofm_size_in_bytes()
518
519 min_memory_req = max(op_memory_req, min_memory_req)
520
521 # Theoretical minimum required memory - used to guide the cascade building
522 self.min_memory_req = min_memory_req
523
524 def create_initial_schedule(self) -> Schedule:
525 """Creates an initial schedule with no cascading or buffering of any kind"""
526 schedule = Schedule(self.sg, "MAX")
Tim Halld8339a72021-05-27 18:49:40 +0100527 for op in self.sched_ops:
528 cost = op.create_scheduler_info(self.nng, op.ofm.shape)
529 cost.cycles = self.estimate_op_performance(op, cost.block_config, op.ofm.shape.depth)
530 schedule.cost_map[op] = cost
531
532 return schedule
533
534 def update_op_memory_snapshot(self, schedule: Schedule):
535 memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]
536
537 # Collect live ranges from tensors
538 lr_graph = live_range.LiveRangeGraph()
539 for mem_area, mem_type_set in memories_list:
Johan Alfven6e281af2023-02-28 09:03:03 +0100540 live_range.extract_live_ranges_from_schedule(
541 self.sg,
Jonas Ohlssond8575072022-03-30 10:30:25 +0200542 mem_area,
543 mem_type_set,
544 lr_graph,
Tim Halld8339a72021-05-27 18:49:40 +0100545 )
546
547 # Populate time-array with memory used by live ranges
548 temporal_usage = lr_graph.get_temporal_memory_usage(self.arch.fast_storage_mem_area)
549 schedule.memory_snapshot = temporal_usage
550
551 # Set the peak memory usage
552 schedule.fast_storage_peak_usage = max(temporal_usage, default=0)
553
554 def estimate_op_performance(self, op: SchedulerOperation, block_config, ofm_depth):
555 query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)
556 query.ifm_shape = op.ifm.shape
557 query.ifm_memory_area = op.ifm.mem_area
558 query.ifm_bits = op.ifm.dtype.size_in_bits()
559 query.ifm_format = op.ifm.format
560 query.ifm2_shape = op.ifm2 and op.ifm2.shape
561 query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area
562 query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()
563 query.ifm2_format = op.ifm2 and op.ifm2.format
564 query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)
565 query.ofm_memory_area = op.ofm.mem_area
566 query.ofm_bits = op.ofm.dtype.size_in_bits()
567 query.ofm_format = op.ofm.format
568 if op.parent_op.bias:
569 query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)
570 query.const_memory_area = self.arch.fast_storage_mem_area
571
572 query.kernel = op.kernel
573 query.config = block_config
574
575 return npu_performance.measure_cycle_cost(self.arch, op.op_type, op.activation and op.activation.op_type, query)
576
Johan Alfvén5c309712022-06-10 15:40:58 +0200577 def estimate_element_access(self, op: SchedulerOperation, block_config, ofm_depth):
578 query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)
579 query.ifm_shape = op.ifm.shape
580 query.ifm_memory_area = op.ifm.mem_area
581 query.ifm_bits = op.ifm.dtype.size_in_bits()
582 query.ifm_format = op.ifm.format
583 query.ifm2_shape = op.ifm2 and op.ifm2.shape
584 query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area
585 query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()
586 query.ifm2_format = op.ifm2 and op.ifm2.format
587 query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)
588 query.ofm_memory_area = op.ofm.mem_area
589 query.ofm_bits = op.ofm.dtype.size_in_bits()
590 query.ofm_format = op.ofm.format
591 if op.parent_op.bias:
592 query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)
593 query.const_memory_area = self.arch.fast_storage_mem_area
594
595 query.kernel = op.kernel
596 query.config = block_config
597
598 return npu_performance.measure_element_access(self.arch, query)
599
Tim Hall789e6f32021-06-17 17:02:31 +0100600 def propose_schedule_buffering(self, ref_schedule: Schedule, staging_limit_bytes):
Tim Halld8339a72021-05-27 18:49:40 +0100601 """Create a buffered schedule"""
602 buffered_schedule = Schedule(self.sg, f"{ref_schedule.label}_BUFFERED")
Tim Halld8339a72021-05-27 18:49:40 +0100603
604 prev_op = None
605 for sched_op in self.sched_ops:
606 if sched_op not in ref_schedule.cost_map:
607 # sched_op is not part of this sub-schedule - skip
608 continue
609
610 self.propose_operator_buffering(sched_op, prev_op, buffered_schedule, ref_schedule, staging_limit_bytes)
611 prev_op = sched_op
612
613 return buffered_schedule
614
615 def propose_operator_buffering(
616 self,
617 sched_op: SchedulerOperation,
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100618 prev_op: Optional[SchedulerOperation],
Tim Halld8339a72021-05-27 18:49:40 +0100619 buffered_schedule: Schedule,
620 ref_schedule: Schedule,
621 staging_limit_bytes,
622 ):
623 # Mild recursion might mean this Op has already been seen
624 if sched_op in buffered_schedule.cost_map:
625 return
626
627 # Take the reference schedule as default costings for this schedule
628 ref_cost = ref_schedule.cost_map[sched_op]
629 cost = copy.copy(ref_cost)
630 cost.slack_buffering_cycles = ref_cost.cycles.op_cycles
631 memory_snapshot = ref_schedule.memory_snapshot
632 ref_memory_usage = memory_snapshot[ref_cost.time_index] if ref_cost.time_index < len(memory_snapshot) else 0
633 cost.slack_buffering_memory = staging_limit_bytes - ref_memory_usage
634 buffered_schedule.cost_map[sched_op] = cost
635
636 # Attempt weight buffering on anything with a weights tensor
637 if sched_op.parent_op.weights:
Johan Alfvén6f4cb032022-05-05 08:42:46 +0200638 buffer_limit_bytes = cost.slack_buffering_memory
639
640 # If applicable apply size limitation, but keep it within reason (ratio 1.5).
641 # Size limitation is used when use_fast_storage_for_feature_maps have
642 # detected that there are fms that do not fit in fast storage.
643 if sched_op.evicted_fms_size and ((buffer_limit_bytes / sched_op.evicted_fms_size) >= 1.5):
644 buffer_limit_bytes -= sched_op.evicted_fms_size
645
Tim Halld8339a72021-05-27 18:49:40 +0100646 self.propose_weight_buffering(
647 sched_op.parent_op.weights,
648 sched_op.parent_op.bias,
649 sched_op,
650 prev_op,
651 buffered_schedule,
652 ref_schedule,
Johan Alfvén6f4cb032022-05-05 08:42:46 +0200653 buffer_limit_bytes,
Tim Halld8339a72021-05-27 18:49:40 +0100654 )
655
656 return cost
657
658 def weights_needs_dma(self, weight_tensor):
659 if weight_tensor and weight_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast):
660 # Weights are in permanent storage
661 # Only when permanent storage differs from feature map storage, there is a point moving the data
662 if (
663 weight_tensor.mem_area in (MemArea.Dram, MemArea.OffChipFlash)
664 and self.arch.permanent_storage_mem_area != self.arch.fast_storage_mem_area
665 ):
666 return True
667 return False
668
669 def propose_weight_buffering(
670 self,
671 weight_tensor,
672 scale_tensor,
673 sched_op: SchedulerOperation,
674 prev_op: SchedulerOperation,
675 buffered_schedule: Schedule,
676 ref_schedule: Schedule,
677 buffer_limit_bytes,
678 ):
679 cost = buffered_schedule.cost_map[sched_op]
680 prev_cost = buffered_schedule.cost_map.get(prev_op)
681 ref_cost = ref_schedule.cost_map[sched_op]
682 assert cost and ref_cost
683
684 needs_dma = self.weights_needs_dma(weight_tensor)
685
686 ofm_full_depth_slices = [0, ref_cost.stripe.depth]
687
688 # Encode weights for the full depth
Tim Halld784af72021-06-08 21:25:57 +0100689 full_weights, full_scales = weight_compressor.encode_weight_and_scale_tensor(
Tim Halld8339a72021-05-27 18:49:40 +0100690 self.arch,
691 sched_op.parent_op,
692 weight_tensor,
693 scale_tensor,
694 sched_op.kernel,
695 cost.block_config,
696 ofm_full_depth_slices,
697 )
698 full_weights_bytes = len(full_weights.buffer)
699 cost.ofm_depth_slices = ofm_full_depth_slices
700
701 # No buffering required - take all the weights from permanent storage
702 if sched_op.op_type == Op.FullyConnected or not needs_dma:
703 cost.npu_weights_tensor = full_weights
Tim Halld784af72021-06-08 21:25:57 +0100704 cost.npu_scales_tensor = full_scales
Tim Halld8339a72021-05-27 18:49:40 +0100705 return
706
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100707 encoded_weights: Optional[NpuWeightTensor] = full_weights
Tim Halld784af72021-06-08 21:25:57 +0100708 encoded_scales = full_scales
Tim Halld8339a72021-05-27 18:49:40 +0100709
710 # How many NPU cycles are available under the previously executing
711 # operator and SRAM unused for performing buffered DMA transfers
712 slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0
713 slack_memory = prev_cost.slack_buffering_memory if prev_cost else 0
714
715 # Force full depth for cascaded Ops
716 if ref_cost.cascade != 0:
717 weight_tensor_purpose = TensorSubPurpose.Standard
718 weight_buffer_size = full_weights_bytes
719 # Update the memory snapshot to reflect the added size of the weights
720 ref_schedule.memory_snapshot[ref_cost.time_index] += weight_buffer_size
721 else:
722 # Estimate the buffering cycle time for the full set of weights
723 full_transfer_cycles = npu_performance.measure_mem2mem_cycles(
724 self.arch, weight_tensor.mem_area, self.arch.fast_storage_mem_area, full_weights_bytes
725 )
726 cost.full_weight_transfer_cycles = full_transfer_cycles
727
728 # Calculate the amount of prebuffering necessary (or what is possible with limited
729 # double buffer buffer size)
730 half_buffer_limit = buffer_limit_bytes // 2
731 if full_transfer_cycles > slack_cycles:
732 prebuffer_ratio = slack_cycles / full_transfer_cycles
733 prebuffer_bytes = min(prebuffer_ratio * full_weights_bytes, half_buffer_limit)
734 else:
735 prebuffer_bytes = min(full_weights_bytes, half_buffer_limit)
Tim Hall789e6f32021-06-17 17:02:31 +0100736
737 prebuffer_ratio = prebuffer_bytes / full_weights_bytes
Tim Halld8339a72021-05-27 18:49:40 +0100738
739 # Have to split the weights if the initial buffering can't store
740 # all of the compressed weights
741 if prebuffer_bytes < full_weights_bytes:
Tim Hall789e6f32021-06-17 17:02:31 +0100742 block_depth = cost.block_config.ofm_block.depth
Tim Halld8339a72021-05-27 18:49:40 +0100743
Tim Hall789e6f32021-06-17 17:02:31 +0100744 # Choose initial prebuffering depth (already buffer clamped)
745 prebuffer_depth = ref_cost.stripe.depth * prebuffer_ratio
Tim Halld8339a72021-05-27 18:49:40 +0100746 prebuffer_depth = int(max(16, round_down(prebuffer_depth, ArchitectureFeatures.OFMSplitDepth)))
747
Tim Hall789e6f32021-06-17 17:02:31 +0100748 # Calculate cycles executed during the prebuffer
749 pre_op_cycles = self.estimate_op_performance(sched_op, cost.block_config, prebuffer_depth)
750 buffering_depth = ref_cost.stripe.depth * (pre_op_cycles.op_cycles / full_transfer_cycles)
Tim Halld8339a72021-05-27 18:49:40 +0100751
Tim Hall789e6f32021-06-17 17:02:31 +0100752 # Choose initial buffering depth and clamp to the double buffering limit
753 buffering_depth = round_up(buffering_depth, block_depth)
754 buffering_bytes = (buffering_depth / ref_cost.stripe.depth) * full_weights_bytes
755 if buffering_bytes > half_buffer_limit:
756 buffering_depth = (half_buffer_limit / full_weights_bytes) * ref_cost.stripe.depth
757
758 while True:
759 # Attempt to buffer whole blocks
Johan Alfvéncce7f2d2022-04-08 10:47:09 +0200760 if buffering_depth > block_depth:
Tim Hall789e6f32021-06-17 17:02:31 +0100761 buffering_depth = round_down(buffering_depth, block_depth)
762 else:
763 buffering_depth = round_down(buffering_depth, ArchitectureFeatures.OFMSplitDepth)
764 buffering_depth = int(max(buffering_depth, ArchitectureFeatures.OFMSplitDepth))
Tim Halld8339a72021-05-27 18:49:40 +0100765
766 # Create list of depth slices
767 depth_slices = [0]
768 if prebuffer_depth < ref_cost.stripe.depth:
769 depth_slices += list(range(prebuffer_depth, ref_cost.stripe.depth, buffering_depth))
770 depth_slices.append(ref_cost.stripe.depth)
771
772 # Encode weights based depth slices
773 cost.ofm_depth_slices = depth_slices
Tim Halld784af72021-06-08 21:25:57 +0100774 encoded_weights, encoded_scales = weight_compressor.encode_weight_and_scale_tensor(
Tim Halld8339a72021-05-27 18:49:40 +0100775 self.arch,
776 sched_op.parent_op,
777 weight_tensor,
778 scale_tensor,
779 sched_op.kernel,
780 cost.block_config,
781 cost.ofm_depth_slices,
782 )
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100783 assert encoded_weights is not None
Tim Halld8339a72021-05-27 18:49:40 +0100784 # Chosen buffering might not fit at all, iterate until it does
785 # or until the minimum usable slice size is reached
786 if (
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000787 encoded_weights.double_buffer_size() <= buffer_limit_bytes
Tim Halld8339a72021-05-27 18:49:40 +0100788 or prebuffer_depth == ArchitectureFeatures.OFMSplitDepth
789 ):
790 break
791
Tim Hall789e6f32021-06-17 17:02:31 +0100792 if buffering_depth > prebuffer_depth:
793 buffering_depth = round_up(buffering_depth // 2, ArchitectureFeatures.OFMSplitDepth)
794 else:
795 prebuffer_depth = round_up(prebuffer_depth // 2, ArchitectureFeatures.OFMSplitDepth)
Tim Halld8339a72021-05-27 18:49:40 +0100796
797 # Calculate cycles required to run the last op for use as future slack
798 tail_cycles = self.estimate_op_performance(
799 sched_op, cost.block_config, depth_slices[-1] - depth_slices[-2]
800 )
801 cost.slack_buffering_cycles = tail_cycles.op_cycles
802
803 # Determine whether the weights need to be double buffered
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000804 weight_buffer_size = min(len(encoded_weights.buffer), encoded_weights.max_range_bytes())
Tim Halld8339a72021-05-27 18:49:40 +0100805
806 # Only buffer weights if there's still space left for the buffer
807 if weight_buffer_size <= buffer_limit_bytes:
808 assert weight_buffer_size % 16 == 0
809 # Determine whether to double buffer or single buffer
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000810 double_buffer_size = encoded_weights.double_buffer_size()
811 if (double_buffer_size <= buffer_limit_bytes) and (weight_buffer_size < len(encoded_weights.buffer)):
Tim Halld8339a72021-05-27 18:49:40 +0100812 weight_tensor_purpose = TensorSubPurpose.DoubleBuffer
813 else:
814 weight_tensor_purpose = TensorSubPurpose.Standard
815
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000816 cost.buffered_weight_tensors = [
817 self.buffer_tensor(
818 encoded_weights,
819 weight_tensor_purpose,
820 encoded_weights.double_buffer_sizes[0],
821 weight_tensor.name + "_buffer",
822 )
823 ]
824 if weight_tensor_purpose == TensorSubPurpose.DoubleBuffer:
825 buf2 = self.buffer_tensor(
826 encoded_weights,
827 weight_tensor_purpose,
828 encoded_weights.double_buffer_sizes[1],
829 weight_tensor.name + "_buffer2",
830 )
831 cost.buffered_weight_tensors.append(buf2)
832
Rickard Bolin1bd20f22022-12-07 08:54:53 +0000833 # Note! OFM depth slices define slices as [0, s1, ... sn]. For example, [0, 70, 140] describes two slices
834 # (0-70 and 70-140) but has a length of 3, which would result in idx = 3 % 2 = 1 if two buffers were used.
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000835 last_used_buffer_idx = len(cost.ofm_depth_slices) % len(cost.buffered_weight_tensors)
836 weight_buffer_size = encoded_weights.double_buffer_sizes[last_used_buffer_idx]
837
Tim Halld8339a72021-05-27 18:49:40 +0100838 if ref_cost.cascade == 0:
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000839 # Determine if the lifetime can be extended and pre-buffer the first weight buffer
840 # under the previous operation
841 cost.buffered_weight_tensors[0].pre_buffer = encoded_weights.double_buffer_size() < slack_memory
Tim Halld8339a72021-05-27 18:49:40 +0100842
843 cost.slack_buffering_memory -= weight_buffer_size
844 else:
845 # Don't slice or buffer - use the whole depth from persistent storage
846 cost.ofm_depth_slices = ofm_full_depth_slices
847 encoded_weights = full_weights
Tim Halld784af72021-06-08 21:25:57 +0100848 encoded_scales = full_scales
Tim Halld8339a72021-05-27 18:49:40 +0100849
850 cost.npu_weights_tensor = encoded_weights
Tim Halld784af72021-06-08 21:25:57 +0100851 cost.npu_scales_tensor = encoded_scales
Tim Halld8339a72021-05-27 18:49:40 +0100852
Jacob Bohlineee9e5d2021-08-17 17:44:45 +0200853 def buffer_tensor(self, src_tensor: Tensor, sub_purpose: TensorSubPurpose, buffer_size: int, name: str) -> Tensor:
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000854 buffered_weight_tensor = Tensor([1, 1, 1, buffer_size], DataType.uint8, name)
Jacob Bohlineee9e5d2021-08-17 17:44:45 +0200855 buffered_weight_tensor.src_tensor = src_tensor
856 buffered_weight_tensor.mem_area = self.arch.fast_storage_mem_area
857 buffered_weight_tensor.mem_type = MemType.Scratch_fast
858 buffered_weight_tensor.purpose = TensorPurpose.Weights
859 buffered_weight_tensor.sub_purpose = sub_purpose
860 return buffered_weight_tensor
861
Tim Halld8339a72021-05-27 18:49:40 +0100862 def propose_minimal_schedule(self) -> Schedule:
863 """Proposes scheduling parameters where every operator is subdivided into the smallest stripe that satisfies the
864 next operators stride"""
865 min_schedule = Schedule(self.sg, "MIN")
866 cost_map = min_schedule.cost_map
867
868 # Keep track of the previous Op - which consumes the current Op's OFM
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100869 prev_op: Optional[SchedulerOperation] = None
Tim Halld8339a72021-05-27 18:49:40 +0100870 for sched_op in reversed(self.sched_ops):
871 min_stripe_height = prev_op.kernel.stride.y if prev_op else 1
872 min_stripe = sched_op.ofm.shape.with_height(min_stripe_height)
873
874 cost = sched_op.create_scheduler_info(self.nng, min_stripe)
875 cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)
876 cost_map[sched_op] = cost
877
878 prev_op = sched_op
879
880 return min_schedule
881
882 def propose_schedule_striping(self, final_stripe: Shape4D, label: str, ref_schedule: Schedule) -> Schedule:
883 """Proposes new striping for a schedule. The stripe is derived from the ifm requirements of the next Op down"""
884 ref_cost = ref_schedule.cost_map
885
886 striped_schedule = Schedule(self.sg, label)
887 stripe = final_stripe
888 for sched_op in reversed(self.sched_ops):
889 if sched_op not in ref_cost:
890 # sched_op is not part of the sub-schedule - skip
891 continue
892
893 # Create a cost entry with the new stripe
894 cost = sched_op.create_scheduler_info(self.nng, stripe)
895
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000896 weight_tensor = cost.npu_weights_tensor
897 for idx, buffered_tens in enumerate(ref_cost[sched_op].buffered_weight_tensors):
Jacob Bohlineee9e5d2021-08-17 17:44:45 +0200898 # If the weights are buffered in the reference schedule they should be in the new proposal
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000899 cost.buffered_weight_tensors.append(
900 self.buffer_tensor(
901 weight_tensor,
902 buffered_tens.sub_purpose,
903 weight_tensor.double_buffer_sizes[idx],
904 buffered_tens.name,
905 )
Jacob Bohlineee9e5d2021-08-17 17:44:45 +0200906 )
Tim Halld8339a72021-05-27 18:49:40 +0100907
908 # Estimate performance
909 cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)
910 striped_schedule.cost_map[sched_op] = cost
911
erik.andersson@arm.com8912f3a2022-08-16 11:08:57 +0200912 # Calculate the preceeding Op's stripe.
913
914 # In certain cases where an upscaling Op is cascaded,
915 # it may get instructed to produce an odd stripe height.
916 # Thus we need to force it back to even heights.
917 force_even_stripe_heights = False
918 for op in self.sched_ops:
919 # Check if the cascade has a Nearest Neighbor-op.
920 # If that is the case, force the stripes to be even.
921 if (
922 ref_cost.get(op, None)
923 and ref_cost.get(sched_op, None)
924 and ref_cost[op].cascade == ref_cost[sched_op].cascade
925 and is_nearest(op.resampling_mode)
926 ):
927 force_even_stripe_heights = True
928 break
929 upscaling_remainder = stripe.height % to_upscale(sched_op.resampling_mode)
930 height = stripe.height + (stripe.height % 2 if force_even_stripe_heights else upscaling_remainder)
Fredrik Svedbergd03dc502022-06-30 10:44:12 +0200931 stripe = sched_op.ifm.shape.with_height(height * sched_op.kernel.stride.y)
Tim Halld8339a72021-05-27 18:49:40 +0100932
933 return striped_schedule
934
935 def estimate_schedule_memory_usage(self, schedule: Schedule, non_local_mem_usage: dict):
936 """Estimates the memory usage of a schedule"""
937 cost = schedule.cost_map
938 cascades = schedule.cascades
939 peak_mem_usage = 0
940 for sched_op in self.sched_ops:
941 if sched_op not in cost:
942 # sched_op is not part of the sub-schedule - skip
943 continue
944
945 if cost[sched_op].cascade:
946 # This Op is part of a cascade - use the cascade's memory usage
947 cascade_info = cascades[cost[sched_op].cascade]
948 # Non-local memory usage is already included in the cascade_info
949 peak_mem_usage = max(cascade_info.mem_usage, peak_mem_usage)
950 else:
951 # This Op is not part of a cascade - calculate the memory usage
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000952 op_weight_buffer = sum(tens.storage_size() for tens in cost[sched_op].buffered_weight_tensors)
Tim Halld8339a72021-05-27 18:49:40 +0100953
954 op_mem_usage = (
955 sched_op.ifm_size_in_bytes()
956 + sched_op.ofm_size_in_bytes()
957 + op_weight_buffer
958 + non_local_mem_usage.get(sched_op, 0)
959 )
960 peak_mem_usage = max(op_mem_usage, peak_mem_usage)
961
962 return peak_mem_usage
963
Johan Alfvén255dad72022-07-16 18:27:05 +0200964 def build_cascades_for_min_schedule(self, min_schedule: Schedule, max_template: Schedule, memory_limit: int):
965 # Update memory snapshot
966 self.sg.schedule = min_schedule
967 self.update_op_memory_snapshot(min_schedule)
968
969 # Calculate residual memory for Min schedule
970 non_local_mem_usage = {}
971 for sched_op in self.sched_ops:
972 time_index = min_schedule.cost_map[sched_op].time_index
973
974 if self.arch.is_spilling_enabled():
975 # For Dedicated SRAM only the intermediate buffers are in SRAM, hence op_mem_usage is 0
976 op_mem_usage = 0
977 else:
978 # Min schedule only have ifm and ofm in SRAM (no buffered weigth tensors)
Johan Alfvéneb332a32022-12-09 17:50:48 +0100979 # Only include IFM's that are in the scratch area
980 ifm = sched_op.ifm.connection.parent_tens
981 ifm_size = (
982 0 if ifm.mem_type not in (MemType.Scratch, MemType.Scratch_fast) else sched_op.ifm_size_in_bytes()
983 )
Johan Alfvén92689d52022-12-06 11:16:19 +0100984 ofm_size = 0 if ofm_can_reuse_ifm(sched_op) else sched_op.ofm_size_in_bytes()
Johan Alfvéneb332a32022-12-09 17:50:48 +0100985 op_mem_usage = ifm_size + ofm_size
Johan Alfvén255dad72022-07-16 18:27:05 +0200986
987 non_local_mem_usage[sched_op] = min_schedule.memory_snapshot[time_index] - op_mem_usage
Johan Alfvén92689d52022-12-06 11:16:19 +0100988 assert non_local_mem_usage[sched_op] >= 0
Johan Alfvén255dad72022-07-16 18:27:05 +0200989
Rickard Bolin1bd20f22022-12-07 08:54:53 +0000990 # Create cascades for Min schedule
Johan Alfvén255dad72022-07-16 18:27:05 +0200991 cascade_builder = CascadeBuilder(self.sched_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)
992 cascade_builder.build_cascades(min_schedule, max_template, memory_limit)
993
Tim Halld8339a72021-05-27 18:49:40 +0100994 def optimize_sub_schedule(
995 self, cascade_info: CascadeInfo, ref_schedule: Schedule, max_template: Schedule, memory_limit: int
996 ) -> Schedule:
997 """Extracts the Ops covered by the given cascade and creates a sub-schedule. The sub-schedule is optimized by
998 proposing weight buffering and then continously proposing new stripe sizes"""
999 ref_cost = ref_schedule.cost_map
1000 # Extract the ops that are part of this sub-schedule
1001 start = cascade_info.start
1002 end = cascade_info.end
1003 sub_schedule_ops = self.sched_ops[start : end + 1]
1004 # Create a sub-schedule that contains only the costs for the Ops that are part of the sub-schedule
1005 sub_schedule = Schedule(self.sg, f"SUB_{start}_{end}")
1006 for sched_op in sub_schedule_ops:
1007 sub_schedule.cost_map[sched_op] = ref_cost[sched_op]
1008
1009 sub_schedule.cascades[end] = cascade_info
1010 # Use the memory snapshot from the reference schedule
1011 sub_schedule.memory_snapshot = ref_schedule.memory_snapshot
1012
1013 # Calculate memory usage that is live during the sub-schedule but not part of it
1014 time_for_cascade = ref_cost[sub_schedule_ops[0]].time_index
1015 mem_usage_parallel_to_sub_schedule = ref_schedule.memory_snapshot[time_for_cascade] - cascade_info.mem_usage
1016 # If the first Op's IFM has other consumers it has to live throughout the whole sub-schedule whether it's
1017 # included in a cascade or not
1018 persistent_initial_ifm = (
1019 sub_schedule_ops[0].ifm_size_in_bytes() if len(sub_schedule_ops[0].ifm.connection.consumers) > 1 else 0
1020 )
1021 # Calculate non-local-mem-usage per Operator
1022 non_local_mem_usage = {}
1023 for idx, sched_op in enumerate(sub_schedule_ops):
1024 non_local_mem_usage[sched_op] = mem_usage_parallel_to_sub_schedule
1025 if idx != 0:
1026 non_local_mem_usage[sched_op] += persistent_initial_ifm
1027
1028 cascade_builder = CascadeBuilder(sub_schedule_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)
1029
1030 # Start by adding buffering
Tim Hall789e6f32021-06-17 17:02:31 +01001031 buffered_sub_schedule = self.propose_schedule_buffering(
1032 sub_schedule, self.scheduler_options.optimization_sram_limit
1033 )
Tim Halld8339a72021-05-27 18:49:40 +01001034 # Copy the cascades over from the unbuffered-schedule
1035 buffered_sub_schedule.cascades = sub_schedule.cascades
1036
1037 # Generate the possible stripings for the final Op in the sub-schedule
1038 final_ofm_shape = sub_schedule_ops[-1].ofm.shape
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001039
1040 # Skip testing the min stripe used in the MIN schedule since that will be used
1041 # anyway if no new cascades are created below
1042 last_op = sub_schedule_ops[-1]
1043 min_stripe_h = sub_schedule.cost_map[last_op].stripe.height + 1
1044
Tim Halld8339a72021-05-27 18:49:40 +01001045 possible_stripes = [
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001046 final_ofm_shape.with_height(stripe_h) for stripe_h in range(min_stripe_h, final_ofm_shape.height // 2 + 1)
Tim Halld8339a72021-05-27 18:49:40 +01001047 ]
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001048 # Propose different striping
Jacob Bohlinfad72042021-08-24 21:51:41 +02001049 best_schedule = None
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001050 max_nbr_of_cascades = 0
1051 for iteration, proposed_stripe in enumerate(possible_stripes):
Tim Halld8339a72021-05-27 18:49:40 +01001052 proposed_schedule = self.propose_schedule_striping(
1053 proposed_stripe, f"OPTIMIZED_{iteration}", buffered_sub_schedule
1054 )
1055
1056 cascade_builder.build_cascades(proposed_schedule, max_template, memory_limit)
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001057 nbr_of_cascades = len(proposed_schedule.cascades)
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001058 if iteration == 0:
1059 # First iteration - used as limit to prevent splitting up the cascades
1060 # Long cascades are better in order to reduce IFM/IFM dram bandwidth
1061 max_nbr_of_cascades = nbr_of_cascades
1062
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001063 # Check if proposal fits
1064 proposed_schedule_mem_usage = self.estimate_schedule_memory_usage(proposed_schedule, non_local_mem_usage)
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001065 if (proposed_schedule_mem_usage) <= memory_limit and nbr_of_cascades <= max_nbr_of_cascades:
Tim Halld8339a72021-05-27 18:49:40 +01001066 best_schedule = proposed_schedule
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001067
Tim Halld8339a72021-05-27 18:49:40 +01001068 if not proposed_schedule.cascades:
1069 # No cascading required - early exit
1070 break
1071 else:
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001072 break
Tim Halld8339a72021-05-27 18:49:40 +01001073
1074 return best_schedule
1075
1076 def optimize_schedule(
Jonas Ohlssond8575072022-03-30 10:30:25 +02001077 self,
1078 schedule: Schedule,
1079 max_sched: Schedule,
1080 max_template: Schedule,
1081 options: SchedulerOptions,
Tim Halld8339a72021-05-27 18:49:40 +01001082 ) -> Schedule:
1083 """Extracts sub-schedules based on the cascades and optimizes them and applies them to the final schedule"""
1084 sram_limit = options.optimization_sram_limit
1085 if max_sched.fast_storage_peak_usage < sram_limit and not self.arch.is_spilling_enabled():
1086 # Maximum performance schedule fits within the SRAM target
1087 return max_sched
1088
Jacob Bohlinfad72042021-08-24 21:51:41 +02001089 # Iterate over a copy of the cascades since they may change during the loop
1090 for cascade_info in list(schedule.cascades.values()):
Tim Halld8339a72021-05-27 18:49:40 +01001091 # Optimize the sub-schedule in this cascade
1092 opt_sub_schedule = self.optimize_sub_schedule(cascade_info, schedule, max_template, sram_limit)
Jacob Bohlinfad72042021-08-24 21:51:41 +02001093 if opt_sub_schedule:
1094 # Remove the existing cascade
1095 del schedule.cascades[cascade_info.end]
1096 # Update the sub-schedule Op and cascade costs to the full schedule
1097 schedule.cost_map.update(opt_sub_schedule.cost_map)
1098 schedule.cascades.update(opt_sub_schedule.cascades)
Tim Halld8339a72021-05-27 18:49:40 +01001099
1100 # Update memory snapshot
1101 self.sg.schedule = schedule
1102 self.update_op_memory_snapshot(schedule)
1103 # Propose schedule buffering to the optimized schedule
Tim Hall789e6f32021-06-17 17:02:31 +01001104 optimized_sched = self.propose_schedule_buffering(schedule, self.scheduler_options.optimization_sram_limit)
Tim Halld8339a72021-05-27 18:49:40 +01001105 # Copy the cascade's metadata from the unbuffered schedule
1106 optimized_sched.cascades = schedule.cascades
1107 return optimized_sched
1108
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001109 def optimize_weight_buffering_size(
1110 self,
1111 min_schedule: Schedule,
1112 options: SchedulerOptions,
1113 ):
1114 default_schedule = self.sg.schedule
Tim Hallc1be0872022-03-03 17:50:52 +00001115 npu_performance.calc_new_performance_for_network(self.nng, self.arch, None, False)
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001116 default_tot_cycles = self.nng.cycles[npu_performance.PassCycles.Total]
1117 default_dram_cycles = self.nng.cycles[npu_performance.PassCycles.DramAccess]
1118
1119 # Restore mem/type for scratched_fms
1120 for tens in self.scratched_fms:
1121 tens.mem_area = self.scratched_fms[tens][0]
1122 tens.mem_type = self.scratched_fms[tens][1]
1123
1124 self.update_op_memory_snapshot(self.sg.schedule)
1125
1126 # Collect live ranges from tensors
1127 memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]
1128 lr_graph = live_range.LiveRangeGraph()
1129 for mem_area, mem_type_set in memories_list:
Johan Alfven6e281af2023-02-28 09:03:03 +01001130 live_range.extract_live_ranges_from_schedule(
1131 self.sg,
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001132 mem_area,
1133 mem_type_set,
1134 lr_graph,
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001135 )
1136
1137 # Find the relation between the sched_op and the buffering tensor
1138 weight_ops = {}
1139 for sched_op in self.sched_ops:
1140 cost = self.sg.schedule.cost_map[sched_op]
Rickard Bolinfd8b5002022-05-16 09:11:06 +00001141 for tens in cost.buffered_weight_tensors:
1142 weight_ops[tens] = sched_op
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001143
1144 # Filter out weight buffer live ranges
1145 weight_lrs = []
1146 for lr in lr_graph.lrs:
1147 for tens in lr.tensors:
1148 if weight_ops.get(tens):
1149 weight_lrs.append(lr)
1150 break
1151
1152 # See if any evicted fm overlaps with a weight buffering op.
1153 # If this is the case add a size limitation to the buffering op
1154 for lr in self.evicted_fms:
1155 for weight_lr in weight_lrs:
1156 if lr.overlaps_ranges(weight_lr):
1157 for tens in weight_lr.tensors:
1158 sched_op = weight_ops.get(tens)
1159 if sched_op:
1160 # Add size reduction to the op
1161 sched_op.evicted_fms_size += lr.size
1162 break
1163
1164 self.sg.schedule = min_schedule
1165 self.update_op_memory_snapshot(self.sg.schedule)
1166
1167 # Run schedule buffering - with weight buffer size reduction
1168 schedule = self.propose_schedule_buffering(self.sg.schedule, options.optimization_sram_limit)
1169 schedule.cascades = self.sg.schedule.cascades
1170 self.sg.schedule = schedule
1171
1172 # Apply new buffer schdule and calc new performance
1173 self.update_op_memory_snapshot(self.sg.schedule)
1174 self.apply_schedule(self.sg.schedule)
1175 self.use_fast_storage_for_feature_maps(self.sg.schedule, options.optimization_sram_limit)
1176
Tim Hallc1be0872022-03-03 17:50:52 +00001177 npu_performance.calc_new_performance_for_network(self.nng, self.arch, None, False)
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001178 new_tot_cycles = self.nng.cycles[npu_performance.PassCycles.Total]
1179 new_dram_cycles = self.nng.cycles[npu_performance.PassCycles.DramAccess]
1180
Tim Hall8bc7a652022-05-19 15:29:23 +01001181 improvement_tot = (
1182 round((default_tot_cycles - new_tot_cycles) / default_tot_cycles, 2) if default_tot_cycles != 0 else 0
1183 )
1184 improvement_dram = (
1185 round((default_dram_cycles - new_dram_cycles) / default_dram_cycles, 2) if default_dram_cycles != 0 else 0
1186 )
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001187
1188 # Compare both total and dram improvement
Johan Alfvén3dae1b62022-05-17 10:26:48 +02001189 if not (improvement_tot >= 0 and improvement_dram > 0):
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001190 # No improvement, restore the default schedule
1191 for sched_op in self.sched_ops:
1192 sched_op.evicted_fms_size = 0
1193
1194 for tens in self.scratched_fms:
1195 tens.mem_area = self.scratched_fms[tens][0]
1196 tens.mem_type = self.scratched_fms[tens][1]
1197
1198 self.sg.schedule = default_schedule
1199 self.update_op_memory_snapshot(self.sg.schedule)
1200 self.apply_schedule(self.sg.schedule)
1201 self.use_fast_storage_for_feature_maps(self.sg.schedule, options.optimization_sram_limit)
1202
Tim Halld8339a72021-05-27 18:49:40 +01001203 def apply_schedule(self, sched: Schedule):
1204 """Applies the given schedule as a final solution"""
1205 for sched_op in self.sched_ops:
1206 op_info = sched.cost_map[sched_op]
1207 cascade_info = sched.cascades.get(op_info.cascade, None)
1208 if cascade_info and sched_op in cascade_info.buffers:
1209 buffer_tens = sched_op.ifm.connection.parent_tens
1210 # Apply memory area and type
1211 buffer_tens.mem_area = self.arch.fast_storage_mem_area
1212 buffer_tens.mem_type = MemType.Scratch_fast
1213 # Apply Rolling buffer
1214 buffer_tens.set_format(TensorFormat.NHCWB16, self.arch)
1215 buffer_tens.set_new_sub_purpose(TensorSubPurpose.RollingBufferY, cascade_info.buffers[sched_op].height)
1216
1217 sched_op.parent_ps.block_config = op_info.block_config.old_style_representation()
1218
1219 # Ensure that the src_tensor reference is set correctly
Rickard Bolinfd8b5002022-05-16 09:11:06 +00001220 for tens in op_info.buffered_weight_tensors:
1221 tens.src_tensor = op_info.npu_weights_tensor
Tim Halld8339a72021-05-27 18:49:40 +01001222
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001223 def use_fast_storage_for_feature_maps(self, schedule, staging_limit):
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001224 """Finds the set of feature maps that fits within the staging limit which combined has the largest amount of
1225 access cycles and moves those feature map into fast storage"""
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001226 max_mem_usage = []
1227 base_mem_usage = []
1228 fast_storage_type = MemType.Scratch_fast
1229 fast_storage_mem_area = self.arch.fast_storage_mem_area
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001230 self.evicted_fms = []
Tim Halld8339a72021-05-27 18:49:40 +01001231
1232 # Force all OFMs to fast-storage
1233 for sched_op in self.sched_ops:
1234 cost = schedule.cost_map[sched_op]
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001235 if cost.cascade == 0 and sched_op.get_dependants():
1236 ofm_tens = sched_op.ofm.connection.parent_tens
1237 if not any(cons is None for cons in ofm_tens.consumer_list):
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001238 if ofm_tens not in self.scratched_fms:
1239 # Remember default mem area and mem type, only done once
1240 self.scratched_fms[ofm_tens] = (ofm_tens.mem_area, ofm_tens.mem_type)
1241
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001242 ofm_tens.mem_area = fast_storage_mem_area
1243 ofm_tens.mem_type = fast_storage_type
Tim Halld8339a72021-05-27 18:49:40 +01001244
1245 # Collect live ranges from tensors
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001246 memories_list = [(fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]
Tim Halld8339a72021-05-27 18:49:40 +01001247 lr_graph = live_range.LiveRangeGraph()
1248 for mem_area, mem_type_set in memories_list:
Johan Alfven6e281af2023-02-28 09:03:03 +01001249 live_range.extract_live_ranges_from_schedule(
1250 self.sg,
Jonas Ohlssond8575072022-03-30 10:30:25 +02001251 mem_area,
1252 mem_type_set,
1253 lr_graph,
Tim Halld8339a72021-05-27 18:49:40 +01001254 )
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001255 max_mem_usage = lr_graph.get_temporal_memory_usage(fast_storage_mem_area)
Tim Halld8339a72021-05-27 18:49:40 +01001256
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001257 # If max_mem_usage does not exceed staging limit at any point all lrs fit and can stay in fast storage
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001258 if max(max_mem_usage) <= staging_limit:
1259 return
1260
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001261 # Build up the base memory usage by removing the mem_usage of the lrs we previously moved to fast-storage
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001262 base_mem_usage = np.array(max_mem_usage)
1263 curr_lrs = []
Tim Halld8339a72021-05-27 18:49:40 +01001264 for lr in lr_graph.lrs:
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001265 for tens in lr.tensors:
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001266 if self.scratched_fms.get(tens):
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001267 curr_lrs.append(lr)
1268 base_mem_usage[lr.start_time : lr.end_time + 1] -= lr.size
1269 break
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001270 competing_lrs = []
Johan Alfvén5c309712022-06-10 15:40:58 +02001271 competing_tens_access = {}
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001272
1273 # Evict live ranges that will never fit
1274 for lr in curr_lrs.copy():
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001275 base_usage = max(base_mem_usage[lr.start_time : lr.end_time + 1])
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001276 if base_usage + lr.size > staging_limit:
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001277 # Lr will never fit and may thus be evicted
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001278 self.evicted_fms.append(lr)
1279 FastStorageComponentAllocator.evict(lr, max_mem_usage, self.scratched_fms)
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001280 curr_lrs.remove(lr)
1281
1282 # Keep live ranges that will always fit in fast storage and let the remaining ones compete
1283 for lr in curr_lrs:
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001284 # Since max_mem_usage is the memory usage with all FMs still in fast-storage,
1285 # the memory limit cannot be exceeded if max_mem_usage does not.
1286 # Thus, the affected lrs can remain in fast-storage if the following is true
1287 if max(max_mem_usage[lr.start_time : lr.end_time + 1]) <= staging_limit:
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001288 FastStorageComponentAllocator.keep(lr, base_mem_usage)
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001289 else:
1290 competing_lrs.append(lr)
Johan Alfvén5c309712022-06-10 15:40:58 +02001291 for tens in lr.tensors:
1292 competing_tens_access[tens] = 0
1293
Johan Alfvén3a6325f2022-10-07 18:03:48 +02001294 # All lrs and their tensors have been handled if competing_lrs_sz is zero, we may thus return
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001295 if len(competing_lrs) == 0:
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001296 return
1297
Johan Alfvén5c309712022-06-10 15:40:58 +02001298 # Estimate element access for all tensors that are competing for a place in fast-storage.
1299 # This number is used when deciding which tensor that stays in fast-storage
1300 for sched_op in self.sched_ops:
1301 cost = schedule.cost_map[sched_op]
1302
1303 if competing_tens_access.get(sched_op.ifm.connection.parent_tens) is not None:
1304 tens = sched_op.ifm.connection.parent_tens
1305 access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)
1306 competing_tens_access[tens] += access.ifm_read[0]
1307
1308 if sched_op.ifm2 and competing_tens_access.get(sched_op.ifm2.connection.parent_tens) is not None:
1309 tens = sched_op.ifm2.connection.parent_tens
1310 access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)
1311 competing_tens_access[tens] += access.ifm_read[1]
1312
1313 if competing_tens_access.get(sched_op.ofm.connection.parent_tens) is not None:
1314 tens = sched_op.ofm.connection.parent_tens
1315 access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)
1316 competing_tens_access[tens] += access.ofm_write
1317
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001318 # Sort live ranges "from left to right" on the time axis to simplify checking overlapping ranges
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001319 competing_lrs = sorted(competing_lrs, key=lambda lr: (lr.start_time, lr.end_time + 1, lr.size))
Johan Alfvén3a6325f2022-10-07 18:03:48 +02001320
1321 # Remove lrs that have a live range that is too long compared to others.
1322 # They are causing problems for the HillClimb Allocator when it has to
1323 # change the allocation indices, in order to fit all the allocations into SRAM.
1324 # This problem only occur in larger networks with complex graphs.
1325 #
1326 # Limit the number of items for allocate_component to work with max MAX_EXHAUSTIVE_ITEMS
1327 # at the time. Too many will give too long compilation time
1328 #
1329 # Too long is currently decided to be (based on experience, analyzing many networks):
1330 # Compare lr at postion i with lr at position i + MAX_EXHAUSTIVE_ITEMS.
1331 # If end time differs by at least MAX_EXHAUSTIVE_LIFE_RANGE then do not include lr at position i.
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001332 if len(competing_lrs) > FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS:
1333 # Create a copy of the original list to iterate over because the original version is modified in-loop
Johan Alfvén3a6325f2022-10-07 18:03:48 +02001334 competing_lrs_copy = competing_lrs.copy()
1335 for i, lr in enumerate(competing_lrs_copy):
1336 lr_time = lr.end_time - lr.start_time
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001337 # Only check live ranges longer than MAX_EXHAUSTIVE_LIFE_RANGE
1338 if lr_time >= FastStorageComponentAllocator.MAX_EXHAUSTIVE_LIFE_RANGE:
1339 # Compare current lr with lr at position lr + MAX_EXHAUSTIVE_ITEMS
1340 cmp_pos = min(i + FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS, len(competing_lrs) - 1)
Johan Alfvén3a6325f2022-10-07 18:03:48 +02001341
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001342 # Compare end times + plus a margin by MAX_EXHAUSTIVE_LIFE_RANGE
1343 if (
1344 lr.end_time
1345 > competing_lrs_copy[cmp_pos].end_time + FastStorageComponentAllocator.MAX_EXHAUSTIVE_LIFE_RANGE
1346 ):
1347 # Current lr live time stands out, remove it. No use adding it to the
1348 # evicted_fms list since the lr should not be included in the fast storage allocation
1349 FastStorageComponentAllocator.evict(lr, max_mem_usage, self.scratched_fms)
1350 competing_lrs.remove(lr)
Johan Alfvén3a6325f2022-10-07 18:03:48 +02001351
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001352 # Split competing live ranges into components by finding disconnected groups of live ranges or components of
1353 # max size MAX_EXHAUSTIVE_ITEMS
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001354 start = 0
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001355 end_time = competing_lrs[0].end_time
1356 component_allocator = FastStorageComponentAllocator(base_mem_usage, max_mem_usage, staging_limit)
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001357 component_ranges = []
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001358 for i, lr in enumerate(competing_lrs):
Johan Alfvén3a6325f2022-10-07 18:03:48 +02001359 nbr_items = i - start
1360 if lr.start_time <= end_time and (nbr_items < FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS):
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001361 end_time = max(end_time, lr.end_time)
1362 else:
Johan Alfvén3a6325f2022-10-07 18:03:48 +02001363 # Number items reached max items or current lr's start time
1364 # does not overlap with previous lr's end time
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001365 component_ranges.append((start, i))
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001366 start = i
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001367 end_time = lr.end_time
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001368 component_ranges.append((start, len(competing_lrs)))
1369
1370 # Allocate each component separately
1371 for start, end in component_ranges:
1372 component_allocator.allocate_component(
1373 competing_lrs[start:end],
1374 max_mem_usage,
1375 base_mem_usage,
1376 self.scratched_fms,
1377 competing_tens_access,
1378 self.evicted_fms,
1379 )
1380 assert max(max_mem_usage) <= staging_limit, "Allocation exceeds staging limit"
Tim Halld8339a72021-05-27 18:49:40 +01001381
1382 def move_constant_data(self):
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001383 """Determine if data can be moved from permanent storage to another memory area. A move will generate a DMA
1384 command in the high-level command stream"""
Tim Halld8339a72021-05-27 18:49:40 +01001385 for sched_op in self.sched_ops:
1386 parent_op = sched_op.parent_op
1387 is_lut_used = any(inp.purpose == TensorPurpose.LUT for inp in parent_op.inputs)
1388 max_ifm_shram_avail = (
1389 (self.arch.available_shram_banks(is_lut_used) - self.arch.shram_reserved_output_banks)
1390 * self.arch.shram_bank_size
1391 // 2
1392 )
1393
1394 for idx, tens in enumerate(parent_op.inputs):
1395 if tens.mem_type not in (MemType.Scratch, MemType.Scratch_fast):
1396 # Tensor is in permanent storage
1397 # Only when permanent storage differs from feature map storage, there is a point moving the data
1398 if (
1399 tens.mem_area in self.arch.permanent_storage_mem_area
1400 and self.arch.permanent_storage_mem_area != self.arch.feature_map_storage_mem_area
1401 ) or tens.purpose == TensorPurpose.LUT:
1402 if tens.purpose == TensorPurpose.LUT or (
Patrik Gustavsson94292fe2021-09-02 08:22:58 +02001403 # For elementwise broadcast
Tim Halld8339a72021-05-27 18:49:40 +01001404 tens.purpose == TensorPurpose.FeatureMap
1405 and sched_op.op_type.is_binary_elementwise_op()
1406 and tens.shape != []
1407 and sched_op.ifm.shape != sched_op.ofm.shape
Patrik Gustavsson94292fe2021-09-02 08:22:58 +02001408 and parent_op.write_shape is None
Tim Halld8339a72021-05-27 18:49:40 +01001409 and tens.storage_size() > max_ifm_shram_avail
1410 ):
1411 only_vector_product_consumers = all(
1412 oper and oper.type.npu_block_type == NpuBlockType.VectorProduct
1413 for oper in tens.consumers()
1414 )
1415
1416 if (not only_vector_product_consumers) or tens.purpose == TensorPurpose.LUT:
1417 new_tens = tens.clone_into_fast_storage(self.arch)
1418 if tens.purpose == TensorPurpose.LUT:
1419 new_tens.mem_area = MemArea.Shram
1420
1421 new_tens.consumer_list.append(parent_op)
1422 parent_op.inputs[idx] = new_tens
Dwight Lidman352607c2021-09-29 17:00:09 +02001423 # If the index is out of range, IFM and IFM2 are the same tensor
1424 # and pass inputs don't have duplicates
1425 if idx < len(sched_op.parent_ps.inputs):
1426 sched_op.parent_ps.inputs[idx] = new_tens
Tim Halld8339a72021-05-27 18:49:40 +01001427
1428 def print_schedule(self, schedule: Schedule):
1429 print(f"Schedule: '{schedule.name}'")
1430 for sched_op in self.sched_ops:
1431 if sched_op not in schedule.cost_map:
1432 # Sub-schedule printing
1433 continue
1434
1435 op_info = schedule.cost_map[sched_op]
1436 print(f"\t{sched_op.index}: Operation {sched_op.name} - OFM {sched_op.ofm.shape}")
1437 print(f"\t\tType: {sched_op.op_type}")
1438 print(f"\t\tKernel: {sched_op.kernel}")
1439 print(f"{op_info}")
1440 mem_usage = (
1441 schedule.memory_snapshot[op_info.time_index]
1442 if op_info.time_index < len(schedule.memory_snapshot)
1443 else 0
1444 )
1445 print(f"\t\tSRAM Used: {mem_usage} bytes")
1446
Jonas Ohlsson25e700c2022-03-04 14:58:56 +01001447 print("\tCascades:")
Tim Halld8339a72021-05-27 18:49:40 +01001448 for i, cascade in enumerate(schedule.cascades.values()):
1449 print(f"\t\t{i}: {cascade.start} -> {cascade.end}, size: {cascade.mem_usage}")
Patrik Gustavssonfeeb06d2020-04-22 12:53:47 +02001450
Andreas Nevalainen27d36f02020-11-19 11:27:50 +01001451
Johan Alfven6e281af2023-02-28 09:03:03 +01001452def _update_memory_snapshot_for_all_npu_graphs(nng: Graph, arch: ArchitectureFeatures, schedulers):
1453 mem_area = arch.fast_storage_mem_area
1454 mem_type_set = set((MemType.Scratch, MemType.Scratch_fast))
1455
1456 # Collect live ranges for the full graph
1457 # extract_live_ranges_from_cascaded_passes will start from the root sg and
1458 # all sub graphs/cascaded passes will be visited and the correct time_index
1459 # will be set for all the tensors.
1460 lr_graph = live_range.LiveRangeGraph()
1461 live_range.extract_live_ranges_from_cascaded_passes(
1462 nng.get_root_subgraph(),
1463 mem_area,
1464 mem_type_set,
1465 lr_graph,
1466 Tensor.AllocationQuantum,
1467 )
1468 # Populate time-array with memory used by live ranges
1469 temporal_usage = lr_graph.get_temporal_memory_usage(arch.fast_storage_mem_area)
1470
1471 # Update snapshot for all the npu sub graphs
1472 # Not needed for the scheduler any longer but npu_performance
1473 # is using this information so it must have the correct state
1474 for sg in schedulers:
1475 sg.schedule.memory_snapshot = temporal_usage
1476 sg.schedule.fast_storage_peak_usage = max(temporal_usage, default=0)
1477
1478
Tim Halld8339a72021-05-27 18:49:40 +01001479def _update_tensor_allocation(nng: Graph, arch: ArchitectureFeatures, options):
1480 """
1481 Creates live ranges and runs tensor allocator for the current schedule
1482 (i.e. sg.schedule for all subgraphs), returns the maximum memory usage
1483 and updates SchedulerOpInfo.mem_usage for all operations in the schedule.
1484 """
1485 root_sg = nng.get_root_subgraph()
1486
1487 alloc_list = []
1488 if arch.is_spilling_enabled():
1489 mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,)))
1490 mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))
1491 # Order is important
1492 alloc_list.append(mem_alloc_scratch_fast)
1493 alloc_list.append(mem_alloc_scratch)
1494 else:
1495 mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))
1496 alloc_list.append(mem_alloc_scratch)
1497
1498 for mem_area, mem_type_set in alloc_list:
1499 tensor_allocation.allocate_tensors(
1500 nng,
1501 root_sg,
1502 arch,
1503 mem_area,
1504 mem_type_set,
1505 tensor_allocator=options.tensor_allocator,
1506 verbose_allocation=options.verbose_allocation,
1507 cpu_tensor_alignment=options.cpu_tensor_alignment,
Tim Hallcda4fcb2022-05-19 12:36:58 +01001508 hillclimb_max_iterations=options.hillclimb_max_iterations,
Tim Halld8339a72021-05-27 18:49:40 +01001509 )
1510
1511
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001512class FastStorageComponentAllocator:
Johan Alfvén5c309712022-06-10 15:40:58 +02001513 MAX_EXHAUSTIVE_LIFE_RANGE = 20
Johan Alfvén3a6325f2022-10-07 18:03:48 +02001514 MAX_EXHAUSTIVE_ITEMS = 20
Johan Alfvén5c309712022-06-10 15:40:58 +02001515
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001516 def __init__(self, base_mem_usage, max_mem_usage, staging_limit):
1517 self.base_mem_usage = base_mem_usage
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001518 self.max_mem_usage = max_mem_usage
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001519 self.staging_limit = staging_limit
1520 self.lrs = []
1521 self.evicted = []
1522 self.curr_evicted = []
1523 self.remaining_total_size = []
Johan Alfvén5c309712022-06-10 15:40:58 +02001524 self.best_score = 0
1525 self.competing_tens_access = {}
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001526
Johan Alfvén5c309712022-06-10 15:40:58 +02001527 def allocate_exhaustive(self, ix, score):
1528 # Favour tensors with highest element access (score)
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001529 if ix >= self.num_lrs:
Johan Alfvén5c309712022-06-10 15:40:58 +02001530 if score > self.best_score:
1531 self.best_score = score
Louis Verhaard5c8f1e52022-02-23 14:13:07 +01001532 self.evicted = self.curr_evicted.copy()
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001533 return
1534
1535 lr = self.lrs[ix]
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001536
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001537 # If adding the tensor size to the base mem usage doesn't exceed the staging limit anywhere on the lr time
1538 # range, it can fit and the case where the tensor is included needs to be checked
1539 can_fit = max(self.base_mem_usage[lr.start_time : lr.end_time + 1]) + lr.size <= self.staging_limit
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001540 if can_fit:
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001541 # Tensor can fit, add tensor element access to the score and check case where tensor is included
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001542 self.curr_evicted[ix] = False
1543 self.base_mem_usage = self.update_mem_usage(self.base_mem_usage, lr, True)
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001544 self.allocate_exhaustive(ix + 1, score + self.competing_tens_access[lr.tensors[0]])
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001545 self.base_mem_usage = self.update_mem_usage(self.base_mem_usage, lr, False)
1546
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001547 # If the max mem usage doesn't exceed the staging limit anywhere on the lr time range, it always fits and the
1548 # case where the tensor is not included can be skipped
1549 always_fits = max(self.max_mem_usage[lr.start_time : lr.end_time + 1]) <= self.staging_limit
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001550 if not always_fits:
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001551 # Tensor doesn't always fit, check case when tensor is not included
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001552 self.curr_evicted[ix] = True
1553 self.max_mem_usage = self.update_mem_usage(self.max_mem_usage, lr, False)
Johan Alfvén5c309712022-06-10 15:40:58 +02001554 self.allocate_exhaustive(ix + 1, score)
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001555 self.max_mem_usage = self.update_mem_usage(self.max_mem_usage, lr, True)
1556
1557 @staticmethod
1558 def update_mem_usage(mem_usage, lr, increase):
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001559 size = lr.size if increase else -lr.size
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001560 for t in range(lr.start_time, lr.end_time + 1):
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001561 mem_usage[t] += size
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001562 return mem_usage
1563
1564 @staticmethod
1565 def evict(lr, max_mem_usage, scratched_fms):
1566 for t in range(lr.start_time, lr.end_time + 1):
1567 max_mem_usage[t] -= lr.size
1568 for tens in lr.tensors:
1569 if tens in scratched_fms:
1570 tens.mem_area = scratched_fms[tens][0]
1571 tens.mem_type = scratched_fms[tens][1]
1572
1573 @staticmethod
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001574 def keep(lr, base_mem_usage):
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001575 for t in range(lr.start_time, lr.end_time + 1):
1576 base_mem_usage[t] += lr.size
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001577
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001578 def allocate_component(self, lrs, max_mem, min_mem, scratched_fms, competing_tens_access, evicted_fms):
1579 self.lrs = lrs
1580 self.num_lrs = len(lrs)
1581 self.evicted = [0] * self.num_lrs
1582 self.curr_evicted = [0] * self.num_lrs
1583 self.best_score = -1
1584 self.competing_tens_access = competing_tens_access
Johan Alfvén5c309712022-06-10 15:40:58 +02001585 # Recursively evaluate all permutations of allocations of the lrs found in the component.
1586 # For every permutation that fits within the staging_limit there is a score calculated.
1587 # The permutation with the highest score will then be chosen. The score is calculated
1588 # as the sum of the actual element access (ifm read and ofm write) for all the
1589 # including tensors. So it is not necessary the tensor with the biggest size that ends up
1590 # being included in the result.
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001591 self.allocate_exhaustive(0, 0)
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001592 # Optimal allocation has been found, move lrs accordingly
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001593 for i, lr in enumerate(self.lrs):
1594 if self.evicted[i]:
1595 self.evict(lr, max_mem, scratched_fms)
1596 if lr not in evicted_fms:
1597 evicted_fms.append(lr)
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001598 else:
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001599 self.keep(lr, min_mem)
1600 if lr in evicted_fms:
1601 evicted_fms.remove(lr)
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001602
1603
Tim Halld8339a72021-05-27 18:49:40 +01001604def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_options: SchedulerOptions):
1605 """Entry point for the Scheduler"""
1606 # Initialize CPU subgraphs
1607 schedulers = dict()
1608 # Initialize schedulers with max schedule. Only schedule NPU subgraphs
Andreas Nevalainen27d36f02020-11-19 11:27:50 +01001609 for sg in nng.subgraphs:
Tim Halld8339a72021-05-27 18:49:40 +01001610 if sg.placement != PassPlacement.Npu:
1611 # Create cascaded passes for CPU Ops
1612 cascaded_passes = []
1613 for idx, ps in enumerate(sg.passes):
1614 cps = CascadedPass(
Jonas Ohlssond8575072022-03-30 10:30:25 +02001615 ps.name,
1616 SchedulingStrategy.WeightStream,
1617 ps.inputs,
1618 [],
1619 ps.outputs,
1620 [ps],
1621 ps.placement,
1622 False,
Tim Halld8339a72021-05-27 18:49:40 +01001623 )
Andreas Nevalainen27d36f02020-11-19 11:27:50 +01001624
Tim Halld8339a72021-05-27 18:49:40 +01001625 cps.time = idx
1626 ps.cascade = cps
1627 cascaded_passes.append(cps)
Andreas Nevalainen27d36f02020-11-19 11:27:50 +01001628
Tim Halld8339a72021-05-27 18:49:40 +01001629 sg.cascaded_passes = cascaded_passes
1630 else:
1631 # Npu subgraph - create schedule
1632 scheduler = Scheduler(nng, sg, arch, scheduler_options)
1633 schedulers[sg] = scheduler
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001634
Tim Halld8339a72021-05-27 18:49:40 +01001635 scheduler.create_scheduler_representation(arch)
1636 sg.sched_ops = scheduler.sched_ops
1637 scheduler.move_constant_data()
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001638
Tim Halld8339a72021-05-27 18:49:40 +01001639 # Create the Max schedule template
1640 max_schedule_template = scheduler.create_initial_schedule()
1641 scheduler.max_schedule = max_schedule_template
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001642
Tim Halld8339a72021-05-27 18:49:40 +01001643 # Create the optimimised Max schedule
1644 sg.schedule = max_schedule_template
1645 scheduler.update_op_memory_snapshot(max_schedule_template)
Tim Hall789e6f32021-06-17 17:02:31 +01001646 opt_max_schedule = scheduler.propose_schedule_buffering(max_schedule_template, 1 << 32)
Tim Halld8339a72021-05-27 18:49:40 +01001647 sg.schedule = opt_max_schedule
1648 scheduler.update_op_memory_snapshot(opt_max_schedule)
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001649
Tim Halld8339a72021-05-27 18:49:40 +01001650 # Create Min schedule
1651 min_schedule = scheduler.propose_minimal_schedule()
1652 initial_sram_limit = scheduler_options.optimization_sram_limit
1653 if scheduler_options.optimization_strategy == OptimizationStrategy.Size:
1654 initial_sram_limit = scheduler.min_memory_req
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001655
Johan Alfvén255dad72022-07-16 18:27:05 +02001656 # Build cascades for Min schedule
1657 scheduler.build_cascades_for_min_schedule(min_schedule, max_schedule_template, initial_sram_limit)
Tim Halld8339a72021-05-27 18:49:40 +01001658 sg.schedule = min_schedule
1659 scheduler.update_op_memory_snapshot(min_schedule)
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001660
Tim Halld8339a72021-05-27 18:49:40 +01001661 if scheduler_options.optimization_strategy == OptimizationStrategy.Performance:
1662 # Create an optimized schedule
1663 sg.schedule = scheduler.optimize_schedule(
1664 min_schedule, opt_max_schedule, max_schedule_template, scheduler_options
1665 )
1666 scheduler.update_op_memory_snapshot(sg.schedule)
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001667
Tim Halld8339a72021-05-27 18:49:40 +01001668 scheduler.apply_schedule(sg.schedule)
1669 scheduler.use_fast_storage_for_feature_maps(sg.schedule, scheduler_options.optimization_sram_limit)
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001670
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001671 if scheduler_options.optimization_strategy == OptimizationStrategy.Performance and scheduler.evicted_fms:
1672 # It might be possible to gain performance by reducing
1673 # weight buffer size and instead fit fms in fast storage
1674 scheduler.optimize_weight_buffering_size(min_schedule, scheduler_options)
1675
Tim Halld8339a72021-05-27 18:49:40 +01001676 if scheduler_options.verbose_schedule:
1677 scheduler.print_schedule(sg.schedule)
Tim Hall79d07d22020-04-27 18:20:16 +01001678
Johan Alfven6e281af2023-02-28 09:03:03 +01001679 # Make a full live range calculation starting from the root sg
1680 _update_memory_snapshot_for_all_npu_graphs(nng, arch, schedulers)
1681
Tim Halld8339a72021-05-27 18:49:40 +01001682 # Evaluate schedule
1683 _update_tensor_allocation(nng, arch, options)