blob: 16531c2ceced02392d305490cdcf9e66d427fb15 [file] [log] [blame]
Johan Alfvenf3490472023-01-13 08:46:27 +01001# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates <open-source-office@arm.com>
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Halld8339a72021-05-27 18:49:40 +010016#
Tim Hall79d07d22020-04-27 18:20:16 +010017# Description:
Tim Halld8339a72021-05-27 18:49:40 +010018# The scheduler creates and searches for an optimal plan for the network, selecting block configurations and
19# subdivisions for the Operators
Jonas Ohlsson845e2322022-03-01 12:39:55 +010020# For Class name forward references for the type annotations. (see PEP 563).
21from __future__ import annotations
22
Diego Russoea6111a2020-04-14 18:41:58 +010023import copy
Johan Alfvén5e0ae552022-02-09 21:20:10 +010024from collections import namedtuple
Tim Halld8339a72021-05-27 18:49:40 +010025from enum import auto
26from enum import IntEnum
Johan Alfvén6f4cb032022-05-05 08:42:46 +020027from typing import Any
Tim Halld8339a72021-05-27 18:49:40 +010028from typing import Dict
29from typing import List
30from typing import Optional
31from typing import Tuple
Jonas Ohlsson845e2322022-03-01 12:39:55 +010032from typing import TYPE_CHECKING
33
34# Import needed for Type annotations. Only import for Type checking to avoid run-time errors due to cyclic import.
35if TYPE_CHECKING:
36 from .npu_performance import CycleCost
Diego Russoea6111a2020-04-14 18:41:58 +010037
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +010038import numpy as np
39
Diego Russoea6111a2020-04-14 18:41:58 +010040from . import live_range
Tim Hall79d07d22020-04-27 18:20:16 +010041from . import npu_performance
Tim Halld8339a72021-05-27 18:49:40 +010042from . import tensor_allocation
43from . import weight_compressor
44from .architecture_allocator import ArchitectureBlockConfig
45from .architecture_allocator import find_block_config
46from .architecture_allocator import get_ifm_area_required
Fredrik Svedbergd03dc502022-06-30 10:44:12 +020047from .architecture_allocator import to_upscale
erik.andersson@arm.com8912f3a2022-08-16 11:08:57 +020048from .architecture_allocator import is_nearest
Tim Halld8339a72021-05-27 18:49:40 +010049from .architecture_features import ArchitectureFeatures
50from .architecture_features import Block
51from .cascade_builder import CascadeBuilder
52from .cascade_builder import CascadeInfo
Fredrik Svedberg880e7352020-08-25 11:31:47 +020053from .data_type import DataType
Diego Russoe8a10452020-04-21 17:39:10 +010054from .nn_graph import CascadedPass
Tim Halld8339a72021-05-27 18:49:40 +010055from .nn_graph import Graph
56from .nn_graph import Pass
Diego Russoe8a10452020-04-21 17:39:10 +010057from .nn_graph import PassPlacement
Diego Russoe8a10452020-04-21 17:39:10 +010058from .nn_graph import SchedulingStrategy
Tim Halld8339a72021-05-27 18:49:40 +010059from .nn_graph import Subgraph
Johan Alfvén92689d52022-12-06 11:16:19 +010060from .live_range import ofm_can_reuse_ifm
Tim Halld8339a72021-05-27 18:49:40 +010061from .numeric_util import round_down
62from .numeric_util import round_up
Louis Verhaardaee5d752020-09-30 09:01:52 +020063from .operation import Op
Tim Halld8339a72021-05-27 18:49:40 +010064from .shape4d import Shape4D
Diego Russoe8a10452020-04-21 17:39:10 +010065from .tensor import MemArea
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020066from .tensor import MemType
Tim Halld8339a72021-05-27 18:49:40 +010067from .tensor import Tensor
Diego Russoe8a10452020-04-21 17:39:10 +010068from .tensor import TensorFormat
69from .tensor import TensorPurpose
70from .tensor import TensorSubPurpose
Jonas Ohlsson845e2322022-03-01 12:39:55 +010071from .weight_compressor import NpuWeightTensor
Jacob Bohlin1a666972020-09-11 10:04:15 +020072
Tim Hall79d07d22020-04-27 18:20:16 +010073
Tim Halld8339a72021-05-27 18:49:40 +010074def shape_for_format(shape: Shape4D, tensor_format: TensorFormat) -> Shape4D:
75 if tensor_format == TensorFormat.NHCWB16:
76 return shape.with_depth(round_up(shape.depth, 16))
77
78 return shape
79
80
81class OptimizationStrategy(IntEnum):
82 """Enum defining the different optimization strategies for the Scheduler"""
83
84 Size = auto()
85 Performance = auto()
Tim Hall79d07d22020-04-27 18:20:16 +010086
87 def __str__(self):
88 return self.name
89
90
Tim Halld8339a72021-05-27 18:49:40 +010091class SchedulerOpInfo:
92 """Contains metadata about a SchedulerOperation that is unique to one Schedule"""
93
Tim Hall79d07d22020-04-27 18:20:16 +010094 def __init__(
95 self,
Tim Halld8339a72021-05-27 18:49:40 +010096 block_config: ArchitectureBlockConfig,
97 weights_size: int,
98 stripe_input: Shape4D,
99 stripe_input2: Optional[Shape4D],
100 stripe: Shape4D,
Tim Hall79d07d22020-04-27 18:20:16 +0100101 ):
Tim Halld8339a72021-05-27 18:49:40 +0100102 self.block_config = block_config
103 self.weights_size = weights_size
104 self.stripe_input = stripe_input
105 self.stripe_input2 = stripe_input2
106 self.stripe = stripe
107 self.cascade = 0 # Assigned by CascadeBuilder. 0 means not part of a cascade
108 self.time_index = None # Set by update_op_memory_snapshot
109 self.ofm_depth_slices: List[int] = [0, stripe.depth]
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100110 self.npu_weights_tensor: Optional[NpuWeightTensor] = None
111 self.npu_scales_tensor: Optional[NpuWeightTensor] = None
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000112 self.buffered_weight_tensors: List[Tensor] = []
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100113 self.cycles: Optional[CycleCost] = None
Tim Halld8339a72021-05-27 18:49:40 +0100114 self.slack_buffering_cycles = 0
115 self.slack_buffering_memory = 0
116 self.full_weight_transfer_cycles = 0
117
118 def copy(self):
Jonas Ohlssond8575072022-03-30 10:30:25 +0200119 res = SchedulerOpInfo(
120 self.block_config,
121 self.weights_size,
122 self.stripe_input,
123 self.stripe_input2,
124 self.stripe,
125 )
Tim Halld8339a72021-05-27 18:49:40 +0100126 res.cascade = self.cascade
127 return res
128
129 def __str__(self):
130 res = f"\t\tBlock Config = {self.block_config}\n"
131 res += f"\t\tOFM Block = {self.block_config.ofm_block}\n"
132 res += f"\t\tIFM Stripe = {self.stripe_input}\n"
133 res += f"\t\tIFM2 Stripe = {self.stripe_input2}\n"
134 res += f"\t\tOFM Stripe = {self.stripe}\n"
135 res += f"\t\tEncoded Weights = {self.npu_weights_tensor and len(self.npu_weights_tensor.buffer)} bytes\n"
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000136 for idx, tens in enumerate(self.buffered_weight_tensors):
137 res += f"\t\tWeight buffer{idx + 1} = {tens.storage_size()} bytes\n"
Tim Halld8339a72021-05-27 18:49:40 +0100138 res += f"\t\tDepth slices = {self.ofm_depth_slices}\n"
139 res += f"\t\tAssigned Cascade = {self.cascade}"
140 return res
141
142
143class SchedulerOptions:
144 """Contains options for the Scheduler"""
145
146 def __init__(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200147 self,
148 optimization_strategy,
149 sram_target,
150 verbose_schedule,
Tim Halld8339a72021-05-27 18:49:40 +0100151 ):
152 self.optimization_strategy = optimization_strategy
153 self.optimization_sram_limit = sram_target
Tim Hall79d07d22020-04-27 18:20:16 +0100154 self.verbose_schedule = verbose_schedule
Tim Hall79d07d22020-04-27 18:20:16 +0100155
Tim Halld8339a72021-05-27 18:49:40 +0100156 def __str__(self) -> str:
157 return f"{type(self).__name__}: {str(self.__dict__)}"
Tim Hall79d07d22020-04-27 18:20:16 +0100158
159 __repr__ = __str__
160
161
Tim Halld8339a72021-05-27 18:49:40 +0100162class SchedulerTensor:
163 def __init__(self, shape, dt, mem_area, _format):
164 self.dtype = dt
165 self.mem_area = mem_area
166 self.shape = shape
167 self.format = _format
168 self.connection = None
Tim Hall79d07d22020-04-27 18:20:16 +0100169
Tim Halld8339a72021-05-27 18:49:40 +0100170
171class SchedulerOperation:
172 """Scheduler internal representation of 'Operation'
173 This class can be seen as a node within the Scheduler Graph representation
174 """
175
176 def __init__(self, ps: Pass, arch: ArchitectureFeatures, nng: Graph):
177 self.arch = arch
178 self.parent_ps = ps
179 self.parent_op = ps.primary_op
180 self.name = ps.primary_op.name
181 self.op_type = ps.primary_op.type
182 self.activation = ps.primary_op.activation
183 self.kernel = ps.primary_op.kernel
Tim Hall3c5cfe92022-03-16 16:31:57 +0000184 self.resampling_mode = ps.primary_op.ifm_resampling_mode
Fredrik Svedbergb81e1bb2022-10-11 21:50:51 +0200185 self.reversed_operands = False
Tim Halld8339a72021-05-27 18:49:40 +0100186 self.uses_scalar = ps.primary_op.ifm2 is not None and (
187 ps.primary_op.ifm.shape == [] or ps.primary_op.ifm2.shape == []
Tim Hall79d07d22020-04-27 18:20:16 +0100188 )
erik.andersson@arm.com6b2a0b42022-03-22 15:35:30 +0100189
Tim Halld8339a72021-05-27 18:49:40 +0100190 self.ifm_ublock = arch.ifm_ublock
Tim Hall79d07d22020-04-27 18:20:16 +0100191
Jonas Ohlssond8575072022-03-30 10:30:25 +0200192 self.ifm = SchedulerTensor(
193 ps.ifm_shapes[0],
194 ps.ifm_tensor.dtype,
195 ps.ifm_tensor.mem_area,
196 ps.ifm_tensor.format,
197 )
Tim Hall79d07d22020-04-27 18:20:16 +0100198
Tim Halld8339a72021-05-27 18:49:40 +0100199 self.ifm2 = None
200 if ps.ifm2_tensor:
201 self.ifm2 = SchedulerTensor(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200202 ps.ifm_shapes[1],
203 ps.ifm2_tensor.dtype,
204 ps.ifm2_tensor.mem_area,
205 ps.ifm2_tensor.format,
Tim Halld8339a72021-05-27 18:49:40 +0100206 )
Tim Hall79d07d22020-04-27 18:20:16 +0100207
Jonas Ohlssond8575072022-03-30 10:30:25 +0200208 self.ofm = SchedulerTensor(
209 ps.ofm_shapes[0],
210 ps.ofm_tensor.dtype,
211 ps.ofm_tensor.mem_area,
212 ps.ofm_tensor.format,
213 )
Tim Hall79d07d22020-04-27 18:20:16 +0100214
Johan Alfven126558e2023-03-09 08:36:10 +0100215 # LUT must be placed in shram area. The copy is done by DMA
216 # generated by the high level command stream generator.
217 for idx, tens in enumerate(self.parent_op.inputs):
218 if tens.purpose == TensorPurpose.LUT:
219 new_tens = tens.clone_into_shram(self.arch)
220 new_tens.consumer_list.append(self.parent_op)
221 self.parent_op.inputs[idx] = new_tens
222
Tim Halld8339a72021-05-27 18:49:40 +0100223 # Input volume width and height required to produce the smallest possible stripe
224 self.min_stripe_input_w, self.min_stripe_input_h = self._calculate_min_stripe_input()
Tim Hall79d07d22020-04-27 18:20:16 +0100225
Tim Halld8339a72021-05-27 18:49:40 +0100226 # Flags that marks whether this SchedulerOperation requires full IFM/OFM
227 self.requires_full_ifm = False
228 self.requires_full_ifm2 = False
229 self.requires_full_ofm = False
Tim Hall79d07d22020-04-27 18:20:16 +0100230
Johan Alfvén6f4cb032022-05-05 08:42:46 +0200231 self.evicted_fms_size = 0
232
Tim Halld8339a72021-05-27 18:49:40 +0100233 self.index = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100234
erik.andersson@arm.com6b2a0b42022-03-22 15:35:30 +0100235 # Perform an IFM swap for certain binary elementwise operators
236 # in order to enable cascading, if the SchedOp conforms to
237 # Elementwise cascading rules.
Johan Alfvén0f2e59f2022-10-21 11:21:38 +0200238 # The non-constant/non-scalar/non-broadcast IFM should be the primary input
239 if self.op_type.is_binary_elementwise_op():
240 ifm = self.parent_op.ifm
241 ifm2 = self.parent_op.ifm2
242 ofm = self.parent_op.ofm
erik.andersson@arm.com6b2a0b42022-03-22 15:35:30 +0100243
Johan Alfvén993ea532022-10-26 10:20:01 +0200244 ifm_can_swap = ifm.is_const or ifm.is_scalar
Johan Alfvén0f2e59f2022-10-21 11:21:38 +0200245 ifm2_can_be_primary = not (ifm2.is_const or ifm2.is_scalar or ifm2.is_broadcast(ofm))
246
Johan Alfvén993ea532022-10-26 10:20:01 +0200247 if ifm_can_swap and ifm2_can_be_primary:
Johan Alfvén0f2e59f2022-10-21 11:21:38 +0200248 # IFM2 is the primary input
Fredrik Svedbergb81e1bb2022-10-11 21:50:51 +0200249 self.reversed_operands = True
erik.andersson@arm.com6b2a0b42022-03-22 15:35:30 +0100250 self.ifm, self.ifm2 = self.ifm2, self.ifm
251
252 self.parent_ps.ifm_shapes = self.parent_ps.ifm_shapes[::-1]
253 self.parent_ps.inputs = self.parent_ps.inputs[::-1]
254 self.parent_ps.ifm_tensor, self.parent_ps.ifm2_tensor = (
255 self.parent_ps.ifm2_tensor,
256 self.parent_ps.ifm_tensor,
257 )
258
Tim Halld8339a72021-05-27 18:49:40 +0100259 def add_ifm_connection(self, conn: "Connection"):
260 """Add input connection to another SchedulerOperation or Subgraph Input"""
261 conn.consumers.append(self)
262 self.ifm.connection = conn
Tim Hall79d07d22020-04-27 18:20:16 +0100263
Tim Halld8339a72021-05-27 18:49:40 +0100264 def add_ifm2_connection(self, conn: "Connection"):
265 """Add input connection to another SchedulerOperation or Subgraph Input"""
266 if self.ifm2:
267 conn.consumers.append(self)
268 self.ifm2.connection = conn
Tim Hall79d07d22020-04-27 18:20:16 +0100269 else:
Tim Halld8339a72021-05-27 18:49:40 +0100270 assert False, f"Trying to set an IFM2 Connection to {self} which has no IFM2"
Tim Hall79d07d22020-04-27 18:20:16 +0100271
Tim Halld8339a72021-05-27 18:49:40 +0100272 def add_ofm_connection(self, conn: "Connection"):
273 """Add output connection to another SchedulerOperation or Subgraph Output"""
274 conn.producers.append(self)
275 self.ofm.connection = conn
276
277 def get_dependants(self):
278 """Returns a list of the Ops that depend on this Operation's OFM"""
279 return self.ofm.connection.consumers
280
281 def ifm_size_in_bytes(self) -> int:
282 """Returns size of the IFM in bytes"""
283 ifm_storage_shape = shape_for_format(self.ifm.shape, self.ifm.format)
284 return round_up(ifm_storage_shape.elements() * self.ifm.dtype.size_in_bytes(), Tensor.AllocationQuantum)
285
286 def ifm2_size_in_bytes(self) -> int:
287 """Returns size of the IFM2 in bytes"""
288 if self.ifm2:
289 ifm2_storage_shape = shape_for_format(self.ifm2.shape, self.ifm2.format)
290 return round_up(ifm2_storage_shape.elements() * self.ifm2.dtype.size_in_bytes(), Tensor.AllocationQuantum)
291
292 return 0
293
294 def ofm_size_in_bytes(self) -> int:
295 """Returns size of the OFM in bytes"""
296 ofm_storage_shape = shape_for_format(self.ofm.shape, self.ofm.format)
297 return round_up(ofm_storage_shape.elements() * self.ofm.dtype.size_in_bytes(), Tensor.AllocationQuantum)
298
299 def create_scheduler_info(self, nng: Graph, stripe: Shape4D) -> SchedulerOpInfo:
300 """Returns schedule info about this SchedulerOperation based on how many ofm elements it should produce"""
301 ifm_shape = self.ifm.shape
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100302 ifm2_shape = self.ifm2.shape if self.ifm2 is not None else None
Tim Halld8339a72021-05-27 18:49:40 +0100303 ofm_shape = stripe
304
305 if ofm_shape != self.ofm.shape:
306 # Striped Op - Need to calculate stripe input volume
307 stripe_input_w, stripe_input_h = self._get_stripe_input_requirement(stripe)
308 # Ensure stripe input volume is within the full IFM volume
309 stripe_input_h = min(stripe_input_h, self.ifm.shape.height)
310 stripe_input_w = min(stripe_input_w, self.ifm.shape.width)
311 ifm_shape = ifm_shape.with_hw(stripe_input_h, stripe_input_w)
312
313 if self.ifm2:
314 stripe_input2_h = min(stripe_input_h, self.ifm2.shape.height)
315 stripe_input2_w = min(stripe_input_w, self.ifm2.shape.width)
316 ifm2_shape = ifm2_shape.with_hw(stripe_input2_h, stripe_input2_w)
317
318 block_config = self._get_block_config(ifm_shape, ifm2_shape, self.uses_scalar, ofm_shape)
319
320 scheduler_op_info = SchedulerOpInfo(block_config, 0, ifm_shape, ifm2_shape, ofm_shape)
321 if self.parent_op.weights:
322 # Default full-depth weight encoding with no buffering
Tim Halld784af72021-06-08 21:25:57 +0100323 (
324 scheduler_op_info.npu_weights_tensor,
325 scheduler_op_info.npu_scales_tensor,
326 ) = weight_compressor.encode_weight_and_scale_tensor(
Tim Halld8339a72021-05-27 18:49:40 +0100327 self.arch,
328 self.parent_op,
329 self.parent_op.weights,
330 self.parent_op.bias,
331 self.kernel,
332 block_config,
333 [0, self.ofm.shape.depth],
334 )
335
336 self.parent_ps.block_config = block_config.old_style_representation()
337 return scheduler_op_info
338
339 def _get_stripe_input_requirement(self, stripe_shape: Shape4D) -> Tuple[int, int]:
340 """Returns the amount of IFM required to produce the stripe with shape:'stripe_shape'"""
341 ofm_shape_to_produce = Block.from_shape(stripe_shape.as_list())
342
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200343 return get_ifm_area_required(ofm_shape_to_produce, self.kernel, self.resampling_mode)
Tim Halld8339a72021-05-27 18:49:40 +0100344
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100345 def _calculate_min_stripe_input(self) -> Tuple[int, int]:
Tim Halld8339a72021-05-27 18:49:40 +0100346 # Calculate the input volume required height and width for the smallest possible stripe (h,w = 1,1)
347 min_stripe = self.ofm.shape.with_hw(1, 1)
348 return self._get_stripe_input_requirement(min_stripe)
349
350 def _get_block_config(
351 self, ifm_shape: Shape4D, ifm2_shape: Optional[Shape4D], uses_scalar: bool, ofm_shape: Shape4D
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100352 ) -> Optional[ArchitectureBlockConfig]:
Tim Halld8339a72021-05-27 18:49:40 +0100353 # Returns a block config and SHRAM layout
354 lut_banks = 2 if self.parent_op.activation_lut else 0
355 return find_block_config(
356 self.arch,
357 self.op_type.npu_block_type,
358 ofm_shape,
359 ifm_shape,
360 ifm2_shape,
361 uses_scalar,
362 self.ifm.dtype.size_in_bits(),
363 self.kernel,
364 lut_banks,
365 self.parent_op.has_scaling(),
366 self.resampling_mode,
367 )
368
369
370class Connection:
371 """Scheduler internal representation of a Tensor that connects two SchedulerOperations
372 This class can be seen as an edge within the Scheduler Graph representation
373 """
374
375 def __init__(self, tensor: Tensor):
376 self.parent_tens = tensor
377
378 # SchedulerOperation relationships
379 self.producers: List[SchedulerOperation] = []
380 self.consumers: List[SchedulerOperation] = []
Tim Hall79d07d22020-04-27 18:20:16 +0100381
382 def __str__(self):
Tim Halld8339a72021-05-27 18:49:40 +0100383 return f"<Connection {self.parent_tens.name}>"
Tim Hall79d07d22020-04-27 18:20:16 +0100384
385 __repr__ = __str__
386
387
Tim Halld8339a72021-05-27 18:49:40 +0100388class Schedule:
389 """Class that contains a solution of how to schedule an NPU subgraph and its cost"""
Tim Hall79d07d22020-04-27 18:20:16 +0100390
Tim Halld8339a72021-05-27 18:49:40 +0100391 def __init__(self, sg: Subgraph, label: str):
392 self.sg = sg
393 self.label = label
394 self.cost_map: Dict[SchedulerOperation, SchedulerOpInfo] = {}
395 self.cascades: Dict[int, CascadeInfo] = {}
396 self.fast_storage_peak_usage = 0
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100397 self.memory_snapshot: Optional[List[int]] = None
Tim Halld8339a72021-05-27 18:49:40 +0100398
399 @property
400 def name(self):
401 return f"{self.sg.name}_{self.label}"
Tim Hall79d07d22020-04-27 18:20:16 +0100402
403
Tim Halld8339a72021-05-27 18:49:40 +0100404class Scheduler:
405 """Main class of the Vela Scheduling"""
Tim Hall79d07d22020-04-27 18:20:16 +0100406
Tim Halld8339a72021-05-27 18:49:40 +0100407 def __init__(self, nng: Graph, sg: Subgraph, arch: ArchitectureFeatures, options: SchedulerOptions):
Tim Hall79d07d22020-04-27 18:20:16 +0100408 self.nng = nng
409 self.sg = sg
410 self.arch = arch
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000411 self.sched_ops: List[SchedulerOperation] = []
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100412 self.max_schedule: Optional[Schedule] = None
Tim Halld8339a72021-05-27 18:49:40 +0100413 self.scheduler_options = options
Tim Hall79d07d22020-04-27 18:20:16 +0100414
Johan Alfvén6f4cb032022-05-05 08:42:46 +0200415 self.scratched_fms: Dict[Tensor, Any] = {}
416 self.evicted_fms: List[live_range.LiveRange] = []
417
Johan Alfvén5e0ae552022-02-09 21:20:10 +0100418 def avoid_nhcwb16_for_ofm(self, tens, ps, arch):
Johan Alfvenf3490472023-01-13 08:46:27 +0100419 """For elementwise ops when ifm is in nhwc format and not brick format aligned (16),
420 then if the ofm can overwrite the ifm it is better to enforce ofm format to nhwc in
421 order to reduce memory transactions"""
Johan Alfvén5e0ae552022-02-09 21:20:10 +0100422
423 op = ps.primary_op
424 if not op.type.is_elementwise_op():
425 return False
426
427 depth = op.ofm_shapes[0][-1]
428 if (depth % 16) == 0:
429 return False
430
431 # Check if overwriting the inputs can be allowed
432 OpShapeTens = namedtuple("OpShapeTens", ["op_shape", "tens"])
433 outp = OpShapeTens(op.ofm_shapes[0], op.ofm)
434 inps = []
435 if op.ifm is not None:
436 inps.append(OpShapeTens(op.ifm_shapes[0], op.ifm))
437 if op.ifm2 is not None:
438 inps.append(OpShapeTens(op.ifm_shapes[1], op.ifm2))
439
440 # Find an input tensor that can be overwritten by the output
441 for inp in inps:
442 if (
443 # check op input and output shapes allow overlapping
444 inp.op_shape == outp.op_shape
445 # check input tensor is valid
446 and inp.tens is not None
447 and inp.tens.shape != []
448 # check input and output tensors are compatible
449 and inp.tens.format == outp.tens.format
450 and inp.tens.dtype == outp.tens.dtype
451 ):
452 if inp.tens.format == TensorFormat.NHWC:
453 return True
454
455 return False
456
Tim Halld8339a72021-05-27 18:49:40 +0100457 def create_scheduler_representation(self, arch: ArchitectureFeatures):
458 """Creates a Scheduler Graph representation"""
459 # Temporary dict for creating connections between the Operations
460 connections: Dict[Tensor, Connection] = {}
461 # Memory required for the largest FeatureMap that has to be full
462 min_memory_req = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100463 for ps in self.sg.passes:
Tim Halld8339a72021-05-27 18:49:40 +0100464 if ps.primary_op:
465 # Set tensor format to NHCWB16 for output FeatureMaps, if possible
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200466 for output in ps.outputs:
Jacob Bohlina5e8c1c2021-06-14 13:33:39 +0200467 if output in self.sg.output_tensors or output.purpose != TensorPurpose.FeatureMap:
Patrik Gustavssonfeeb06d2020-04-22 12:53:47 +0200468 continue
Johan Alfvén5e0ae552022-02-09 21:20:10 +0100469
470 if output.needs_linear_format:
471 continue
472
473 if self.avoid_nhcwb16_for_ofm(output, ps, arch):
474 output.needs_linear_format = True
475 continue
476
477 output.set_format(TensorFormat.NHCWB16, arch)
Tim Halld8339a72021-05-27 18:49:40 +0100478
479 # Create SchedulerOperations
480 op = SchedulerOperation(ps, arch, self.nng)
481 op.index = len(self.sched_ops)
482
483 # Make connections
484 if ps.ifm_tensor not in connections:
485 connections[ps.ifm_tensor] = Connection(ps.ifm_tensor)
486 if ps.ifm2_tensor and ps.ifm2_tensor not in connections:
487 connections[ps.ifm2_tensor] = Connection(ps.ifm2_tensor)
488 if ps.ofm_tensor not in connections:
489 connections[ps.ofm_tensor] = Connection(ps.ofm_tensor)
490
491 op.add_ifm_connection(connections[ps.ifm_tensor])
492 if ps.ifm2_tensor:
493 op.add_ifm2_connection(connections[ps.ifm2_tensor])
494 op.add_ofm_connection(connections[ps.ofm_tensor])
495
496 # Set requirements on the ifm/ofm buffers
497 self.sched_ops.append(op)
498 if ps.ifm_tensor in self.sg.input_tensors:
499 # This Op consumes a subgraph input
500 op.requires_full_ifm = True
501 if ps.ifm2_tensor and ps.ifm2_tensor in self.sg.input_tensors:
502 # This Op consumes a subgraph input
503 op.requires_full_ifm2 = True
504 if ps.ofm_tensor in self.sg.output_tensors:
505 # This Op produces a subgraph output
506 op.requires_full_ofm = True
507 if ps.ifm_tensor.needs_linear_format:
508 op.requires_full_ifm = True
509 if ps.ifm2_tensor and ps.ifm2_tensor.needs_linear_format:
510 op.requires_full_ifm2 = True
511 if ps.ofm_tensor.needs_linear_format or ps.primary_op.memory_function == Op.ConcatSliceWrite:
512 op.requires_full_ofm = True
513 if len(ps.primary_op.outputs) > 1 or len(ps.primary_op.outputs[0].consumer_list) > 1:
514 # Op has multiple outputs or consumers - requires full OFM
515 op.requires_full_ofm = True
516
517 # Check memory requirements if this Op requires any full FeatureMaps
518 op_memory_req = 0
519 if op.requires_full_ifm:
520 op_memory_req += op.ifm_size_in_bytes()
521 if op.requires_full_ifm2:
522 op_memory_req += op.ifm2_size_in_bytes()
523 if op.requires_full_ofm:
524 op_memory_req += op.ofm_size_in_bytes()
525
526 min_memory_req = max(op_memory_req, min_memory_req)
527
528 # Theoretical minimum required memory - used to guide the cascade building
529 self.min_memory_req = min_memory_req
530
531 def create_initial_schedule(self) -> Schedule:
532 """Creates an initial schedule with no cascading or buffering of any kind"""
533 schedule = Schedule(self.sg, "MAX")
Tim Halld8339a72021-05-27 18:49:40 +0100534 for op in self.sched_ops:
535 cost = op.create_scheduler_info(self.nng, op.ofm.shape)
536 cost.cycles = self.estimate_op_performance(op, cost.block_config, op.ofm.shape.depth)
537 schedule.cost_map[op] = cost
538
539 return schedule
540
541 def update_op_memory_snapshot(self, schedule: Schedule):
542 memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]
543
544 # Collect live ranges from tensors
545 lr_graph = live_range.LiveRangeGraph()
546 for mem_area, mem_type_set in memories_list:
Johan Alfven6e281af2023-02-28 09:03:03 +0100547 live_range.extract_live_ranges_from_schedule(
548 self.sg,
Jonas Ohlssond8575072022-03-30 10:30:25 +0200549 mem_area,
550 mem_type_set,
551 lr_graph,
Tim Halld8339a72021-05-27 18:49:40 +0100552 )
553
554 # Populate time-array with memory used by live ranges
555 temporal_usage = lr_graph.get_temporal_memory_usage(self.arch.fast_storage_mem_area)
556 schedule.memory_snapshot = temporal_usage
557
558 # Set the peak memory usage
559 schedule.fast_storage_peak_usage = max(temporal_usage, default=0)
560
561 def estimate_op_performance(self, op: SchedulerOperation, block_config, ofm_depth):
562 query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)
563 query.ifm_shape = op.ifm.shape
564 query.ifm_memory_area = op.ifm.mem_area
565 query.ifm_bits = op.ifm.dtype.size_in_bits()
566 query.ifm_format = op.ifm.format
567 query.ifm2_shape = op.ifm2 and op.ifm2.shape
568 query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area
569 query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()
570 query.ifm2_format = op.ifm2 and op.ifm2.format
571 query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)
572 query.ofm_memory_area = op.ofm.mem_area
573 query.ofm_bits = op.ofm.dtype.size_in_bits()
574 query.ofm_format = op.ofm.format
575 if op.parent_op.bias:
576 query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)
577 query.const_memory_area = self.arch.fast_storage_mem_area
578
579 query.kernel = op.kernel
580 query.config = block_config
581
582 return npu_performance.measure_cycle_cost(self.arch, op.op_type, op.activation and op.activation.op_type, query)
583
Johan Alfvén5c309712022-06-10 15:40:58 +0200584 def estimate_element_access(self, op: SchedulerOperation, block_config, ofm_depth):
585 query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)
586 query.ifm_shape = op.ifm.shape
587 query.ifm_memory_area = op.ifm.mem_area
588 query.ifm_bits = op.ifm.dtype.size_in_bits()
589 query.ifm_format = op.ifm.format
590 query.ifm2_shape = op.ifm2 and op.ifm2.shape
591 query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area
592 query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()
593 query.ifm2_format = op.ifm2 and op.ifm2.format
594 query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)
595 query.ofm_memory_area = op.ofm.mem_area
596 query.ofm_bits = op.ofm.dtype.size_in_bits()
597 query.ofm_format = op.ofm.format
598 if op.parent_op.bias:
599 query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)
600 query.const_memory_area = self.arch.fast_storage_mem_area
601
602 query.kernel = op.kernel
603 query.config = block_config
604
605 return npu_performance.measure_element_access(self.arch, query)
606
Tim Hall789e6f32021-06-17 17:02:31 +0100607 def propose_schedule_buffering(self, ref_schedule: Schedule, staging_limit_bytes):
Tim Halld8339a72021-05-27 18:49:40 +0100608 """Create a buffered schedule"""
609 buffered_schedule = Schedule(self.sg, f"{ref_schedule.label}_BUFFERED")
Tim Halld8339a72021-05-27 18:49:40 +0100610
611 prev_op = None
612 for sched_op in self.sched_ops:
613 if sched_op not in ref_schedule.cost_map:
614 # sched_op is not part of this sub-schedule - skip
615 continue
616
617 self.propose_operator_buffering(sched_op, prev_op, buffered_schedule, ref_schedule, staging_limit_bytes)
618 prev_op = sched_op
619
620 return buffered_schedule
621
622 def propose_operator_buffering(
623 self,
624 sched_op: SchedulerOperation,
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100625 prev_op: Optional[SchedulerOperation],
Tim Halld8339a72021-05-27 18:49:40 +0100626 buffered_schedule: Schedule,
627 ref_schedule: Schedule,
628 staging_limit_bytes,
629 ):
630 # Mild recursion might mean this Op has already been seen
631 if sched_op in buffered_schedule.cost_map:
632 return
633
634 # Take the reference schedule as default costings for this schedule
635 ref_cost = ref_schedule.cost_map[sched_op]
636 cost = copy.copy(ref_cost)
637 cost.slack_buffering_cycles = ref_cost.cycles.op_cycles
638 memory_snapshot = ref_schedule.memory_snapshot
639 ref_memory_usage = memory_snapshot[ref_cost.time_index] if ref_cost.time_index < len(memory_snapshot) else 0
640 cost.slack_buffering_memory = staging_limit_bytes - ref_memory_usage
641 buffered_schedule.cost_map[sched_op] = cost
642
643 # Attempt weight buffering on anything with a weights tensor
644 if sched_op.parent_op.weights:
Johan Alfvén6f4cb032022-05-05 08:42:46 +0200645 buffer_limit_bytes = cost.slack_buffering_memory
646
647 # If applicable apply size limitation, but keep it within reason (ratio 1.5).
648 # Size limitation is used when use_fast_storage_for_feature_maps have
649 # detected that there are fms that do not fit in fast storage.
650 if sched_op.evicted_fms_size and ((buffer_limit_bytes / sched_op.evicted_fms_size) >= 1.5):
651 buffer_limit_bytes -= sched_op.evicted_fms_size
652
Tim Halld8339a72021-05-27 18:49:40 +0100653 self.propose_weight_buffering(
654 sched_op.parent_op.weights,
655 sched_op.parent_op.bias,
656 sched_op,
657 prev_op,
658 buffered_schedule,
659 ref_schedule,
Johan Alfvén6f4cb032022-05-05 08:42:46 +0200660 buffer_limit_bytes,
Tim Halld8339a72021-05-27 18:49:40 +0100661 )
662
663 return cost
664
665 def weights_needs_dma(self, weight_tensor):
666 if weight_tensor and weight_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast):
667 # Weights are in permanent storage
668 # Only when permanent storage differs from feature map storage, there is a point moving the data
669 if (
670 weight_tensor.mem_area in (MemArea.Dram, MemArea.OffChipFlash)
671 and self.arch.permanent_storage_mem_area != self.arch.fast_storage_mem_area
672 ):
673 return True
674 return False
675
676 def propose_weight_buffering(
677 self,
678 weight_tensor,
679 scale_tensor,
680 sched_op: SchedulerOperation,
681 prev_op: SchedulerOperation,
682 buffered_schedule: Schedule,
683 ref_schedule: Schedule,
684 buffer_limit_bytes,
685 ):
686 cost = buffered_schedule.cost_map[sched_op]
687 prev_cost = buffered_schedule.cost_map.get(prev_op)
688 ref_cost = ref_schedule.cost_map[sched_op]
689 assert cost and ref_cost
690
691 needs_dma = self.weights_needs_dma(weight_tensor)
692
693 ofm_full_depth_slices = [0, ref_cost.stripe.depth]
694
695 # Encode weights for the full depth
Tim Halld784af72021-06-08 21:25:57 +0100696 full_weights, full_scales = weight_compressor.encode_weight_and_scale_tensor(
Tim Halld8339a72021-05-27 18:49:40 +0100697 self.arch,
698 sched_op.parent_op,
699 weight_tensor,
700 scale_tensor,
701 sched_op.kernel,
702 cost.block_config,
703 ofm_full_depth_slices,
704 )
705 full_weights_bytes = len(full_weights.buffer)
706 cost.ofm_depth_slices = ofm_full_depth_slices
707
708 # No buffering required - take all the weights from permanent storage
709 if sched_op.op_type == Op.FullyConnected or not needs_dma:
710 cost.npu_weights_tensor = full_weights
Tim Halld784af72021-06-08 21:25:57 +0100711 cost.npu_scales_tensor = full_scales
Tim Halld8339a72021-05-27 18:49:40 +0100712 return
713
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100714 encoded_weights: Optional[NpuWeightTensor] = full_weights
Tim Halld784af72021-06-08 21:25:57 +0100715 encoded_scales = full_scales
Tim Halld8339a72021-05-27 18:49:40 +0100716
717 # How many NPU cycles are available under the previously executing
718 # operator and SRAM unused for performing buffered DMA transfers
719 slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0
720 slack_memory = prev_cost.slack_buffering_memory if prev_cost else 0
721
722 # Force full depth for cascaded Ops
723 if ref_cost.cascade != 0:
724 weight_tensor_purpose = TensorSubPurpose.Standard
725 weight_buffer_size = full_weights_bytes
726 # Update the memory snapshot to reflect the added size of the weights
727 ref_schedule.memory_snapshot[ref_cost.time_index] += weight_buffer_size
728 else:
729 # Estimate the buffering cycle time for the full set of weights
730 full_transfer_cycles = npu_performance.measure_mem2mem_cycles(
731 self.arch, weight_tensor.mem_area, self.arch.fast_storage_mem_area, full_weights_bytes
732 )
733 cost.full_weight_transfer_cycles = full_transfer_cycles
734
735 # Calculate the amount of prebuffering necessary (or what is possible with limited
736 # double buffer buffer size)
737 half_buffer_limit = buffer_limit_bytes // 2
738 if full_transfer_cycles > slack_cycles:
739 prebuffer_ratio = slack_cycles / full_transfer_cycles
740 prebuffer_bytes = min(prebuffer_ratio * full_weights_bytes, half_buffer_limit)
741 else:
742 prebuffer_bytes = min(full_weights_bytes, half_buffer_limit)
Tim Hall789e6f32021-06-17 17:02:31 +0100743
744 prebuffer_ratio = prebuffer_bytes / full_weights_bytes
Tim Halld8339a72021-05-27 18:49:40 +0100745
746 # Have to split the weights if the initial buffering can't store
747 # all of the compressed weights
748 if prebuffer_bytes < full_weights_bytes:
Tim Hall789e6f32021-06-17 17:02:31 +0100749 block_depth = cost.block_config.ofm_block.depth
Tim Halld8339a72021-05-27 18:49:40 +0100750
Tim Hall789e6f32021-06-17 17:02:31 +0100751 # Choose initial prebuffering depth (already buffer clamped)
752 prebuffer_depth = ref_cost.stripe.depth * prebuffer_ratio
Tim Halld8339a72021-05-27 18:49:40 +0100753 prebuffer_depth = int(max(16, round_down(prebuffer_depth, ArchitectureFeatures.OFMSplitDepth)))
754
Tim Hall789e6f32021-06-17 17:02:31 +0100755 # Calculate cycles executed during the prebuffer
756 pre_op_cycles = self.estimate_op_performance(sched_op, cost.block_config, prebuffer_depth)
757 buffering_depth = ref_cost.stripe.depth * (pre_op_cycles.op_cycles / full_transfer_cycles)
Tim Halld8339a72021-05-27 18:49:40 +0100758
Tim Hall789e6f32021-06-17 17:02:31 +0100759 # Choose initial buffering depth and clamp to the double buffering limit
760 buffering_depth = round_up(buffering_depth, block_depth)
761 buffering_bytes = (buffering_depth / ref_cost.stripe.depth) * full_weights_bytes
762 if buffering_bytes > half_buffer_limit:
763 buffering_depth = (half_buffer_limit / full_weights_bytes) * ref_cost.stripe.depth
764
765 while True:
766 # Attempt to buffer whole blocks
Johan Alfvéncce7f2d2022-04-08 10:47:09 +0200767 if buffering_depth > block_depth:
Tim Hall789e6f32021-06-17 17:02:31 +0100768 buffering_depth = round_down(buffering_depth, block_depth)
769 else:
770 buffering_depth = round_down(buffering_depth, ArchitectureFeatures.OFMSplitDepth)
771 buffering_depth = int(max(buffering_depth, ArchitectureFeatures.OFMSplitDepth))
Tim Halld8339a72021-05-27 18:49:40 +0100772
773 # Create list of depth slices
774 depth_slices = [0]
775 if prebuffer_depth < ref_cost.stripe.depth:
776 depth_slices += list(range(prebuffer_depth, ref_cost.stripe.depth, buffering_depth))
777 depth_slices.append(ref_cost.stripe.depth)
778
779 # Encode weights based depth slices
780 cost.ofm_depth_slices = depth_slices
Tim Halld784af72021-06-08 21:25:57 +0100781 encoded_weights, encoded_scales = weight_compressor.encode_weight_and_scale_tensor(
Tim Halld8339a72021-05-27 18:49:40 +0100782 self.arch,
783 sched_op.parent_op,
784 weight_tensor,
785 scale_tensor,
786 sched_op.kernel,
787 cost.block_config,
788 cost.ofm_depth_slices,
789 )
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100790 assert encoded_weights is not None
Tim Halld8339a72021-05-27 18:49:40 +0100791 # Chosen buffering might not fit at all, iterate until it does
792 # or until the minimum usable slice size is reached
793 if (
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000794 encoded_weights.double_buffer_size() <= buffer_limit_bytes
Tim Halld8339a72021-05-27 18:49:40 +0100795 or prebuffer_depth == ArchitectureFeatures.OFMSplitDepth
796 ):
797 break
798
Tim Hall789e6f32021-06-17 17:02:31 +0100799 if buffering_depth > prebuffer_depth:
800 buffering_depth = round_up(buffering_depth // 2, ArchitectureFeatures.OFMSplitDepth)
801 else:
802 prebuffer_depth = round_up(prebuffer_depth // 2, ArchitectureFeatures.OFMSplitDepth)
Tim Halld8339a72021-05-27 18:49:40 +0100803
804 # Calculate cycles required to run the last op for use as future slack
805 tail_cycles = self.estimate_op_performance(
806 sched_op, cost.block_config, depth_slices[-1] - depth_slices[-2]
807 )
808 cost.slack_buffering_cycles = tail_cycles.op_cycles
809
810 # Determine whether the weights need to be double buffered
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000811 weight_buffer_size = min(len(encoded_weights.buffer), encoded_weights.max_range_bytes())
Tim Halld8339a72021-05-27 18:49:40 +0100812
813 # Only buffer weights if there's still space left for the buffer
814 if weight_buffer_size <= buffer_limit_bytes:
815 assert weight_buffer_size % 16 == 0
816 # Determine whether to double buffer or single buffer
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000817 double_buffer_size = encoded_weights.double_buffer_size()
818 if (double_buffer_size <= buffer_limit_bytes) and (weight_buffer_size < len(encoded_weights.buffer)):
Tim Halld8339a72021-05-27 18:49:40 +0100819 weight_tensor_purpose = TensorSubPurpose.DoubleBuffer
820 else:
821 weight_tensor_purpose = TensorSubPurpose.Standard
822
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000823 cost.buffered_weight_tensors = [
824 self.buffer_tensor(
825 encoded_weights,
826 weight_tensor_purpose,
827 encoded_weights.double_buffer_sizes[0],
828 weight_tensor.name + "_buffer",
829 )
830 ]
831 if weight_tensor_purpose == TensorSubPurpose.DoubleBuffer:
832 buf2 = self.buffer_tensor(
833 encoded_weights,
834 weight_tensor_purpose,
835 encoded_weights.double_buffer_sizes[1],
836 weight_tensor.name + "_buffer2",
837 )
838 cost.buffered_weight_tensors.append(buf2)
839
Rickard Bolin1bd20f22022-12-07 08:54:53 +0000840 # Note! OFM depth slices define slices as [0, s1, ... sn]. For example, [0, 70, 140] describes two slices
841 # (0-70 and 70-140) but has a length of 3, which would result in idx = 3 % 2 = 1 if two buffers were used.
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000842 last_used_buffer_idx = len(cost.ofm_depth_slices) % len(cost.buffered_weight_tensors)
843 weight_buffer_size = encoded_weights.double_buffer_sizes[last_used_buffer_idx]
844
Tim Halld8339a72021-05-27 18:49:40 +0100845 if ref_cost.cascade == 0:
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000846 # Determine if the lifetime can be extended and pre-buffer the first weight buffer
847 # under the previous operation
848 cost.buffered_weight_tensors[0].pre_buffer = encoded_weights.double_buffer_size() < slack_memory
Tim Halld8339a72021-05-27 18:49:40 +0100849
850 cost.slack_buffering_memory -= weight_buffer_size
851 else:
852 # Don't slice or buffer - use the whole depth from persistent storage
853 cost.ofm_depth_slices = ofm_full_depth_slices
854 encoded_weights = full_weights
Tim Halld784af72021-06-08 21:25:57 +0100855 encoded_scales = full_scales
Tim Halld8339a72021-05-27 18:49:40 +0100856
857 cost.npu_weights_tensor = encoded_weights
Tim Halld784af72021-06-08 21:25:57 +0100858 cost.npu_scales_tensor = encoded_scales
Tim Halld8339a72021-05-27 18:49:40 +0100859
Jacob Bohlineee9e5d2021-08-17 17:44:45 +0200860 def buffer_tensor(self, src_tensor: Tensor, sub_purpose: TensorSubPurpose, buffer_size: int, name: str) -> Tensor:
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000861 buffered_weight_tensor = Tensor([1, 1, 1, buffer_size], DataType.uint8, name)
Jacob Bohlineee9e5d2021-08-17 17:44:45 +0200862 buffered_weight_tensor.src_tensor = src_tensor
863 buffered_weight_tensor.mem_area = self.arch.fast_storage_mem_area
864 buffered_weight_tensor.mem_type = MemType.Scratch_fast
865 buffered_weight_tensor.purpose = TensorPurpose.Weights
866 buffered_weight_tensor.sub_purpose = sub_purpose
867 return buffered_weight_tensor
868
Tim Halld8339a72021-05-27 18:49:40 +0100869 def propose_minimal_schedule(self) -> Schedule:
870 """Proposes scheduling parameters where every operator is subdivided into the smallest stripe that satisfies the
871 next operators stride"""
872 min_schedule = Schedule(self.sg, "MIN")
873 cost_map = min_schedule.cost_map
874
875 # Keep track of the previous Op - which consumes the current Op's OFM
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100876 prev_op: Optional[SchedulerOperation] = None
Tim Halld8339a72021-05-27 18:49:40 +0100877 for sched_op in reversed(self.sched_ops):
878 min_stripe_height = prev_op.kernel.stride.y if prev_op else 1
879 min_stripe = sched_op.ofm.shape.with_height(min_stripe_height)
880
881 cost = sched_op.create_scheduler_info(self.nng, min_stripe)
882 cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)
883 cost_map[sched_op] = cost
884
885 prev_op = sched_op
886
887 return min_schedule
888
889 def propose_schedule_striping(self, final_stripe: Shape4D, label: str, ref_schedule: Schedule) -> Schedule:
890 """Proposes new striping for a schedule. The stripe is derived from the ifm requirements of the next Op down"""
891 ref_cost = ref_schedule.cost_map
892
893 striped_schedule = Schedule(self.sg, label)
894 stripe = final_stripe
895 for sched_op in reversed(self.sched_ops):
896 if sched_op not in ref_cost:
897 # sched_op is not part of the sub-schedule - skip
898 continue
899
900 # Create a cost entry with the new stripe
901 cost = sched_op.create_scheduler_info(self.nng, stripe)
902
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000903 weight_tensor = cost.npu_weights_tensor
904 for idx, buffered_tens in enumerate(ref_cost[sched_op].buffered_weight_tensors):
Jacob Bohlineee9e5d2021-08-17 17:44:45 +0200905 # If the weights are buffered in the reference schedule they should be in the new proposal
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000906 cost.buffered_weight_tensors.append(
907 self.buffer_tensor(
908 weight_tensor,
909 buffered_tens.sub_purpose,
910 weight_tensor.double_buffer_sizes[idx],
911 buffered_tens.name,
912 )
Jacob Bohlineee9e5d2021-08-17 17:44:45 +0200913 )
Tim Halld8339a72021-05-27 18:49:40 +0100914
915 # Estimate performance
916 cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)
917 striped_schedule.cost_map[sched_op] = cost
918
erik.andersson@arm.com8912f3a2022-08-16 11:08:57 +0200919 # Calculate the preceeding Op's stripe.
920
921 # In certain cases where an upscaling Op is cascaded,
922 # it may get instructed to produce an odd stripe height.
923 # Thus we need to force it back to even heights.
924 force_even_stripe_heights = False
925 for op in self.sched_ops:
926 # Check if the cascade has a Nearest Neighbor-op.
927 # If that is the case, force the stripes to be even.
928 if (
929 ref_cost.get(op, None)
930 and ref_cost.get(sched_op, None)
931 and ref_cost[op].cascade == ref_cost[sched_op].cascade
932 and is_nearest(op.resampling_mode)
933 ):
934 force_even_stripe_heights = True
935 break
936 upscaling_remainder = stripe.height % to_upscale(sched_op.resampling_mode)
937 height = stripe.height + (stripe.height % 2 if force_even_stripe_heights else upscaling_remainder)
Fredrik Svedbergd03dc502022-06-30 10:44:12 +0200938 stripe = sched_op.ifm.shape.with_height(height * sched_op.kernel.stride.y)
Tim Halld8339a72021-05-27 18:49:40 +0100939
940 return striped_schedule
941
942 def estimate_schedule_memory_usage(self, schedule: Schedule, non_local_mem_usage: dict):
943 """Estimates the memory usage of a schedule"""
944 cost = schedule.cost_map
945 cascades = schedule.cascades
946 peak_mem_usage = 0
947 for sched_op in self.sched_ops:
948 if sched_op not in cost:
949 # sched_op is not part of the sub-schedule - skip
950 continue
951
952 if cost[sched_op].cascade:
953 # This Op is part of a cascade - use the cascade's memory usage
954 cascade_info = cascades[cost[sched_op].cascade]
955 # Non-local memory usage is already included in the cascade_info
956 peak_mem_usage = max(cascade_info.mem_usage, peak_mem_usage)
957 else:
958 # This Op is not part of a cascade - calculate the memory usage
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000959 op_weight_buffer = sum(tens.storage_size() for tens in cost[sched_op].buffered_weight_tensors)
Tim Halld8339a72021-05-27 18:49:40 +0100960
961 op_mem_usage = (
962 sched_op.ifm_size_in_bytes()
963 + sched_op.ofm_size_in_bytes()
964 + op_weight_buffer
965 + non_local_mem_usage.get(sched_op, 0)
966 )
967 peak_mem_usage = max(op_mem_usage, peak_mem_usage)
968
969 return peak_mem_usage
970
Johan Alfvén255dad72022-07-16 18:27:05 +0200971 def build_cascades_for_min_schedule(self, min_schedule: Schedule, max_template: Schedule, memory_limit: int):
972 # Update memory snapshot
973 self.sg.schedule = min_schedule
974 self.update_op_memory_snapshot(min_schedule)
975
976 # Calculate residual memory for Min schedule
977 non_local_mem_usage = {}
978 for sched_op in self.sched_ops:
979 time_index = min_schedule.cost_map[sched_op].time_index
980
981 if self.arch.is_spilling_enabled():
982 # For Dedicated SRAM only the intermediate buffers are in SRAM, hence op_mem_usage is 0
983 op_mem_usage = 0
984 else:
985 # Min schedule only have ifm and ofm in SRAM (no buffered weigth tensors)
Johan Alfvéneb332a32022-12-09 17:50:48 +0100986 # Only include IFM's that are in the scratch area
987 ifm = sched_op.ifm.connection.parent_tens
988 ifm_size = (
989 0 if ifm.mem_type not in (MemType.Scratch, MemType.Scratch_fast) else sched_op.ifm_size_in_bytes()
990 )
Johan Alfvén92689d52022-12-06 11:16:19 +0100991 ofm_size = 0 if ofm_can_reuse_ifm(sched_op) else sched_op.ofm_size_in_bytes()
Johan Alfvéneb332a32022-12-09 17:50:48 +0100992 op_mem_usage = ifm_size + ofm_size
Johan Alfvén255dad72022-07-16 18:27:05 +0200993
994 non_local_mem_usage[sched_op] = min_schedule.memory_snapshot[time_index] - op_mem_usage
Johan Alfvén92689d52022-12-06 11:16:19 +0100995 assert non_local_mem_usage[sched_op] >= 0
Johan Alfvén255dad72022-07-16 18:27:05 +0200996
Rickard Bolin1bd20f22022-12-07 08:54:53 +0000997 # Create cascades for Min schedule
Johan Alfvén255dad72022-07-16 18:27:05 +0200998 cascade_builder = CascadeBuilder(self.sched_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)
999 cascade_builder.build_cascades(min_schedule, max_template, memory_limit)
1000
Tim Halld8339a72021-05-27 18:49:40 +01001001 def optimize_sub_schedule(
1002 self, cascade_info: CascadeInfo, ref_schedule: Schedule, max_template: Schedule, memory_limit: int
1003 ) -> Schedule:
1004 """Extracts the Ops covered by the given cascade and creates a sub-schedule. The sub-schedule is optimized by
1005 proposing weight buffering and then continously proposing new stripe sizes"""
1006 ref_cost = ref_schedule.cost_map
1007 # Extract the ops that are part of this sub-schedule
1008 start = cascade_info.start
1009 end = cascade_info.end
1010 sub_schedule_ops = self.sched_ops[start : end + 1]
1011 # Create a sub-schedule that contains only the costs for the Ops that are part of the sub-schedule
1012 sub_schedule = Schedule(self.sg, f"SUB_{start}_{end}")
1013 for sched_op in sub_schedule_ops:
1014 sub_schedule.cost_map[sched_op] = ref_cost[sched_op]
1015
1016 sub_schedule.cascades[end] = cascade_info
1017 # Use the memory snapshot from the reference schedule
1018 sub_schedule.memory_snapshot = ref_schedule.memory_snapshot
1019
1020 # Calculate memory usage that is live during the sub-schedule but not part of it
1021 time_for_cascade = ref_cost[sub_schedule_ops[0]].time_index
1022 mem_usage_parallel_to_sub_schedule = ref_schedule.memory_snapshot[time_for_cascade] - cascade_info.mem_usage
1023 # If the first Op's IFM has other consumers it has to live throughout the whole sub-schedule whether it's
1024 # included in a cascade or not
1025 persistent_initial_ifm = (
1026 sub_schedule_ops[0].ifm_size_in_bytes() if len(sub_schedule_ops[0].ifm.connection.consumers) > 1 else 0
1027 )
1028 # Calculate non-local-mem-usage per Operator
1029 non_local_mem_usage = {}
1030 for idx, sched_op in enumerate(sub_schedule_ops):
1031 non_local_mem_usage[sched_op] = mem_usage_parallel_to_sub_schedule
1032 if idx != 0:
1033 non_local_mem_usage[sched_op] += persistent_initial_ifm
1034
1035 cascade_builder = CascadeBuilder(sub_schedule_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)
1036
1037 # Start by adding buffering
Tim Hall789e6f32021-06-17 17:02:31 +01001038 buffered_sub_schedule = self.propose_schedule_buffering(
1039 sub_schedule, self.scheduler_options.optimization_sram_limit
1040 )
Tim Halld8339a72021-05-27 18:49:40 +01001041 # Copy the cascades over from the unbuffered-schedule
1042 buffered_sub_schedule.cascades = sub_schedule.cascades
1043
1044 # Generate the possible stripings for the final Op in the sub-schedule
1045 final_ofm_shape = sub_schedule_ops[-1].ofm.shape
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001046
1047 # Skip testing the min stripe used in the MIN schedule since that will be used
1048 # anyway if no new cascades are created below
1049 last_op = sub_schedule_ops[-1]
1050 min_stripe_h = sub_schedule.cost_map[last_op].stripe.height + 1
1051
Tim Halld8339a72021-05-27 18:49:40 +01001052 possible_stripes = [
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001053 final_ofm_shape.with_height(stripe_h) for stripe_h in range(min_stripe_h, final_ofm_shape.height // 2 + 1)
Tim Halld8339a72021-05-27 18:49:40 +01001054 ]
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001055 # Propose different striping
Jacob Bohlinfad72042021-08-24 21:51:41 +02001056 best_schedule = None
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001057 max_nbr_of_cascades = 0
1058 for iteration, proposed_stripe in enumerate(possible_stripes):
Tim Halld8339a72021-05-27 18:49:40 +01001059 proposed_schedule = self.propose_schedule_striping(
1060 proposed_stripe, f"OPTIMIZED_{iteration}", buffered_sub_schedule
1061 )
1062
1063 cascade_builder.build_cascades(proposed_schedule, max_template, memory_limit)
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001064 nbr_of_cascades = len(proposed_schedule.cascades)
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001065 if iteration == 0:
1066 # First iteration - used as limit to prevent splitting up the cascades
1067 # Long cascades are better in order to reduce IFM/IFM dram bandwidth
1068 max_nbr_of_cascades = nbr_of_cascades
1069
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001070 # Check if proposal fits
1071 proposed_schedule_mem_usage = self.estimate_schedule_memory_usage(proposed_schedule, non_local_mem_usage)
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001072 if (proposed_schedule_mem_usage) <= memory_limit and nbr_of_cascades <= max_nbr_of_cascades:
Tim Halld8339a72021-05-27 18:49:40 +01001073 best_schedule = proposed_schedule
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001074
Tim Halld8339a72021-05-27 18:49:40 +01001075 if not proposed_schedule.cascades:
1076 # No cascading required - early exit
1077 break
1078 else:
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001079 break
Tim Halld8339a72021-05-27 18:49:40 +01001080
1081 return best_schedule
1082
1083 def optimize_schedule(
Jonas Ohlssond8575072022-03-30 10:30:25 +02001084 self,
1085 schedule: Schedule,
1086 max_sched: Schedule,
1087 max_template: Schedule,
1088 options: SchedulerOptions,
Tim Halld8339a72021-05-27 18:49:40 +01001089 ) -> Schedule:
1090 """Extracts sub-schedules based on the cascades and optimizes them and applies them to the final schedule"""
1091 sram_limit = options.optimization_sram_limit
1092 if max_sched.fast_storage_peak_usage < sram_limit and not self.arch.is_spilling_enabled():
1093 # Maximum performance schedule fits within the SRAM target
1094 return max_sched
1095
Jacob Bohlinfad72042021-08-24 21:51:41 +02001096 # Iterate over a copy of the cascades since they may change during the loop
1097 for cascade_info in list(schedule.cascades.values()):
Tim Halld8339a72021-05-27 18:49:40 +01001098 # Optimize the sub-schedule in this cascade
1099 opt_sub_schedule = self.optimize_sub_schedule(cascade_info, schedule, max_template, sram_limit)
Jacob Bohlinfad72042021-08-24 21:51:41 +02001100 if opt_sub_schedule:
1101 # Remove the existing cascade
1102 del schedule.cascades[cascade_info.end]
1103 # Update the sub-schedule Op and cascade costs to the full schedule
1104 schedule.cost_map.update(opt_sub_schedule.cost_map)
1105 schedule.cascades.update(opt_sub_schedule.cascades)
Tim Halld8339a72021-05-27 18:49:40 +01001106
1107 # Update memory snapshot
1108 self.sg.schedule = schedule
1109 self.update_op_memory_snapshot(schedule)
1110 # Propose schedule buffering to the optimized schedule
Tim Hall789e6f32021-06-17 17:02:31 +01001111 optimized_sched = self.propose_schedule_buffering(schedule, self.scheduler_options.optimization_sram_limit)
Tim Halld8339a72021-05-27 18:49:40 +01001112 # Copy the cascade's metadata from the unbuffered schedule
1113 optimized_sched.cascades = schedule.cascades
1114 return optimized_sched
1115
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001116 def optimize_weight_buffering_size(
1117 self,
1118 min_schedule: Schedule,
1119 options: SchedulerOptions,
1120 ):
1121 default_schedule = self.sg.schedule
Tim Hallc1be0872022-03-03 17:50:52 +00001122 npu_performance.calc_new_performance_for_network(self.nng, self.arch, None, False)
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001123 default_tot_cycles = self.nng.cycles[npu_performance.PassCycles.Total]
1124 default_dram_cycles = self.nng.cycles[npu_performance.PassCycles.DramAccess]
1125
1126 # Restore mem/type for scratched_fms
1127 for tens in self.scratched_fms:
1128 tens.mem_area = self.scratched_fms[tens][0]
1129 tens.mem_type = self.scratched_fms[tens][1]
1130
1131 self.update_op_memory_snapshot(self.sg.schedule)
1132
1133 # Collect live ranges from tensors
1134 memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]
1135 lr_graph = live_range.LiveRangeGraph()
1136 for mem_area, mem_type_set in memories_list:
Johan Alfven6e281af2023-02-28 09:03:03 +01001137 live_range.extract_live_ranges_from_schedule(
1138 self.sg,
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001139 mem_area,
1140 mem_type_set,
1141 lr_graph,
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001142 )
1143
1144 # Find the relation between the sched_op and the buffering tensor
1145 weight_ops = {}
1146 for sched_op in self.sched_ops:
1147 cost = self.sg.schedule.cost_map[sched_op]
Rickard Bolinfd8b5002022-05-16 09:11:06 +00001148 for tens in cost.buffered_weight_tensors:
1149 weight_ops[tens] = sched_op
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001150
1151 # Filter out weight buffer live ranges
1152 weight_lrs = []
1153 for lr in lr_graph.lrs:
1154 for tens in lr.tensors:
1155 if weight_ops.get(tens):
1156 weight_lrs.append(lr)
1157 break
1158
1159 # See if any evicted fm overlaps with a weight buffering op.
1160 # If this is the case add a size limitation to the buffering op
1161 for lr in self.evicted_fms:
1162 for weight_lr in weight_lrs:
1163 if lr.overlaps_ranges(weight_lr):
1164 for tens in weight_lr.tensors:
1165 sched_op = weight_ops.get(tens)
1166 if sched_op:
1167 # Add size reduction to the op
1168 sched_op.evicted_fms_size += lr.size
1169 break
1170
1171 self.sg.schedule = min_schedule
1172 self.update_op_memory_snapshot(self.sg.schedule)
1173
1174 # Run schedule buffering - with weight buffer size reduction
1175 schedule = self.propose_schedule_buffering(self.sg.schedule, options.optimization_sram_limit)
1176 schedule.cascades = self.sg.schedule.cascades
1177 self.sg.schedule = schedule
1178
1179 # Apply new buffer schdule and calc new performance
1180 self.update_op_memory_snapshot(self.sg.schedule)
1181 self.apply_schedule(self.sg.schedule)
1182 self.use_fast_storage_for_feature_maps(self.sg.schedule, options.optimization_sram_limit)
1183
Tim Hallc1be0872022-03-03 17:50:52 +00001184 npu_performance.calc_new_performance_for_network(self.nng, self.arch, None, False)
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001185 new_tot_cycles = self.nng.cycles[npu_performance.PassCycles.Total]
1186 new_dram_cycles = self.nng.cycles[npu_performance.PassCycles.DramAccess]
1187
Tim Hall8bc7a652022-05-19 15:29:23 +01001188 improvement_tot = (
1189 round((default_tot_cycles - new_tot_cycles) / default_tot_cycles, 2) if default_tot_cycles != 0 else 0
1190 )
1191 improvement_dram = (
1192 round((default_dram_cycles - new_dram_cycles) / default_dram_cycles, 2) if default_dram_cycles != 0 else 0
1193 )
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001194
1195 # Compare both total and dram improvement
Johan Alfvén3dae1b62022-05-17 10:26:48 +02001196 if not (improvement_tot >= 0 and improvement_dram > 0):
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001197 # No improvement, restore the default schedule
1198 for sched_op in self.sched_ops:
1199 sched_op.evicted_fms_size = 0
1200
1201 for tens in self.scratched_fms:
1202 tens.mem_area = self.scratched_fms[tens][0]
1203 tens.mem_type = self.scratched_fms[tens][1]
1204
1205 self.sg.schedule = default_schedule
1206 self.update_op_memory_snapshot(self.sg.schedule)
1207 self.apply_schedule(self.sg.schedule)
1208 self.use_fast_storage_for_feature_maps(self.sg.schedule, options.optimization_sram_limit)
1209
Tim Halld8339a72021-05-27 18:49:40 +01001210 def apply_schedule(self, sched: Schedule):
1211 """Applies the given schedule as a final solution"""
1212 for sched_op in self.sched_ops:
1213 op_info = sched.cost_map[sched_op]
1214 cascade_info = sched.cascades.get(op_info.cascade, None)
1215 if cascade_info and sched_op in cascade_info.buffers:
1216 buffer_tens = sched_op.ifm.connection.parent_tens
1217 # Apply memory area and type
1218 buffer_tens.mem_area = self.arch.fast_storage_mem_area
1219 buffer_tens.mem_type = MemType.Scratch_fast
1220 # Apply Rolling buffer
1221 buffer_tens.set_format(TensorFormat.NHCWB16, self.arch)
1222 buffer_tens.set_new_sub_purpose(TensorSubPurpose.RollingBufferY, cascade_info.buffers[sched_op].height)
1223
1224 sched_op.parent_ps.block_config = op_info.block_config.old_style_representation()
1225
1226 # Ensure that the src_tensor reference is set correctly
Rickard Bolinfd8b5002022-05-16 09:11:06 +00001227 for tens in op_info.buffered_weight_tensors:
1228 tens.src_tensor = op_info.npu_weights_tensor
Tim Halld8339a72021-05-27 18:49:40 +01001229
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001230 def use_fast_storage_for_feature_maps(self, schedule, staging_limit):
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001231 """Finds the set of feature maps that fits within the staging limit which combined has the largest amount of
1232 access cycles and moves those feature map into fast storage"""
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001233 max_mem_usage = []
1234 base_mem_usage = []
1235 fast_storage_type = MemType.Scratch_fast
1236 fast_storage_mem_area = self.arch.fast_storage_mem_area
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001237 self.evicted_fms = []
Tim Halld8339a72021-05-27 18:49:40 +01001238
1239 # Force all OFMs to fast-storage
1240 for sched_op in self.sched_ops:
1241 cost = schedule.cost_map[sched_op]
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001242 if cost.cascade == 0 and sched_op.get_dependants():
1243 ofm_tens = sched_op.ofm.connection.parent_tens
1244 if not any(cons is None for cons in ofm_tens.consumer_list):
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001245 if ofm_tens not in self.scratched_fms:
1246 # Remember default mem area and mem type, only done once
1247 self.scratched_fms[ofm_tens] = (ofm_tens.mem_area, ofm_tens.mem_type)
1248
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001249 ofm_tens.mem_area = fast_storage_mem_area
1250 ofm_tens.mem_type = fast_storage_type
Tim Halld8339a72021-05-27 18:49:40 +01001251
1252 # Collect live ranges from tensors
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001253 memories_list = [(fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]
Tim Halld8339a72021-05-27 18:49:40 +01001254 lr_graph = live_range.LiveRangeGraph()
1255 for mem_area, mem_type_set in memories_list:
Johan Alfven6e281af2023-02-28 09:03:03 +01001256 live_range.extract_live_ranges_from_schedule(
1257 self.sg,
Jonas Ohlssond8575072022-03-30 10:30:25 +02001258 mem_area,
1259 mem_type_set,
1260 lr_graph,
Tim Halld8339a72021-05-27 18:49:40 +01001261 )
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001262 max_mem_usage = lr_graph.get_temporal_memory_usage(fast_storage_mem_area)
Tim Halld8339a72021-05-27 18:49:40 +01001263
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001264 # If max_mem_usage does not exceed staging limit at any point all lrs fit and can stay in fast storage
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001265 if max(max_mem_usage) <= staging_limit:
1266 return
1267
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001268 # Build up the base memory usage by removing the mem_usage of the lrs we previously moved to fast-storage
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001269 base_mem_usage = np.array(max_mem_usage)
1270 curr_lrs = []
Tim Halld8339a72021-05-27 18:49:40 +01001271 for lr in lr_graph.lrs:
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001272 for tens in lr.tensors:
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001273 if self.scratched_fms.get(tens):
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001274 curr_lrs.append(lr)
1275 base_mem_usage[lr.start_time : lr.end_time + 1] -= lr.size
1276 break
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001277 competing_lrs = []
Johan Alfvén5c309712022-06-10 15:40:58 +02001278 competing_tens_access = {}
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001279
1280 # Evict live ranges that will never fit
1281 for lr in curr_lrs.copy():
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001282 base_usage = max(base_mem_usage[lr.start_time : lr.end_time + 1])
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001283 if base_usage + lr.size > staging_limit:
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001284 # Lr will never fit and may thus be evicted
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001285 self.evicted_fms.append(lr)
1286 FastStorageComponentAllocator.evict(lr, max_mem_usage, self.scratched_fms)
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001287 curr_lrs.remove(lr)
1288
1289 # Keep live ranges that will always fit in fast storage and let the remaining ones compete
1290 for lr in curr_lrs:
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001291 # Since max_mem_usage is the memory usage with all FMs still in fast-storage,
1292 # the memory limit cannot be exceeded if max_mem_usage does not.
1293 # Thus, the affected lrs can remain in fast-storage if the following is true
1294 if max(max_mem_usage[lr.start_time : lr.end_time + 1]) <= staging_limit:
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001295 FastStorageComponentAllocator.keep(lr, base_mem_usage)
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001296 else:
1297 competing_lrs.append(lr)
Johan Alfvén5c309712022-06-10 15:40:58 +02001298 for tens in lr.tensors:
1299 competing_tens_access[tens] = 0
1300
Johan Alfvén3a6325f2022-10-07 18:03:48 +02001301 # All lrs and their tensors have been handled if competing_lrs_sz is zero, we may thus return
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001302 if len(competing_lrs) == 0:
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001303 return
1304
Johan Alfvén5c309712022-06-10 15:40:58 +02001305 # Estimate element access for all tensors that are competing for a place in fast-storage.
1306 # This number is used when deciding which tensor that stays in fast-storage
1307 for sched_op in self.sched_ops:
1308 cost = schedule.cost_map[sched_op]
1309
1310 if competing_tens_access.get(sched_op.ifm.connection.parent_tens) is not None:
1311 tens = sched_op.ifm.connection.parent_tens
1312 access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)
1313 competing_tens_access[tens] += access.ifm_read[0]
1314
1315 if sched_op.ifm2 and competing_tens_access.get(sched_op.ifm2.connection.parent_tens) is not None:
1316 tens = sched_op.ifm2.connection.parent_tens
1317 access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)
1318 competing_tens_access[tens] += access.ifm_read[1]
1319
1320 if competing_tens_access.get(sched_op.ofm.connection.parent_tens) is not None:
1321 tens = sched_op.ofm.connection.parent_tens
1322 access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)
1323 competing_tens_access[tens] += access.ofm_write
1324
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001325 # Sort live ranges "from left to right" on the time axis to simplify checking overlapping ranges
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001326 competing_lrs = sorted(competing_lrs, key=lambda lr: (lr.start_time, lr.end_time + 1, lr.size))
Johan Alfvén3a6325f2022-10-07 18:03:48 +02001327
1328 # Remove lrs that have a live range that is too long compared to others.
1329 # They are causing problems for the HillClimb Allocator when it has to
1330 # change the allocation indices, in order to fit all the allocations into SRAM.
1331 # This problem only occur in larger networks with complex graphs.
1332 #
1333 # Limit the number of items for allocate_component to work with max MAX_EXHAUSTIVE_ITEMS
1334 # at the time. Too many will give too long compilation time
1335 #
1336 # Too long is currently decided to be (based on experience, analyzing many networks):
1337 # Compare lr at postion i with lr at position i + MAX_EXHAUSTIVE_ITEMS.
1338 # If end time differs by at least MAX_EXHAUSTIVE_LIFE_RANGE then do not include lr at position i.
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001339 if len(competing_lrs) > FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS:
1340 # Create a copy of the original list to iterate over because the original version is modified in-loop
Johan Alfvén3a6325f2022-10-07 18:03:48 +02001341 competing_lrs_copy = competing_lrs.copy()
1342 for i, lr in enumerate(competing_lrs_copy):
1343 lr_time = lr.end_time - lr.start_time
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001344 # Only check live ranges longer than MAX_EXHAUSTIVE_LIFE_RANGE
1345 if lr_time >= FastStorageComponentAllocator.MAX_EXHAUSTIVE_LIFE_RANGE:
1346 # Compare current lr with lr at position lr + MAX_EXHAUSTIVE_ITEMS
1347 cmp_pos = min(i + FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS, len(competing_lrs) - 1)
Johan Alfvén3a6325f2022-10-07 18:03:48 +02001348
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001349 # Compare end times + plus a margin by MAX_EXHAUSTIVE_LIFE_RANGE
1350 if (
1351 lr.end_time
1352 > competing_lrs_copy[cmp_pos].end_time + FastStorageComponentAllocator.MAX_EXHAUSTIVE_LIFE_RANGE
1353 ):
1354 # Current lr live time stands out, remove it. No use adding it to the
1355 # evicted_fms list since the lr should not be included in the fast storage allocation
1356 FastStorageComponentAllocator.evict(lr, max_mem_usage, self.scratched_fms)
1357 competing_lrs.remove(lr)
Johan Alfvén3a6325f2022-10-07 18:03:48 +02001358
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001359 # Split competing live ranges into components by finding disconnected groups of live ranges or components of
1360 # max size MAX_EXHAUSTIVE_ITEMS
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001361 start = 0
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001362 end_time = competing_lrs[0].end_time
1363 component_allocator = FastStorageComponentAllocator(base_mem_usage, max_mem_usage, staging_limit)
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001364 component_ranges = []
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001365 for i, lr in enumerate(competing_lrs):
Johan Alfvén3a6325f2022-10-07 18:03:48 +02001366 nbr_items = i - start
1367 if lr.start_time <= end_time and (nbr_items < FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS):
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001368 end_time = max(end_time, lr.end_time)
1369 else:
Johan Alfvén3a6325f2022-10-07 18:03:48 +02001370 # Number items reached max items or current lr's start time
1371 # does not overlap with previous lr's end time
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001372 component_ranges.append((start, i))
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001373 start = i
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001374 end_time = lr.end_time
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001375 component_ranges.append((start, len(competing_lrs)))
1376
1377 # Allocate each component separately
1378 for start, end in component_ranges:
1379 component_allocator.allocate_component(
1380 competing_lrs[start:end],
1381 max_mem_usage,
1382 base_mem_usage,
1383 self.scratched_fms,
1384 competing_tens_access,
1385 self.evicted_fms,
1386 )
1387 assert max(max_mem_usage) <= staging_limit, "Allocation exceeds staging limit"
Tim Halld8339a72021-05-27 18:49:40 +01001388
Tim Halld8339a72021-05-27 18:49:40 +01001389 def print_schedule(self, schedule: Schedule):
1390 print(f"Schedule: '{schedule.name}'")
1391 for sched_op in self.sched_ops:
1392 if sched_op not in schedule.cost_map:
1393 # Sub-schedule printing
1394 continue
1395
1396 op_info = schedule.cost_map[sched_op]
1397 print(f"\t{sched_op.index}: Operation {sched_op.name} - OFM {sched_op.ofm.shape}")
1398 print(f"\t\tType: {sched_op.op_type}")
1399 print(f"\t\tKernel: {sched_op.kernel}")
1400 print(f"{op_info}")
1401 mem_usage = (
1402 schedule.memory_snapshot[op_info.time_index]
1403 if op_info.time_index < len(schedule.memory_snapshot)
1404 else 0
1405 )
1406 print(f"\t\tSRAM Used: {mem_usage} bytes")
1407
Jonas Ohlsson25e700c2022-03-04 14:58:56 +01001408 print("\tCascades:")
Tim Halld8339a72021-05-27 18:49:40 +01001409 for i, cascade in enumerate(schedule.cascades.values()):
1410 print(f"\t\t{i}: {cascade.start} -> {cascade.end}, size: {cascade.mem_usage}")
Patrik Gustavssonfeeb06d2020-04-22 12:53:47 +02001411
Andreas Nevalainen27d36f02020-11-19 11:27:50 +01001412
Johan Alfven6e281af2023-02-28 09:03:03 +01001413def _update_memory_snapshot_for_all_npu_graphs(nng: Graph, arch: ArchitectureFeatures, schedulers):
1414 mem_area = arch.fast_storage_mem_area
1415 mem_type_set = set((MemType.Scratch, MemType.Scratch_fast))
1416
1417 # Collect live ranges for the full graph
1418 # extract_live_ranges_from_cascaded_passes will start from the root sg and
1419 # all sub graphs/cascaded passes will be visited and the correct time_index
1420 # will be set for all the tensors.
1421 lr_graph = live_range.LiveRangeGraph()
1422 live_range.extract_live_ranges_from_cascaded_passes(
1423 nng.get_root_subgraph(),
1424 mem_area,
1425 mem_type_set,
1426 lr_graph,
1427 Tensor.AllocationQuantum,
1428 )
1429 # Populate time-array with memory used by live ranges
1430 temporal_usage = lr_graph.get_temporal_memory_usage(arch.fast_storage_mem_area)
1431
1432 # Update snapshot for all the npu sub graphs
1433 # Not needed for the scheduler any longer but npu_performance
1434 # is using this information so it must have the correct state
1435 for sg in schedulers:
1436 sg.schedule.memory_snapshot = temporal_usage
1437 sg.schedule.fast_storage_peak_usage = max(temporal_usage, default=0)
1438
1439
Tim Halld8339a72021-05-27 18:49:40 +01001440def _update_tensor_allocation(nng: Graph, arch: ArchitectureFeatures, options):
1441 """
1442 Creates live ranges and runs tensor allocator for the current schedule
1443 (i.e. sg.schedule for all subgraphs), returns the maximum memory usage
1444 and updates SchedulerOpInfo.mem_usage for all operations in the schedule.
1445 """
1446 root_sg = nng.get_root_subgraph()
1447
1448 alloc_list = []
1449 if arch.is_spilling_enabled():
1450 mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,)))
1451 mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))
1452 # Order is important
1453 alloc_list.append(mem_alloc_scratch_fast)
1454 alloc_list.append(mem_alloc_scratch)
1455 else:
1456 mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))
1457 alloc_list.append(mem_alloc_scratch)
1458
1459 for mem_area, mem_type_set in alloc_list:
1460 tensor_allocation.allocate_tensors(
1461 nng,
1462 root_sg,
1463 arch,
1464 mem_area,
1465 mem_type_set,
1466 tensor_allocator=options.tensor_allocator,
1467 verbose_allocation=options.verbose_allocation,
1468 cpu_tensor_alignment=options.cpu_tensor_alignment,
Tim Hallcda4fcb2022-05-19 12:36:58 +01001469 hillclimb_max_iterations=options.hillclimb_max_iterations,
Tim Halld8339a72021-05-27 18:49:40 +01001470 )
1471
1472
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001473class FastStorageComponentAllocator:
Johan Alfvén5c309712022-06-10 15:40:58 +02001474 MAX_EXHAUSTIVE_LIFE_RANGE = 20
Johan Alfvén3a6325f2022-10-07 18:03:48 +02001475 MAX_EXHAUSTIVE_ITEMS = 20
Johan Alfvén5c309712022-06-10 15:40:58 +02001476
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001477 def __init__(self, base_mem_usage, max_mem_usage, staging_limit):
1478 self.base_mem_usage = base_mem_usage
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001479 self.max_mem_usage = max_mem_usage
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001480 self.staging_limit = staging_limit
1481 self.lrs = []
1482 self.evicted = []
1483 self.curr_evicted = []
1484 self.remaining_total_size = []
Johan Alfvén5c309712022-06-10 15:40:58 +02001485 self.best_score = 0
1486 self.competing_tens_access = {}
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001487
Johan Alfvén5c309712022-06-10 15:40:58 +02001488 def allocate_exhaustive(self, ix, score):
1489 # Favour tensors with highest element access (score)
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001490 if ix >= self.num_lrs:
Johan Alfvén5c309712022-06-10 15:40:58 +02001491 if score > self.best_score:
1492 self.best_score = score
Louis Verhaard5c8f1e52022-02-23 14:13:07 +01001493 self.evicted = self.curr_evicted.copy()
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001494 return
1495
1496 lr = self.lrs[ix]
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001497
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001498 # If adding the tensor size to the base mem usage doesn't exceed the staging limit anywhere on the lr time
1499 # range, it can fit and the case where the tensor is included needs to be checked
1500 can_fit = max(self.base_mem_usage[lr.start_time : lr.end_time + 1]) + lr.size <= self.staging_limit
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001501 if can_fit:
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001502 # Tensor can fit, add tensor element access to the score and check case where tensor is included
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001503 self.curr_evicted[ix] = False
1504 self.base_mem_usage = self.update_mem_usage(self.base_mem_usage, lr, True)
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001505 self.allocate_exhaustive(ix + 1, score + self.competing_tens_access[lr.tensors[0]])
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001506 self.base_mem_usage = self.update_mem_usage(self.base_mem_usage, lr, False)
1507
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001508 # If the max mem usage doesn't exceed the staging limit anywhere on the lr time range, it always fits and the
1509 # case where the tensor is not included can be skipped
1510 always_fits = max(self.max_mem_usage[lr.start_time : lr.end_time + 1]) <= self.staging_limit
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001511 if not always_fits:
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001512 # Tensor doesn't always fit, check case when tensor is not included
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001513 self.curr_evicted[ix] = True
1514 self.max_mem_usage = self.update_mem_usage(self.max_mem_usage, lr, False)
Johan Alfvén5c309712022-06-10 15:40:58 +02001515 self.allocate_exhaustive(ix + 1, score)
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001516 self.max_mem_usage = self.update_mem_usage(self.max_mem_usage, lr, True)
1517
1518 @staticmethod
1519 def update_mem_usage(mem_usage, lr, increase):
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001520 size = lr.size if increase else -lr.size
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001521 for t in range(lr.start_time, lr.end_time + 1):
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001522 mem_usage[t] += size
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001523 return mem_usage
1524
1525 @staticmethod
1526 def evict(lr, max_mem_usage, scratched_fms):
1527 for t in range(lr.start_time, lr.end_time + 1):
1528 max_mem_usage[t] -= lr.size
1529 for tens in lr.tensors:
1530 if tens in scratched_fms:
1531 tens.mem_area = scratched_fms[tens][0]
1532 tens.mem_type = scratched_fms[tens][1]
1533
1534 @staticmethod
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001535 def keep(lr, base_mem_usage):
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001536 for t in range(lr.start_time, lr.end_time + 1):
1537 base_mem_usage[t] += lr.size
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001538
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001539 def allocate_component(self, lrs, max_mem, min_mem, scratched_fms, competing_tens_access, evicted_fms):
1540 self.lrs = lrs
1541 self.num_lrs = len(lrs)
1542 self.evicted = [0] * self.num_lrs
1543 self.curr_evicted = [0] * self.num_lrs
1544 self.best_score = -1
1545 self.competing_tens_access = competing_tens_access
Johan Alfvén5c309712022-06-10 15:40:58 +02001546 # Recursively evaluate all permutations of allocations of the lrs found in the component.
1547 # For every permutation that fits within the staging_limit there is a score calculated.
1548 # The permutation with the highest score will then be chosen. The score is calculated
1549 # as the sum of the actual element access (ifm read and ofm write) for all the
1550 # including tensors. So it is not necessary the tensor with the biggest size that ends up
1551 # being included in the result.
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001552 self.allocate_exhaustive(0, 0)
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001553 # Optimal allocation has been found, move lrs accordingly
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001554 for i, lr in enumerate(self.lrs):
1555 if self.evicted[i]:
1556 self.evict(lr, max_mem, scratched_fms)
1557 if lr not in evicted_fms:
1558 evicted_fms.append(lr)
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001559 else:
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001560 self.keep(lr, min_mem)
1561 if lr in evicted_fms:
1562 evicted_fms.remove(lr)
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001563
1564
Tim Halld8339a72021-05-27 18:49:40 +01001565def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_options: SchedulerOptions):
1566 """Entry point for the Scheduler"""
1567 # Initialize CPU subgraphs
1568 schedulers = dict()
1569 # Initialize schedulers with max schedule. Only schedule NPU subgraphs
Andreas Nevalainen27d36f02020-11-19 11:27:50 +01001570 for sg in nng.subgraphs:
Tim Halld8339a72021-05-27 18:49:40 +01001571 if sg.placement != PassPlacement.Npu:
1572 # Create cascaded passes for CPU Ops
1573 cascaded_passes = []
1574 for idx, ps in enumerate(sg.passes):
1575 cps = CascadedPass(
Jonas Ohlssond8575072022-03-30 10:30:25 +02001576 ps.name,
1577 SchedulingStrategy.WeightStream,
1578 ps.inputs,
1579 [],
1580 ps.outputs,
1581 [ps],
1582 ps.placement,
1583 False,
Tim Halld8339a72021-05-27 18:49:40 +01001584 )
Andreas Nevalainen27d36f02020-11-19 11:27:50 +01001585
Tim Halld8339a72021-05-27 18:49:40 +01001586 cps.time = idx
1587 ps.cascade = cps
1588 cascaded_passes.append(cps)
Andreas Nevalainen27d36f02020-11-19 11:27:50 +01001589
Tim Halld8339a72021-05-27 18:49:40 +01001590 sg.cascaded_passes = cascaded_passes
1591 else:
1592 # Npu subgraph - create schedule
1593 scheduler = Scheduler(nng, sg, arch, scheduler_options)
1594 schedulers[sg] = scheduler
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001595
Tim Halld8339a72021-05-27 18:49:40 +01001596 scheduler.create_scheduler_representation(arch)
1597 sg.sched_ops = scheduler.sched_ops
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001598
Tim Halld8339a72021-05-27 18:49:40 +01001599 # Create the Max schedule template
1600 max_schedule_template = scheduler.create_initial_schedule()
1601 scheduler.max_schedule = max_schedule_template
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001602
Tim Halld8339a72021-05-27 18:49:40 +01001603 # Create the optimimised Max schedule
1604 sg.schedule = max_schedule_template
1605 scheduler.update_op_memory_snapshot(max_schedule_template)
Tim Hall789e6f32021-06-17 17:02:31 +01001606 opt_max_schedule = scheduler.propose_schedule_buffering(max_schedule_template, 1 << 32)
Tim Halld8339a72021-05-27 18:49:40 +01001607 sg.schedule = opt_max_schedule
1608 scheduler.update_op_memory_snapshot(opt_max_schedule)
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001609
Tim Halld8339a72021-05-27 18:49:40 +01001610 # Create Min schedule
1611 min_schedule = scheduler.propose_minimal_schedule()
1612 initial_sram_limit = scheduler_options.optimization_sram_limit
1613 if scheduler_options.optimization_strategy == OptimizationStrategy.Size:
1614 initial_sram_limit = scheduler.min_memory_req
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001615
Johan Alfvén255dad72022-07-16 18:27:05 +02001616 # Build cascades for Min schedule
1617 scheduler.build_cascades_for_min_schedule(min_schedule, max_schedule_template, initial_sram_limit)
Tim Halld8339a72021-05-27 18:49:40 +01001618 sg.schedule = min_schedule
1619 scheduler.update_op_memory_snapshot(min_schedule)
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001620
Tim Halld8339a72021-05-27 18:49:40 +01001621 if scheduler_options.optimization_strategy == OptimizationStrategy.Performance:
1622 # Create an optimized schedule
1623 sg.schedule = scheduler.optimize_schedule(
1624 min_schedule, opt_max_schedule, max_schedule_template, scheduler_options
1625 )
1626 scheduler.update_op_memory_snapshot(sg.schedule)
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001627
Tim Halld8339a72021-05-27 18:49:40 +01001628 scheduler.apply_schedule(sg.schedule)
1629 scheduler.use_fast_storage_for_feature_maps(sg.schedule, scheduler_options.optimization_sram_limit)
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001630
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001631 if scheduler_options.optimization_strategy == OptimizationStrategy.Performance and scheduler.evicted_fms:
1632 # It might be possible to gain performance by reducing
1633 # weight buffer size and instead fit fms in fast storage
1634 scheduler.optimize_weight_buffering_size(min_schedule, scheduler_options)
1635
Tim Halld8339a72021-05-27 18:49:40 +01001636 if scheduler_options.verbose_schedule:
1637 scheduler.print_schedule(sg.schedule)
Tim Hall79d07d22020-04-27 18:20:16 +01001638
Johan Alfven6e281af2023-02-28 09:03:03 +01001639 # Make a full live range calculation starting from the root sg
1640 _update_memory_snapshot_for_all_npu_graphs(nng, arch, schedulers)
1641
Tim Halld8339a72021-05-27 18:49:40 +01001642 # Evaluate schedule
1643 _update_tensor_allocation(nng, arch, options)