blob: 4e0599e12ef642475b54d86cda446faee3531446 [file] [log] [blame]
Rickard Bolinbc6ee582022-11-04 08:24:29 +00001# SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Halld8339a72021-05-27 18:49:40 +010016#
Tim Hall79d07d22020-04-27 18:20:16 +010017# Description:
Tim Halld8339a72021-05-27 18:49:40 +010018# The scheduler creates and searches for an optimal plan for the network, selecting block configurations and
19# subdivisions for the Operators
Jonas Ohlsson845e2322022-03-01 12:39:55 +010020# For Class name forward references for the type annotations. (see PEP 563).
21from __future__ import annotations
22
Diego Russoea6111a2020-04-14 18:41:58 +010023import copy
Johan Alfvén5e0ae552022-02-09 21:20:10 +010024from collections import namedtuple
Tim Halld8339a72021-05-27 18:49:40 +010025from enum import auto
26from enum import IntEnum
Johan Alfvén6f4cb032022-05-05 08:42:46 +020027from typing import Any
Tim Halld8339a72021-05-27 18:49:40 +010028from typing import Dict
29from typing import List
30from typing import Optional
31from typing import Tuple
Jonas Ohlsson845e2322022-03-01 12:39:55 +010032from typing import TYPE_CHECKING
33
34# Import needed for Type annotations. Only import for Type checking to avoid run-time errors due to cyclic import.
35if TYPE_CHECKING:
36 from .npu_performance import CycleCost
Diego Russoea6111a2020-04-14 18:41:58 +010037
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +010038import numpy as np
39
Diego Russoea6111a2020-04-14 18:41:58 +010040from . import live_range
Tim Hall79d07d22020-04-27 18:20:16 +010041from . import npu_performance
Tim Halld8339a72021-05-27 18:49:40 +010042from . import tensor_allocation
43from . import weight_compressor
44from .architecture_allocator import ArchitectureBlockConfig
45from .architecture_allocator import find_block_config
46from .architecture_allocator import get_ifm_area_required
Fredrik Svedbergd03dc502022-06-30 10:44:12 +020047from .architecture_allocator import to_upscale
erik.andersson@arm.com8912f3a2022-08-16 11:08:57 +020048from .architecture_allocator import is_nearest
Tim Halld8339a72021-05-27 18:49:40 +010049from .architecture_features import ArchitectureFeatures
50from .architecture_features import Block
51from .cascade_builder import CascadeBuilder
52from .cascade_builder import CascadeInfo
Fredrik Svedberg880e7352020-08-25 11:31:47 +020053from .data_type import DataType
Diego Russoe8a10452020-04-21 17:39:10 +010054from .nn_graph import CascadedPass
Tim Halld8339a72021-05-27 18:49:40 +010055from .nn_graph import Graph
56from .nn_graph import Pass
Diego Russoe8a10452020-04-21 17:39:10 +010057from .nn_graph import PassPlacement
Diego Russoe8a10452020-04-21 17:39:10 +010058from .nn_graph import SchedulingStrategy
Tim Halld8339a72021-05-27 18:49:40 +010059from .nn_graph import Subgraph
Johan Alfvén92689d52022-12-06 11:16:19 +010060from .live_range import ofm_can_reuse_ifm
Tim Halld8339a72021-05-27 18:49:40 +010061from .numeric_util import round_down
62from .numeric_util import round_up
Diego Russoe8a10452020-04-21 17:39:10 +010063from .operation import NpuBlockType
Louis Verhaardaee5d752020-09-30 09:01:52 +020064from .operation import Op
Tim Halld8339a72021-05-27 18:49:40 +010065from .shape4d import Shape4D
Diego Russoe8a10452020-04-21 17:39:10 +010066from .tensor import MemArea
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020067from .tensor import MemType
Tim Halld8339a72021-05-27 18:49:40 +010068from .tensor import Tensor
Diego Russoe8a10452020-04-21 17:39:10 +010069from .tensor import TensorFormat
70from .tensor import TensorPurpose
71from .tensor import TensorSubPurpose
Jonas Ohlsson845e2322022-03-01 12:39:55 +010072from .weight_compressor import NpuWeightTensor
Jacob Bohlin1a666972020-09-11 10:04:15 +020073
Tim Hall79d07d22020-04-27 18:20:16 +010074
Tim Halld8339a72021-05-27 18:49:40 +010075def shape_for_format(shape: Shape4D, tensor_format: TensorFormat) -> Shape4D:
76 if tensor_format == TensorFormat.NHCWB16:
77 return shape.with_depth(round_up(shape.depth, 16))
78
79 return shape
80
81
82class OptimizationStrategy(IntEnum):
83 """Enum defining the different optimization strategies for the Scheduler"""
84
85 Size = auto()
86 Performance = auto()
Tim Hall79d07d22020-04-27 18:20:16 +010087
88 def __str__(self):
89 return self.name
90
91
Tim Halld8339a72021-05-27 18:49:40 +010092class SchedulerOpInfo:
93 """Contains metadata about a SchedulerOperation that is unique to one Schedule"""
94
Tim Hall79d07d22020-04-27 18:20:16 +010095 def __init__(
96 self,
Tim Halld8339a72021-05-27 18:49:40 +010097 block_config: ArchitectureBlockConfig,
98 weights_size: int,
99 stripe_input: Shape4D,
100 stripe_input2: Optional[Shape4D],
101 stripe: Shape4D,
Tim Hall79d07d22020-04-27 18:20:16 +0100102 ):
Tim Halld8339a72021-05-27 18:49:40 +0100103 self.block_config = block_config
104 self.weights_size = weights_size
105 self.stripe_input = stripe_input
106 self.stripe_input2 = stripe_input2
107 self.stripe = stripe
108 self.cascade = 0 # Assigned by CascadeBuilder. 0 means not part of a cascade
109 self.time_index = None # Set by update_op_memory_snapshot
110 self.ofm_depth_slices: List[int] = [0, stripe.depth]
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100111 self.npu_weights_tensor: Optional[NpuWeightTensor] = None
112 self.npu_scales_tensor: Optional[NpuWeightTensor] = None
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000113 self.buffered_weight_tensors: List[Tensor] = []
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100114 self.cycles: Optional[CycleCost] = None
Tim Halld8339a72021-05-27 18:49:40 +0100115 self.slack_buffering_cycles = 0
116 self.slack_buffering_memory = 0
117 self.full_weight_transfer_cycles = 0
118
119 def copy(self):
Jonas Ohlssond8575072022-03-30 10:30:25 +0200120 res = SchedulerOpInfo(
121 self.block_config,
122 self.weights_size,
123 self.stripe_input,
124 self.stripe_input2,
125 self.stripe,
126 )
Tim Halld8339a72021-05-27 18:49:40 +0100127 res.cascade = self.cascade
128 return res
129
130 def __str__(self):
131 res = f"\t\tBlock Config = {self.block_config}\n"
132 res += f"\t\tOFM Block = {self.block_config.ofm_block}\n"
133 res += f"\t\tIFM Stripe = {self.stripe_input}\n"
134 res += f"\t\tIFM2 Stripe = {self.stripe_input2}\n"
135 res += f"\t\tOFM Stripe = {self.stripe}\n"
136 res += f"\t\tEncoded Weights = {self.npu_weights_tensor and len(self.npu_weights_tensor.buffer)} bytes\n"
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000137 for idx, tens in enumerate(self.buffered_weight_tensors):
138 res += f"\t\tWeight buffer{idx + 1} = {tens.storage_size()} bytes\n"
Tim Halld8339a72021-05-27 18:49:40 +0100139 res += f"\t\tDepth slices = {self.ofm_depth_slices}\n"
140 res += f"\t\tAssigned Cascade = {self.cascade}"
141 return res
142
143
144class SchedulerOptions:
145 """Contains options for the Scheduler"""
146
147 def __init__(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200148 self,
149 optimization_strategy,
150 sram_target,
151 verbose_schedule,
Tim Halld8339a72021-05-27 18:49:40 +0100152 ):
153 self.optimization_strategy = optimization_strategy
154 self.optimization_sram_limit = sram_target
Tim Hall79d07d22020-04-27 18:20:16 +0100155 self.verbose_schedule = verbose_schedule
Tim Hall79d07d22020-04-27 18:20:16 +0100156
Tim Halld8339a72021-05-27 18:49:40 +0100157 def __str__(self) -> str:
158 return f"{type(self).__name__}: {str(self.__dict__)}"
Tim Hall79d07d22020-04-27 18:20:16 +0100159
160 __repr__ = __str__
161
162
Tim Halld8339a72021-05-27 18:49:40 +0100163class SchedulerTensor:
164 def __init__(self, shape, dt, mem_area, _format):
165 self.dtype = dt
166 self.mem_area = mem_area
167 self.shape = shape
168 self.format = _format
169 self.connection = None
Tim Hall79d07d22020-04-27 18:20:16 +0100170
Tim Halld8339a72021-05-27 18:49:40 +0100171
172class SchedulerOperation:
173 """Scheduler internal representation of 'Operation'
174 This class can be seen as a node within the Scheduler Graph representation
175 """
176
177 def __init__(self, ps: Pass, arch: ArchitectureFeatures, nng: Graph):
178 self.arch = arch
179 self.parent_ps = ps
180 self.parent_op = ps.primary_op
181 self.name = ps.primary_op.name
182 self.op_type = ps.primary_op.type
183 self.activation = ps.primary_op.activation
184 self.kernel = ps.primary_op.kernel
Tim Hall3c5cfe92022-03-16 16:31:57 +0000185 self.resampling_mode = ps.primary_op.ifm_resampling_mode
Fredrik Svedbergb81e1bb2022-10-11 21:50:51 +0200186 self.reversed_operands = False
Tim Halld8339a72021-05-27 18:49:40 +0100187 self.uses_scalar = ps.primary_op.ifm2 is not None and (
188 ps.primary_op.ifm.shape == [] or ps.primary_op.ifm2.shape == []
Tim Hall79d07d22020-04-27 18:20:16 +0100189 )
erik.andersson@arm.com6b2a0b42022-03-22 15:35:30 +0100190
Tim Halld8339a72021-05-27 18:49:40 +0100191 self.ifm_ublock = arch.ifm_ublock
Tim Hall79d07d22020-04-27 18:20:16 +0100192
Jonas Ohlssond8575072022-03-30 10:30:25 +0200193 self.ifm = SchedulerTensor(
194 ps.ifm_shapes[0],
195 ps.ifm_tensor.dtype,
196 ps.ifm_tensor.mem_area,
197 ps.ifm_tensor.format,
198 )
Tim Hall79d07d22020-04-27 18:20:16 +0100199
Tim Halld8339a72021-05-27 18:49:40 +0100200 self.ifm2 = None
201 if ps.ifm2_tensor:
202 self.ifm2 = SchedulerTensor(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200203 ps.ifm_shapes[1],
204 ps.ifm2_tensor.dtype,
205 ps.ifm2_tensor.mem_area,
206 ps.ifm2_tensor.format,
Tim Halld8339a72021-05-27 18:49:40 +0100207 )
Tim Hall79d07d22020-04-27 18:20:16 +0100208
Jonas Ohlssond8575072022-03-30 10:30:25 +0200209 self.ofm = SchedulerTensor(
210 ps.ofm_shapes[0],
211 ps.ofm_tensor.dtype,
212 ps.ofm_tensor.mem_area,
213 ps.ofm_tensor.format,
214 )
Tim Hall79d07d22020-04-27 18:20:16 +0100215
Tim Halld8339a72021-05-27 18:49:40 +0100216 # Input volume width and height required to produce the smallest possible stripe
217 self.min_stripe_input_w, self.min_stripe_input_h = self._calculate_min_stripe_input()
Tim Hall79d07d22020-04-27 18:20:16 +0100218
Tim Halld8339a72021-05-27 18:49:40 +0100219 # Flags that marks whether this SchedulerOperation requires full IFM/OFM
220 self.requires_full_ifm = False
221 self.requires_full_ifm2 = False
222 self.requires_full_ofm = False
Tim Hall79d07d22020-04-27 18:20:16 +0100223
Johan Alfvén6f4cb032022-05-05 08:42:46 +0200224 self.evicted_fms_size = 0
225
Tim Halld8339a72021-05-27 18:49:40 +0100226 self.index = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100227
erik.andersson@arm.com6b2a0b42022-03-22 15:35:30 +0100228 # Perform an IFM swap for certain binary elementwise operators
229 # in order to enable cascading, if the SchedOp conforms to
230 # Elementwise cascading rules.
Johan Alfvén0f2e59f2022-10-21 11:21:38 +0200231 # The non-constant/non-scalar/non-broadcast IFM should be the primary input
232 if self.op_type.is_binary_elementwise_op():
233 ifm = self.parent_op.ifm
234 ifm2 = self.parent_op.ifm2
235 ofm = self.parent_op.ofm
erik.andersson@arm.com6b2a0b42022-03-22 15:35:30 +0100236
Johan Alfvén993ea532022-10-26 10:20:01 +0200237 ifm_can_swap = ifm.is_const or ifm.is_scalar
Johan Alfvén0f2e59f2022-10-21 11:21:38 +0200238 ifm2_can_be_primary = not (ifm2.is_const or ifm2.is_scalar or ifm2.is_broadcast(ofm))
239
Johan Alfvén993ea532022-10-26 10:20:01 +0200240 if ifm_can_swap and ifm2_can_be_primary:
Johan Alfvén0f2e59f2022-10-21 11:21:38 +0200241 # IFM2 is the primary input
Fredrik Svedbergb81e1bb2022-10-11 21:50:51 +0200242 self.reversed_operands = True
erik.andersson@arm.com6b2a0b42022-03-22 15:35:30 +0100243 self.ifm, self.ifm2 = self.ifm2, self.ifm
244
245 self.parent_ps.ifm_shapes = self.parent_ps.ifm_shapes[::-1]
246 self.parent_ps.inputs = self.parent_ps.inputs[::-1]
247 self.parent_ps.ifm_tensor, self.parent_ps.ifm2_tensor = (
248 self.parent_ps.ifm2_tensor,
249 self.parent_ps.ifm_tensor,
250 )
251
Tim Halld8339a72021-05-27 18:49:40 +0100252 def add_ifm_connection(self, conn: "Connection"):
253 """Add input connection to another SchedulerOperation or Subgraph Input"""
254 conn.consumers.append(self)
255 self.ifm.connection = conn
Tim Hall79d07d22020-04-27 18:20:16 +0100256
Tim Halld8339a72021-05-27 18:49:40 +0100257 def add_ifm2_connection(self, conn: "Connection"):
258 """Add input connection to another SchedulerOperation or Subgraph Input"""
259 if self.ifm2:
260 conn.consumers.append(self)
261 self.ifm2.connection = conn
Tim Hall79d07d22020-04-27 18:20:16 +0100262 else:
Tim Halld8339a72021-05-27 18:49:40 +0100263 assert False, f"Trying to set an IFM2 Connection to {self} which has no IFM2"
Tim Hall79d07d22020-04-27 18:20:16 +0100264
Tim Halld8339a72021-05-27 18:49:40 +0100265 def add_ofm_connection(self, conn: "Connection"):
266 """Add output connection to another SchedulerOperation or Subgraph Output"""
267 conn.producers.append(self)
268 self.ofm.connection = conn
269
270 def get_dependants(self):
271 """Returns a list of the Ops that depend on this Operation's OFM"""
272 return self.ofm.connection.consumers
273
274 def ifm_size_in_bytes(self) -> int:
275 """Returns size of the IFM in bytes"""
276 ifm_storage_shape = shape_for_format(self.ifm.shape, self.ifm.format)
277 return round_up(ifm_storage_shape.elements() * self.ifm.dtype.size_in_bytes(), Tensor.AllocationQuantum)
278
279 def ifm2_size_in_bytes(self) -> int:
280 """Returns size of the IFM2 in bytes"""
281 if self.ifm2:
282 ifm2_storage_shape = shape_for_format(self.ifm2.shape, self.ifm2.format)
283 return round_up(ifm2_storage_shape.elements() * self.ifm2.dtype.size_in_bytes(), Tensor.AllocationQuantum)
284
285 return 0
286
287 def ofm_size_in_bytes(self) -> int:
288 """Returns size of the OFM in bytes"""
289 ofm_storage_shape = shape_for_format(self.ofm.shape, self.ofm.format)
290 return round_up(ofm_storage_shape.elements() * self.ofm.dtype.size_in_bytes(), Tensor.AllocationQuantum)
291
292 def create_scheduler_info(self, nng: Graph, stripe: Shape4D) -> SchedulerOpInfo:
293 """Returns schedule info about this SchedulerOperation based on how many ofm elements it should produce"""
294 ifm_shape = self.ifm.shape
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100295 ifm2_shape = self.ifm2.shape if self.ifm2 is not None else None
Tim Halld8339a72021-05-27 18:49:40 +0100296 ofm_shape = stripe
297
298 if ofm_shape != self.ofm.shape:
299 # Striped Op - Need to calculate stripe input volume
300 stripe_input_w, stripe_input_h = self._get_stripe_input_requirement(stripe)
301 # Ensure stripe input volume is within the full IFM volume
302 stripe_input_h = min(stripe_input_h, self.ifm.shape.height)
303 stripe_input_w = min(stripe_input_w, self.ifm.shape.width)
304 ifm_shape = ifm_shape.with_hw(stripe_input_h, stripe_input_w)
305
306 if self.ifm2:
307 stripe_input2_h = min(stripe_input_h, self.ifm2.shape.height)
308 stripe_input2_w = min(stripe_input_w, self.ifm2.shape.width)
309 ifm2_shape = ifm2_shape.with_hw(stripe_input2_h, stripe_input2_w)
310
311 block_config = self._get_block_config(ifm_shape, ifm2_shape, self.uses_scalar, ofm_shape)
312
313 scheduler_op_info = SchedulerOpInfo(block_config, 0, ifm_shape, ifm2_shape, ofm_shape)
314 if self.parent_op.weights:
315 # Default full-depth weight encoding with no buffering
Tim Halld784af72021-06-08 21:25:57 +0100316 (
317 scheduler_op_info.npu_weights_tensor,
318 scheduler_op_info.npu_scales_tensor,
319 ) = weight_compressor.encode_weight_and_scale_tensor(
Tim Halld8339a72021-05-27 18:49:40 +0100320 self.arch,
321 self.parent_op,
322 self.parent_op.weights,
323 self.parent_op.bias,
324 self.kernel,
325 block_config,
326 [0, self.ofm.shape.depth],
327 )
328
329 self.parent_ps.block_config = block_config.old_style_representation()
330 return scheduler_op_info
331
332 def _get_stripe_input_requirement(self, stripe_shape: Shape4D) -> Tuple[int, int]:
333 """Returns the amount of IFM required to produce the stripe with shape:'stripe_shape'"""
334 ofm_shape_to_produce = Block.from_shape(stripe_shape.as_list())
335
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200336 return get_ifm_area_required(ofm_shape_to_produce, self.kernel, self.resampling_mode)
Tim Halld8339a72021-05-27 18:49:40 +0100337
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100338 def _calculate_min_stripe_input(self) -> Tuple[int, int]:
Tim Halld8339a72021-05-27 18:49:40 +0100339 # Calculate the input volume required height and width for the smallest possible stripe (h,w = 1,1)
340 min_stripe = self.ofm.shape.with_hw(1, 1)
341 return self._get_stripe_input_requirement(min_stripe)
342
343 def _get_block_config(
344 self, ifm_shape: Shape4D, ifm2_shape: Optional[Shape4D], uses_scalar: bool, ofm_shape: Shape4D
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100345 ) -> Optional[ArchitectureBlockConfig]:
Tim Halld8339a72021-05-27 18:49:40 +0100346 # Returns a block config and SHRAM layout
347 lut_banks = 2 if self.parent_op.activation_lut else 0
348 return find_block_config(
349 self.arch,
350 self.op_type.npu_block_type,
351 ofm_shape,
352 ifm_shape,
353 ifm2_shape,
354 uses_scalar,
355 self.ifm.dtype.size_in_bits(),
356 self.kernel,
357 lut_banks,
358 self.parent_op.has_scaling(),
359 self.resampling_mode,
360 )
361
362
363class Connection:
364 """Scheduler internal representation of a Tensor that connects two SchedulerOperations
365 This class can be seen as an edge within the Scheduler Graph representation
366 """
367
368 def __init__(self, tensor: Tensor):
369 self.parent_tens = tensor
370
371 # SchedulerOperation relationships
372 self.producers: List[SchedulerOperation] = []
373 self.consumers: List[SchedulerOperation] = []
Tim Hall79d07d22020-04-27 18:20:16 +0100374
375 def __str__(self):
Tim Halld8339a72021-05-27 18:49:40 +0100376 return f"<Connection {self.parent_tens.name}>"
Tim Hall79d07d22020-04-27 18:20:16 +0100377
378 __repr__ = __str__
379
380
Tim Halld8339a72021-05-27 18:49:40 +0100381class Schedule:
382 """Class that contains a solution of how to schedule an NPU subgraph and its cost"""
Tim Hall79d07d22020-04-27 18:20:16 +0100383
Tim Halld8339a72021-05-27 18:49:40 +0100384 def __init__(self, sg: Subgraph, label: str):
385 self.sg = sg
386 self.label = label
387 self.cost_map: Dict[SchedulerOperation, SchedulerOpInfo] = {}
388 self.cascades: Dict[int, CascadeInfo] = {}
389 self.fast_storage_peak_usage = 0
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100390 self.memory_snapshot: Optional[List[int]] = None
Tim Halld8339a72021-05-27 18:49:40 +0100391
392 @property
393 def name(self):
394 return f"{self.sg.name}_{self.label}"
Tim Hall79d07d22020-04-27 18:20:16 +0100395
396
Tim Halld8339a72021-05-27 18:49:40 +0100397class Scheduler:
398 """Main class of the Vela Scheduling"""
Tim Hall79d07d22020-04-27 18:20:16 +0100399
Tim Halld8339a72021-05-27 18:49:40 +0100400 def __init__(self, nng: Graph, sg: Subgraph, arch: ArchitectureFeatures, options: SchedulerOptions):
Tim Hall79d07d22020-04-27 18:20:16 +0100401 self.nng = nng
402 self.sg = sg
403 self.arch = arch
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000404 self.sched_ops: List[SchedulerOperation] = []
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100405 self.max_schedule: Optional[Schedule] = None
Tim Halld8339a72021-05-27 18:49:40 +0100406 self.scheduler_options = options
Tim Hall79d07d22020-04-27 18:20:16 +0100407
Johan Alfvén6f4cb032022-05-05 08:42:46 +0200408 self.scratched_fms: Dict[Tensor, Any] = {}
409 self.evicted_fms: List[live_range.LiveRange] = []
410
Johan Alfvén5e0ae552022-02-09 21:20:10 +0100411 def avoid_nhcwb16_for_ofm(self, tens, ps, arch):
412 # Only run this check for opt strategy Size
413 if self.scheduler_options.optimization_strategy == OptimizationStrategy.Performance:
414 return False
415
416 op = ps.primary_op
417 if not op.type.is_elementwise_op():
418 return False
419
420 depth = op.ofm_shapes[0][-1]
421 if (depth % 16) == 0:
422 return False
423
424 # Check if overwriting the inputs can be allowed
425 OpShapeTens = namedtuple("OpShapeTens", ["op_shape", "tens"])
426 outp = OpShapeTens(op.ofm_shapes[0], op.ofm)
427 inps = []
428 if op.ifm is not None:
429 inps.append(OpShapeTens(op.ifm_shapes[0], op.ifm))
430 if op.ifm2 is not None:
431 inps.append(OpShapeTens(op.ifm_shapes[1], op.ifm2))
432
433 # Find an input tensor that can be overwritten by the output
434 for inp in inps:
435 if (
436 # check op input and output shapes allow overlapping
437 inp.op_shape == outp.op_shape
438 # check input tensor is valid
439 and inp.tens is not None
440 and inp.tens.shape != []
441 # check input and output tensors are compatible
442 and inp.tens.format == outp.tens.format
443 and inp.tens.dtype == outp.tens.dtype
444 ):
445 if inp.tens.format == TensorFormat.NHWC:
446 return True
447
448 return False
449
Tim Halld8339a72021-05-27 18:49:40 +0100450 def create_scheduler_representation(self, arch: ArchitectureFeatures):
451 """Creates a Scheduler Graph representation"""
452 # Temporary dict for creating connections between the Operations
453 connections: Dict[Tensor, Connection] = {}
454 # Memory required for the largest FeatureMap that has to be full
455 min_memory_req = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100456 for ps in self.sg.passes:
Tim Halld8339a72021-05-27 18:49:40 +0100457 if ps.primary_op:
458 # Set tensor format to NHCWB16 for output FeatureMaps, if possible
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200459 for output in ps.outputs:
Jacob Bohlina5e8c1c2021-06-14 13:33:39 +0200460 if output in self.sg.output_tensors or output.purpose != TensorPurpose.FeatureMap:
Patrik Gustavssonfeeb06d2020-04-22 12:53:47 +0200461 continue
Johan Alfvén5e0ae552022-02-09 21:20:10 +0100462
463 if output.needs_linear_format:
464 continue
465
466 if self.avoid_nhcwb16_for_ofm(output, ps, arch):
467 output.needs_linear_format = True
468 continue
469
470 output.set_format(TensorFormat.NHCWB16, arch)
Tim Halld8339a72021-05-27 18:49:40 +0100471
472 # Create SchedulerOperations
473 op = SchedulerOperation(ps, arch, self.nng)
474 op.index = len(self.sched_ops)
475
476 # Make connections
477 if ps.ifm_tensor not in connections:
478 connections[ps.ifm_tensor] = Connection(ps.ifm_tensor)
479 if ps.ifm2_tensor and ps.ifm2_tensor not in connections:
480 connections[ps.ifm2_tensor] = Connection(ps.ifm2_tensor)
481 if ps.ofm_tensor not in connections:
482 connections[ps.ofm_tensor] = Connection(ps.ofm_tensor)
483
484 op.add_ifm_connection(connections[ps.ifm_tensor])
485 if ps.ifm2_tensor:
486 op.add_ifm2_connection(connections[ps.ifm2_tensor])
487 op.add_ofm_connection(connections[ps.ofm_tensor])
488
489 # Set requirements on the ifm/ofm buffers
490 self.sched_ops.append(op)
491 if ps.ifm_tensor in self.sg.input_tensors:
492 # This Op consumes a subgraph input
493 op.requires_full_ifm = True
494 if ps.ifm2_tensor and ps.ifm2_tensor in self.sg.input_tensors:
495 # This Op consumes a subgraph input
496 op.requires_full_ifm2 = True
497 if ps.ofm_tensor in self.sg.output_tensors:
498 # This Op produces a subgraph output
499 op.requires_full_ofm = True
500 if ps.ifm_tensor.needs_linear_format:
501 op.requires_full_ifm = True
502 if ps.ifm2_tensor and ps.ifm2_tensor.needs_linear_format:
503 op.requires_full_ifm2 = True
504 if ps.ofm_tensor.needs_linear_format or ps.primary_op.memory_function == Op.ConcatSliceWrite:
505 op.requires_full_ofm = True
506 if len(ps.primary_op.outputs) > 1 or len(ps.primary_op.outputs[0].consumer_list) > 1:
507 # Op has multiple outputs or consumers - requires full OFM
508 op.requires_full_ofm = True
509
510 # Check memory requirements if this Op requires any full FeatureMaps
511 op_memory_req = 0
512 if op.requires_full_ifm:
513 op_memory_req += op.ifm_size_in_bytes()
514 if op.requires_full_ifm2:
515 op_memory_req += op.ifm2_size_in_bytes()
516 if op.requires_full_ofm:
517 op_memory_req += op.ofm_size_in_bytes()
518
519 min_memory_req = max(op_memory_req, min_memory_req)
520
521 # Theoretical minimum required memory - used to guide the cascade building
522 self.min_memory_req = min_memory_req
523
524 def create_initial_schedule(self) -> Schedule:
525 """Creates an initial schedule with no cascading or buffering of any kind"""
526 schedule = Schedule(self.sg, "MAX")
Tim Halld8339a72021-05-27 18:49:40 +0100527 for op in self.sched_ops:
528 cost = op.create_scheduler_info(self.nng, op.ofm.shape)
529 cost.cycles = self.estimate_op_performance(op, cost.block_config, op.ofm.shape.depth)
530 schedule.cost_map[op] = cost
531
532 return schedule
533
534 def update_op_memory_snapshot(self, schedule: Schedule):
535 memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]
536
537 # Collect live ranges from tensors
538 lr_graph = live_range.LiveRangeGraph()
539 for mem_area, mem_type_set in memories_list:
540 live_range.extract_live_ranges_from_cascaded_passes(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200541 self.nng.get_root_subgraph(),
542 mem_area,
543 mem_type_set,
544 lr_graph,
545 Tensor.AllocationQuantum,
Tim Halld8339a72021-05-27 18:49:40 +0100546 )
547
548 # Populate time-array with memory used by live ranges
549 temporal_usage = lr_graph.get_temporal_memory_usage(self.arch.fast_storage_mem_area)
550 schedule.memory_snapshot = temporal_usage
551
552 # Set the peak memory usage
553 schedule.fast_storage_peak_usage = max(temporal_usage, default=0)
554
555 def estimate_op_performance(self, op: SchedulerOperation, block_config, ofm_depth):
556 query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)
557 query.ifm_shape = op.ifm.shape
558 query.ifm_memory_area = op.ifm.mem_area
559 query.ifm_bits = op.ifm.dtype.size_in_bits()
560 query.ifm_format = op.ifm.format
561 query.ifm2_shape = op.ifm2 and op.ifm2.shape
562 query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area
563 query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()
564 query.ifm2_format = op.ifm2 and op.ifm2.format
565 query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)
566 query.ofm_memory_area = op.ofm.mem_area
567 query.ofm_bits = op.ofm.dtype.size_in_bits()
568 query.ofm_format = op.ofm.format
569 if op.parent_op.bias:
570 query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)
571 query.const_memory_area = self.arch.fast_storage_mem_area
572
573 query.kernel = op.kernel
574 query.config = block_config
575
576 return npu_performance.measure_cycle_cost(self.arch, op.op_type, op.activation and op.activation.op_type, query)
577
Johan Alfvén5c309712022-06-10 15:40:58 +0200578 def estimate_element_access(self, op: SchedulerOperation, block_config, ofm_depth):
579 query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)
580 query.ifm_shape = op.ifm.shape
581 query.ifm_memory_area = op.ifm.mem_area
582 query.ifm_bits = op.ifm.dtype.size_in_bits()
583 query.ifm_format = op.ifm.format
584 query.ifm2_shape = op.ifm2 and op.ifm2.shape
585 query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area
586 query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()
587 query.ifm2_format = op.ifm2 and op.ifm2.format
588 query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)
589 query.ofm_memory_area = op.ofm.mem_area
590 query.ofm_bits = op.ofm.dtype.size_in_bits()
591 query.ofm_format = op.ofm.format
592 if op.parent_op.bias:
593 query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)
594 query.const_memory_area = self.arch.fast_storage_mem_area
595
596 query.kernel = op.kernel
597 query.config = block_config
598
599 return npu_performance.measure_element_access(self.arch, query)
600
Tim Hall789e6f32021-06-17 17:02:31 +0100601 def propose_schedule_buffering(self, ref_schedule: Schedule, staging_limit_bytes):
Tim Halld8339a72021-05-27 18:49:40 +0100602 """Create a buffered schedule"""
603 buffered_schedule = Schedule(self.sg, f"{ref_schedule.label}_BUFFERED")
Tim Halld8339a72021-05-27 18:49:40 +0100604
605 prev_op = None
606 for sched_op in self.sched_ops:
607 if sched_op not in ref_schedule.cost_map:
608 # sched_op is not part of this sub-schedule - skip
609 continue
610
611 self.propose_operator_buffering(sched_op, prev_op, buffered_schedule, ref_schedule, staging_limit_bytes)
612 prev_op = sched_op
613
614 return buffered_schedule
615
616 def propose_operator_buffering(
617 self,
618 sched_op: SchedulerOperation,
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100619 prev_op: Optional[SchedulerOperation],
Tim Halld8339a72021-05-27 18:49:40 +0100620 buffered_schedule: Schedule,
621 ref_schedule: Schedule,
622 staging_limit_bytes,
623 ):
624 # Mild recursion might mean this Op has already been seen
625 if sched_op in buffered_schedule.cost_map:
626 return
627
628 # Take the reference schedule as default costings for this schedule
629 ref_cost = ref_schedule.cost_map[sched_op]
630 cost = copy.copy(ref_cost)
631 cost.slack_buffering_cycles = ref_cost.cycles.op_cycles
632 memory_snapshot = ref_schedule.memory_snapshot
633 ref_memory_usage = memory_snapshot[ref_cost.time_index] if ref_cost.time_index < len(memory_snapshot) else 0
634 cost.slack_buffering_memory = staging_limit_bytes - ref_memory_usage
635 buffered_schedule.cost_map[sched_op] = cost
636
637 # Attempt weight buffering on anything with a weights tensor
638 if sched_op.parent_op.weights:
Johan Alfvén6f4cb032022-05-05 08:42:46 +0200639 buffer_limit_bytes = cost.slack_buffering_memory
640
641 # If applicable apply size limitation, but keep it within reason (ratio 1.5).
642 # Size limitation is used when use_fast_storage_for_feature_maps have
643 # detected that there are fms that do not fit in fast storage.
644 if sched_op.evicted_fms_size and ((buffer_limit_bytes / sched_op.evicted_fms_size) >= 1.5):
645 buffer_limit_bytes -= sched_op.evicted_fms_size
646
Tim Halld8339a72021-05-27 18:49:40 +0100647 self.propose_weight_buffering(
648 sched_op.parent_op.weights,
649 sched_op.parent_op.bias,
650 sched_op,
651 prev_op,
652 buffered_schedule,
653 ref_schedule,
Johan Alfvén6f4cb032022-05-05 08:42:46 +0200654 buffer_limit_bytes,
Tim Halld8339a72021-05-27 18:49:40 +0100655 )
656
657 return cost
658
659 def weights_needs_dma(self, weight_tensor):
660 if weight_tensor and weight_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast):
661 # Weights are in permanent storage
662 # Only when permanent storage differs from feature map storage, there is a point moving the data
663 if (
664 weight_tensor.mem_area in (MemArea.Dram, MemArea.OffChipFlash)
665 and self.arch.permanent_storage_mem_area != self.arch.fast_storage_mem_area
666 ):
667 return True
668 return False
669
670 def propose_weight_buffering(
671 self,
672 weight_tensor,
673 scale_tensor,
674 sched_op: SchedulerOperation,
675 prev_op: SchedulerOperation,
676 buffered_schedule: Schedule,
677 ref_schedule: Schedule,
678 buffer_limit_bytes,
679 ):
680 cost = buffered_schedule.cost_map[sched_op]
681 prev_cost = buffered_schedule.cost_map.get(prev_op)
682 ref_cost = ref_schedule.cost_map[sched_op]
683 assert cost and ref_cost
684
685 needs_dma = self.weights_needs_dma(weight_tensor)
686
687 ofm_full_depth_slices = [0, ref_cost.stripe.depth]
688
689 # Encode weights for the full depth
Tim Halld784af72021-06-08 21:25:57 +0100690 full_weights, full_scales = weight_compressor.encode_weight_and_scale_tensor(
Tim Halld8339a72021-05-27 18:49:40 +0100691 self.arch,
692 sched_op.parent_op,
693 weight_tensor,
694 scale_tensor,
695 sched_op.kernel,
696 cost.block_config,
697 ofm_full_depth_slices,
698 )
699 full_weights_bytes = len(full_weights.buffer)
700 cost.ofm_depth_slices = ofm_full_depth_slices
701
702 # No buffering required - take all the weights from permanent storage
703 if sched_op.op_type == Op.FullyConnected or not needs_dma:
704 cost.npu_weights_tensor = full_weights
Tim Halld784af72021-06-08 21:25:57 +0100705 cost.npu_scales_tensor = full_scales
Tim Halld8339a72021-05-27 18:49:40 +0100706 return
707
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100708 encoded_weights: Optional[NpuWeightTensor] = full_weights
Tim Halld784af72021-06-08 21:25:57 +0100709 encoded_scales = full_scales
Tim Halld8339a72021-05-27 18:49:40 +0100710
711 # How many NPU cycles are available under the previously executing
712 # operator and SRAM unused for performing buffered DMA transfers
713 slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0
714 slack_memory = prev_cost.slack_buffering_memory if prev_cost else 0
715
716 # Force full depth for cascaded Ops
717 if ref_cost.cascade != 0:
718 weight_tensor_purpose = TensorSubPurpose.Standard
719 weight_buffer_size = full_weights_bytes
720 # Update the memory snapshot to reflect the added size of the weights
721 ref_schedule.memory_snapshot[ref_cost.time_index] += weight_buffer_size
722 else:
723 # Estimate the buffering cycle time for the full set of weights
724 full_transfer_cycles = npu_performance.measure_mem2mem_cycles(
725 self.arch, weight_tensor.mem_area, self.arch.fast_storage_mem_area, full_weights_bytes
726 )
727 cost.full_weight_transfer_cycles = full_transfer_cycles
728
729 # Calculate the amount of prebuffering necessary (or what is possible with limited
730 # double buffer buffer size)
731 half_buffer_limit = buffer_limit_bytes // 2
732 if full_transfer_cycles > slack_cycles:
733 prebuffer_ratio = slack_cycles / full_transfer_cycles
734 prebuffer_bytes = min(prebuffer_ratio * full_weights_bytes, half_buffer_limit)
735 else:
736 prebuffer_bytes = min(full_weights_bytes, half_buffer_limit)
Tim Hall789e6f32021-06-17 17:02:31 +0100737
738 prebuffer_ratio = prebuffer_bytes / full_weights_bytes
Tim Halld8339a72021-05-27 18:49:40 +0100739
740 # Have to split the weights if the initial buffering can't store
741 # all of the compressed weights
742 if prebuffer_bytes < full_weights_bytes:
Tim Hall789e6f32021-06-17 17:02:31 +0100743 block_depth = cost.block_config.ofm_block.depth
Tim Halld8339a72021-05-27 18:49:40 +0100744
Tim Hall789e6f32021-06-17 17:02:31 +0100745 # Choose initial prebuffering depth (already buffer clamped)
746 prebuffer_depth = ref_cost.stripe.depth * prebuffer_ratio
Tim Halld8339a72021-05-27 18:49:40 +0100747 prebuffer_depth = int(max(16, round_down(prebuffer_depth, ArchitectureFeatures.OFMSplitDepth)))
748
Tim Hall789e6f32021-06-17 17:02:31 +0100749 # Calculate cycles executed during the prebuffer
750 pre_op_cycles = self.estimate_op_performance(sched_op, cost.block_config, prebuffer_depth)
751 buffering_depth = ref_cost.stripe.depth * (pre_op_cycles.op_cycles / full_transfer_cycles)
Tim Halld8339a72021-05-27 18:49:40 +0100752
Tim Hall789e6f32021-06-17 17:02:31 +0100753 # Choose initial buffering depth and clamp to the double buffering limit
754 buffering_depth = round_up(buffering_depth, block_depth)
755 buffering_bytes = (buffering_depth / ref_cost.stripe.depth) * full_weights_bytes
756 if buffering_bytes > half_buffer_limit:
757 buffering_depth = (half_buffer_limit / full_weights_bytes) * ref_cost.stripe.depth
758
759 while True:
760 # Attempt to buffer whole blocks
Johan Alfvéncce7f2d2022-04-08 10:47:09 +0200761 if buffering_depth > block_depth:
Tim Hall789e6f32021-06-17 17:02:31 +0100762 buffering_depth = round_down(buffering_depth, block_depth)
763 else:
764 buffering_depth = round_down(buffering_depth, ArchitectureFeatures.OFMSplitDepth)
765 buffering_depth = int(max(buffering_depth, ArchitectureFeatures.OFMSplitDepth))
Tim Halld8339a72021-05-27 18:49:40 +0100766
767 # Create list of depth slices
768 depth_slices = [0]
769 if prebuffer_depth < ref_cost.stripe.depth:
770 depth_slices += list(range(prebuffer_depth, ref_cost.stripe.depth, buffering_depth))
771 depth_slices.append(ref_cost.stripe.depth)
772
773 # Encode weights based depth slices
774 cost.ofm_depth_slices = depth_slices
Tim Halld784af72021-06-08 21:25:57 +0100775 encoded_weights, encoded_scales = weight_compressor.encode_weight_and_scale_tensor(
Tim Halld8339a72021-05-27 18:49:40 +0100776 self.arch,
777 sched_op.parent_op,
778 weight_tensor,
779 scale_tensor,
780 sched_op.kernel,
781 cost.block_config,
782 cost.ofm_depth_slices,
783 )
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100784 assert encoded_weights is not None
Tim Halld8339a72021-05-27 18:49:40 +0100785 # Chosen buffering might not fit at all, iterate until it does
786 # or until the minimum usable slice size is reached
787 if (
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000788 encoded_weights.double_buffer_size() <= buffer_limit_bytes
Tim Halld8339a72021-05-27 18:49:40 +0100789 or prebuffer_depth == ArchitectureFeatures.OFMSplitDepth
790 ):
791 break
792
Tim Hall789e6f32021-06-17 17:02:31 +0100793 if buffering_depth > prebuffer_depth:
794 buffering_depth = round_up(buffering_depth // 2, ArchitectureFeatures.OFMSplitDepth)
795 else:
796 prebuffer_depth = round_up(prebuffer_depth // 2, ArchitectureFeatures.OFMSplitDepth)
Tim Halld8339a72021-05-27 18:49:40 +0100797
798 # Calculate cycles required to run the last op for use as future slack
799 tail_cycles = self.estimate_op_performance(
800 sched_op, cost.block_config, depth_slices[-1] - depth_slices[-2]
801 )
802 cost.slack_buffering_cycles = tail_cycles.op_cycles
803
804 # Determine whether the weights need to be double buffered
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000805 weight_buffer_size = min(len(encoded_weights.buffer), encoded_weights.max_range_bytes())
Tim Halld8339a72021-05-27 18:49:40 +0100806
807 # Only buffer weights if there's still space left for the buffer
808 if weight_buffer_size <= buffer_limit_bytes:
809 assert weight_buffer_size % 16 == 0
810 # Determine whether to double buffer or single buffer
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000811 double_buffer_size = encoded_weights.double_buffer_size()
812 if (double_buffer_size <= buffer_limit_bytes) and (weight_buffer_size < len(encoded_weights.buffer)):
Tim Halld8339a72021-05-27 18:49:40 +0100813 weight_tensor_purpose = TensorSubPurpose.DoubleBuffer
814 else:
815 weight_tensor_purpose = TensorSubPurpose.Standard
816
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000817 cost.buffered_weight_tensors = [
818 self.buffer_tensor(
819 encoded_weights,
820 weight_tensor_purpose,
821 encoded_weights.double_buffer_sizes[0],
822 weight_tensor.name + "_buffer",
823 )
824 ]
825 if weight_tensor_purpose == TensorSubPurpose.DoubleBuffer:
826 buf2 = self.buffer_tensor(
827 encoded_weights,
828 weight_tensor_purpose,
829 encoded_weights.double_buffer_sizes[1],
830 weight_tensor.name + "_buffer2",
831 )
832 cost.buffered_weight_tensors.append(buf2)
833
Rickard Bolin1bd20f22022-12-07 08:54:53 +0000834 # Note! OFM depth slices define slices as [0, s1, ... sn]. For example, [0, 70, 140] describes two slices
835 # (0-70 and 70-140) but has a length of 3, which would result in idx = 3 % 2 = 1 if two buffers were used.
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000836 last_used_buffer_idx = len(cost.ofm_depth_slices) % len(cost.buffered_weight_tensors)
837 weight_buffer_size = encoded_weights.double_buffer_sizes[last_used_buffer_idx]
838
Tim Halld8339a72021-05-27 18:49:40 +0100839 if ref_cost.cascade == 0:
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000840 # Determine if the lifetime can be extended and pre-buffer the first weight buffer
841 # under the previous operation
842 cost.buffered_weight_tensors[0].pre_buffer = encoded_weights.double_buffer_size() < slack_memory
Tim Halld8339a72021-05-27 18:49:40 +0100843
844 cost.slack_buffering_memory -= weight_buffer_size
845 else:
846 # Don't slice or buffer - use the whole depth from persistent storage
847 cost.ofm_depth_slices = ofm_full_depth_slices
848 encoded_weights = full_weights
Tim Halld784af72021-06-08 21:25:57 +0100849 encoded_scales = full_scales
Tim Halld8339a72021-05-27 18:49:40 +0100850
851 cost.npu_weights_tensor = encoded_weights
Tim Halld784af72021-06-08 21:25:57 +0100852 cost.npu_scales_tensor = encoded_scales
Tim Halld8339a72021-05-27 18:49:40 +0100853
Jacob Bohlineee9e5d2021-08-17 17:44:45 +0200854 def buffer_tensor(self, src_tensor: Tensor, sub_purpose: TensorSubPurpose, buffer_size: int, name: str) -> Tensor:
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000855 buffered_weight_tensor = Tensor([1, 1, 1, buffer_size], DataType.uint8, name)
Jacob Bohlineee9e5d2021-08-17 17:44:45 +0200856 buffered_weight_tensor.src_tensor = src_tensor
857 buffered_weight_tensor.mem_area = self.arch.fast_storage_mem_area
858 buffered_weight_tensor.mem_type = MemType.Scratch_fast
859 buffered_weight_tensor.purpose = TensorPurpose.Weights
860 buffered_weight_tensor.sub_purpose = sub_purpose
861 return buffered_weight_tensor
862
Tim Halld8339a72021-05-27 18:49:40 +0100863 def propose_minimal_schedule(self) -> Schedule:
864 """Proposes scheduling parameters where every operator is subdivided into the smallest stripe that satisfies the
865 next operators stride"""
866 min_schedule = Schedule(self.sg, "MIN")
867 cost_map = min_schedule.cost_map
868
869 # Keep track of the previous Op - which consumes the current Op's OFM
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100870 prev_op: Optional[SchedulerOperation] = None
Tim Halld8339a72021-05-27 18:49:40 +0100871 for sched_op in reversed(self.sched_ops):
872 min_stripe_height = prev_op.kernel.stride.y if prev_op else 1
873 min_stripe = sched_op.ofm.shape.with_height(min_stripe_height)
874
875 cost = sched_op.create_scheduler_info(self.nng, min_stripe)
876 cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)
877 cost_map[sched_op] = cost
878
879 prev_op = sched_op
880
881 return min_schedule
882
883 def propose_schedule_striping(self, final_stripe: Shape4D, label: str, ref_schedule: Schedule) -> Schedule:
884 """Proposes new striping for a schedule. The stripe is derived from the ifm requirements of the next Op down"""
885 ref_cost = ref_schedule.cost_map
886
887 striped_schedule = Schedule(self.sg, label)
888 stripe = final_stripe
889 for sched_op in reversed(self.sched_ops):
890 if sched_op not in ref_cost:
891 # sched_op is not part of the sub-schedule - skip
892 continue
893
894 # Create a cost entry with the new stripe
895 cost = sched_op.create_scheduler_info(self.nng, stripe)
896
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000897 weight_tensor = cost.npu_weights_tensor
898 for idx, buffered_tens in enumerate(ref_cost[sched_op].buffered_weight_tensors):
Jacob Bohlineee9e5d2021-08-17 17:44:45 +0200899 # If the weights are buffered in the reference schedule they should be in the new proposal
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000900 cost.buffered_weight_tensors.append(
901 self.buffer_tensor(
902 weight_tensor,
903 buffered_tens.sub_purpose,
904 weight_tensor.double_buffer_sizes[idx],
905 buffered_tens.name,
906 )
Jacob Bohlineee9e5d2021-08-17 17:44:45 +0200907 )
Tim Halld8339a72021-05-27 18:49:40 +0100908
909 # Estimate performance
910 cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)
911 striped_schedule.cost_map[sched_op] = cost
912
erik.andersson@arm.com8912f3a2022-08-16 11:08:57 +0200913 # Calculate the preceeding Op's stripe.
914
915 # In certain cases where an upscaling Op is cascaded,
916 # it may get instructed to produce an odd stripe height.
917 # Thus we need to force it back to even heights.
918 force_even_stripe_heights = False
919 for op in self.sched_ops:
920 # Check if the cascade has a Nearest Neighbor-op.
921 # If that is the case, force the stripes to be even.
922 if (
923 ref_cost.get(op, None)
924 and ref_cost.get(sched_op, None)
925 and ref_cost[op].cascade == ref_cost[sched_op].cascade
926 and is_nearest(op.resampling_mode)
927 ):
928 force_even_stripe_heights = True
929 break
930 upscaling_remainder = stripe.height % to_upscale(sched_op.resampling_mode)
931 height = stripe.height + (stripe.height % 2 if force_even_stripe_heights else upscaling_remainder)
Fredrik Svedbergd03dc502022-06-30 10:44:12 +0200932 stripe = sched_op.ifm.shape.with_height(height * sched_op.kernel.stride.y)
Tim Halld8339a72021-05-27 18:49:40 +0100933
934 return striped_schedule
935
936 def estimate_schedule_memory_usage(self, schedule: Schedule, non_local_mem_usage: dict):
937 """Estimates the memory usage of a schedule"""
938 cost = schedule.cost_map
939 cascades = schedule.cascades
940 peak_mem_usage = 0
941 for sched_op in self.sched_ops:
942 if sched_op not in cost:
943 # sched_op is not part of the sub-schedule - skip
944 continue
945
946 if cost[sched_op].cascade:
947 # This Op is part of a cascade - use the cascade's memory usage
948 cascade_info = cascades[cost[sched_op].cascade]
949 # Non-local memory usage is already included in the cascade_info
950 peak_mem_usage = max(cascade_info.mem_usage, peak_mem_usage)
951 else:
952 # This Op is not part of a cascade - calculate the memory usage
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000953 op_weight_buffer = sum(tens.storage_size() for tens in cost[sched_op].buffered_weight_tensors)
Tim Halld8339a72021-05-27 18:49:40 +0100954
955 op_mem_usage = (
956 sched_op.ifm_size_in_bytes()
957 + sched_op.ofm_size_in_bytes()
958 + op_weight_buffer
959 + non_local_mem_usage.get(sched_op, 0)
960 )
961 peak_mem_usage = max(op_mem_usage, peak_mem_usage)
962
963 return peak_mem_usage
964
Johan Alfvén255dad72022-07-16 18:27:05 +0200965 def build_cascades_for_min_schedule(self, min_schedule: Schedule, max_template: Schedule, memory_limit: int):
966 # Update memory snapshot
967 self.sg.schedule = min_schedule
968 self.update_op_memory_snapshot(min_schedule)
969
970 # Calculate residual memory for Min schedule
971 non_local_mem_usage = {}
972 for sched_op in self.sched_ops:
973 time_index = min_schedule.cost_map[sched_op].time_index
974
975 if self.arch.is_spilling_enabled():
976 # For Dedicated SRAM only the intermediate buffers are in SRAM, hence op_mem_usage is 0
977 op_mem_usage = 0
978 else:
979 # Min schedule only have ifm and ofm in SRAM (no buffered weigth tensors)
Johan Alfvéneb332a32022-12-09 17:50:48 +0100980 # Only include IFM's that are in the scratch area
981 ifm = sched_op.ifm.connection.parent_tens
982 ifm_size = (
983 0 if ifm.mem_type not in (MemType.Scratch, MemType.Scratch_fast) else sched_op.ifm_size_in_bytes()
984 )
Johan Alfvén92689d52022-12-06 11:16:19 +0100985 ofm_size = 0 if ofm_can_reuse_ifm(sched_op) else sched_op.ofm_size_in_bytes()
Johan Alfvéneb332a32022-12-09 17:50:48 +0100986 op_mem_usage = ifm_size + ofm_size
Johan Alfvén255dad72022-07-16 18:27:05 +0200987
988 non_local_mem_usage[sched_op] = min_schedule.memory_snapshot[time_index] - op_mem_usage
Johan Alfvén92689d52022-12-06 11:16:19 +0100989 assert non_local_mem_usage[sched_op] >= 0
Johan Alfvén255dad72022-07-16 18:27:05 +0200990
Rickard Bolin1bd20f22022-12-07 08:54:53 +0000991 # Create cascades for Min schedule
Johan Alfvén255dad72022-07-16 18:27:05 +0200992 cascade_builder = CascadeBuilder(self.sched_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)
993 cascade_builder.build_cascades(min_schedule, max_template, memory_limit)
994
Tim Halld8339a72021-05-27 18:49:40 +0100995 def optimize_sub_schedule(
996 self, cascade_info: CascadeInfo, ref_schedule: Schedule, max_template: Schedule, memory_limit: int
997 ) -> Schedule:
998 """Extracts the Ops covered by the given cascade and creates a sub-schedule. The sub-schedule is optimized by
999 proposing weight buffering and then continously proposing new stripe sizes"""
1000 ref_cost = ref_schedule.cost_map
1001 # Extract the ops that are part of this sub-schedule
1002 start = cascade_info.start
1003 end = cascade_info.end
1004 sub_schedule_ops = self.sched_ops[start : end + 1]
1005 # Create a sub-schedule that contains only the costs for the Ops that are part of the sub-schedule
1006 sub_schedule = Schedule(self.sg, f"SUB_{start}_{end}")
1007 for sched_op in sub_schedule_ops:
1008 sub_schedule.cost_map[sched_op] = ref_cost[sched_op]
1009
1010 sub_schedule.cascades[end] = cascade_info
1011 # Use the memory snapshot from the reference schedule
1012 sub_schedule.memory_snapshot = ref_schedule.memory_snapshot
1013
1014 # Calculate memory usage that is live during the sub-schedule but not part of it
1015 time_for_cascade = ref_cost[sub_schedule_ops[0]].time_index
1016 mem_usage_parallel_to_sub_schedule = ref_schedule.memory_snapshot[time_for_cascade] - cascade_info.mem_usage
1017 # If the first Op's IFM has other consumers it has to live throughout the whole sub-schedule whether it's
1018 # included in a cascade or not
1019 persistent_initial_ifm = (
1020 sub_schedule_ops[0].ifm_size_in_bytes() if len(sub_schedule_ops[0].ifm.connection.consumers) > 1 else 0
1021 )
1022 # Calculate non-local-mem-usage per Operator
1023 non_local_mem_usage = {}
1024 for idx, sched_op in enumerate(sub_schedule_ops):
1025 non_local_mem_usage[sched_op] = mem_usage_parallel_to_sub_schedule
1026 if idx != 0:
1027 non_local_mem_usage[sched_op] += persistent_initial_ifm
1028
1029 cascade_builder = CascadeBuilder(sub_schedule_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)
1030
1031 # Start by adding buffering
Tim Hall789e6f32021-06-17 17:02:31 +01001032 buffered_sub_schedule = self.propose_schedule_buffering(
1033 sub_schedule, self.scheduler_options.optimization_sram_limit
1034 )
Tim Halld8339a72021-05-27 18:49:40 +01001035 # Copy the cascades over from the unbuffered-schedule
1036 buffered_sub_schedule.cascades = sub_schedule.cascades
1037
1038 # Generate the possible stripings for the final Op in the sub-schedule
1039 final_ofm_shape = sub_schedule_ops[-1].ofm.shape
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001040
1041 # Skip testing the min stripe used in the MIN schedule since that will be used
1042 # anyway if no new cascades are created below
1043 last_op = sub_schedule_ops[-1]
1044 min_stripe_h = sub_schedule.cost_map[last_op].stripe.height + 1
1045
Tim Halld8339a72021-05-27 18:49:40 +01001046 possible_stripes = [
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001047 final_ofm_shape.with_height(stripe_h) for stripe_h in range(min_stripe_h, final_ofm_shape.height // 2 + 1)
Tim Halld8339a72021-05-27 18:49:40 +01001048 ]
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001049 # Propose different striping
Jacob Bohlinfad72042021-08-24 21:51:41 +02001050 best_schedule = None
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001051 max_nbr_of_cascades = 0
1052 for iteration, proposed_stripe in enumerate(possible_stripes):
Tim Halld8339a72021-05-27 18:49:40 +01001053 proposed_schedule = self.propose_schedule_striping(
1054 proposed_stripe, f"OPTIMIZED_{iteration}", buffered_sub_schedule
1055 )
1056
1057 cascade_builder.build_cascades(proposed_schedule, max_template, memory_limit)
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001058 nbr_of_cascades = len(proposed_schedule.cascades)
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001059 if iteration == 0:
1060 # First iteration - used as limit to prevent splitting up the cascades
1061 # Long cascades are better in order to reduce IFM/IFM dram bandwidth
1062 max_nbr_of_cascades = nbr_of_cascades
1063
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001064 # Check if proposal fits
1065 proposed_schedule_mem_usage = self.estimate_schedule_memory_usage(proposed_schedule, non_local_mem_usage)
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001066 if (proposed_schedule_mem_usage) <= memory_limit and nbr_of_cascades <= max_nbr_of_cascades:
Tim Halld8339a72021-05-27 18:49:40 +01001067 best_schedule = proposed_schedule
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001068
Tim Halld8339a72021-05-27 18:49:40 +01001069 if not proposed_schedule.cascades:
1070 # No cascading required - early exit
1071 break
1072 else:
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001073 break
Tim Halld8339a72021-05-27 18:49:40 +01001074
1075 return best_schedule
1076
1077 def optimize_schedule(
Jonas Ohlssond8575072022-03-30 10:30:25 +02001078 self,
1079 schedule: Schedule,
1080 max_sched: Schedule,
1081 max_template: Schedule,
1082 options: SchedulerOptions,
Tim Halld8339a72021-05-27 18:49:40 +01001083 ) -> Schedule:
1084 """Extracts sub-schedules based on the cascades and optimizes them and applies them to the final schedule"""
1085 sram_limit = options.optimization_sram_limit
1086 if max_sched.fast_storage_peak_usage < sram_limit and not self.arch.is_spilling_enabled():
1087 # Maximum performance schedule fits within the SRAM target
1088 return max_sched
1089
Jacob Bohlinfad72042021-08-24 21:51:41 +02001090 # Iterate over a copy of the cascades since they may change during the loop
1091 for cascade_info in list(schedule.cascades.values()):
Tim Halld8339a72021-05-27 18:49:40 +01001092 # Optimize the sub-schedule in this cascade
1093 opt_sub_schedule = self.optimize_sub_schedule(cascade_info, schedule, max_template, sram_limit)
Jacob Bohlinfad72042021-08-24 21:51:41 +02001094 if opt_sub_schedule:
1095 # Remove the existing cascade
1096 del schedule.cascades[cascade_info.end]
1097 # Update the sub-schedule Op and cascade costs to the full schedule
1098 schedule.cost_map.update(opt_sub_schedule.cost_map)
1099 schedule.cascades.update(opt_sub_schedule.cascades)
Tim Halld8339a72021-05-27 18:49:40 +01001100
1101 # Update memory snapshot
1102 self.sg.schedule = schedule
1103 self.update_op_memory_snapshot(schedule)
1104 # Propose schedule buffering to the optimized schedule
Tim Hall789e6f32021-06-17 17:02:31 +01001105 optimized_sched = self.propose_schedule_buffering(schedule, self.scheduler_options.optimization_sram_limit)
Tim Halld8339a72021-05-27 18:49:40 +01001106 # Copy the cascade's metadata from the unbuffered schedule
1107 optimized_sched.cascades = schedule.cascades
1108 return optimized_sched
1109
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001110 def optimize_weight_buffering_size(
1111 self,
1112 min_schedule: Schedule,
1113 options: SchedulerOptions,
1114 ):
1115 default_schedule = self.sg.schedule
Tim Hallc1be0872022-03-03 17:50:52 +00001116 npu_performance.calc_new_performance_for_network(self.nng, self.arch, None, False)
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001117 default_tot_cycles = self.nng.cycles[npu_performance.PassCycles.Total]
1118 default_dram_cycles = self.nng.cycles[npu_performance.PassCycles.DramAccess]
1119
1120 # Restore mem/type for scratched_fms
1121 for tens in self.scratched_fms:
1122 tens.mem_area = self.scratched_fms[tens][0]
1123 tens.mem_type = self.scratched_fms[tens][1]
1124
1125 self.update_op_memory_snapshot(self.sg.schedule)
1126
1127 # Collect live ranges from tensors
1128 memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]
1129 lr_graph = live_range.LiveRangeGraph()
1130 for mem_area, mem_type_set in memories_list:
1131 live_range.extract_live_ranges_from_cascaded_passes(
1132 self.nng.get_root_subgraph(),
1133 mem_area,
1134 mem_type_set,
1135 lr_graph,
1136 Tensor.AllocationQuantum,
1137 )
1138
1139 # Find the relation between the sched_op and the buffering tensor
1140 weight_ops = {}
1141 for sched_op in self.sched_ops:
1142 cost = self.sg.schedule.cost_map[sched_op]
Rickard Bolinfd8b5002022-05-16 09:11:06 +00001143 for tens in cost.buffered_weight_tensors:
1144 weight_ops[tens] = sched_op
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001145
1146 # Filter out weight buffer live ranges
1147 weight_lrs = []
1148 for lr in lr_graph.lrs:
1149 for tens in lr.tensors:
1150 if weight_ops.get(tens):
1151 weight_lrs.append(lr)
1152 break
1153
1154 # See if any evicted fm overlaps with a weight buffering op.
1155 # If this is the case add a size limitation to the buffering op
1156 for lr in self.evicted_fms:
1157 for weight_lr in weight_lrs:
1158 if lr.overlaps_ranges(weight_lr):
1159 for tens in weight_lr.tensors:
1160 sched_op = weight_ops.get(tens)
1161 if sched_op:
1162 # Add size reduction to the op
1163 sched_op.evicted_fms_size += lr.size
1164 break
1165
1166 self.sg.schedule = min_schedule
1167 self.update_op_memory_snapshot(self.sg.schedule)
1168
1169 # Run schedule buffering - with weight buffer size reduction
1170 schedule = self.propose_schedule_buffering(self.sg.schedule, options.optimization_sram_limit)
1171 schedule.cascades = self.sg.schedule.cascades
1172 self.sg.schedule = schedule
1173
1174 # Apply new buffer schdule and calc new performance
1175 self.update_op_memory_snapshot(self.sg.schedule)
1176 self.apply_schedule(self.sg.schedule)
1177 self.use_fast_storage_for_feature_maps(self.sg.schedule, options.optimization_sram_limit)
1178
Tim Hallc1be0872022-03-03 17:50:52 +00001179 npu_performance.calc_new_performance_for_network(self.nng, self.arch, None, False)
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001180 new_tot_cycles = self.nng.cycles[npu_performance.PassCycles.Total]
1181 new_dram_cycles = self.nng.cycles[npu_performance.PassCycles.DramAccess]
1182
Tim Hall8bc7a652022-05-19 15:29:23 +01001183 improvement_tot = (
1184 round((default_tot_cycles - new_tot_cycles) / default_tot_cycles, 2) if default_tot_cycles != 0 else 0
1185 )
1186 improvement_dram = (
1187 round((default_dram_cycles - new_dram_cycles) / default_dram_cycles, 2) if default_dram_cycles != 0 else 0
1188 )
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001189
1190 # Compare both total and dram improvement
Johan Alfvén3dae1b62022-05-17 10:26:48 +02001191 if not (improvement_tot >= 0 and improvement_dram > 0):
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001192 # No improvement, restore the default schedule
1193 for sched_op in self.sched_ops:
1194 sched_op.evicted_fms_size = 0
1195
1196 for tens in self.scratched_fms:
1197 tens.mem_area = self.scratched_fms[tens][0]
1198 tens.mem_type = self.scratched_fms[tens][1]
1199
1200 self.sg.schedule = default_schedule
1201 self.update_op_memory_snapshot(self.sg.schedule)
1202 self.apply_schedule(self.sg.schedule)
1203 self.use_fast_storage_for_feature_maps(self.sg.schedule, options.optimization_sram_limit)
1204
Tim Halld8339a72021-05-27 18:49:40 +01001205 def apply_schedule(self, sched: Schedule):
1206 """Applies the given schedule as a final solution"""
1207 for sched_op in self.sched_ops:
1208 op_info = sched.cost_map[sched_op]
1209 cascade_info = sched.cascades.get(op_info.cascade, None)
1210 if cascade_info and sched_op in cascade_info.buffers:
1211 buffer_tens = sched_op.ifm.connection.parent_tens
1212 # Apply memory area and type
1213 buffer_tens.mem_area = self.arch.fast_storage_mem_area
1214 buffer_tens.mem_type = MemType.Scratch_fast
1215 # Apply Rolling buffer
1216 buffer_tens.set_format(TensorFormat.NHCWB16, self.arch)
1217 buffer_tens.set_new_sub_purpose(TensorSubPurpose.RollingBufferY, cascade_info.buffers[sched_op].height)
1218
1219 sched_op.parent_ps.block_config = op_info.block_config.old_style_representation()
1220
1221 # Ensure that the src_tensor reference is set correctly
Rickard Bolinfd8b5002022-05-16 09:11:06 +00001222 for tens in op_info.buffered_weight_tensors:
1223 tens.src_tensor = op_info.npu_weights_tensor
Tim Halld8339a72021-05-27 18:49:40 +01001224
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001225 def use_fast_storage_for_feature_maps(self, schedule, staging_limit):
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001226 """Finds the set of feature maps that fits within the staging limit which combined has the largest amount of
1227 access cycles and moves those feature map into fast storage"""
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001228 max_mem_usage = []
1229 base_mem_usage = []
1230 fast_storage_type = MemType.Scratch_fast
1231 fast_storage_mem_area = self.arch.fast_storage_mem_area
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001232 self.evicted_fms = []
Tim Halld8339a72021-05-27 18:49:40 +01001233
1234 # Force all OFMs to fast-storage
1235 for sched_op in self.sched_ops:
1236 cost = schedule.cost_map[sched_op]
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001237 if cost.cascade == 0 and sched_op.get_dependants():
1238 ofm_tens = sched_op.ofm.connection.parent_tens
1239 if not any(cons is None for cons in ofm_tens.consumer_list):
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001240 if ofm_tens not in self.scratched_fms:
1241 # Remember default mem area and mem type, only done once
1242 self.scratched_fms[ofm_tens] = (ofm_tens.mem_area, ofm_tens.mem_type)
1243
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001244 ofm_tens.mem_area = fast_storage_mem_area
1245 ofm_tens.mem_type = fast_storage_type
Tim Halld8339a72021-05-27 18:49:40 +01001246
1247 # Collect live ranges from tensors
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001248 memories_list = [(fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]
Tim Halld8339a72021-05-27 18:49:40 +01001249 lr_graph = live_range.LiveRangeGraph()
1250 for mem_area, mem_type_set in memories_list:
1251 live_range.extract_live_ranges_from_cascaded_passes(
Jonas Ohlssond8575072022-03-30 10:30:25 +02001252 self.nng.get_root_subgraph(),
1253 mem_area,
1254 mem_type_set,
1255 lr_graph,
1256 Tensor.AllocationQuantum,
Tim Halld8339a72021-05-27 18:49:40 +01001257 )
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001258 max_mem_usage = lr_graph.get_temporal_memory_usage(fast_storage_mem_area)
Tim Halld8339a72021-05-27 18:49:40 +01001259
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001260 # If max_mem_usage does not exceed staging limit at any point all lrs fit and can stay in fast storage
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001261 if max(max_mem_usage) <= staging_limit:
1262 return
1263
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001264 # Build up the base memory usage by removing the mem_usage of the lrs we previously moved to fast-storage
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001265 base_mem_usage = np.array(max_mem_usage)
1266 curr_lrs = []
Tim Halld8339a72021-05-27 18:49:40 +01001267 for lr in lr_graph.lrs:
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001268 for tens in lr.tensors:
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001269 if self.scratched_fms.get(tens):
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001270 curr_lrs.append(lr)
1271 base_mem_usage[lr.start_time : lr.end_time + 1] -= lr.size
1272 break
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001273 competing_lrs = []
Johan Alfvén5c309712022-06-10 15:40:58 +02001274 competing_tens_access = {}
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001275
1276 # Evict live ranges that will never fit
1277 for lr in curr_lrs.copy():
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001278 base_usage = max(base_mem_usage[lr.start_time : lr.end_time + 1])
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001279 if base_usage + lr.size > staging_limit:
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001280 # Lr will never fit and may thus be evicted
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001281 self.evicted_fms.append(lr)
1282 FastStorageComponentAllocator.evict(lr, max_mem_usage, self.scratched_fms)
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001283 curr_lrs.remove(lr)
1284
1285 # Keep live ranges that will always fit in fast storage and let the remaining ones compete
1286 for lr in curr_lrs:
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001287 # Since max_mem_usage is the memory usage with all FMs still in fast-storage,
1288 # the memory limit cannot be exceeded if max_mem_usage does not.
1289 # Thus, the affected lrs can remain in fast-storage if the following is true
1290 if max(max_mem_usage[lr.start_time : lr.end_time + 1]) <= staging_limit:
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001291 FastStorageComponentAllocator.keep(lr, base_mem_usage)
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001292 else:
1293 competing_lrs.append(lr)
Johan Alfvén5c309712022-06-10 15:40:58 +02001294 for tens in lr.tensors:
1295 competing_tens_access[tens] = 0
1296
Johan Alfvén3a6325f2022-10-07 18:03:48 +02001297 # All lrs and their tensors have been handled if competing_lrs_sz is zero, we may thus return
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001298 if len(competing_lrs) == 0:
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001299 return
1300
Johan Alfvén5c309712022-06-10 15:40:58 +02001301 # Estimate element access for all tensors that are competing for a place in fast-storage.
1302 # This number is used when deciding which tensor that stays in fast-storage
1303 for sched_op in self.sched_ops:
1304 cost = schedule.cost_map[sched_op]
1305
1306 if competing_tens_access.get(sched_op.ifm.connection.parent_tens) is not None:
1307 tens = sched_op.ifm.connection.parent_tens
1308 access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)
1309 competing_tens_access[tens] += access.ifm_read[0]
1310
1311 if sched_op.ifm2 and competing_tens_access.get(sched_op.ifm2.connection.parent_tens) is not None:
1312 tens = sched_op.ifm2.connection.parent_tens
1313 access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)
1314 competing_tens_access[tens] += access.ifm_read[1]
1315
1316 if competing_tens_access.get(sched_op.ofm.connection.parent_tens) is not None:
1317 tens = sched_op.ofm.connection.parent_tens
1318 access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)
1319 competing_tens_access[tens] += access.ofm_write
1320
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001321 # Sort live ranges "from left to right" on the time axis to simplify checking overlapping ranges
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001322 competing_lrs = sorted(competing_lrs, key=lambda lr: (lr.start_time, lr.end_time + 1, lr.size))
Johan Alfvén3a6325f2022-10-07 18:03:48 +02001323
1324 # Remove lrs that have a live range that is too long compared to others.
1325 # They are causing problems for the HillClimb Allocator when it has to
1326 # change the allocation indices, in order to fit all the allocations into SRAM.
1327 # This problem only occur in larger networks with complex graphs.
1328 #
1329 # Limit the number of items for allocate_component to work with max MAX_EXHAUSTIVE_ITEMS
1330 # at the time. Too many will give too long compilation time
1331 #
1332 # Too long is currently decided to be (based on experience, analyzing many networks):
1333 # Compare lr at postion i with lr at position i + MAX_EXHAUSTIVE_ITEMS.
1334 # If end time differs by at least MAX_EXHAUSTIVE_LIFE_RANGE then do not include lr at position i.
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001335 if len(competing_lrs) > FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS:
1336 # Create a copy of the original list to iterate over because the original version is modified in-loop
Johan Alfvén3a6325f2022-10-07 18:03:48 +02001337 competing_lrs_copy = competing_lrs.copy()
1338 for i, lr in enumerate(competing_lrs_copy):
1339 lr_time = lr.end_time - lr.start_time
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001340 # Only check live ranges longer than MAX_EXHAUSTIVE_LIFE_RANGE
1341 if lr_time >= FastStorageComponentAllocator.MAX_EXHAUSTIVE_LIFE_RANGE:
1342 # Compare current lr with lr at position lr + MAX_EXHAUSTIVE_ITEMS
1343 cmp_pos = min(i + FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS, len(competing_lrs) - 1)
Johan Alfvén3a6325f2022-10-07 18:03:48 +02001344
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001345 # Compare end times + plus a margin by MAX_EXHAUSTIVE_LIFE_RANGE
1346 if (
1347 lr.end_time
1348 > competing_lrs_copy[cmp_pos].end_time + FastStorageComponentAllocator.MAX_EXHAUSTIVE_LIFE_RANGE
1349 ):
1350 # Current lr live time stands out, remove it. No use adding it to the
1351 # evicted_fms list since the lr should not be included in the fast storage allocation
1352 FastStorageComponentAllocator.evict(lr, max_mem_usage, self.scratched_fms)
1353 competing_lrs.remove(lr)
Johan Alfvén3a6325f2022-10-07 18:03:48 +02001354
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001355 # Split competing live ranges into components by finding disconnected groups of live ranges or components of
1356 # max size MAX_EXHAUSTIVE_ITEMS
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001357 start = 0
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001358 end_time = competing_lrs[0].end_time
1359 component_allocator = FastStorageComponentAllocator(base_mem_usage, max_mem_usage, staging_limit)
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001360 component_ranges = []
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001361 for i, lr in enumerate(competing_lrs):
Johan Alfvén3a6325f2022-10-07 18:03:48 +02001362 nbr_items = i - start
1363 if lr.start_time <= end_time and (nbr_items < FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS):
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001364 end_time = max(end_time, lr.end_time)
1365 else:
Johan Alfvén3a6325f2022-10-07 18:03:48 +02001366 # Number items reached max items or current lr's start time
1367 # does not overlap with previous lr's end time
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001368 component_ranges.append((start, i))
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001369 start = i
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001370 end_time = lr.end_time
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001371 component_ranges.append((start, len(competing_lrs)))
1372
1373 # Allocate each component separately
1374 for start, end in component_ranges:
1375 component_allocator.allocate_component(
1376 competing_lrs[start:end],
1377 max_mem_usage,
1378 base_mem_usage,
1379 self.scratched_fms,
1380 competing_tens_access,
1381 self.evicted_fms,
1382 )
1383 assert max(max_mem_usage) <= staging_limit, "Allocation exceeds staging limit"
Tim Halld8339a72021-05-27 18:49:40 +01001384
1385 def move_constant_data(self):
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001386 """Determine if data can be moved from permanent storage to another memory area. A move will generate a DMA
1387 command in the high-level command stream"""
Tim Halld8339a72021-05-27 18:49:40 +01001388 for sched_op in self.sched_ops:
1389 parent_op = sched_op.parent_op
1390 is_lut_used = any(inp.purpose == TensorPurpose.LUT for inp in parent_op.inputs)
1391 max_ifm_shram_avail = (
1392 (self.arch.available_shram_banks(is_lut_used) - self.arch.shram_reserved_output_banks)
1393 * self.arch.shram_bank_size
1394 // 2
1395 )
1396
1397 for idx, tens in enumerate(parent_op.inputs):
1398 if tens.mem_type not in (MemType.Scratch, MemType.Scratch_fast):
1399 # Tensor is in permanent storage
1400 # Only when permanent storage differs from feature map storage, there is a point moving the data
1401 if (
1402 tens.mem_area in self.arch.permanent_storage_mem_area
1403 and self.arch.permanent_storage_mem_area != self.arch.feature_map_storage_mem_area
1404 ) or tens.purpose == TensorPurpose.LUT:
1405 if tens.purpose == TensorPurpose.LUT or (
Patrik Gustavsson94292fe2021-09-02 08:22:58 +02001406 # For elementwise broadcast
Tim Halld8339a72021-05-27 18:49:40 +01001407 tens.purpose == TensorPurpose.FeatureMap
1408 and sched_op.op_type.is_binary_elementwise_op()
1409 and tens.shape != []
1410 and sched_op.ifm.shape != sched_op.ofm.shape
Patrik Gustavsson94292fe2021-09-02 08:22:58 +02001411 and parent_op.write_shape is None
Tim Halld8339a72021-05-27 18:49:40 +01001412 and tens.storage_size() > max_ifm_shram_avail
1413 ):
1414 only_vector_product_consumers = all(
1415 oper and oper.type.npu_block_type == NpuBlockType.VectorProduct
1416 for oper in tens.consumers()
1417 )
1418
1419 if (not only_vector_product_consumers) or tens.purpose == TensorPurpose.LUT:
1420 new_tens = tens.clone_into_fast_storage(self.arch)
1421 if tens.purpose == TensorPurpose.LUT:
1422 new_tens.mem_area = MemArea.Shram
1423
1424 new_tens.consumer_list.append(parent_op)
1425 parent_op.inputs[idx] = new_tens
Dwight Lidman352607c2021-09-29 17:00:09 +02001426 # If the index is out of range, IFM and IFM2 are the same tensor
1427 # and pass inputs don't have duplicates
1428 if idx < len(sched_op.parent_ps.inputs):
1429 sched_op.parent_ps.inputs[idx] = new_tens
Tim Halld8339a72021-05-27 18:49:40 +01001430
1431 def print_schedule(self, schedule: Schedule):
1432 print(f"Schedule: '{schedule.name}'")
1433 for sched_op in self.sched_ops:
1434 if sched_op not in schedule.cost_map:
1435 # Sub-schedule printing
1436 continue
1437
1438 op_info = schedule.cost_map[sched_op]
1439 print(f"\t{sched_op.index}: Operation {sched_op.name} - OFM {sched_op.ofm.shape}")
1440 print(f"\t\tType: {sched_op.op_type}")
1441 print(f"\t\tKernel: {sched_op.kernel}")
1442 print(f"{op_info}")
1443 mem_usage = (
1444 schedule.memory_snapshot[op_info.time_index]
1445 if op_info.time_index < len(schedule.memory_snapshot)
1446 else 0
1447 )
1448 print(f"\t\tSRAM Used: {mem_usage} bytes")
1449
Jonas Ohlsson25e700c2022-03-04 14:58:56 +01001450 print("\tCascades:")
Tim Halld8339a72021-05-27 18:49:40 +01001451 for i, cascade in enumerate(schedule.cascades.values()):
1452 print(f"\t\t{i}: {cascade.start} -> {cascade.end}, size: {cascade.mem_usage}")
Patrik Gustavssonfeeb06d2020-04-22 12:53:47 +02001453
Andreas Nevalainen27d36f02020-11-19 11:27:50 +01001454
Tim Halld8339a72021-05-27 18:49:40 +01001455def _update_tensor_allocation(nng: Graph, arch: ArchitectureFeatures, options):
1456 """
1457 Creates live ranges and runs tensor allocator for the current schedule
1458 (i.e. sg.schedule for all subgraphs), returns the maximum memory usage
1459 and updates SchedulerOpInfo.mem_usage for all operations in the schedule.
1460 """
1461 root_sg = nng.get_root_subgraph()
1462
1463 alloc_list = []
1464 if arch.is_spilling_enabled():
1465 mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,)))
1466 mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))
1467 # Order is important
1468 alloc_list.append(mem_alloc_scratch_fast)
1469 alloc_list.append(mem_alloc_scratch)
1470 else:
1471 mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))
1472 alloc_list.append(mem_alloc_scratch)
1473
1474 for mem_area, mem_type_set in alloc_list:
1475 tensor_allocation.allocate_tensors(
1476 nng,
1477 root_sg,
1478 arch,
1479 mem_area,
1480 mem_type_set,
1481 tensor_allocator=options.tensor_allocator,
1482 verbose_allocation=options.verbose_allocation,
1483 cpu_tensor_alignment=options.cpu_tensor_alignment,
Tim Hallcda4fcb2022-05-19 12:36:58 +01001484 hillclimb_max_iterations=options.hillclimb_max_iterations,
Tim Halld8339a72021-05-27 18:49:40 +01001485 )
1486
1487
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001488class FastStorageComponentAllocator:
Johan Alfvén5c309712022-06-10 15:40:58 +02001489 MAX_EXHAUSTIVE_LIFE_RANGE = 20
Johan Alfvén3a6325f2022-10-07 18:03:48 +02001490 MAX_EXHAUSTIVE_ITEMS = 20
Johan Alfvén5c309712022-06-10 15:40:58 +02001491
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001492 def __init__(self, base_mem_usage, max_mem_usage, staging_limit):
1493 self.base_mem_usage = base_mem_usage
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001494 self.max_mem_usage = max_mem_usage
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001495 self.staging_limit = staging_limit
1496 self.lrs = []
1497 self.evicted = []
1498 self.curr_evicted = []
1499 self.remaining_total_size = []
Johan Alfvén5c309712022-06-10 15:40:58 +02001500 self.best_score = 0
1501 self.competing_tens_access = {}
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001502
Johan Alfvén5c309712022-06-10 15:40:58 +02001503 def allocate_exhaustive(self, ix, score):
1504 # Favour tensors with highest element access (score)
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001505 if ix >= self.num_lrs:
Johan Alfvén5c309712022-06-10 15:40:58 +02001506 if score > self.best_score:
1507 self.best_score = score
Louis Verhaard5c8f1e52022-02-23 14:13:07 +01001508 self.evicted = self.curr_evicted.copy()
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001509 return
1510
1511 lr = self.lrs[ix]
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001512
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001513 # If adding the tensor size to the base mem usage doesn't exceed the staging limit anywhere on the lr time
1514 # range, it can fit and the case where the tensor is included needs to be checked
1515 can_fit = max(self.base_mem_usage[lr.start_time : lr.end_time + 1]) + lr.size <= self.staging_limit
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001516 if can_fit:
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001517 # Tensor can fit, add tensor element access to the score and check case where tensor is included
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001518 self.curr_evicted[ix] = False
1519 self.base_mem_usage = self.update_mem_usage(self.base_mem_usage, lr, True)
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001520 self.allocate_exhaustive(ix + 1, score + self.competing_tens_access[lr.tensors[0]])
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001521 self.base_mem_usage = self.update_mem_usage(self.base_mem_usage, lr, False)
1522
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001523 # If the max mem usage doesn't exceed the staging limit anywhere on the lr time range, it always fits and the
1524 # case where the tensor is not included can be skipped
1525 always_fits = max(self.max_mem_usage[lr.start_time : lr.end_time + 1]) <= self.staging_limit
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001526 if not always_fits:
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001527 # Tensor doesn't always fit, check case when tensor is not included
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001528 self.curr_evicted[ix] = True
1529 self.max_mem_usage = self.update_mem_usage(self.max_mem_usage, lr, False)
Johan Alfvén5c309712022-06-10 15:40:58 +02001530 self.allocate_exhaustive(ix + 1, score)
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001531 self.max_mem_usage = self.update_mem_usage(self.max_mem_usage, lr, True)
1532
1533 @staticmethod
1534 def update_mem_usage(mem_usage, lr, increase):
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001535 size = lr.size if increase else -lr.size
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001536 for t in range(lr.start_time, lr.end_time + 1):
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001537 mem_usage[t] += size
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001538 return mem_usage
1539
1540 @staticmethod
1541 def evict(lr, max_mem_usage, scratched_fms):
1542 for t in range(lr.start_time, lr.end_time + 1):
1543 max_mem_usage[t] -= lr.size
1544 for tens in lr.tensors:
1545 if tens in scratched_fms:
1546 tens.mem_area = scratched_fms[tens][0]
1547 tens.mem_type = scratched_fms[tens][1]
1548
1549 @staticmethod
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001550 def keep(lr, base_mem_usage):
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001551 for t in range(lr.start_time, lr.end_time + 1):
1552 base_mem_usage[t] += lr.size
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001553
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001554 def allocate_component(self, lrs, max_mem, min_mem, scratched_fms, competing_tens_access, evicted_fms):
1555 self.lrs = lrs
1556 self.num_lrs = len(lrs)
1557 self.evicted = [0] * self.num_lrs
1558 self.curr_evicted = [0] * self.num_lrs
1559 self.best_score = -1
1560 self.competing_tens_access = competing_tens_access
Johan Alfvén5c309712022-06-10 15:40:58 +02001561 # Recursively evaluate all permutations of allocations of the lrs found in the component.
1562 # For every permutation that fits within the staging_limit there is a score calculated.
1563 # The permutation with the highest score will then be chosen. The score is calculated
1564 # as the sum of the actual element access (ifm read and ofm write) for all the
1565 # including tensors. So it is not necessary the tensor with the biggest size that ends up
1566 # being included in the result.
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001567 self.allocate_exhaustive(0, 0)
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001568 # Optimal allocation has been found, move lrs accordingly
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001569 for i, lr in enumerate(self.lrs):
1570 if self.evicted[i]:
1571 self.evict(lr, max_mem, scratched_fms)
1572 if lr not in evicted_fms:
1573 evicted_fms.append(lr)
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001574 else:
Rickard Bolin1bd20f22022-12-07 08:54:53 +00001575 self.keep(lr, min_mem)
1576 if lr in evicted_fms:
1577 evicted_fms.remove(lr)
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001578
1579
Tim Halld8339a72021-05-27 18:49:40 +01001580def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_options: SchedulerOptions):
1581 """Entry point for the Scheduler"""
1582 # Initialize CPU subgraphs
1583 schedulers = dict()
1584 # Initialize schedulers with max schedule. Only schedule NPU subgraphs
Andreas Nevalainen27d36f02020-11-19 11:27:50 +01001585 for sg in nng.subgraphs:
Tim Halld8339a72021-05-27 18:49:40 +01001586 if sg.placement != PassPlacement.Npu:
1587 # Create cascaded passes for CPU Ops
1588 cascaded_passes = []
1589 for idx, ps in enumerate(sg.passes):
1590 cps = CascadedPass(
Jonas Ohlssond8575072022-03-30 10:30:25 +02001591 ps.name,
1592 SchedulingStrategy.WeightStream,
1593 ps.inputs,
1594 [],
1595 ps.outputs,
1596 [ps],
1597 ps.placement,
1598 False,
Tim Halld8339a72021-05-27 18:49:40 +01001599 )
Andreas Nevalainen27d36f02020-11-19 11:27:50 +01001600
Tim Halld8339a72021-05-27 18:49:40 +01001601 cps.time = idx
1602 ps.cascade = cps
1603 cascaded_passes.append(cps)
Andreas Nevalainen27d36f02020-11-19 11:27:50 +01001604
Tim Halld8339a72021-05-27 18:49:40 +01001605 sg.cascaded_passes = cascaded_passes
1606 else:
1607 # Npu subgraph - create schedule
1608 scheduler = Scheduler(nng, sg, arch, scheduler_options)
1609 schedulers[sg] = scheduler
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001610
Tim Halld8339a72021-05-27 18:49:40 +01001611 scheduler.create_scheduler_representation(arch)
1612 sg.sched_ops = scheduler.sched_ops
1613 scheduler.move_constant_data()
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001614
Tim Halld8339a72021-05-27 18:49:40 +01001615 # Create the Max schedule template
1616 max_schedule_template = scheduler.create_initial_schedule()
1617 scheduler.max_schedule = max_schedule_template
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001618
Tim Halld8339a72021-05-27 18:49:40 +01001619 # Create the optimimised Max schedule
1620 sg.schedule = max_schedule_template
1621 scheduler.update_op_memory_snapshot(max_schedule_template)
Tim Hall789e6f32021-06-17 17:02:31 +01001622 opt_max_schedule = scheduler.propose_schedule_buffering(max_schedule_template, 1 << 32)
Tim Halld8339a72021-05-27 18:49:40 +01001623 sg.schedule = opt_max_schedule
1624 scheduler.update_op_memory_snapshot(opt_max_schedule)
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001625
Tim Halld8339a72021-05-27 18:49:40 +01001626 # Create Min schedule
1627 min_schedule = scheduler.propose_minimal_schedule()
1628 initial_sram_limit = scheduler_options.optimization_sram_limit
1629 if scheduler_options.optimization_strategy == OptimizationStrategy.Size:
1630 initial_sram_limit = scheduler.min_memory_req
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001631
Johan Alfvén255dad72022-07-16 18:27:05 +02001632 # Build cascades for Min schedule
1633 scheduler.build_cascades_for_min_schedule(min_schedule, max_schedule_template, initial_sram_limit)
Tim Halld8339a72021-05-27 18:49:40 +01001634 sg.schedule = min_schedule
1635 scheduler.update_op_memory_snapshot(min_schedule)
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001636
Tim Halld8339a72021-05-27 18:49:40 +01001637 if scheduler_options.optimization_strategy == OptimizationStrategy.Performance:
1638 # Create an optimized schedule
1639 sg.schedule = scheduler.optimize_schedule(
1640 min_schedule, opt_max_schedule, max_schedule_template, scheduler_options
1641 )
1642 scheduler.update_op_memory_snapshot(sg.schedule)
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001643
Tim Halld8339a72021-05-27 18:49:40 +01001644 scheduler.apply_schedule(sg.schedule)
1645 scheduler.use_fast_storage_for_feature_maps(sg.schedule, scheduler_options.optimization_sram_limit)
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001646
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001647 if scheduler_options.optimization_strategy == OptimizationStrategy.Performance and scheduler.evicted_fms:
1648 # It might be possible to gain performance by reducing
1649 # weight buffer size and instead fit fms in fast storage
1650 scheduler.optimize_weight_buffering_size(min_schedule, scheduler_options)
1651
Tim Halld8339a72021-05-27 18:49:40 +01001652 if scheduler_options.verbose_schedule:
1653 scheduler.print_schedule(sg.schedule)
Tim Hall79d07d22020-04-27 18:20:16 +01001654
Tim Halld8339a72021-05-27 18:49:40 +01001655 # Evaluate schedule
1656 _update_tensor_allocation(nng, arch, options)