blob: a19d0531c615b7d55b9bb4620048dba9491dcf14 [file] [log] [blame]
Tim Halld8339a72021-05-27 18:49:40 +01001# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Halld8339a72021-05-27 18:49:40 +010016#
Tim Hall79d07d22020-04-27 18:20:16 +010017# Description:
Tim Halld8339a72021-05-27 18:49:40 +010018# The scheduler creates and searches for an optimal plan for the network, selecting block configurations and
19# subdivisions for the Operators
Jonas Ohlsson845e2322022-03-01 12:39:55 +010020# For Class name forward references for the type annotations. (see PEP 563).
21from __future__ import annotations
22
Diego Russoea6111a2020-04-14 18:41:58 +010023import copy
Johan Alfvén5e0ae552022-02-09 21:20:10 +010024from collections import namedtuple
Tim Halld8339a72021-05-27 18:49:40 +010025from enum import auto
26from enum import IntEnum
27from typing import Dict
28from typing import List
29from typing import Optional
30from typing import Tuple
Jonas Ohlsson845e2322022-03-01 12:39:55 +010031from typing import TYPE_CHECKING
32
33# Import needed for Type annotations. Only import for Type checking to avoid run-time errors due to cyclic import.
34if TYPE_CHECKING:
35 from .npu_performance import CycleCost
Diego Russoea6111a2020-04-14 18:41:58 +010036
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +010037import numpy as np
38
Diego Russoea6111a2020-04-14 18:41:58 +010039from . import live_range
Tim Hall79d07d22020-04-27 18:20:16 +010040from . import npu_performance
Tim Halld8339a72021-05-27 18:49:40 +010041from . import tensor_allocation
42from . import weight_compressor
43from .architecture_allocator import ArchitectureBlockConfig
44from .architecture_allocator import find_block_config
45from .architecture_allocator import get_ifm_area_required
Tim Halld8339a72021-05-27 18:49:40 +010046from .architecture_features import ArchitectureFeatures
47from .architecture_features import Block
48from .cascade_builder import CascadeBuilder
49from .cascade_builder import CascadeInfo
Fredrik Svedberg880e7352020-08-25 11:31:47 +020050from .data_type import DataType
Diego Russoe8a10452020-04-21 17:39:10 +010051from .nn_graph import CascadedPass
Tim Halld8339a72021-05-27 18:49:40 +010052from .nn_graph import Graph
53from .nn_graph import Pass
Diego Russoe8a10452020-04-21 17:39:10 +010054from .nn_graph import PassPlacement
Diego Russoe8a10452020-04-21 17:39:10 +010055from .nn_graph import SchedulingStrategy
Tim Halld8339a72021-05-27 18:49:40 +010056from .nn_graph import Subgraph
57from .numeric_util import round_down
58from .numeric_util import round_up
Diego Russoe8a10452020-04-21 17:39:10 +010059from .operation import NpuBlockType
Louis Verhaardaee5d752020-09-30 09:01:52 +020060from .operation import Op
Tim Halld8339a72021-05-27 18:49:40 +010061from .shape4d import Shape4D
Diego Russoe8a10452020-04-21 17:39:10 +010062from .tensor import MemArea
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020063from .tensor import MemType
Tim Halld8339a72021-05-27 18:49:40 +010064from .tensor import Tensor
Diego Russoe8a10452020-04-21 17:39:10 +010065from .tensor import TensorFormat
66from .tensor import TensorPurpose
67from .tensor import TensorSubPurpose
Jonas Ohlsson845e2322022-03-01 12:39:55 +010068from .weight_compressor import NpuWeightTensor
Jacob Bohlin1a666972020-09-11 10:04:15 +020069
Tim Hall79d07d22020-04-27 18:20:16 +010070
Tim Halld8339a72021-05-27 18:49:40 +010071def shape_for_format(shape: Shape4D, tensor_format: TensorFormat) -> Shape4D:
72 if tensor_format == TensorFormat.NHCWB16:
73 return shape.with_depth(round_up(shape.depth, 16))
74
75 return shape
76
77
78class OptimizationStrategy(IntEnum):
79 """Enum defining the different optimization strategies for the Scheduler"""
80
81 Size = auto()
82 Performance = auto()
Tim Hall79d07d22020-04-27 18:20:16 +010083
84 def __str__(self):
85 return self.name
86
87
Tim Halld8339a72021-05-27 18:49:40 +010088class SchedulerOpInfo:
89 """Contains metadata about a SchedulerOperation that is unique to one Schedule"""
90
Tim Hall79d07d22020-04-27 18:20:16 +010091 def __init__(
92 self,
Tim Halld8339a72021-05-27 18:49:40 +010093 block_config: ArchitectureBlockConfig,
94 weights_size: int,
95 stripe_input: Shape4D,
96 stripe_input2: Optional[Shape4D],
97 stripe: Shape4D,
Tim Hall79d07d22020-04-27 18:20:16 +010098 ):
Tim Halld8339a72021-05-27 18:49:40 +010099 self.block_config = block_config
100 self.weights_size = weights_size
101 self.stripe_input = stripe_input
102 self.stripe_input2 = stripe_input2
103 self.stripe = stripe
104 self.cascade = 0 # Assigned by CascadeBuilder. 0 means not part of a cascade
105 self.time_index = None # Set by update_op_memory_snapshot
106 self.ofm_depth_slices: List[int] = [0, stripe.depth]
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100107 self.npu_weights_tensor: Optional[NpuWeightTensor] = None
108 self.npu_scales_tensor: Optional[NpuWeightTensor] = None
Louis Verhaardcc5f4de2022-03-01 11:26:58 +0100109 self.buffered_weight_tensors: List[Tensor] = []
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100110 self.cycles: Optional[CycleCost] = None
Tim Halld8339a72021-05-27 18:49:40 +0100111 self.slack_buffering_cycles = 0
112 self.slack_buffering_memory = 0
113 self.full_weight_transfer_cycles = 0
114
115 def copy(self):
Jonas Ohlssond8575072022-03-30 10:30:25 +0200116 res = SchedulerOpInfo(
117 self.block_config,
118 self.weights_size,
119 self.stripe_input,
120 self.stripe_input2,
121 self.stripe,
122 )
Tim Halld8339a72021-05-27 18:49:40 +0100123 res.cascade = self.cascade
124 return res
125
126 def __str__(self):
127 res = f"\t\tBlock Config = {self.block_config}\n"
128 res += f"\t\tOFM Block = {self.block_config.ofm_block}\n"
129 res += f"\t\tIFM Stripe = {self.stripe_input}\n"
130 res += f"\t\tIFM2 Stripe = {self.stripe_input2}\n"
131 res += f"\t\tOFM Stripe = {self.stripe}\n"
132 res += f"\t\tEncoded Weights = {self.npu_weights_tensor and len(self.npu_weights_tensor.buffer)} bytes\n"
Louis Verhaardcc5f4de2022-03-01 11:26:58 +0100133 for idx, tens in enumerate(self.buffered_weight_tensors):
134 res += f"\t\tWeight buffer{idx + 1} = {tens.storage_size()} bytes\n"
Tim Halld8339a72021-05-27 18:49:40 +0100135 res += f"\t\tDepth slices = {self.ofm_depth_slices}\n"
136 res += f"\t\tAssigned Cascade = {self.cascade}"
137 return res
138
139
140class SchedulerOptions:
141 """Contains options for the Scheduler"""
142
143 def __init__(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200144 self,
145 optimization_strategy,
146 sram_target,
147 verbose_schedule,
Tim Halld8339a72021-05-27 18:49:40 +0100148 ):
149 self.optimization_strategy = optimization_strategy
150 self.optimization_sram_limit = sram_target
Tim Hall79d07d22020-04-27 18:20:16 +0100151 self.verbose_schedule = verbose_schedule
Tim Hall79d07d22020-04-27 18:20:16 +0100152
Tim Halld8339a72021-05-27 18:49:40 +0100153 def __str__(self) -> str:
154 return f"{type(self).__name__}: {str(self.__dict__)}"
Tim Hall79d07d22020-04-27 18:20:16 +0100155
156 __repr__ = __str__
157
158
Tim Halld8339a72021-05-27 18:49:40 +0100159class SchedulerTensor:
160 def __init__(self, shape, dt, mem_area, _format):
161 self.dtype = dt
162 self.mem_area = mem_area
163 self.shape = shape
164 self.format = _format
165 self.connection = None
Tim Hall79d07d22020-04-27 18:20:16 +0100166
Tim Halld8339a72021-05-27 18:49:40 +0100167
168class SchedulerOperation:
169 """Scheduler internal representation of 'Operation'
170 This class can be seen as a node within the Scheduler Graph representation
171 """
172
173 def __init__(self, ps: Pass, arch: ArchitectureFeatures, nng: Graph):
174 self.arch = arch
175 self.parent_ps = ps
176 self.parent_op = ps.primary_op
177 self.name = ps.primary_op.name
178 self.op_type = ps.primary_op.type
179 self.activation = ps.primary_op.activation
180 self.kernel = ps.primary_op.kernel
Tim Hall3c5cfe92022-03-16 16:31:57 +0000181 self.resampling_mode = ps.primary_op.ifm_resampling_mode
Tim Halld8339a72021-05-27 18:49:40 +0100182 self.uses_scalar = ps.primary_op.ifm2 is not None and (
183 ps.primary_op.ifm.shape == [] or ps.primary_op.ifm2.shape == []
Tim Hall79d07d22020-04-27 18:20:16 +0100184 )
Tim Halld8339a72021-05-27 18:49:40 +0100185 self.ifm_ublock = arch.ifm_ublock
Tim Hall79d07d22020-04-27 18:20:16 +0100186
Jonas Ohlssond8575072022-03-30 10:30:25 +0200187 self.ifm = SchedulerTensor(
188 ps.ifm_shapes[0],
189 ps.ifm_tensor.dtype,
190 ps.ifm_tensor.mem_area,
191 ps.ifm_tensor.format,
192 )
Tim Hall79d07d22020-04-27 18:20:16 +0100193
Tim Halld8339a72021-05-27 18:49:40 +0100194 self.ifm2 = None
195 if ps.ifm2_tensor:
196 self.ifm2 = SchedulerTensor(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200197 ps.ifm_shapes[1],
198 ps.ifm2_tensor.dtype,
199 ps.ifm2_tensor.mem_area,
200 ps.ifm2_tensor.format,
Tim Halld8339a72021-05-27 18:49:40 +0100201 )
Tim Hall79d07d22020-04-27 18:20:16 +0100202
Jonas Ohlssond8575072022-03-30 10:30:25 +0200203 self.ofm = SchedulerTensor(
204 ps.ofm_shapes[0],
205 ps.ofm_tensor.dtype,
206 ps.ofm_tensor.mem_area,
207 ps.ofm_tensor.format,
208 )
Tim Hall79d07d22020-04-27 18:20:16 +0100209
Tim Halld8339a72021-05-27 18:49:40 +0100210 # Input volume width and height required to produce the smallest possible stripe
211 self.min_stripe_input_w, self.min_stripe_input_h = self._calculate_min_stripe_input()
Tim Hall79d07d22020-04-27 18:20:16 +0100212
Tim Halld8339a72021-05-27 18:49:40 +0100213 # Flags that marks whether this SchedulerOperation requires full IFM/OFM
214 self.requires_full_ifm = False
215 self.requires_full_ifm2 = False
216 self.requires_full_ofm = False
Tim Hall79d07d22020-04-27 18:20:16 +0100217
Tim Halld8339a72021-05-27 18:49:40 +0100218 self.index = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100219
Tim Halld8339a72021-05-27 18:49:40 +0100220 def add_ifm_connection(self, conn: "Connection"):
221 """Add input connection to another SchedulerOperation or Subgraph Input"""
222 conn.consumers.append(self)
223 self.ifm.connection = conn
Tim Hall79d07d22020-04-27 18:20:16 +0100224
Tim Halld8339a72021-05-27 18:49:40 +0100225 def add_ifm2_connection(self, conn: "Connection"):
226 """Add input connection to another SchedulerOperation or Subgraph Input"""
227 if self.ifm2:
228 conn.consumers.append(self)
229 self.ifm2.connection = conn
Tim Hall79d07d22020-04-27 18:20:16 +0100230 else:
Tim Halld8339a72021-05-27 18:49:40 +0100231 assert False, f"Trying to set an IFM2 Connection to {self} which has no IFM2"
Tim Hall79d07d22020-04-27 18:20:16 +0100232
Tim Halld8339a72021-05-27 18:49:40 +0100233 def add_ofm_connection(self, conn: "Connection"):
234 """Add output connection to another SchedulerOperation or Subgraph Output"""
235 conn.producers.append(self)
236 self.ofm.connection = conn
237
238 def get_dependants(self):
239 """Returns a list of the Ops that depend on this Operation's OFM"""
240 return self.ofm.connection.consumers
241
242 def ifm_size_in_bytes(self) -> int:
243 """Returns size of the IFM in bytes"""
244 ifm_storage_shape = shape_for_format(self.ifm.shape, self.ifm.format)
245 return round_up(ifm_storage_shape.elements() * self.ifm.dtype.size_in_bytes(), Tensor.AllocationQuantum)
246
247 def ifm2_size_in_bytes(self) -> int:
248 """Returns size of the IFM2 in bytes"""
249 if self.ifm2:
250 ifm2_storage_shape = shape_for_format(self.ifm2.shape, self.ifm2.format)
251 return round_up(ifm2_storage_shape.elements() * self.ifm2.dtype.size_in_bytes(), Tensor.AllocationQuantum)
252
253 return 0
254
255 def ofm_size_in_bytes(self) -> int:
256 """Returns size of the OFM in bytes"""
257 ofm_storage_shape = shape_for_format(self.ofm.shape, self.ofm.format)
258 return round_up(ofm_storage_shape.elements() * self.ofm.dtype.size_in_bytes(), Tensor.AllocationQuantum)
259
260 def create_scheduler_info(self, nng: Graph, stripe: Shape4D) -> SchedulerOpInfo:
261 """Returns schedule info about this SchedulerOperation based on how many ofm elements it should produce"""
262 ifm_shape = self.ifm.shape
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100263 ifm2_shape = self.ifm2.shape if self.ifm2 is not None else None
Tim Halld8339a72021-05-27 18:49:40 +0100264 ofm_shape = stripe
265
266 if ofm_shape != self.ofm.shape:
267 # Striped Op - Need to calculate stripe input volume
268 stripe_input_w, stripe_input_h = self._get_stripe_input_requirement(stripe)
269 # Ensure stripe input volume is within the full IFM volume
270 stripe_input_h = min(stripe_input_h, self.ifm.shape.height)
271 stripe_input_w = min(stripe_input_w, self.ifm.shape.width)
272 ifm_shape = ifm_shape.with_hw(stripe_input_h, stripe_input_w)
273
274 if self.ifm2:
275 stripe_input2_h = min(stripe_input_h, self.ifm2.shape.height)
276 stripe_input2_w = min(stripe_input_w, self.ifm2.shape.width)
277 ifm2_shape = ifm2_shape.with_hw(stripe_input2_h, stripe_input2_w)
278
279 block_config = self._get_block_config(ifm_shape, ifm2_shape, self.uses_scalar, ofm_shape)
280
281 scheduler_op_info = SchedulerOpInfo(block_config, 0, ifm_shape, ifm2_shape, ofm_shape)
282 if self.parent_op.weights:
283 # Default full-depth weight encoding with no buffering
Tim Halld784af72021-06-08 21:25:57 +0100284 (
285 scheduler_op_info.npu_weights_tensor,
286 scheduler_op_info.npu_scales_tensor,
287 ) = weight_compressor.encode_weight_and_scale_tensor(
Tim Halld8339a72021-05-27 18:49:40 +0100288 self.arch,
289 self.parent_op,
290 self.parent_op.weights,
291 self.parent_op.bias,
292 self.kernel,
293 block_config,
294 [0, self.ofm.shape.depth],
295 )
296
297 self.parent_ps.block_config = block_config.old_style_representation()
298 return scheduler_op_info
299
300 def _get_stripe_input_requirement(self, stripe_shape: Shape4D) -> Tuple[int, int]:
301 """Returns the amount of IFM required to produce the stripe with shape:'stripe_shape'"""
302 ofm_shape_to_produce = Block.from_shape(stripe_shape.as_list())
303
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200304 return get_ifm_area_required(ofm_shape_to_produce, self.kernel, self.resampling_mode)
Tim Halld8339a72021-05-27 18:49:40 +0100305
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100306 def _calculate_min_stripe_input(self) -> Tuple[int, int]:
Tim Halld8339a72021-05-27 18:49:40 +0100307 # Calculate the input volume required height and width for the smallest possible stripe (h,w = 1,1)
308 min_stripe = self.ofm.shape.with_hw(1, 1)
309 return self._get_stripe_input_requirement(min_stripe)
310
311 def _get_block_config(
312 self, ifm_shape: Shape4D, ifm2_shape: Optional[Shape4D], uses_scalar: bool, ofm_shape: Shape4D
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100313 ) -> Optional[ArchitectureBlockConfig]:
Tim Halld8339a72021-05-27 18:49:40 +0100314 # Returns a block config and SHRAM layout
315 lut_banks = 2 if self.parent_op.activation_lut else 0
316 return find_block_config(
317 self.arch,
318 self.op_type.npu_block_type,
319 ofm_shape,
320 ifm_shape,
321 ifm2_shape,
322 uses_scalar,
323 self.ifm.dtype.size_in_bits(),
324 self.kernel,
325 lut_banks,
326 self.parent_op.has_scaling(),
327 self.resampling_mode,
328 )
329
330
331class Connection:
332 """Scheduler internal representation of a Tensor that connects two SchedulerOperations
333 This class can be seen as an edge within the Scheduler Graph representation
334 """
335
336 def __init__(self, tensor: Tensor):
337 self.parent_tens = tensor
338
339 # SchedulerOperation relationships
340 self.producers: List[SchedulerOperation] = []
341 self.consumers: List[SchedulerOperation] = []
Tim Hall79d07d22020-04-27 18:20:16 +0100342
343 def __str__(self):
Tim Halld8339a72021-05-27 18:49:40 +0100344 return f"<Connection {self.parent_tens.name}>"
Tim Hall79d07d22020-04-27 18:20:16 +0100345
346 __repr__ = __str__
347
348
Tim Halld8339a72021-05-27 18:49:40 +0100349class Schedule:
350 """Class that contains a solution of how to schedule an NPU subgraph and its cost"""
Tim Hall79d07d22020-04-27 18:20:16 +0100351
Tim Halld8339a72021-05-27 18:49:40 +0100352 def __init__(self, sg: Subgraph, label: str):
353 self.sg = sg
354 self.label = label
355 self.cost_map: Dict[SchedulerOperation, SchedulerOpInfo] = {}
356 self.cascades: Dict[int, CascadeInfo] = {}
357 self.fast_storage_peak_usage = 0
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100358 self.memory_snapshot: Optional[List[int]] = None
Tim Halld8339a72021-05-27 18:49:40 +0100359
360 @property
361 def name(self):
362 return f"{self.sg.name}_{self.label}"
Tim Hall79d07d22020-04-27 18:20:16 +0100363
364
Tim Halld8339a72021-05-27 18:49:40 +0100365class Scheduler:
366 """Main class of the Vela Scheduling"""
Tim Hall79d07d22020-04-27 18:20:16 +0100367
Tim Halld8339a72021-05-27 18:49:40 +0100368 def __init__(self, nng: Graph, sg: Subgraph, arch: ArchitectureFeatures, options: SchedulerOptions):
Tim Hall79d07d22020-04-27 18:20:16 +0100369 self.nng = nng
370 self.sg = sg
371 self.arch = arch
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000372 self.sched_ops: List[SchedulerOperation] = []
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100373 self.max_schedule: Optional[Schedule] = None
Tim Halld8339a72021-05-27 18:49:40 +0100374 self.scheduler_options = options
Tim Hall79d07d22020-04-27 18:20:16 +0100375
Johan Alfvén5e0ae552022-02-09 21:20:10 +0100376 def avoid_nhcwb16_for_ofm(self, tens, ps, arch):
377 # Only run this check for opt strategy Size
378 if self.scheduler_options.optimization_strategy == OptimizationStrategy.Performance:
379 return False
380
381 op = ps.primary_op
382 if not op.type.is_elementwise_op():
383 return False
384
385 depth = op.ofm_shapes[0][-1]
386 if (depth % 16) == 0:
387 return False
388
389 # Check if overwriting the inputs can be allowed
390 OpShapeTens = namedtuple("OpShapeTens", ["op_shape", "tens"])
391 outp = OpShapeTens(op.ofm_shapes[0], op.ofm)
392 inps = []
393 if op.ifm is not None:
394 inps.append(OpShapeTens(op.ifm_shapes[0], op.ifm))
395 if op.ifm2 is not None:
396 inps.append(OpShapeTens(op.ifm_shapes[1], op.ifm2))
397
398 # Find an input tensor that can be overwritten by the output
399 for inp in inps:
400 if (
401 # check op input and output shapes allow overlapping
402 inp.op_shape == outp.op_shape
403 # check input tensor is valid
404 and inp.tens is not None
405 and inp.tens.shape != []
406 # check input and output tensors are compatible
407 and inp.tens.format == outp.tens.format
408 and inp.tens.dtype == outp.tens.dtype
409 ):
410 if inp.tens.format == TensorFormat.NHWC:
411 return True
412
413 return False
414
Tim Halld8339a72021-05-27 18:49:40 +0100415 def create_scheduler_representation(self, arch: ArchitectureFeatures):
416 """Creates a Scheduler Graph representation"""
417 # Temporary dict for creating connections between the Operations
418 connections: Dict[Tensor, Connection] = {}
419 # Memory required for the largest FeatureMap that has to be full
420 min_memory_req = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100421 for ps in self.sg.passes:
Tim Halld8339a72021-05-27 18:49:40 +0100422 if ps.primary_op:
423 # Set tensor format to NHCWB16 for output FeatureMaps, if possible
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200424 for output in ps.outputs:
Jacob Bohlina5e8c1c2021-06-14 13:33:39 +0200425 if output in self.sg.output_tensors or output.purpose != TensorPurpose.FeatureMap:
Patrik Gustavssonfeeb06d2020-04-22 12:53:47 +0200426 continue
Johan Alfvén5e0ae552022-02-09 21:20:10 +0100427
428 if output.needs_linear_format:
429 continue
430
431 if self.avoid_nhcwb16_for_ofm(output, ps, arch):
432 output.needs_linear_format = True
433 continue
434
435 output.set_format(TensorFormat.NHCWB16, arch)
Tim Halld8339a72021-05-27 18:49:40 +0100436
437 # Create SchedulerOperations
438 op = SchedulerOperation(ps, arch, self.nng)
439 op.index = len(self.sched_ops)
440
441 # Make connections
442 if ps.ifm_tensor not in connections:
443 connections[ps.ifm_tensor] = Connection(ps.ifm_tensor)
444 if ps.ifm2_tensor and ps.ifm2_tensor not in connections:
445 connections[ps.ifm2_tensor] = Connection(ps.ifm2_tensor)
446 if ps.ofm_tensor not in connections:
447 connections[ps.ofm_tensor] = Connection(ps.ofm_tensor)
448
449 op.add_ifm_connection(connections[ps.ifm_tensor])
450 if ps.ifm2_tensor:
451 op.add_ifm2_connection(connections[ps.ifm2_tensor])
452 op.add_ofm_connection(connections[ps.ofm_tensor])
453
454 # Set requirements on the ifm/ofm buffers
455 self.sched_ops.append(op)
456 if ps.ifm_tensor in self.sg.input_tensors:
457 # This Op consumes a subgraph input
458 op.requires_full_ifm = True
459 if ps.ifm2_tensor and ps.ifm2_tensor in self.sg.input_tensors:
460 # This Op consumes a subgraph input
461 op.requires_full_ifm2 = True
462 if ps.ofm_tensor in self.sg.output_tensors:
463 # This Op produces a subgraph output
464 op.requires_full_ofm = True
465 if ps.ifm_tensor.needs_linear_format:
466 op.requires_full_ifm = True
467 if ps.ifm2_tensor and ps.ifm2_tensor.needs_linear_format:
468 op.requires_full_ifm2 = True
469 if ps.ofm_tensor.needs_linear_format or ps.primary_op.memory_function == Op.ConcatSliceWrite:
470 op.requires_full_ofm = True
471 if len(ps.primary_op.outputs) > 1 or len(ps.primary_op.outputs[0].consumer_list) > 1:
472 # Op has multiple outputs or consumers - requires full OFM
473 op.requires_full_ofm = True
474
475 # Check memory requirements if this Op requires any full FeatureMaps
476 op_memory_req = 0
477 if op.requires_full_ifm:
478 op_memory_req += op.ifm_size_in_bytes()
479 if op.requires_full_ifm2:
480 op_memory_req += op.ifm2_size_in_bytes()
481 if op.requires_full_ofm:
482 op_memory_req += op.ofm_size_in_bytes()
483
484 min_memory_req = max(op_memory_req, min_memory_req)
485
486 # Theoretical minimum required memory - used to guide the cascade building
487 self.min_memory_req = min_memory_req
488
489 def create_initial_schedule(self) -> Schedule:
490 """Creates an initial schedule with no cascading or buffering of any kind"""
491 schedule = Schedule(self.sg, "MAX")
Tim Halld8339a72021-05-27 18:49:40 +0100492 for op in self.sched_ops:
493 cost = op.create_scheduler_info(self.nng, op.ofm.shape)
494 cost.cycles = self.estimate_op_performance(op, cost.block_config, op.ofm.shape.depth)
495 schedule.cost_map[op] = cost
496
497 return schedule
498
499 def update_op_memory_snapshot(self, schedule: Schedule):
500 memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]
501
502 # Collect live ranges from tensors
503 lr_graph = live_range.LiveRangeGraph()
504 for mem_area, mem_type_set in memories_list:
505 live_range.extract_live_ranges_from_cascaded_passes(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200506 self.nng.get_root_subgraph(),
507 mem_area,
508 mem_type_set,
509 lr_graph,
510 Tensor.AllocationQuantum,
Tim Halld8339a72021-05-27 18:49:40 +0100511 )
512
513 # Populate time-array with memory used by live ranges
514 temporal_usage = lr_graph.get_temporal_memory_usage(self.arch.fast_storage_mem_area)
515 schedule.memory_snapshot = temporal_usage
516
517 # Set the peak memory usage
518 schedule.fast_storage_peak_usage = max(temporal_usage, default=0)
519
520 def estimate_op_performance(self, op: SchedulerOperation, block_config, ofm_depth):
521 query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)
522 query.ifm_shape = op.ifm.shape
523 query.ifm_memory_area = op.ifm.mem_area
524 query.ifm_bits = op.ifm.dtype.size_in_bits()
525 query.ifm_format = op.ifm.format
526 query.ifm2_shape = op.ifm2 and op.ifm2.shape
527 query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area
528 query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()
529 query.ifm2_format = op.ifm2 and op.ifm2.format
530 query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)
531 query.ofm_memory_area = op.ofm.mem_area
532 query.ofm_bits = op.ofm.dtype.size_in_bits()
533 query.ofm_format = op.ofm.format
534 if op.parent_op.bias:
535 query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)
536 query.const_memory_area = self.arch.fast_storage_mem_area
537
538 query.kernel = op.kernel
539 query.config = block_config
540
541 return npu_performance.measure_cycle_cost(self.arch, op.op_type, op.activation and op.activation.op_type, query)
542
Tim Hall789e6f32021-06-17 17:02:31 +0100543 def propose_schedule_buffering(self, ref_schedule: Schedule, staging_limit_bytes):
Tim Halld8339a72021-05-27 18:49:40 +0100544 """Create a buffered schedule"""
545 buffered_schedule = Schedule(self.sg, f"{ref_schedule.label}_BUFFERED")
Tim Halld8339a72021-05-27 18:49:40 +0100546
547 prev_op = None
548 for sched_op in self.sched_ops:
549 if sched_op not in ref_schedule.cost_map:
550 # sched_op is not part of this sub-schedule - skip
551 continue
552
553 self.propose_operator_buffering(sched_op, prev_op, buffered_schedule, ref_schedule, staging_limit_bytes)
554 prev_op = sched_op
555
556 return buffered_schedule
557
558 def propose_operator_buffering(
559 self,
560 sched_op: SchedulerOperation,
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100561 prev_op: Optional[SchedulerOperation],
Tim Halld8339a72021-05-27 18:49:40 +0100562 buffered_schedule: Schedule,
563 ref_schedule: Schedule,
564 staging_limit_bytes,
565 ):
566 # Mild recursion might mean this Op has already been seen
567 if sched_op in buffered_schedule.cost_map:
568 return
569
570 # Take the reference schedule as default costings for this schedule
571 ref_cost = ref_schedule.cost_map[sched_op]
572 cost = copy.copy(ref_cost)
573 cost.slack_buffering_cycles = ref_cost.cycles.op_cycles
574 memory_snapshot = ref_schedule.memory_snapshot
575 ref_memory_usage = memory_snapshot[ref_cost.time_index] if ref_cost.time_index < len(memory_snapshot) else 0
576 cost.slack_buffering_memory = staging_limit_bytes - ref_memory_usage
577 buffered_schedule.cost_map[sched_op] = cost
578
579 # Attempt weight buffering on anything with a weights tensor
580 if sched_op.parent_op.weights:
581 self.propose_weight_buffering(
582 sched_op.parent_op.weights,
583 sched_op.parent_op.bias,
584 sched_op,
585 prev_op,
586 buffered_schedule,
587 ref_schedule,
588 cost.slack_buffering_memory,
589 )
590
591 return cost
592
593 def weights_needs_dma(self, weight_tensor):
594 if weight_tensor and weight_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast):
595 # Weights are in permanent storage
596 # Only when permanent storage differs from feature map storage, there is a point moving the data
597 if (
598 weight_tensor.mem_area in (MemArea.Dram, MemArea.OffChipFlash)
599 and self.arch.permanent_storage_mem_area != self.arch.fast_storage_mem_area
600 ):
601 return True
602 return False
603
604 def propose_weight_buffering(
605 self,
606 weight_tensor,
607 scale_tensor,
608 sched_op: SchedulerOperation,
609 prev_op: SchedulerOperation,
610 buffered_schedule: Schedule,
611 ref_schedule: Schedule,
612 buffer_limit_bytes,
613 ):
614 cost = buffered_schedule.cost_map[sched_op]
615 prev_cost = buffered_schedule.cost_map.get(prev_op)
616 ref_cost = ref_schedule.cost_map[sched_op]
617 assert cost and ref_cost
618
619 needs_dma = self.weights_needs_dma(weight_tensor)
620
621 ofm_full_depth_slices = [0, ref_cost.stripe.depth]
622
623 # Encode weights for the full depth
Tim Halld784af72021-06-08 21:25:57 +0100624 full_weights, full_scales = weight_compressor.encode_weight_and_scale_tensor(
Tim Halld8339a72021-05-27 18:49:40 +0100625 self.arch,
626 sched_op.parent_op,
627 weight_tensor,
628 scale_tensor,
629 sched_op.kernel,
630 cost.block_config,
631 ofm_full_depth_slices,
632 )
633 full_weights_bytes = len(full_weights.buffer)
634 cost.ofm_depth_slices = ofm_full_depth_slices
635
636 # No buffering required - take all the weights from permanent storage
637 if sched_op.op_type == Op.FullyConnected or not needs_dma:
638 cost.npu_weights_tensor = full_weights
Tim Halld784af72021-06-08 21:25:57 +0100639 cost.npu_scales_tensor = full_scales
Tim Halld8339a72021-05-27 18:49:40 +0100640 return
641
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100642 encoded_weights: Optional[NpuWeightTensor] = full_weights
Tim Halld784af72021-06-08 21:25:57 +0100643 encoded_scales = full_scales
Tim Halld8339a72021-05-27 18:49:40 +0100644
645 # How many NPU cycles are available under the previously executing
646 # operator and SRAM unused for performing buffered DMA transfers
647 slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0
648 slack_memory = prev_cost.slack_buffering_memory if prev_cost else 0
649
650 # Force full depth for cascaded Ops
651 if ref_cost.cascade != 0:
652 weight_tensor_purpose = TensorSubPurpose.Standard
653 weight_buffer_size = full_weights_bytes
654 # Update the memory snapshot to reflect the added size of the weights
655 ref_schedule.memory_snapshot[ref_cost.time_index] += weight_buffer_size
656 else:
657 # Estimate the buffering cycle time for the full set of weights
658 full_transfer_cycles = npu_performance.measure_mem2mem_cycles(
659 self.arch, weight_tensor.mem_area, self.arch.fast_storage_mem_area, full_weights_bytes
660 )
661 cost.full_weight_transfer_cycles = full_transfer_cycles
662
663 # Calculate the amount of prebuffering necessary (or what is possible with limited
664 # double buffer buffer size)
665 half_buffer_limit = buffer_limit_bytes // 2
666 if full_transfer_cycles > slack_cycles:
667 prebuffer_ratio = slack_cycles / full_transfer_cycles
668 prebuffer_bytes = min(prebuffer_ratio * full_weights_bytes, half_buffer_limit)
669 else:
670 prebuffer_bytes = min(full_weights_bytes, half_buffer_limit)
Tim Hall789e6f32021-06-17 17:02:31 +0100671
672 prebuffer_ratio = prebuffer_bytes / full_weights_bytes
Tim Halld8339a72021-05-27 18:49:40 +0100673
674 # Have to split the weights if the initial buffering can't store
675 # all of the compressed weights
676 if prebuffer_bytes < full_weights_bytes:
Tim Hall789e6f32021-06-17 17:02:31 +0100677 block_depth = cost.block_config.ofm_block.depth
Tim Halld8339a72021-05-27 18:49:40 +0100678
Tim Hall789e6f32021-06-17 17:02:31 +0100679 # Choose initial prebuffering depth (already buffer clamped)
680 prebuffer_depth = ref_cost.stripe.depth * prebuffer_ratio
Tim Halld8339a72021-05-27 18:49:40 +0100681 prebuffer_depth = int(max(16, round_down(prebuffer_depth, ArchitectureFeatures.OFMSplitDepth)))
682
Tim Hall789e6f32021-06-17 17:02:31 +0100683 # Calculate cycles executed during the prebuffer
684 pre_op_cycles = self.estimate_op_performance(sched_op, cost.block_config, prebuffer_depth)
685 buffering_depth = ref_cost.stripe.depth * (pre_op_cycles.op_cycles / full_transfer_cycles)
Tim Halld8339a72021-05-27 18:49:40 +0100686
Tim Hall789e6f32021-06-17 17:02:31 +0100687 # Choose initial buffering depth and clamp to the double buffering limit
688 buffering_depth = round_up(buffering_depth, block_depth)
689 buffering_bytes = (buffering_depth / ref_cost.stripe.depth) * full_weights_bytes
690 if buffering_bytes > half_buffer_limit:
691 buffering_depth = (half_buffer_limit / full_weights_bytes) * ref_cost.stripe.depth
692
693 while True:
694 # Attempt to buffer whole blocks
695 if buffering_bytes > block_depth:
696 buffering_depth = round_down(buffering_depth, block_depth)
697 else:
698 buffering_depth = round_down(buffering_depth, ArchitectureFeatures.OFMSplitDepth)
699 buffering_depth = int(max(buffering_depth, ArchitectureFeatures.OFMSplitDepth))
Tim Halld8339a72021-05-27 18:49:40 +0100700
701 # Create list of depth slices
702 depth_slices = [0]
703 if prebuffer_depth < ref_cost.stripe.depth:
704 depth_slices += list(range(prebuffer_depth, ref_cost.stripe.depth, buffering_depth))
705 depth_slices.append(ref_cost.stripe.depth)
706
707 # Encode weights based depth slices
708 cost.ofm_depth_slices = depth_slices
Tim Halld784af72021-06-08 21:25:57 +0100709 encoded_weights, encoded_scales = weight_compressor.encode_weight_and_scale_tensor(
Tim Halld8339a72021-05-27 18:49:40 +0100710 self.arch,
711 sched_op.parent_op,
712 weight_tensor,
713 scale_tensor,
714 sched_op.kernel,
715 cost.block_config,
716 cost.ofm_depth_slices,
717 )
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100718 assert encoded_weights is not None
Tim Halld8339a72021-05-27 18:49:40 +0100719 # Chosen buffering might not fit at all, iterate until it does
720 # or until the minimum usable slice size is reached
721 if (
Louis Verhaardcc5f4de2022-03-01 11:26:58 +0100722 encoded_weights.double_buffer_size() <= buffer_limit_bytes
Tim Halld8339a72021-05-27 18:49:40 +0100723 or prebuffer_depth == ArchitectureFeatures.OFMSplitDepth
724 ):
725 break
726
Tim Hall789e6f32021-06-17 17:02:31 +0100727 if buffering_depth > prebuffer_depth:
728 buffering_depth = round_up(buffering_depth // 2, ArchitectureFeatures.OFMSplitDepth)
729 else:
730 prebuffer_depth = round_up(prebuffer_depth // 2, ArchitectureFeatures.OFMSplitDepth)
Tim Halld8339a72021-05-27 18:49:40 +0100731
732 # Calculate cycles required to run the last op for use as future slack
733 tail_cycles = self.estimate_op_performance(
734 sched_op, cost.block_config, depth_slices[-1] - depth_slices[-2]
735 )
736 cost.slack_buffering_cycles = tail_cycles.op_cycles
737
738 # Determine whether the weights need to be double buffered
Louis Verhaardcc5f4de2022-03-01 11:26:58 +0100739 weight_buffer_size = min(len(encoded_weights.buffer), encoded_weights.max_range_bytes())
Tim Halld8339a72021-05-27 18:49:40 +0100740
741 # Only buffer weights if there's still space left for the buffer
742 if weight_buffer_size <= buffer_limit_bytes:
743 assert weight_buffer_size % 16 == 0
744 # Determine whether to double buffer or single buffer
Louis Verhaardcc5f4de2022-03-01 11:26:58 +0100745 double_buffer_size = encoded_weights.double_buffer_size()
746 if (double_buffer_size <= buffer_limit_bytes) and (weight_buffer_size < len(encoded_weights.buffer)):
Tim Halld8339a72021-05-27 18:49:40 +0100747 weight_tensor_purpose = TensorSubPurpose.DoubleBuffer
748 else:
749 weight_tensor_purpose = TensorSubPurpose.Standard
750
Louis Verhaardcc5f4de2022-03-01 11:26:58 +0100751 cost.buffered_weight_tensors = [
752 self.buffer_tensor(
753 encoded_weights,
754 weight_tensor_purpose,
755 encoded_weights.double_buffer_sizes[0],
756 weight_tensor.name + "_buffer",
757 )
758 ]
759 if weight_tensor_purpose == TensorSubPurpose.DoubleBuffer:
760 buf2 = self.buffer_tensor(
761 encoded_weights,
762 weight_tensor_purpose,
763 encoded_weights.double_buffer_sizes[1],
764 weight_tensor.name + "_buffer2",
765 )
766 cost.buffered_weight_tensors.append(buf2)
767 last_used_buffer_idx = len(cost.ofm_depth_slices) % 2
768 weight_buffer_size = encoded_weights.double_buffer_sizes[last_used_buffer_idx]
Tim Halld8339a72021-05-27 18:49:40 +0100769 if ref_cost.cascade == 0:
Louis Verhaardcc5f4de2022-03-01 11:26:58 +0100770 # Determine if the lifetime can be extended and pre-buffer the first weight buffer
771 # under the previous operation
772 cost.buffered_weight_tensors[0].pre_buffer = encoded_weights.double_buffer_sizes[0] < slack_memory
Tim Halld8339a72021-05-27 18:49:40 +0100773
774 cost.slack_buffering_memory -= weight_buffer_size
775 else:
776 # Don't slice or buffer - use the whole depth from persistent storage
777 cost.ofm_depth_slices = ofm_full_depth_slices
778 encoded_weights = full_weights
Tim Halld784af72021-06-08 21:25:57 +0100779 encoded_scales = full_scales
Tim Halld8339a72021-05-27 18:49:40 +0100780
781 cost.npu_weights_tensor = encoded_weights
Tim Halld784af72021-06-08 21:25:57 +0100782 cost.npu_scales_tensor = encoded_scales
Tim Halld8339a72021-05-27 18:49:40 +0100783
Jacob Bohlineee9e5d2021-08-17 17:44:45 +0200784 def buffer_tensor(self, src_tensor: Tensor, sub_purpose: TensorSubPurpose, buffer_size: int, name: str) -> Tensor:
Louis Verhaardcc5f4de2022-03-01 11:26:58 +0100785 buffered_weight_tensor = Tensor([1, 1, 1, buffer_size], DataType.uint8, name)
Jacob Bohlineee9e5d2021-08-17 17:44:45 +0200786 buffered_weight_tensor.src_tensor = src_tensor
787 buffered_weight_tensor.mem_area = self.arch.fast_storage_mem_area
788 buffered_weight_tensor.mem_type = MemType.Scratch_fast
789 buffered_weight_tensor.purpose = TensorPurpose.Weights
790 buffered_weight_tensor.sub_purpose = sub_purpose
791 return buffered_weight_tensor
792
Tim Halld8339a72021-05-27 18:49:40 +0100793 def propose_minimal_schedule(self) -> Schedule:
794 """Proposes scheduling parameters where every operator is subdivided into the smallest stripe that satisfies the
795 next operators stride"""
796 min_schedule = Schedule(self.sg, "MIN")
797 cost_map = min_schedule.cost_map
798
799 # Keep track of the previous Op - which consumes the current Op's OFM
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100800 prev_op: Optional[SchedulerOperation] = None
Tim Halld8339a72021-05-27 18:49:40 +0100801 for sched_op in reversed(self.sched_ops):
802 min_stripe_height = prev_op.kernel.stride.y if prev_op else 1
803 min_stripe = sched_op.ofm.shape.with_height(min_stripe_height)
804
805 cost = sched_op.create_scheduler_info(self.nng, min_stripe)
806 cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)
807 cost_map[sched_op] = cost
808
809 prev_op = sched_op
810
811 return min_schedule
812
813 def propose_schedule_striping(self, final_stripe: Shape4D, label: str, ref_schedule: Schedule) -> Schedule:
814 """Proposes new striping for a schedule. The stripe is derived from the ifm requirements of the next Op down"""
815 ref_cost = ref_schedule.cost_map
816
817 striped_schedule = Schedule(self.sg, label)
818 stripe = final_stripe
819 for sched_op in reversed(self.sched_ops):
820 if sched_op not in ref_cost:
821 # sched_op is not part of the sub-schedule - skip
822 continue
823
824 # Create a cost entry with the new stripe
825 cost = sched_op.create_scheduler_info(self.nng, stripe)
826
Louis Verhaardcc5f4de2022-03-01 11:26:58 +0100827 for buffered_tens in ref_cost[sched_op].buffered_weight_tensors:
Jacob Bohlineee9e5d2021-08-17 17:44:45 +0200828 # If the weights are buffered in the reference schedule they should be in the new proposal
829 weight_tensor = cost.npu_weights_tensor
Louis Verhaardcc5f4de2022-03-01 11:26:58 +0100830 cost.buffered_weight_tensors.append(
831 self.buffer_tensor(
832 weight_tensor, TensorSubPurpose.Standard, buffered_tens.storage_size(), buffered_tens.name
833 )
Jacob Bohlineee9e5d2021-08-17 17:44:45 +0200834 )
Tim Halld8339a72021-05-27 18:49:40 +0100835
836 # Estimate performance
837 cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)
838 striped_schedule.cost_map[sched_op] = cost
839
840 # Calculate the preceeding Op's stripe
841 stripe = sched_op.ifm.shape.with_height(stripe.height * sched_op.kernel.stride.y)
842
843 return striped_schedule
844
845 def estimate_schedule_memory_usage(self, schedule: Schedule, non_local_mem_usage: dict):
846 """Estimates the memory usage of a schedule"""
847 cost = schedule.cost_map
848 cascades = schedule.cascades
849 peak_mem_usage = 0
850 for sched_op in self.sched_ops:
851 if sched_op not in cost:
852 # sched_op is not part of the sub-schedule - skip
853 continue
854
855 if cost[sched_op].cascade:
856 # This Op is part of a cascade - use the cascade's memory usage
857 cascade_info = cascades[cost[sched_op].cascade]
858 # Non-local memory usage is already included in the cascade_info
859 peak_mem_usage = max(cascade_info.mem_usage, peak_mem_usage)
860 else:
861 # This Op is not part of a cascade - calculate the memory usage
Louis Verhaardcc5f4de2022-03-01 11:26:58 +0100862 op_weight_buffer = sum(tens.storage_size() for tens in cost[sched_op].buffered_weight_tensors)
Tim Halld8339a72021-05-27 18:49:40 +0100863
864 op_mem_usage = (
865 sched_op.ifm_size_in_bytes()
866 + sched_op.ofm_size_in_bytes()
867 + op_weight_buffer
868 + non_local_mem_usage.get(sched_op, 0)
869 )
870 peak_mem_usage = max(op_mem_usage, peak_mem_usage)
871
872 return peak_mem_usage
873
874 def optimize_sub_schedule(
875 self, cascade_info: CascadeInfo, ref_schedule: Schedule, max_template: Schedule, memory_limit: int
876 ) -> Schedule:
877 """Extracts the Ops covered by the given cascade and creates a sub-schedule. The sub-schedule is optimized by
878 proposing weight buffering and then continously proposing new stripe sizes"""
879 ref_cost = ref_schedule.cost_map
880 # Extract the ops that are part of this sub-schedule
881 start = cascade_info.start
882 end = cascade_info.end
883 sub_schedule_ops = self.sched_ops[start : end + 1]
884 # Create a sub-schedule that contains only the costs for the Ops that are part of the sub-schedule
885 sub_schedule = Schedule(self.sg, f"SUB_{start}_{end}")
886 for sched_op in sub_schedule_ops:
887 sub_schedule.cost_map[sched_op] = ref_cost[sched_op]
888
889 sub_schedule.cascades[end] = cascade_info
890 # Use the memory snapshot from the reference schedule
891 sub_schedule.memory_snapshot = ref_schedule.memory_snapshot
892
893 # Calculate memory usage that is live during the sub-schedule but not part of it
894 time_for_cascade = ref_cost[sub_schedule_ops[0]].time_index
895 mem_usage_parallel_to_sub_schedule = ref_schedule.memory_snapshot[time_for_cascade] - cascade_info.mem_usage
896 # If the first Op's IFM has other consumers it has to live throughout the whole sub-schedule whether it's
897 # included in a cascade or not
898 persistent_initial_ifm = (
899 sub_schedule_ops[0].ifm_size_in_bytes() if len(sub_schedule_ops[0].ifm.connection.consumers) > 1 else 0
900 )
901 # Calculate non-local-mem-usage per Operator
902 non_local_mem_usage = {}
903 for idx, sched_op in enumerate(sub_schedule_ops):
904 non_local_mem_usage[sched_op] = mem_usage_parallel_to_sub_schedule
905 if idx != 0:
906 non_local_mem_usage[sched_op] += persistent_initial_ifm
907
908 cascade_builder = CascadeBuilder(sub_schedule_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)
909
910 # Start by adding buffering
Tim Hall789e6f32021-06-17 17:02:31 +0100911 buffered_sub_schedule = self.propose_schedule_buffering(
912 sub_schedule, self.scheduler_options.optimization_sram_limit
913 )
Tim Halld8339a72021-05-27 18:49:40 +0100914 # Copy the cascades over from the unbuffered-schedule
915 buffered_sub_schedule.cascades = sub_schedule.cascades
916
917 # Generate the possible stripings for the final Op in the sub-schedule
918 final_ofm_shape = sub_schedule_ops[-1].ofm.shape
919 possible_stripes = [
920 final_ofm_shape.with_height(stripe_h) for stripe_h in range(1, final_ofm_shape.height // 2 + 1)
921 ]
922
923 # Propose different striping - the possible stripes are proposed similarly to a binary search
Jacob Bohlinfad72042021-08-24 21:51:41 +0200924 best_schedule = None
Tim Halld8339a72021-05-27 18:49:40 +0100925 iteration = 0
926 while len(possible_stripes) > 1:
927 proposed_stripe = possible_stripes[len(possible_stripes) // 2]
928 proposed_schedule = self.propose_schedule_striping(
929 proposed_stripe, f"OPTIMIZED_{iteration}", buffered_sub_schedule
930 )
931
932 cascade_builder.build_cascades(proposed_schedule, max_template, memory_limit)
933
934 # Check if proposal fits
935 proposed_schedule_mem_usage = self.estimate_schedule_memory_usage(proposed_schedule, non_local_mem_usage)
936 if (proposed_schedule_mem_usage) <= memory_limit:
937 # Remove all possible stripes smaller than this
938 possible_stripes = possible_stripes[len(possible_stripes) // 2 :]
939 best_schedule = proposed_schedule
940 if not proposed_schedule.cascades:
941 # No cascading required - early exit
942 break
943 else:
944 # Proposal doesn't fit within the limit - remove all possible stripes larger than this
945 possible_stripes = possible_stripes[: len(possible_stripes) // 2]
946
947 iteration += 1
948
949 return best_schedule
950
951 def optimize_schedule(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200952 self,
953 schedule: Schedule,
954 max_sched: Schedule,
955 max_template: Schedule,
956 options: SchedulerOptions,
Tim Halld8339a72021-05-27 18:49:40 +0100957 ) -> Schedule:
958 """Extracts sub-schedules based on the cascades and optimizes them and applies them to the final schedule"""
959 sram_limit = options.optimization_sram_limit
960 if max_sched.fast_storage_peak_usage < sram_limit and not self.arch.is_spilling_enabled():
961 # Maximum performance schedule fits within the SRAM target
962 return max_sched
963
Jacob Bohlinfad72042021-08-24 21:51:41 +0200964 # Iterate over a copy of the cascades since they may change during the loop
965 for cascade_info in list(schedule.cascades.values()):
Tim Halld8339a72021-05-27 18:49:40 +0100966 # Optimize the sub-schedule in this cascade
967 opt_sub_schedule = self.optimize_sub_schedule(cascade_info, schedule, max_template, sram_limit)
Jacob Bohlinfad72042021-08-24 21:51:41 +0200968 if opt_sub_schedule:
969 # Remove the existing cascade
970 del schedule.cascades[cascade_info.end]
971 # Update the sub-schedule Op and cascade costs to the full schedule
972 schedule.cost_map.update(opt_sub_schedule.cost_map)
973 schedule.cascades.update(opt_sub_schedule.cascades)
Tim Halld8339a72021-05-27 18:49:40 +0100974
975 # Update memory snapshot
976 self.sg.schedule = schedule
977 self.update_op_memory_snapshot(schedule)
978 # Propose schedule buffering to the optimized schedule
Tim Hall789e6f32021-06-17 17:02:31 +0100979 optimized_sched = self.propose_schedule_buffering(schedule, self.scheduler_options.optimization_sram_limit)
Tim Halld8339a72021-05-27 18:49:40 +0100980 # Copy the cascade's metadata from the unbuffered schedule
981 optimized_sched.cascades = schedule.cascades
982 return optimized_sched
983
984 def apply_schedule(self, sched: Schedule):
985 """Applies the given schedule as a final solution"""
986 for sched_op in self.sched_ops:
987 op_info = sched.cost_map[sched_op]
988 cascade_info = sched.cascades.get(op_info.cascade, None)
989 if cascade_info and sched_op in cascade_info.buffers:
990 buffer_tens = sched_op.ifm.connection.parent_tens
991 # Apply memory area and type
992 buffer_tens.mem_area = self.arch.fast_storage_mem_area
993 buffer_tens.mem_type = MemType.Scratch_fast
994 # Apply Rolling buffer
995 buffer_tens.set_format(TensorFormat.NHCWB16, self.arch)
996 buffer_tens.set_new_sub_purpose(TensorSubPurpose.RollingBufferY, cascade_info.buffers[sched_op].height)
997
998 sched_op.parent_ps.block_config = op_info.block_config.old_style_representation()
999
1000 # Ensure that the src_tensor reference is set correctly
Louis Verhaardcc5f4de2022-03-01 11:26:58 +01001001 for tens in op_info.buffered_weight_tensors:
1002 tens.src_tensor = op_info.npu_weights_tensor
Tim Halld8339a72021-05-27 18:49:40 +01001003
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001004 def use_fast_storage_for_feature_maps(self, schedule, staging_limit):
1005 scratched_fms = {}
1006 max_mem_usage = []
1007 base_mem_usage = []
1008 fast_storage_type = MemType.Scratch_fast
1009 fast_storage_mem_area = self.arch.fast_storage_mem_area
Tim Halld8339a72021-05-27 18:49:40 +01001010
1011 # Force all OFMs to fast-storage
1012 for sched_op in self.sched_ops:
1013 cost = schedule.cost_map[sched_op]
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001014 if cost.cascade == 0 and sched_op.get_dependants():
1015 ofm_tens = sched_op.ofm.connection.parent_tens
1016 if not any(cons is None for cons in ofm_tens.consumer_list):
1017 if ofm_tens not in scratched_fms:
1018 scratched_fms[ofm_tens] = (ofm_tens.mem_area, ofm_tens.mem_type)
1019 ofm_tens.mem_area = fast_storage_mem_area
1020 ofm_tens.mem_type = fast_storage_type
Tim Halld8339a72021-05-27 18:49:40 +01001021
1022 # Collect live ranges from tensors
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001023 memories_list = [(fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]
Tim Halld8339a72021-05-27 18:49:40 +01001024 lr_graph = live_range.LiveRangeGraph()
1025 for mem_area, mem_type_set in memories_list:
1026 live_range.extract_live_ranges_from_cascaded_passes(
Jonas Ohlssond8575072022-03-30 10:30:25 +02001027 self.nng.get_root_subgraph(),
1028 mem_area,
1029 mem_type_set,
1030 lr_graph,
1031 Tensor.AllocationQuantum,
Tim Halld8339a72021-05-27 18:49:40 +01001032 )
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001033 max_mem_usage = lr_graph.get_temporal_memory_usage(fast_storage_mem_area)
Tim Halld8339a72021-05-27 18:49:40 +01001034
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001035 # If true, everything fits and we can proceed
1036 if max(max_mem_usage) <= staging_limit:
1037 return
1038
1039 # Build up the base memory usage by removing the
1040 # mem_usage of the lrs we previously moved to fast-storage
1041 base_mem_usage = np.array(max_mem_usage)
1042 curr_lrs = []
Tim Halld8339a72021-05-27 18:49:40 +01001043 for lr in lr_graph.lrs:
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001044 for tens in lr.tensors:
1045 if scratched_fms.get(tens):
1046 curr_lrs.append(lr)
1047 base_mem_usage[lr.start_time : lr.end_time + 1] -= lr.size
1048 break
1049
1050 competing_lrs = []
1051 for lr in curr_lrs:
1052 base_usage = max(base_mem_usage[lr.start_time : lr.end_time + 1])
1053 # If true, the lr will never fit and may thus be evicted
1054 if base_usage + lr.size > staging_limit:
1055 FastStorageComponentAllocator.evict(lr, max_mem_usage, scratched_fms)
1056 continue
1057 # Since max_mem_usage is the memory usage with all FMs still in fast-storage,
1058 # the memory limit cannot be exceeded if max_mem_usage does not.
1059 # Thus, the affected lrs can remain in fast-storage if the following is true
1060 if max(max_mem_usage[lr.start_time : lr.end_time + 1]) <= staging_limit:
1061 FastStorageComponentAllocator.keep(lr, base_mem_usage, staging_limit)
1062 else:
1063 competing_lrs.append(lr)
1064 sz = len(competing_lrs)
1065 # All lrs and their tensors have been handled if sz is zero, we may thus return
1066 if sz == 0:
1067 return
1068
1069 competing_lrs = sorted(competing_lrs, key=lambda lr: (lr.start_time, lr.end_time + 1, lr.size))
1070 start = 0
1071 start_time = competing_lrs[0].start_time
1072 end_time = competing_lrs[0].end_time
1073 component_allocator = FastStorageComponentAllocator(base_mem_usage, max_mem_usage, staging_limit)
1074 # Build up components and then allocate each separately
1075 for i, lr in enumerate(competing_lrs):
1076 if lr.start_time <= end_time and i - start < component_allocator.max_exhaustive_size:
1077 start_time = min(start_time, lr.start_time)
1078 end_time = max(end_time, lr.end_time)
1079 else:
1080 component_allocator.allocate_component(
1081 component_allocator,
1082 competing_lrs[start:i],
1083 max_mem_usage,
1084 base_mem_usage,
1085 staging_limit,
1086 scratched_fms,
1087 )
1088 start = i
1089 start_time = lr.start_time
1090 end_time = lr.end_time
1091 component_allocator.allocate_component(
1092 component_allocator, competing_lrs[start:sz], max_mem_usage, base_mem_usage, staging_limit, scratched_fms
1093 )
Tim Halld8339a72021-05-27 18:49:40 +01001094
1095 def move_constant_data(self):
1096 """Determine if data, can be moved from permanent storage to another memory area. A move
1097 will generate a DMA command in the high-level command stream"""
1098 for sched_op in self.sched_ops:
1099 parent_op = sched_op.parent_op
1100 is_lut_used = any(inp.purpose == TensorPurpose.LUT for inp in parent_op.inputs)
1101 max_ifm_shram_avail = (
1102 (self.arch.available_shram_banks(is_lut_used) - self.arch.shram_reserved_output_banks)
1103 * self.arch.shram_bank_size
1104 // 2
1105 )
1106
1107 for idx, tens in enumerate(parent_op.inputs):
1108 if tens.mem_type not in (MemType.Scratch, MemType.Scratch_fast):
1109 # Tensor is in permanent storage
1110 # Only when permanent storage differs from feature map storage, there is a point moving the data
1111 if (
1112 tens.mem_area in self.arch.permanent_storage_mem_area
1113 and self.arch.permanent_storage_mem_area != self.arch.feature_map_storage_mem_area
1114 ) or tens.purpose == TensorPurpose.LUT:
1115 if tens.purpose == TensorPurpose.LUT or (
Patrik Gustavsson94292fe2021-09-02 08:22:58 +02001116 # For elementwise broadcast
Tim Halld8339a72021-05-27 18:49:40 +01001117 tens.purpose == TensorPurpose.FeatureMap
1118 and sched_op.op_type.is_binary_elementwise_op()
1119 and tens.shape != []
1120 and sched_op.ifm.shape != sched_op.ofm.shape
Patrik Gustavsson94292fe2021-09-02 08:22:58 +02001121 and parent_op.write_shape is None
Tim Halld8339a72021-05-27 18:49:40 +01001122 and tens.storage_size() > max_ifm_shram_avail
1123 ):
1124 only_vector_product_consumers = all(
1125 oper and oper.type.npu_block_type == NpuBlockType.VectorProduct
1126 for oper in tens.consumers()
1127 )
1128
1129 if (not only_vector_product_consumers) or tens.purpose == TensorPurpose.LUT:
1130 new_tens = tens.clone_into_fast_storage(self.arch)
1131 if tens.purpose == TensorPurpose.LUT:
1132 new_tens.mem_area = MemArea.Shram
1133
1134 new_tens.consumer_list.append(parent_op)
1135 parent_op.inputs[idx] = new_tens
Dwight Lidman352607c2021-09-29 17:00:09 +02001136 # If the index is out of range, IFM and IFM2 are the same tensor
1137 # and pass inputs don't have duplicates
1138 if idx < len(sched_op.parent_ps.inputs):
1139 sched_op.parent_ps.inputs[idx] = new_tens
Tim Halld8339a72021-05-27 18:49:40 +01001140
1141 def print_schedule(self, schedule: Schedule):
1142 print(f"Schedule: '{schedule.name}'")
1143 for sched_op in self.sched_ops:
1144 if sched_op not in schedule.cost_map:
1145 # Sub-schedule printing
1146 continue
1147
1148 op_info = schedule.cost_map[sched_op]
1149 print(f"\t{sched_op.index}: Operation {sched_op.name} - OFM {sched_op.ofm.shape}")
1150 print(f"\t\tType: {sched_op.op_type}")
1151 print(f"\t\tKernel: {sched_op.kernel}")
1152 print(f"{op_info}")
1153 mem_usage = (
1154 schedule.memory_snapshot[op_info.time_index]
1155 if op_info.time_index < len(schedule.memory_snapshot)
1156 else 0
1157 )
1158 print(f"\t\tSRAM Used: {mem_usage} bytes")
1159
Jonas Ohlsson25e700c2022-03-04 14:58:56 +01001160 print("\tCascades:")
Tim Halld8339a72021-05-27 18:49:40 +01001161 for i, cascade in enumerate(schedule.cascades.values()):
1162 print(f"\t\t{i}: {cascade.start} -> {cascade.end}, size: {cascade.mem_usage}")
Patrik Gustavssonfeeb06d2020-04-22 12:53:47 +02001163
Andreas Nevalainen27d36f02020-11-19 11:27:50 +01001164
Tim Halld8339a72021-05-27 18:49:40 +01001165def _update_tensor_allocation(nng: Graph, arch: ArchitectureFeatures, options):
1166 """
1167 Creates live ranges and runs tensor allocator for the current schedule
1168 (i.e. sg.schedule for all subgraphs), returns the maximum memory usage
1169 and updates SchedulerOpInfo.mem_usage for all operations in the schedule.
1170 """
1171 root_sg = nng.get_root_subgraph()
1172
1173 alloc_list = []
1174 if arch.is_spilling_enabled():
1175 mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,)))
1176 mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))
1177 # Order is important
1178 alloc_list.append(mem_alloc_scratch_fast)
1179 alloc_list.append(mem_alloc_scratch)
1180 else:
1181 mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))
1182 alloc_list.append(mem_alloc_scratch)
1183
1184 for mem_area, mem_type_set in alloc_list:
1185 tensor_allocation.allocate_tensors(
1186 nng,
1187 root_sg,
1188 arch,
1189 mem_area,
1190 mem_type_set,
1191 tensor_allocator=options.tensor_allocator,
1192 verbose_allocation=options.verbose_allocation,
1193 cpu_tensor_alignment=options.cpu_tensor_alignment,
1194 )
1195
1196
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001197class FastStorageComponentAllocator:
1198 def __init__(self, base_mem_usage, max_mem_usage, staging_limit):
1199 self.base_mem_usage = base_mem_usage
1200 self.max_mem_usage = list(max_mem_usage)
1201 self.staging_limit = staging_limit
1202 self.lrs = []
1203 self.evicted = []
1204 self.curr_evicted = []
1205 self.remaining_total_size = []
1206 self.best_allocated_size = 0
1207 self.max_exhaustive_size = 20
1208
1209 def allocate_exhaustive(self, ix, alloc_size):
1210 if ix >= len(self.lrs):
1211 if alloc_size > self.best_allocated_size:
1212 self.best_allocated_size = alloc_size
Louis Verhaard5c8f1e52022-02-23 14:13:07 +01001213 self.evicted = self.curr_evicted.copy()
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001214 return
1215
1216 lr = self.lrs[ix]
1217 for t in range(lr.start_time, lr.end_time):
1218 assert self.base_mem_usage[t] <= self.max_mem_usage[t]
1219 base_usage = max(self.base_mem_usage[lr.start_time : lr.end_time + 1])
1220 can_fit = base_usage + lr.size <= self.staging_limit
1221 always_fits = can_fit
1222
1223 if can_fit:
1224 max_usage = max(self.max_mem_usage[lr.start_time : lr.end_time + 1])
1225 always_fits = max_usage <= self.staging_limit
1226
1227 if can_fit or always_fits:
1228 self.curr_evicted[ix] = False
1229 self.base_mem_usage = self.update_mem_usage(self.base_mem_usage, lr, True)
1230 self.allocate_exhaustive(ix + 1, alloc_size + lr.size)
1231 self.base_mem_usage = self.update_mem_usage(self.base_mem_usage, lr, False)
1232
1233 if not always_fits:
1234 self.curr_evicted[ix] = True
1235 self.max_mem_usage = self.update_mem_usage(self.max_mem_usage, lr, False)
1236 self.allocate_exhaustive(ix + 1, alloc_size)
1237 self.max_mem_usage = self.update_mem_usage(self.max_mem_usage, lr, True)
1238
1239 @staticmethod
1240 def update_mem_usage(mem_usage, lr, increase):
1241 for t in range(lr.start_time, lr.end_time + 1):
1242 mem_usage[t] += lr.size if increase else -lr.size
1243 assert mem_usage[t] >= 0
1244 return mem_usage
1245
1246 @staticmethod
1247 def evict(lr, max_mem_usage, scratched_fms):
1248 for t in range(lr.start_time, lr.end_time + 1):
1249 max_mem_usage[t] -= lr.size
1250 for tens in lr.tensors:
1251 if tens in scratched_fms:
1252 tens.mem_area = scratched_fms[tens][0]
1253 tens.mem_type = scratched_fms[tens][1]
1254
1255 @staticmethod
1256 def keep(lr, base_mem_usage, staging_limit):
1257 for t in range(lr.start_time, lr.end_time + 1):
1258 base_mem_usage[t] += lr.size
1259 assert base_mem_usage[t] <= staging_limit
1260
1261 def allocate_component(self, allocator, lrs, max_mem, min_mem, staging_limit, scratched_fms):
1262 sz = len(lrs)
1263 allocator.lrs = lrs
1264 allocator.evicted = [0] * len(lrs)
1265 allocator.curr_evicted = [0] * sz
1266 allocator.best_allocated_size = -1
1267 # Recursively evaluate all permutations of allocations of the lrs found in the component
1268 allocator.allocate_exhaustive(0, 0)
1269
1270 # Optimal allocation has been found, move lrs accordingly
1271 for i, e in enumerate(allocator.evicted):
1272 if e:
1273 self.evict(lrs[i], max_mem, scratched_fms)
1274 else:
1275 self.keep(lrs[i], min_mem, staging_limit)
1276
1277
Tim Halld8339a72021-05-27 18:49:40 +01001278def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_options: SchedulerOptions):
1279 """Entry point for the Scheduler"""
1280 # Initialize CPU subgraphs
1281 schedulers = dict()
1282 # Initialize schedulers with max schedule. Only schedule NPU subgraphs
Andreas Nevalainen27d36f02020-11-19 11:27:50 +01001283 for sg in nng.subgraphs:
Tim Halld8339a72021-05-27 18:49:40 +01001284 if sg.placement != PassPlacement.Npu:
1285 # Create cascaded passes for CPU Ops
1286 cascaded_passes = []
1287 for idx, ps in enumerate(sg.passes):
1288 cps = CascadedPass(
Jonas Ohlssond8575072022-03-30 10:30:25 +02001289 ps.name,
1290 SchedulingStrategy.WeightStream,
1291 ps.inputs,
1292 [],
1293 ps.outputs,
1294 [ps],
1295 ps.placement,
1296 False,
Tim Halld8339a72021-05-27 18:49:40 +01001297 )
Andreas Nevalainen27d36f02020-11-19 11:27:50 +01001298
Tim Halld8339a72021-05-27 18:49:40 +01001299 cps.time = idx
1300 ps.cascade = cps
1301 cascaded_passes.append(cps)
Andreas Nevalainen27d36f02020-11-19 11:27:50 +01001302
Tim Halld8339a72021-05-27 18:49:40 +01001303 sg.cascaded_passes = cascaded_passes
1304 else:
1305 # Npu subgraph - create schedule
1306 scheduler = Scheduler(nng, sg, arch, scheduler_options)
1307 schedulers[sg] = scheduler
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001308
Tim Halld8339a72021-05-27 18:49:40 +01001309 scheduler.create_scheduler_representation(arch)
1310 sg.sched_ops = scheduler.sched_ops
1311 scheduler.move_constant_data()
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001312
Tim Halld8339a72021-05-27 18:49:40 +01001313 # Create the Max schedule template
1314 max_schedule_template = scheduler.create_initial_schedule()
1315 scheduler.max_schedule = max_schedule_template
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001316
Tim Halld8339a72021-05-27 18:49:40 +01001317 # Create the optimimised Max schedule
1318 sg.schedule = max_schedule_template
1319 scheduler.update_op_memory_snapshot(max_schedule_template)
Tim Hall789e6f32021-06-17 17:02:31 +01001320 opt_max_schedule = scheduler.propose_schedule_buffering(max_schedule_template, 1 << 32)
Tim Halld8339a72021-05-27 18:49:40 +01001321 sg.schedule = opt_max_schedule
1322 scheduler.update_op_memory_snapshot(opt_max_schedule)
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001323
Tim Halld8339a72021-05-27 18:49:40 +01001324 # Create Min schedule
1325 min_schedule = scheduler.propose_minimal_schedule()
1326 initial_sram_limit = scheduler_options.optimization_sram_limit
1327 if scheduler_options.optimization_strategy == OptimizationStrategy.Size:
1328 initial_sram_limit = scheduler.min_memory_req
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001329
Tim Halld8339a72021-05-27 18:49:40 +01001330 cascade_builder = CascadeBuilder(scheduler.sched_ops, arch.is_spilling_enabled())
1331 cascade_builder.build_cascades(min_schedule, max_schedule_template, initial_sram_limit)
1332 sg.schedule = min_schedule
1333 scheduler.update_op_memory_snapshot(min_schedule)
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001334
Tim Halld8339a72021-05-27 18:49:40 +01001335 if scheduler_options.optimization_strategy == OptimizationStrategy.Performance:
1336 # Create an optimized schedule
1337 sg.schedule = scheduler.optimize_schedule(
1338 min_schedule, opt_max_schedule, max_schedule_template, scheduler_options
1339 )
1340 scheduler.update_op_memory_snapshot(sg.schedule)
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001341
Tim Halld8339a72021-05-27 18:49:40 +01001342 scheduler.apply_schedule(sg.schedule)
1343 scheduler.use_fast_storage_for_feature_maps(sg.schedule, scheduler_options.optimization_sram_limit)
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001344
Tim Halld8339a72021-05-27 18:49:40 +01001345 if scheduler_options.verbose_schedule:
1346 scheduler.print_schedule(sg.schedule)
Tim Hall79d07d22020-04-27 18:20:16 +01001347
Tim Halld8339a72021-05-27 18:49:40 +01001348 # Evaluate schedule
1349 _update_tensor_allocation(nng, arch, options)