blob: 2ac47878d2fa28d0f42c765419854ab7cd99f686 [file] [log] [blame]
Tim Halld8339a72021-05-27 18:49:40 +01001# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Halld8339a72021-05-27 18:49:40 +010016#
Tim Hall79d07d22020-04-27 18:20:16 +010017# Description:
Tim Halld8339a72021-05-27 18:49:40 +010018# The scheduler creates and searches for an optimal plan for the network, selecting block configurations and
19# subdivisions for the Operators
Diego Russoea6111a2020-04-14 18:41:58 +010020import copy
Tim Halld8339a72021-05-27 18:49:40 +010021from enum import auto
22from enum import IntEnum
23from typing import Dict
24from typing import List
25from typing import Optional
26from typing import Tuple
Diego Russoea6111a2020-04-14 18:41:58 +010027
28from . import live_range
Tim Hall79d07d22020-04-27 18:20:16 +010029from . import npu_performance
Tim Halld8339a72021-05-27 18:49:40 +010030from . import tensor_allocation
31from . import weight_compressor
32from .architecture_allocator import ArchitectureBlockConfig
33from .architecture_allocator import find_block_config
34from .architecture_allocator import get_ifm_area_required
Tim Halld8339a72021-05-27 18:49:40 +010035from .architecture_features import ArchitectureFeatures
36from .architecture_features import Block
37from .cascade_builder import CascadeBuilder
38from .cascade_builder import CascadeInfo
Fredrik Svedberg880e7352020-08-25 11:31:47 +020039from .data_type import DataType
Diego Russoe8a10452020-04-21 17:39:10 +010040from .nn_graph import CascadedPass
Tim Halld8339a72021-05-27 18:49:40 +010041from .nn_graph import Graph
42from .nn_graph import Pass
Diego Russoe8a10452020-04-21 17:39:10 +010043from .nn_graph import PassPlacement
Diego Russoe8a10452020-04-21 17:39:10 +010044from .nn_graph import SchedulingStrategy
Tim Halld8339a72021-05-27 18:49:40 +010045from .nn_graph import Subgraph
46from .numeric_util import round_down
47from .numeric_util import round_up
Diego Russoe8a10452020-04-21 17:39:10 +010048from .operation import NpuBlockType
Louis Verhaardaee5d752020-09-30 09:01:52 +020049from .operation import Op
Tim Halld8339a72021-05-27 18:49:40 +010050from .shape4d import Shape4D
Diego Russoe8a10452020-04-21 17:39:10 +010051from .tensor import MemArea
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020052from .tensor import MemType
Tim Halld8339a72021-05-27 18:49:40 +010053from .tensor import Tensor
Diego Russoe8a10452020-04-21 17:39:10 +010054from .tensor import TensorFormat
55from .tensor import TensorPurpose
56from .tensor import TensorSubPurpose
Jacob Bohlin1a666972020-09-11 10:04:15 +020057
Tim Hall79d07d22020-04-27 18:20:16 +010058
Tim Halld8339a72021-05-27 18:49:40 +010059def shape_for_format(shape: Shape4D, tensor_format: TensorFormat) -> Shape4D:
60 if tensor_format == TensorFormat.NHCWB16:
61 return shape.with_depth(round_up(shape.depth, 16))
62
63 return shape
64
65
66class OptimizationStrategy(IntEnum):
67 """Enum defining the different optimization strategies for the Scheduler"""
68
69 Size = auto()
70 Performance = auto()
Tim Hall79d07d22020-04-27 18:20:16 +010071
72 def __str__(self):
73 return self.name
74
75
Tim Halld8339a72021-05-27 18:49:40 +010076class SchedulerOpInfo:
77 """Contains metadata about a SchedulerOperation that is unique to one Schedule"""
78
Tim Hall79d07d22020-04-27 18:20:16 +010079 def __init__(
80 self,
Tim Halld8339a72021-05-27 18:49:40 +010081 block_config: ArchitectureBlockConfig,
82 weights_size: int,
83 stripe_input: Shape4D,
84 stripe_input2: Optional[Shape4D],
85 stripe: Shape4D,
Tim Hall79d07d22020-04-27 18:20:16 +010086 ):
Tim Halld8339a72021-05-27 18:49:40 +010087 self.block_config = block_config
88 self.weights_size = weights_size
89 self.stripe_input = stripe_input
90 self.stripe_input2 = stripe_input2
91 self.stripe = stripe
92 self.cascade = 0 # Assigned by CascadeBuilder. 0 means not part of a cascade
93 self.time_index = None # Set by update_op_memory_snapshot
94 self.ofm_depth_slices: List[int] = [0, stripe.depth]
95 self.npu_weights_tensor = None
Tim Halld784af72021-06-08 21:25:57 +010096 self.npu_scales_tensor = None
Tim Halld8339a72021-05-27 18:49:40 +010097 self.buffered_weight_tensor = None
98 self.cycles = None
99 self.slack_buffering_cycles = 0
100 self.slack_buffering_memory = 0
101 self.full_weight_transfer_cycles = 0
102
103 def copy(self):
104 res = SchedulerOpInfo(self.block_config, self.weights_size, self.stripe_input, self.stripe_input2, self.stripe,)
105 res.cascade = self.cascade
106 return res
107
108 def __str__(self):
109 res = f"\t\tBlock Config = {self.block_config}\n"
110 res += f"\t\tOFM Block = {self.block_config.ofm_block}\n"
111 res += f"\t\tIFM Stripe = {self.stripe_input}\n"
112 res += f"\t\tIFM2 Stripe = {self.stripe_input2}\n"
113 res += f"\t\tOFM Stripe = {self.stripe}\n"
114 res += f"\t\tEncoded Weights = {self.npu_weights_tensor and len(self.npu_weights_tensor.buffer)} bytes\n"
115 res += (
116 f"\t\tWeight buffer = {self.buffered_weight_tensor and self.buffered_weight_tensor.storage_size()} bytes\n"
117 )
118 res += f"\t\tDepth slices = {self.ofm_depth_slices}\n"
119 res += f"\t\tAssigned Cascade = {self.cascade}"
120 return res
121
122
123class SchedulerOptions:
124 """Contains options for the Scheduler"""
125
126 def __init__(
127 self, optimization_strategy, sram_target, verbose_schedule,
128 ):
129 self.optimization_strategy = optimization_strategy
130 self.optimization_sram_limit = sram_target
Tim Hall79d07d22020-04-27 18:20:16 +0100131 self.verbose_schedule = verbose_schedule
Tim Hall79d07d22020-04-27 18:20:16 +0100132
Tim Halld8339a72021-05-27 18:49:40 +0100133 def __str__(self) -> str:
134 return f"{type(self).__name__}: {str(self.__dict__)}"
Tim Hall79d07d22020-04-27 18:20:16 +0100135
136 __repr__ = __str__
137
138
Tim Halld8339a72021-05-27 18:49:40 +0100139class SchedulerTensor:
140 def __init__(self, shape, dt, mem_area, _format):
141 self.dtype = dt
142 self.mem_area = mem_area
143 self.shape = shape
144 self.format = _format
145 self.connection = None
Tim Hall79d07d22020-04-27 18:20:16 +0100146
Tim Halld8339a72021-05-27 18:49:40 +0100147
148class SchedulerOperation:
149 """Scheduler internal representation of 'Operation'
150 This class can be seen as a node within the Scheduler Graph representation
151 """
152
153 def __init__(self, ps: Pass, arch: ArchitectureFeatures, nng: Graph):
154 self.arch = arch
155 self.parent_ps = ps
156 self.parent_op = ps.primary_op
157 self.name = ps.primary_op.name
158 self.op_type = ps.primary_op.type
159 self.activation = ps.primary_op.activation
160 self.kernel = ps.primary_op.kernel
161 self.resampling_mode = ps.primary_op.ifm.resampling_mode
162 self.uses_scalar = ps.primary_op.ifm2 is not None and (
163 ps.primary_op.ifm.shape == [] or ps.primary_op.ifm2.shape == []
Tim Hall79d07d22020-04-27 18:20:16 +0100164 )
Tim Halld8339a72021-05-27 18:49:40 +0100165 self.ifm_ublock = arch.ifm_ublock
Tim Hall79d07d22020-04-27 18:20:16 +0100166
Tim Halld8339a72021-05-27 18:49:40 +0100167 self.ifm = SchedulerTensor(ps.ifm_shapes[0], ps.ifm_tensor.dtype, ps.ifm_tensor.mem_area, ps.ifm_tensor.format,)
Tim Hall79d07d22020-04-27 18:20:16 +0100168
Tim Halld8339a72021-05-27 18:49:40 +0100169 self.ifm2 = None
170 if ps.ifm2_tensor:
171 self.ifm2 = SchedulerTensor(
172 ps.ifm_shapes[1], ps.ifm2_tensor.dtype, ps.ifm2_tensor.mem_area, ps.ifm2_tensor.format,
173 )
Tim Hall79d07d22020-04-27 18:20:16 +0100174
Tim Halld8339a72021-05-27 18:49:40 +0100175 self.ofm = SchedulerTensor(ps.ofm_shapes[0], ps.ofm_tensor.dtype, ps.ofm_tensor.mem_area, ps.ofm_tensor.format,)
Tim Hall79d07d22020-04-27 18:20:16 +0100176
Tim Halld8339a72021-05-27 18:49:40 +0100177 # Input volume width and height required to produce the smallest possible stripe
178 self.min_stripe_input_w, self.min_stripe_input_h = self._calculate_min_stripe_input()
Tim Hall79d07d22020-04-27 18:20:16 +0100179
Tim Halld8339a72021-05-27 18:49:40 +0100180 # Flags that marks whether this SchedulerOperation requires full IFM/OFM
181 self.requires_full_ifm = False
182 self.requires_full_ifm2 = False
183 self.requires_full_ofm = False
Tim Hall79d07d22020-04-27 18:20:16 +0100184
Tim Halld8339a72021-05-27 18:49:40 +0100185 self.index = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100186
Tim Halld8339a72021-05-27 18:49:40 +0100187 def add_ifm_connection(self, conn: "Connection"):
188 """Add input connection to another SchedulerOperation or Subgraph Input"""
189 conn.consumers.append(self)
190 self.ifm.connection = conn
Tim Hall79d07d22020-04-27 18:20:16 +0100191
Tim Halld8339a72021-05-27 18:49:40 +0100192 def add_ifm2_connection(self, conn: "Connection"):
193 """Add input connection to another SchedulerOperation or Subgraph Input"""
194 if self.ifm2:
195 conn.consumers.append(self)
196 self.ifm2.connection = conn
Tim Hall79d07d22020-04-27 18:20:16 +0100197 else:
Tim Halld8339a72021-05-27 18:49:40 +0100198 assert False, f"Trying to set an IFM2 Connection to {self} which has no IFM2"
Tim Hall79d07d22020-04-27 18:20:16 +0100199
Tim Halld8339a72021-05-27 18:49:40 +0100200 def add_ofm_connection(self, conn: "Connection"):
201 """Add output connection to another SchedulerOperation or Subgraph Output"""
202 conn.producers.append(self)
203 self.ofm.connection = conn
204
205 def get_dependants(self):
206 """Returns a list of the Ops that depend on this Operation's OFM"""
207 return self.ofm.connection.consumers
208
209 def ifm_size_in_bytes(self) -> int:
210 """Returns size of the IFM in bytes"""
211 ifm_storage_shape = shape_for_format(self.ifm.shape, self.ifm.format)
212 return round_up(ifm_storage_shape.elements() * self.ifm.dtype.size_in_bytes(), Tensor.AllocationQuantum)
213
214 def ifm2_size_in_bytes(self) -> int:
215 """Returns size of the IFM2 in bytes"""
216 if self.ifm2:
217 ifm2_storage_shape = shape_for_format(self.ifm2.shape, self.ifm2.format)
218 return round_up(ifm2_storage_shape.elements() * self.ifm2.dtype.size_in_bytes(), Tensor.AllocationQuantum)
219
220 return 0
221
222 def ofm_size_in_bytes(self) -> int:
223 """Returns size of the OFM in bytes"""
224 ofm_storage_shape = shape_for_format(self.ofm.shape, self.ofm.format)
225 return round_up(ofm_storage_shape.elements() * self.ofm.dtype.size_in_bytes(), Tensor.AllocationQuantum)
226
227 def create_scheduler_info(self, nng: Graph, stripe: Shape4D) -> SchedulerOpInfo:
228 """Returns schedule info about this SchedulerOperation based on how many ofm elements it should produce"""
229 ifm_shape = self.ifm.shape
230 ifm2_shape = self.ifm2 and self.ifm2.shape
231 ofm_shape = stripe
232
233 if ofm_shape != self.ofm.shape:
234 # Striped Op - Need to calculate stripe input volume
235 stripe_input_w, stripe_input_h = self._get_stripe_input_requirement(stripe)
236 # Ensure stripe input volume is within the full IFM volume
237 stripe_input_h = min(stripe_input_h, self.ifm.shape.height)
238 stripe_input_w = min(stripe_input_w, self.ifm.shape.width)
239 ifm_shape = ifm_shape.with_hw(stripe_input_h, stripe_input_w)
240
241 if self.ifm2:
242 stripe_input2_h = min(stripe_input_h, self.ifm2.shape.height)
243 stripe_input2_w = min(stripe_input_w, self.ifm2.shape.width)
244 ifm2_shape = ifm2_shape.with_hw(stripe_input2_h, stripe_input2_w)
245
246 block_config = self._get_block_config(ifm_shape, ifm2_shape, self.uses_scalar, ofm_shape)
247
248 scheduler_op_info = SchedulerOpInfo(block_config, 0, ifm_shape, ifm2_shape, ofm_shape)
249 if self.parent_op.weights:
250 # Default full-depth weight encoding with no buffering
Tim Halld784af72021-06-08 21:25:57 +0100251 (
252 scheduler_op_info.npu_weights_tensor,
253 scheduler_op_info.npu_scales_tensor,
254 ) = weight_compressor.encode_weight_and_scale_tensor(
Tim Halld8339a72021-05-27 18:49:40 +0100255 self.arch,
256 self.parent_op,
257 self.parent_op.weights,
258 self.parent_op.bias,
259 self.kernel,
260 block_config,
261 [0, self.ofm.shape.depth],
262 )
263
264 self.parent_ps.block_config = block_config.old_style_representation()
265 return scheduler_op_info
266
267 def _get_stripe_input_requirement(self, stripe_shape: Shape4D) -> Tuple[int, int]:
268 """Returns the amount of IFM required to produce the stripe with shape:'stripe_shape'"""
269 ofm_shape_to_produce = Block.from_shape(stripe_shape.as_list())
270
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200271 return get_ifm_area_required(ofm_shape_to_produce, self.kernel, self.resampling_mode)
Tim Halld8339a72021-05-27 18:49:40 +0100272
273 def _calculate_min_stripe_input(self) -> Shape4D:
274 # Calculate the input volume required height and width for the smallest possible stripe (h,w = 1,1)
275 min_stripe = self.ofm.shape.with_hw(1, 1)
276 return self._get_stripe_input_requirement(min_stripe)
277
278 def _get_block_config(
279 self, ifm_shape: Shape4D, ifm2_shape: Optional[Shape4D], uses_scalar: bool, ofm_shape: Shape4D
280 ) -> ArchitectureBlockConfig:
281 # Returns a block config and SHRAM layout
282 lut_banks = 2 if self.parent_op.activation_lut else 0
283 return find_block_config(
284 self.arch,
285 self.op_type.npu_block_type,
286 ofm_shape,
287 ifm_shape,
288 ifm2_shape,
289 uses_scalar,
290 self.ifm.dtype.size_in_bits(),
291 self.kernel,
292 lut_banks,
293 self.parent_op.has_scaling(),
294 self.resampling_mode,
295 )
296
297
298class Connection:
299 """Scheduler internal representation of a Tensor that connects two SchedulerOperations
300 This class can be seen as an edge within the Scheduler Graph representation
301 """
302
303 def __init__(self, tensor: Tensor):
304 self.parent_tens = tensor
305
306 # SchedulerOperation relationships
307 self.producers: List[SchedulerOperation] = []
308 self.consumers: List[SchedulerOperation] = []
Tim Hall79d07d22020-04-27 18:20:16 +0100309
310 def __str__(self):
Tim Halld8339a72021-05-27 18:49:40 +0100311 return f"<Connection {self.parent_tens.name}>"
Tim Hall79d07d22020-04-27 18:20:16 +0100312
313 __repr__ = __str__
314
315
Tim Halld8339a72021-05-27 18:49:40 +0100316class Schedule:
317 """Class that contains a solution of how to schedule an NPU subgraph and its cost"""
Tim Hall79d07d22020-04-27 18:20:16 +0100318
Tim Halld8339a72021-05-27 18:49:40 +0100319 def __init__(self, sg: Subgraph, label: str):
320 self.sg = sg
321 self.label = label
322 self.cost_map: Dict[SchedulerOperation, SchedulerOpInfo] = {}
323 self.cascades: Dict[int, CascadeInfo] = {}
324 self.fast_storage_peak_usage = 0
325 self.memory_snapshot = None
326
327 @property
328 def name(self):
329 return f"{self.sg.name}_{self.label}"
Tim Hall79d07d22020-04-27 18:20:16 +0100330
331
Tim Halld8339a72021-05-27 18:49:40 +0100332class Scheduler:
333 """Main class of the Vela Scheduling"""
Tim Hall79d07d22020-04-27 18:20:16 +0100334
Tim Halld8339a72021-05-27 18:49:40 +0100335 def __init__(self, nng: Graph, sg: Subgraph, arch: ArchitectureFeatures, options: SchedulerOptions):
Tim Hall79d07d22020-04-27 18:20:16 +0100336 self.nng = nng
337 self.sg = sg
338 self.arch = arch
Tim Halld8339a72021-05-27 18:49:40 +0100339 self.sched_ops: List(SchedulerOperation) = []
340 self.max_schedule = None
341 self.scheduler_options = options
Tim Hall79d07d22020-04-27 18:20:16 +0100342
Tim Halld8339a72021-05-27 18:49:40 +0100343 def create_scheduler_representation(self, arch: ArchitectureFeatures):
344 """Creates a Scheduler Graph representation"""
345 # Temporary dict for creating connections between the Operations
346 connections: Dict[Tensor, Connection] = {}
347 # Memory required for the largest FeatureMap that has to be full
348 min_memory_req = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100349 for ps in self.sg.passes:
Tim Halld8339a72021-05-27 18:49:40 +0100350 if ps.primary_op:
351 # Set tensor format to NHCWB16 for output FeatureMaps, if possible
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200352 for output in ps.outputs:
Jacob Bohlina5e8c1c2021-06-14 13:33:39 +0200353 if output in self.sg.output_tensors or output.purpose != TensorPurpose.FeatureMap:
Patrik Gustavssonfeeb06d2020-04-22 12:53:47 +0200354 continue
Tim Halld8339a72021-05-27 18:49:40 +0100355 if not output.needs_linear_format:
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200356 output.set_format(TensorFormat.NHCWB16, arch)
Tim Halld8339a72021-05-27 18:49:40 +0100357
358 # Create SchedulerOperations
359 op = SchedulerOperation(ps, arch, self.nng)
360 op.index = len(self.sched_ops)
361
362 # Make connections
363 if ps.ifm_tensor not in connections:
364 connections[ps.ifm_tensor] = Connection(ps.ifm_tensor)
365 if ps.ifm2_tensor and ps.ifm2_tensor not in connections:
366 connections[ps.ifm2_tensor] = Connection(ps.ifm2_tensor)
367 if ps.ofm_tensor not in connections:
368 connections[ps.ofm_tensor] = Connection(ps.ofm_tensor)
369
370 op.add_ifm_connection(connections[ps.ifm_tensor])
371 if ps.ifm2_tensor:
372 op.add_ifm2_connection(connections[ps.ifm2_tensor])
373 op.add_ofm_connection(connections[ps.ofm_tensor])
374
375 # Set requirements on the ifm/ofm buffers
376 self.sched_ops.append(op)
377 if ps.ifm_tensor in self.sg.input_tensors:
378 # This Op consumes a subgraph input
379 op.requires_full_ifm = True
380 if ps.ifm2_tensor and ps.ifm2_tensor in self.sg.input_tensors:
381 # This Op consumes a subgraph input
382 op.requires_full_ifm2 = True
383 if ps.ofm_tensor in self.sg.output_tensors:
384 # This Op produces a subgraph output
385 op.requires_full_ofm = True
386 if ps.ifm_tensor.needs_linear_format:
387 op.requires_full_ifm = True
388 if ps.ifm2_tensor and ps.ifm2_tensor.needs_linear_format:
389 op.requires_full_ifm2 = True
390 if ps.ofm_tensor.needs_linear_format or ps.primary_op.memory_function == Op.ConcatSliceWrite:
391 op.requires_full_ofm = True
392 if len(ps.primary_op.outputs) > 1 or len(ps.primary_op.outputs[0].consumer_list) > 1:
393 # Op has multiple outputs or consumers - requires full OFM
394 op.requires_full_ofm = True
395
396 # Check memory requirements if this Op requires any full FeatureMaps
397 op_memory_req = 0
398 if op.requires_full_ifm:
399 op_memory_req += op.ifm_size_in_bytes()
400 if op.requires_full_ifm2:
401 op_memory_req += op.ifm2_size_in_bytes()
402 if op.requires_full_ofm:
403 op_memory_req += op.ofm_size_in_bytes()
404
405 min_memory_req = max(op_memory_req, min_memory_req)
406
407 # Theoretical minimum required memory - used to guide the cascade building
408 self.min_memory_req = min_memory_req
409
410 def create_initial_schedule(self) -> Schedule:
411 """Creates an initial schedule with no cascading or buffering of any kind"""
412 schedule = Schedule(self.sg, "MAX")
413
414 for op in self.sched_ops:
415 cost = op.create_scheduler_info(self.nng, op.ofm.shape)
416 cost.cycles = self.estimate_op_performance(op, cost.block_config, op.ofm.shape.depth)
417 schedule.cost_map[op] = cost
418
419 return schedule
420
421 def update_op_memory_snapshot(self, schedule: Schedule):
422 memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]
423
424 # Collect live ranges from tensors
425 lr_graph = live_range.LiveRangeGraph()
426 for mem_area, mem_type_set in memories_list:
427 live_range.extract_live_ranges_from_cascaded_passes(
428 self.nng.get_root_subgraph(), mem_area, mem_type_set, False, lr_graph, Tensor.AllocationQuantum,
429 )
430
431 # Populate time-array with memory used by live ranges
432 temporal_usage = lr_graph.get_temporal_memory_usage(self.arch.fast_storage_mem_area)
433 schedule.memory_snapshot = temporal_usage
434
435 # Set the peak memory usage
436 schedule.fast_storage_peak_usage = max(temporal_usage, default=0)
437
438 def estimate_op_performance(self, op: SchedulerOperation, block_config, ofm_depth):
439 query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)
440 query.ifm_shape = op.ifm.shape
441 query.ifm_memory_area = op.ifm.mem_area
442 query.ifm_bits = op.ifm.dtype.size_in_bits()
443 query.ifm_format = op.ifm.format
444 query.ifm2_shape = op.ifm2 and op.ifm2.shape
445 query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area
446 query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()
447 query.ifm2_format = op.ifm2 and op.ifm2.format
448 query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)
449 query.ofm_memory_area = op.ofm.mem_area
450 query.ofm_bits = op.ofm.dtype.size_in_bits()
451 query.ofm_format = op.ofm.format
452 if op.parent_op.bias:
453 query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)
454 query.const_memory_area = self.arch.fast_storage_mem_area
455
456 query.kernel = op.kernel
457 query.config = block_config
458
459 return npu_performance.measure_cycle_cost(self.arch, op.op_type, op.activation and op.activation.op_type, query)
460
Tim Hall789e6f32021-06-17 17:02:31 +0100461 def propose_schedule_buffering(self, ref_schedule: Schedule, staging_limit_bytes):
Tim Halld8339a72021-05-27 18:49:40 +0100462 """Create a buffered schedule"""
463 buffered_schedule = Schedule(self.sg, f"{ref_schedule.label}_BUFFERED")
Tim Halld8339a72021-05-27 18:49:40 +0100464
465 prev_op = None
466 for sched_op in self.sched_ops:
467 if sched_op not in ref_schedule.cost_map:
468 # sched_op is not part of this sub-schedule - skip
469 continue
470
471 self.propose_operator_buffering(sched_op, prev_op, buffered_schedule, ref_schedule, staging_limit_bytes)
472 prev_op = sched_op
473
474 return buffered_schedule
475
476 def propose_operator_buffering(
477 self,
478 sched_op: SchedulerOperation,
479 prev_op: SchedulerOperation,
480 buffered_schedule: Schedule,
481 ref_schedule: Schedule,
482 staging_limit_bytes,
483 ):
484 # Mild recursion might mean this Op has already been seen
485 if sched_op in buffered_schedule.cost_map:
486 return
487
488 # Take the reference schedule as default costings for this schedule
489 ref_cost = ref_schedule.cost_map[sched_op]
490 cost = copy.copy(ref_cost)
491 cost.slack_buffering_cycles = ref_cost.cycles.op_cycles
492 memory_snapshot = ref_schedule.memory_snapshot
493 ref_memory_usage = memory_snapshot[ref_cost.time_index] if ref_cost.time_index < len(memory_snapshot) else 0
494 cost.slack_buffering_memory = staging_limit_bytes - ref_memory_usage
495 buffered_schedule.cost_map[sched_op] = cost
496
497 # Attempt weight buffering on anything with a weights tensor
498 if sched_op.parent_op.weights:
499 self.propose_weight_buffering(
500 sched_op.parent_op.weights,
501 sched_op.parent_op.bias,
502 sched_op,
503 prev_op,
504 buffered_schedule,
505 ref_schedule,
506 cost.slack_buffering_memory,
507 )
508
509 return cost
510
511 def weights_needs_dma(self, weight_tensor):
512 if weight_tensor and weight_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast):
513 # Weights are in permanent storage
514 # Only when permanent storage differs from feature map storage, there is a point moving the data
515 if (
516 weight_tensor.mem_area in (MemArea.Dram, MemArea.OffChipFlash)
517 and self.arch.permanent_storage_mem_area != self.arch.fast_storage_mem_area
518 ):
519 return True
520 return False
521
522 def propose_weight_buffering(
523 self,
524 weight_tensor,
525 scale_tensor,
526 sched_op: SchedulerOperation,
527 prev_op: SchedulerOperation,
528 buffered_schedule: Schedule,
529 ref_schedule: Schedule,
530 buffer_limit_bytes,
531 ):
532 cost = buffered_schedule.cost_map[sched_op]
533 prev_cost = buffered_schedule.cost_map.get(prev_op)
534 ref_cost = ref_schedule.cost_map[sched_op]
535 assert cost and ref_cost
536
537 needs_dma = self.weights_needs_dma(weight_tensor)
538
539 ofm_full_depth_slices = [0, ref_cost.stripe.depth]
540
541 # Encode weights for the full depth
Tim Halld784af72021-06-08 21:25:57 +0100542 full_weights, full_scales = weight_compressor.encode_weight_and_scale_tensor(
Tim Halld8339a72021-05-27 18:49:40 +0100543 self.arch,
544 sched_op.parent_op,
545 weight_tensor,
546 scale_tensor,
547 sched_op.kernel,
548 cost.block_config,
549 ofm_full_depth_slices,
550 )
551 full_weights_bytes = len(full_weights.buffer)
552 cost.ofm_depth_slices = ofm_full_depth_slices
553
554 # No buffering required - take all the weights from permanent storage
555 if sched_op.op_type == Op.FullyConnected or not needs_dma:
556 cost.npu_weights_tensor = full_weights
Tim Halld784af72021-06-08 21:25:57 +0100557 cost.npu_scales_tensor = full_scales
Tim Halld8339a72021-05-27 18:49:40 +0100558 return
559
560 encoded_weights = full_weights
Tim Halld784af72021-06-08 21:25:57 +0100561 encoded_scales = full_scales
Tim Halld8339a72021-05-27 18:49:40 +0100562
563 # How many NPU cycles are available under the previously executing
564 # operator and SRAM unused for performing buffered DMA transfers
565 slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0
566 slack_memory = prev_cost.slack_buffering_memory if prev_cost else 0
567
568 # Force full depth for cascaded Ops
569 if ref_cost.cascade != 0:
570 weight_tensor_purpose = TensorSubPurpose.Standard
571 weight_buffer_size = full_weights_bytes
572 # Update the memory snapshot to reflect the added size of the weights
573 ref_schedule.memory_snapshot[ref_cost.time_index] += weight_buffer_size
574 else:
575 # Estimate the buffering cycle time for the full set of weights
576 full_transfer_cycles = npu_performance.measure_mem2mem_cycles(
577 self.arch, weight_tensor.mem_area, self.arch.fast_storage_mem_area, full_weights_bytes
578 )
579 cost.full_weight_transfer_cycles = full_transfer_cycles
580
581 # Calculate the amount of prebuffering necessary (or what is possible with limited
582 # double buffer buffer size)
583 half_buffer_limit = buffer_limit_bytes // 2
584 if full_transfer_cycles > slack_cycles:
585 prebuffer_ratio = slack_cycles / full_transfer_cycles
586 prebuffer_bytes = min(prebuffer_ratio * full_weights_bytes, half_buffer_limit)
587 else:
588 prebuffer_bytes = min(full_weights_bytes, half_buffer_limit)
Tim Hall789e6f32021-06-17 17:02:31 +0100589
590 prebuffer_ratio = prebuffer_bytes / full_weights_bytes
Tim Halld8339a72021-05-27 18:49:40 +0100591
592 # Have to split the weights if the initial buffering can't store
593 # all of the compressed weights
594 if prebuffer_bytes < full_weights_bytes:
Tim Hall789e6f32021-06-17 17:02:31 +0100595 block_depth = cost.block_config.ofm_block.depth
Tim Halld8339a72021-05-27 18:49:40 +0100596
Tim Hall789e6f32021-06-17 17:02:31 +0100597 # Choose initial prebuffering depth (already buffer clamped)
598 prebuffer_depth = ref_cost.stripe.depth * prebuffer_ratio
Tim Halld8339a72021-05-27 18:49:40 +0100599 prebuffer_depth = int(max(16, round_down(prebuffer_depth, ArchitectureFeatures.OFMSplitDepth)))
600
Tim Hall789e6f32021-06-17 17:02:31 +0100601 # Calculate cycles executed during the prebuffer
602 pre_op_cycles = self.estimate_op_performance(sched_op, cost.block_config, prebuffer_depth)
603 buffering_depth = ref_cost.stripe.depth * (pre_op_cycles.op_cycles / full_transfer_cycles)
Tim Halld8339a72021-05-27 18:49:40 +0100604
Tim Hall789e6f32021-06-17 17:02:31 +0100605 # Choose initial buffering depth and clamp to the double buffering limit
606 buffering_depth = round_up(buffering_depth, block_depth)
607 buffering_bytes = (buffering_depth / ref_cost.stripe.depth) * full_weights_bytes
608 if buffering_bytes > half_buffer_limit:
609 buffering_depth = (half_buffer_limit / full_weights_bytes) * ref_cost.stripe.depth
610
611 while True:
612 # Attempt to buffer whole blocks
613 if buffering_bytes > block_depth:
614 buffering_depth = round_down(buffering_depth, block_depth)
615 else:
616 buffering_depth = round_down(buffering_depth, ArchitectureFeatures.OFMSplitDepth)
617 buffering_depth = int(max(buffering_depth, ArchitectureFeatures.OFMSplitDepth))
Tim Halld8339a72021-05-27 18:49:40 +0100618
619 # Create list of depth slices
620 depth_slices = [0]
621 if prebuffer_depth < ref_cost.stripe.depth:
622 depth_slices += list(range(prebuffer_depth, ref_cost.stripe.depth, buffering_depth))
623 depth_slices.append(ref_cost.stripe.depth)
624
625 # Encode weights based depth slices
626 cost.ofm_depth_slices = depth_slices
Tim Halld784af72021-06-08 21:25:57 +0100627 encoded_weights, encoded_scales = weight_compressor.encode_weight_and_scale_tensor(
Tim Halld8339a72021-05-27 18:49:40 +0100628 self.arch,
629 sched_op.parent_op,
630 weight_tensor,
631 scale_tensor,
632 sched_op.kernel,
633 cost.block_config,
634 cost.ofm_depth_slices,
635 )
636
637 # Chosen buffering might not fit at all, iterate until it does
638 # or until the minimum usable slice size is reached
639 if (
640 encoded_weights.max_range_bytes <= half_buffer_limit
641 or prebuffer_depth == ArchitectureFeatures.OFMSplitDepth
642 ):
643 break
644
Tim Hall789e6f32021-06-17 17:02:31 +0100645 if buffering_depth > prebuffer_depth:
646 buffering_depth = round_up(buffering_depth // 2, ArchitectureFeatures.OFMSplitDepth)
647 else:
648 prebuffer_depth = round_up(prebuffer_depth // 2, ArchitectureFeatures.OFMSplitDepth)
Tim Halld8339a72021-05-27 18:49:40 +0100649
650 # Calculate cycles required to run the last op for use as future slack
651 tail_cycles = self.estimate_op_performance(
652 sched_op, cost.block_config, depth_slices[-1] - depth_slices[-2]
653 )
654 cost.slack_buffering_cycles = tail_cycles.op_cycles
655
656 # Determine whether the weights need to be double buffered
657 weight_buffer_size = min(len(encoded_weights.buffer), encoded_weights.max_range_bytes)
658
659 # Only buffer weights if there's still space left for the buffer
660 if weight_buffer_size <= buffer_limit_bytes:
661 assert weight_buffer_size % 16 == 0
662 # Determine whether to double buffer or single buffer
663 if (weight_buffer_size * 2 <= buffer_limit_bytes) and (weight_buffer_size < len(encoded_weights.buffer)):
664 weight_buffer_size = weight_buffer_size * 2
665 weight_tensor_purpose = TensorSubPurpose.DoubleBuffer
666 else:
667 weight_tensor_purpose = TensorSubPurpose.Standard
668
Jacob Bohlineee9e5d2021-08-17 17:44:45 +0200669 cost.buffered_weight_tensor = self.buffer_tensor(
670 encoded_weights, weight_tensor_purpose, weight_buffer_size, weight_tensor.name
Tim Halld8339a72021-05-27 18:49:40 +0100671 )
Tim Halld8339a72021-05-27 18:49:40 +0100672 if ref_cost.cascade == 0:
673 # Determine if the lifetime can be extended and pre-buffer weights under the previous operation
674 cost.buffered_weight_tensor.pre_buffer = weight_buffer_size < slack_memory
675
676 cost.slack_buffering_memory -= weight_buffer_size
677 else:
678 # Don't slice or buffer - use the whole depth from persistent storage
679 cost.ofm_depth_slices = ofm_full_depth_slices
680 encoded_weights = full_weights
Tim Halld784af72021-06-08 21:25:57 +0100681 encoded_scales = full_scales
Tim Halld8339a72021-05-27 18:49:40 +0100682
683 cost.npu_weights_tensor = encoded_weights
Tim Halld784af72021-06-08 21:25:57 +0100684 cost.npu_scales_tensor = encoded_scales
Tim Halld8339a72021-05-27 18:49:40 +0100685
Jacob Bohlineee9e5d2021-08-17 17:44:45 +0200686 def buffer_tensor(self, src_tensor: Tensor, sub_purpose: TensorSubPurpose, buffer_size: int, name: str) -> Tensor:
687 buffered_weight_tensor = Tensor([1, 1, 1, buffer_size], DataType.uint8, name + "_buffer")
688 buffered_weight_tensor.src_tensor = src_tensor
689 buffered_weight_tensor.mem_area = self.arch.fast_storage_mem_area
690 buffered_weight_tensor.mem_type = MemType.Scratch_fast
691 buffered_weight_tensor.purpose = TensorPurpose.Weights
692 buffered_weight_tensor.sub_purpose = sub_purpose
693 return buffered_weight_tensor
694
Tim Halld8339a72021-05-27 18:49:40 +0100695 def propose_minimal_schedule(self) -> Schedule:
696 """Proposes scheduling parameters where every operator is subdivided into the smallest stripe that satisfies the
697 next operators stride"""
698 min_schedule = Schedule(self.sg, "MIN")
699 cost_map = min_schedule.cost_map
700
701 # Keep track of the previous Op - which consumes the current Op's OFM
702 prev_op = None
703 for sched_op in reversed(self.sched_ops):
704 min_stripe_height = prev_op.kernel.stride.y if prev_op else 1
705 min_stripe = sched_op.ofm.shape.with_height(min_stripe_height)
706
707 cost = sched_op.create_scheduler_info(self.nng, min_stripe)
708 cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)
709 cost_map[sched_op] = cost
710
711 prev_op = sched_op
712
713 return min_schedule
714
715 def propose_schedule_striping(self, final_stripe: Shape4D, label: str, ref_schedule: Schedule) -> Schedule:
716 """Proposes new striping for a schedule. The stripe is derived from the ifm requirements of the next Op down"""
717 ref_cost = ref_schedule.cost_map
718
719 striped_schedule = Schedule(self.sg, label)
720 stripe = final_stripe
721 for sched_op in reversed(self.sched_ops):
722 if sched_op not in ref_cost:
723 # sched_op is not part of the sub-schedule - skip
724 continue
725
726 # Create a cost entry with the new stripe
727 cost = sched_op.create_scheduler_info(self.nng, stripe)
728
Jacob Bohlineee9e5d2021-08-17 17:44:45 +0200729 if ref_cost[sched_op].buffered_weight_tensor:
730 # If the weights are buffered in the reference schedule they should be in the new proposal
731 weight_tensor = cost.npu_weights_tensor
732 cost.buffered_weight_tensor = self.buffer_tensor(
733 weight_tensor, TensorSubPurpose.Standard, len(weight_tensor.buffer), weight_tensor.name
734 )
Tim Halld8339a72021-05-27 18:49:40 +0100735
736 # Estimate performance
737 cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)
738 striped_schedule.cost_map[sched_op] = cost
739
740 # Calculate the preceeding Op's stripe
741 stripe = sched_op.ifm.shape.with_height(stripe.height * sched_op.kernel.stride.y)
742
743 return striped_schedule
744
745 def estimate_schedule_memory_usage(self, schedule: Schedule, non_local_mem_usage: dict):
746 """Estimates the memory usage of a schedule"""
747 cost = schedule.cost_map
748 cascades = schedule.cascades
749 peak_mem_usage = 0
750 for sched_op in self.sched_ops:
751 if sched_op not in cost:
752 # sched_op is not part of the sub-schedule - skip
753 continue
754
755 if cost[sched_op].cascade:
756 # This Op is part of a cascade - use the cascade's memory usage
757 cascade_info = cascades[cost[sched_op].cascade]
758 # Non-local memory usage is already included in the cascade_info
759 peak_mem_usage = max(cascade_info.mem_usage, peak_mem_usage)
760 else:
761 # This Op is not part of a cascade - calculate the memory usage
762 op_weight_buffer = 0
763 if cost[sched_op].buffered_weight_tensor:
764 op_weight_buffer = cost[sched_op].buffered_weight_tensor.storage_size()
765
766 op_mem_usage = (
767 sched_op.ifm_size_in_bytes()
768 + sched_op.ofm_size_in_bytes()
769 + op_weight_buffer
770 + non_local_mem_usage.get(sched_op, 0)
771 )
772 peak_mem_usage = max(op_mem_usage, peak_mem_usage)
773
774 return peak_mem_usage
775
776 def optimize_sub_schedule(
777 self, cascade_info: CascadeInfo, ref_schedule: Schedule, max_template: Schedule, memory_limit: int
778 ) -> Schedule:
779 """Extracts the Ops covered by the given cascade and creates a sub-schedule. The sub-schedule is optimized by
780 proposing weight buffering and then continously proposing new stripe sizes"""
781 ref_cost = ref_schedule.cost_map
782 # Extract the ops that are part of this sub-schedule
783 start = cascade_info.start
784 end = cascade_info.end
785 sub_schedule_ops = self.sched_ops[start : end + 1]
786 # Create a sub-schedule that contains only the costs for the Ops that are part of the sub-schedule
787 sub_schedule = Schedule(self.sg, f"SUB_{start}_{end}")
788 for sched_op in sub_schedule_ops:
789 sub_schedule.cost_map[sched_op] = ref_cost[sched_op]
790
791 sub_schedule.cascades[end] = cascade_info
792 # Use the memory snapshot from the reference schedule
793 sub_schedule.memory_snapshot = ref_schedule.memory_snapshot
794
795 # Calculate memory usage that is live during the sub-schedule but not part of it
796 time_for_cascade = ref_cost[sub_schedule_ops[0]].time_index
797 mem_usage_parallel_to_sub_schedule = ref_schedule.memory_snapshot[time_for_cascade] - cascade_info.mem_usage
798 # If the first Op's IFM has other consumers it has to live throughout the whole sub-schedule whether it's
799 # included in a cascade or not
800 persistent_initial_ifm = (
801 sub_schedule_ops[0].ifm_size_in_bytes() if len(sub_schedule_ops[0].ifm.connection.consumers) > 1 else 0
802 )
803 # Calculate non-local-mem-usage per Operator
804 non_local_mem_usage = {}
805 for idx, sched_op in enumerate(sub_schedule_ops):
806 non_local_mem_usage[sched_op] = mem_usage_parallel_to_sub_schedule
807 if idx != 0:
808 non_local_mem_usage[sched_op] += persistent_initial_ifm
809
810 cascade_builder = CascadeBuilder(sub_schedule_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)
811
812 # Start by adding buffering
Tim Hall789e6f32021-06-17 17:02:31 +0100813 buffered_sub_schedule = self.propose_schedule_buffering(
814 sub_schedule, self.scheduler_options.optimization_sram_limit
815 )
Tim Halld8339a72021-05-27 18:49:40 +0100816 # Copy the cascades over from the unbuffered-schedule
817 buffered_sub_schedule.cascades = sub_schedule.cascades
818
819 # Generate the possible stripings for the final Op in the sub-schedule
820 final_ofm_shape = sub_schedule_ops[-1].ofm.shape
821 possible_stripes = [
822 final_ofm_shape.with_height(stripe_h) for stripe_h in range(1, final_ofm_shape.height // 2 + 1)
823 ]
824
825 # Propose different striping - the possible stripes are proposed similarly to a binary search
Jacob Bohlinfad72042021-08-24 21:51:41 +0200826 best_schedule = None
Tim Halld8339a72021-05-27 18:49:40 +0100827 iteration = 0
828 while len(possible_stripes) > 1:
829 proposed_stripe = possible_stripes[len(possible_stripes) // 2]
830 proposed_schedule = self.propose_schedule_striping(
831 proposed_stripe, f"OPTIMIZED_{iteration}", buffered_sub_schedule
832 )
833
834 cascade_builder.build_cascades(proposed_schedule, max_template, memory_limit)
835
836 # Check if proposal fits
837 proposed_schedule_mem_usage = self.estimate_schedule_memory_usage(proposed_schedule, non_local_mem_usage)
838 if (proposed_schedule_mem_usage) <= memory_limit:
839 # Remove all possible stripes smaller than this
840 possible_stripes = possible_stripes[len(possible_stripes) // 2 :]
841 best_schedule = proposed_schedule
842 if not proposed_schedule.cascades:
843 # No cascading required - early exit
844 break
845 else:
846 # Proposal doesn't fit within the limit - remove all possible stripes larger than this
847 possible_stripes = possible_stripes[: len(possible_stripes) // 2]
848
849 iteration += 1
850
851 return best_schedule
852
853 def optimize_schedule(
854 self, schedule: Schedule, max_sched: Schedule, max_template: Schedule, options: SchedulerOptions,
855 ) -> Schedule:
856 """Extracts sub-schedules based on the cascades and optimizes them and applies them to the final schedule"""
857 sram_limit = options.optimization_sram_limit
858 if max_sched.fast_storage_peak_usage < sram_limit and not self.arch.is_spilling_enabled():
859 # Maximum performance schedule fits within the SRAM target
860 return max_sched
861
Jacob Bohlinfad72042021-08-24 21:51:41 +0200862 # Iterate over a copy of the cascades since they may change during the loop
863 for cascade_info in list(schedule.cascades.values()):
Tim Halld8339a72021-05-27 18:49:40 +0100864 # Optimize the sub-schedule in this cascade
865 opt_sub_schedule = self.optimize_sub_schedule(cascade_info, schedule, max_template, sram_limit)
Jacob Bohlinfad72042021-08-24 21:51:41 +0200866 if opt_sub_schedule:
867 # Remove the existing cascade
868 del schedule.cascades[cascade_info.end]
869 # Update the sub-schedule Op and cascade costs to the full schedule
870 schedule.cost_map.update(opt_sub_schedule.cost_map)
871 schedule.cascades.update(opt_sub_schedule.cascades)
Tim Halld8339a72021-05-27 18:49:40 +0100872
873 # Update memory snapshot
874 self.sg.schedule = schedule
875 self.update_op_memory_snapshot(schedule)
876 # Propose schedule buffering to the optimized schedule
Tim Hall789e6f32021-06-17 17:02:31 +0100877 optimized_sched = self.propose_schedule_buffering(schedule, self.scheduler_options.optimization_sram_limit)
Tim Halld8339a72021-05-27 18:49:40 +0100878 # Copy the cascade's metadata from the unbuffered schedule
879 optimized_sched.cascades = schedule.cascades
880 return optimized_sched
881
882 def apply_schedule(self, sched: Schedule):
883 """Applies the given schedule as a final solution"""
884 for sched_op in self.sched_ops:
885 op_info = sched.cost_map[sched_op]
886 cascade_info = sched.cascades.get(op_info.cascade, None)
887 if cascade_info and sched_op in cascade_info.buffers:
888 buffer_tens = sched_op.ifm.connection.parent_tens
889 # Apply memory area and type
890 buffer_tens.mem_area = self.arch.fast_storage_mem_area
891 buffer_tens.mem_type = MemType.Scratch_fast
892 # Apply Rolling buffer
893 buffer_tens.set_format(TensorFormat.NHCWB16, self.arch)
894 buffer_tens.set_new_sub_purpose(TensorSubPurpose.RollingBufferY, cascade_info.buffers[sched_op].height)
895
896 sched_op.parent_ps.block_config = op_info.block_config.old_style_representation()
897
898 # Ensure that the src_tensor reference is set correctly
899 if op_info.buffered_weight_tensor:
900 op_info.buffered_weight_tensor.src_tensor = op_info.npu_weights_tensor
901
902 def use_fast_storage_for_feature_maps(self, schedule: Schedule, memory_limit: int):
903 if self.arch.fast_storage_mem_area == self.arch.feature_map_storage_mem_area:
904 return
905
906 # Force all OFMs to fast-storage
907 for sched_op in self.sched_ops:
908 cost = schedule.cost_map[sched_op]
909 if cost.cascade == 0:
910 if sched_op.get_dependants():
911 ofm_tens = sched_op.ofm.connection.parent_tens
912 if not any(cons is None for cons in ofm_tens.consumer_list):
913 ofm_tens.mem_area = self.arch.fast_storage_mem_area
914 ofm_tens.mem_type = MemType.Scratch_fast
915
916 # Collect live ranges from tensors
917 memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]
918 lr_graph = live_range.LiveRangeGraph()
919 for mem_area, mem_type_set in memories_list:
920 live_range.extract_live_ranges_from_cascaded_passes(
921 self.nng.get_root_subgraph(), mem_area, mem_type_set, False, lr_graph, Tensor.AllocationQuantum,
922 )
923
924 # Iterate over live ranges and evict tensors that doesn't fit
925 fast_storage_snapshot = lr_graph.get_temporal_memory_usage(self.arch.fast_storage_mem_area)
926 for lr in lr_graph.lrs:
927 if (
928 lr.mem_area == self.arch.fast_storage_mem_area
929 and max(fast_storage_snapshot[lr.start_time : lr.end_time + 1]) > memory_limit
930 ):
931 # Evict tensor to DRAM
932 for tens in lr.tensors:
933 if tens.purpose == TensorPurpose.FeatureMap and tens.sub_purpose == TensorSubPurpose.Standard:
934 # Can only evict unbuffered FeatureMaps
935 tens.mem_area = self.arch.feature_map_storage_mem_area
936 tens.mem_type = MemType.Scratch
937 # Adjust the snapshot
938 fast_storage_snapshot[lr.start_time : lr.end_time + 1] -= lr.size
939
940 def move_constant_data(self):
941 """Determine if data, can be moved from permanent storage to another memory area. A move
942 will generate a DMA command in the high-level command stream"""
943 for sched_op in self.sched_ops:
944 parent_op = sched_op.parent_op
945 is_lut_used = any(inp.purpose == TensorPurpose.LUT for inp in parent_op.inputs)
946 max_ifm_shram_avail = (
947 (self.arch.available_shram_banks(is_lut_used) - self.arch.shram_reserved_output_banks)
948 * self.arch.shram_bank_size
949 // 2
950 )
951
952 for idx, tens in enumerate(parent_op.inputs):
953 if tens.mem_type not in (MemType.Scratch, MemType.Scratch_fast):
954 # Tensor is in permanent storage
955 # Only when permanent storage differs from feature map storage, there is a point moving the data
956 if (
957 tens.mem_area in self.arch.permanent_storage_mem_area
958 and self.arch.permanent_storage_mem_area != self.arch.feature_map_storage_mem_area
959 ) or tens.purpose == TensorPurpose.LUT:
960 if tens.purpose == TensorPurpose.LUT or (
Patrik Gustavsson94292fe2021-09-02 08:22:58 +0200961 # For elementwise broadcast
Tim Halld8339a72021-05-27 18:49:40 +0100962 tens.purpose == TensorPurpose.FeatureMap
963 and sched_op.op_type.is_binary_elementwise_op()
964 and tens.shape != []
965 and sched_op.ifm.shape != sched_op.ofm.shape
Patrik Gustavsson94292fe2021-09-02 08:22:58 +0200966 and parent_op.write_shape is None
Tim Halld8339a72021-05-27 18:49:40 +0100967 and tens.storage_size() > max_ifm_shram_avail
968 ):
969 only_vector_product_consumers = all(
970 oper and oper.type.npu_block_type == NpuBlockType.VectorProduct
971 for oper in tens.consumers()
972 )
973
974 if (not only_vector_product_consumers) or tens.purpose == TensorPurpose.LUT:
975 new_tens = tens.clone_into_fast_storage(self.arch)
976 if tens.purpose == TensorPurpose.LUT:
977 new_tens.mem_area = MemArea.Shram
978
979 new_tens.consumer_list.append(parent_op)
980 parent_op.inputs[idx] = new_tens
Dwight Lidman352607c2021-09-29 17:00:09 +0200981 # If the index is out of range, IFM and IFM2 are the same tensor
982 # and pass inputs don't have duplicates
983 if idx < len(sched_op.parent_ps.inputs):
984 sched_op.parent_ps.inputs[idx] = new_tens
Tim Halld8339a72021-05-27 18:49:40 +0100985
986 def print_schedule(self, schedule: Schedule):
987 print(f"Schedule: '{schedule.name}'")
988 for sched_op in self.sched_ops:
989 if sched_op not in schedule.cost_map:
990 # Sub-schedule printing
991 continue
992
993 op_info = schedule.cost_map[sched_op]
994 print(f"\t{sched_op.index}: Operation {sched_op.name} - OFM {sched_op.ofm.shape}")
995 print(f"\t\tType: {sched_op.op_type}")
996 print(f"\t\tKernel: {sched_op.kernel}")
997 print(f"{op_info}")
998 mem_usage = (
999 schedule.memory_snapshot[op_info.time_index]
1000 if op_info.time_index < len(schedule.memory_snapshot)
1001 else 0
1002 )
1003 print(f"\t\tSRAM Used: {mem_usage} bytes")
1004
1005 print(f"\tCascades:")
1006 for i, cascade in enumerate(schedule.cascades.values()):
1007 print(f"\t\t{i}: {cascade.start} -> {cascade.end}, size: {cascade.mem_usage}")
Patrik Gustavssonfeeb06d2020-04-22 12:53:47 +02001008
Andreas Nevalainen27d36f02020-11-19 11:27:50 +01001009
Tim Halld8339a72021-05-27 18:49:40 +01001010def _update_tensor_allocation(nng: Graph, arch: ArchitectureFeatures, options):
1011 """
1012 Creates live ranges and runs tensor allocator for the current schedule
1013 (i.e. sg.schedule for all subgraphs), returns the maximum memory usage
1014 and updates SchedulerOpInfo.mem_usage for all operations in the schedule.
1015 """
1016 root_sg = nng.get_root_subgraph()
1017
1018 alloc_list = []
1019 if arch.is_spilling_enabled():
1020 mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,)))
1021 mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))
1022 # Order is important
1023 alloc_list.append(mem_alloc_scratch_fast)
1024 alloc_list.append(mem_alloc_scratch)
1025 else:
1026 mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))
1027 alloc_list.append(mem_alloc_scratch)
1028
1029 for mem_area, mem_type_set in alloc_list:
1030 tensor_allocation.allocate_tensors(
1031 nng,
1032 root_sg,
1033 arch,
1034 mem_area,
1035 mem_type_set,
1036 tensor_allocator=options.tensor_allocator,
1037 verbose_allocation=options.verbose_allocation,
1038 cpu_tensor_alignment=options.cpu_tensor_alignment,
1039 )
1040
1041
1042def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_options: SchedulerOptions):
1043 """Entry point for the Scheduler"""
1044 # Initialize CPU subgraphs
1045 schedulers = dict()
1046 # Initialize schedulers with max schedule. Only schedule NPU subgraphs
Andreas Nevalainen27d36f02020-11-19 11:27:50 +01001047 for sg in nng.subgraphs:
Tim Halld8339a72021-05-27 18:49:40 +01001048 if sg.placement != PassPlacement.Npu:
1049 # Create cascaded passes for CPU Ops
1050 cascaded_passes = []
1051 for idx, ps in enumerate(sg.passes):
1052 cps = CascadedPass(
1053 ps.name, SchedulingStrategy.WeightStream, ps.inputs, [], ps.outputs, [ps], ps.placement, False,
1054 )
Andreas Nevalainen27d36f02020-11-19 11:27:50 +01001055
Tim Halld8339a72021-05-27 18:49:40 +01001056 cps.time = idx
1057 ps.cascade = cps
1058 cascaded_passes.append(cps)
Andreas Nevalainen27d36f02020-11-19 11:27:50 +01001059
Tim Halld8339a72021-05-27 18:49:40 +01001060 sg.cascaded_passes = cascaded_passes
1061 else:
1062 # Npu subgraph - create schedule
1063 scheduler = Scheduler(nng, sg, arch, scheduler_options)
1064 schedulers[sg] = scheduler
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001065
Tim Halld8339a72021-05-27 18:49:40 +01001066 scheduler.create_scheduler_representation(arch)
1067 sg.sched_ops = scheduler.sched_ops
1068 scheduler.move_constant_data()
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001069
Tim Halld8339a72021-05-27 18:49:40 +01001070 # Create the Max schedule template
1071 max_schedule_template = scheduler.create_initial_schedule()
1072 scheduler.max_schedule = max_schedule_template
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001073
Tim Halld8339a72021-05-27 18:49:40 +01001074 # Create the optimimised Max schedule
1075 sg.schedule = max_schedule_template
1076 scheduler.update_op_memory_snapshot(max_schedule_template)
Tim Hall789e6f32021-06-17 17:02:31 +01001077 opt_max_schedule = scheduler.propose_schedule_buffering(max_schedule_template, 1 << 32)
Tim Halld8339a72021-05-27 18:49:40 +01001078 sg.schedule = opt_max_schedule
1079 scheduler.update_op_memory_snapshot(opt_max_schedule)
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001080
Tim Halld8339a72021-05-27 18:49:40 +01001081 # Create Min schedule
1082 min_schedule = scheduler.propose_minimal_schedule()
1083 initial_sram_limit = scheduler_options.optimization_sram_limit
1084 if scheduler_options.optimization_strategy == OptimizationStrategy.Size:
1085 initial_sram_limit = scheduler.min_memory_req
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001086
Tim Halld8339a72021-05-27 18:49:40 +01001087 cascade_builder = CascadeBuilder(scheduler.sched_ops, arch.is_spilling_enabled())
1088 cascade_builder.build_cascades(min_schedule, max_schedule_template, initial_sram_limit)
1089 sg.schedule = min_schedule
1090 scheduler.update_op_memory_snapshot(min_schedule)
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001091
Tim Halld8339a72021-05-27 18:49:40 +01001092 if scheduler_options.optimization_strategy == OptimizationStrategy.Performance:
1093 # Create an optimized schedule
1094 sg.schedule = scheduler.optimize_schedule(
1095 min_schedule, opt_max_schedule, max_schedule_template, scheduler_options
1096 )
1097 scheduler.update_op_memory_snapshot(sg.schedule)
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001098
Tim Halld8339a72021-05-27 18:49:40 +01001099 scheduler.apply_schedule(sg.schedule)
1100 scheduler.use_fast_storage_for_feature_maps(sg.schedule, scheduler_options.optimization_sram_limit)
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001101
Tim Halld8339a72021-05-27 18:49:40 +01001102 if scheduler_options.verbose_schedule:
1103 scheduler.print_schedule(sg.schedule)
Tim Hall79d07d22020-04-27 18:20:16 +01001104
Tim Halld8339a72021-05-27 18:49:40 +01001105 # Evaluate schedule
1106 _update_tensor_allocation(nng, arch, options)