blob: e9f38b4d76e50750485e3193124a991c7519146d [file] [log] [blame]
Tim Halld8339a72021-05-27 18:49:40 +01001# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Halld8339a72021-05-27 18:49:40 +010016#
Tim Hall79d07d22020-04-27 18:20:16 +010017# Description:
Tim Halld8339a72021-05-27 18:49:40 +010018# The scheduler creates and searches for an optimal plan for the network, selecting block configurations and
19# subdivisions for the Operators
Jonas Ohlsson845e2322022-03-01 12:39:55 +010020# For Class name forward references for the type annotations. (see PEP 563).
21from __future__ import annotations
22
Diego Russoea6111a2020-04-14 18:41:58 +010023import copy
Johan Alfvén5e0ae552022-02-09 21:20:10 +010024from collections import namedtuple
Tim Halld8339a72021-05-27 18:49:40 +010025from enum import auto
26from enum import IntEnum
Johan Alfvén6f4cb032022-05-05 08:42:46 +020027from typing import Any
Tim Halld8339a72021-05-27 18:49:40 +010028from typing import Dict
29from typing import List
30from typing import Optional
31from typing import Tuple
Jonas Ohlsson845e2322022-03-01 12:39:55 +010032from typing import TYPE_CHECKING
33
34# Import needed for Type annotations. Only import for Type checking to avoid run-time errors due to cyclic import.
35if TYPE_CHECKING:
36 from .npu_performance import CycleCost
Diego Russoea6111a2020-04-14 18:41:58 +010037
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +010038import numpy as np
39
Diego Russoea6111a2020-04-14 18:41:58 +010040from . import live_range
Tim Hall79d07d22020-04-27 18:20:16 +010041from . import npu_performance
Tim Halld8339a72021-05-27 18:49:40 +010042from . import tensor_allocation
43from . import weight_compressor
44from .architecture_allocator import ArchitectureBlockConfig
45from .architecture_allocator import find_block_config
46from .architecture_allocator import get_ifm_area_required
Fredrik Svedbergd03dc502022-06-30 10:44:12 +020047from .architecture_allocator import to_upscale
Tim Halld8339a72021-05-27 18:49:40 +010048from .architecture_features import ArchitectureFeatures
49from .architecture_features import Block
50from .cascade_builder import CascadeBuilder
51from .cascade_builder import CascadeInfo
Fredrik Svedberg880e7352020-08-25 11:31:47 +020052from .data_type import DataType
Diego Russoe8a10452020-04-21 17:39:10 +010053from .nn_graph import CascadedPass
Tim Halld8339a72021-05-27 18:49:40 +010054from .nn_graph import Graph
55from .nn_graph import Pass
Diego Russoe8a10452020-04-21 17:39:10 +010056from .nn_graph import PassPlacement
Diego Russoe8a10452020-04-21 17:39:10 +010057from .nn_graph import SchedulingStrategy
Tim Halld8339a72021-05-27 18:49:40 +010058from .nn_graph import Subgraph
59from .numeric_util import round_down
60from .numeric_util import round_up
Diego Russoe8a10452020-04-21 17:39:10 +010061from .operation import NpuBlockType
Louis Verhaardaee5d752020-09-30 09:01:52 +020062from .operation import Op
Tim Halld8339a72021-05-27 18:49:40 +010063from .shape4d import Shape4D
Diego Russoe8a10452020-04-21 17:39:10 +010064from .tensor import MemArea
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020065from .tensor import MemType
Tim Halld8339a72021-05-27 18:49:40 +010066from .tensor import Tensor
Diego Russoe8a10452020-04-21 17:39:10 +010067from .tensor import TensorFormat
68from .tensor import TensorPurpose
69from .tensor import TensorSubPurpose
Jonas Ohlsson845e2322022-03-01 12:39:55 +010070from .weight_compressor import NpuWeightTensor
Jacob Bohlin1a666972020-09-11 10:04:15 +020071
Tim Hall79d07d22020-04-27 18:20:16 +010072
Tim Halld8339a72021-05-27 18:49:40 +010073def shape_for_format(shape: Shape4D, tensor_format: TensorFormat) -> Shape4D:
74 if tensor_format == TensorFormat.NHCWB16:
75 return shape.with_depth(round_up(shape.depth, 16))
76
77 return shape
78
79
80class OptimizationStrategy(IntEnum):
81 """Enum defining the different optimization strategies for the Scheduler"""
82
83 Size = auto()
84 Performance = auto()
Tim Hall79d07d22020-04-27 18:20:16 +010085
86 def __str__(self):
87 return self.name
88
89
Tim Halld8339a72021-05-27 18:49:40 +010090class SchedulerOpInfo:
91 """Contains metadata about a SchedulerOperation that is unique to one Schedule"""
92
Tim Hall79d07d22020-04-27 18:20:16 +010093 def __init__(
94 self,
Tim Halld8339a72021-05-27 18:49:40 +010095 block_config: ArchitectureBlockConfig,
96 weights_size: int,
97 stripe_input: Shape4D,
98 stripe_input2: Optional[Shape4D],
99 stripe: Shape4D,
Tim Hall79d07d22020-04-27 18:20:16 +0100100 ):
Tim Halld8339a72021-05-27 18:49:40 +0100101 self.block_config = block_config
102 self.weights_size = weights_size
103 self.stripe_input = stripe_input
104 self.stripe_input2 = stripe_input2
105 self.stripe = stripe
106 self.cascade = 0 # Assigned by CascadeBuilder. 0 means not part of a cascade
107 self.time_index = None # Set by update_op_memory_snapshot
108 self.ofm_depth_slices: List[int] = [0, stripe.depth]
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100109 self.npu_weights_tensor: Optional[NpuWeightTensor] = None
110 self.npu_scales_tensor: Optional[NpuWeightTensor] = None
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000111 self.buffered_weight_tensors: List[Tensor] = []
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100112 self.cycles: Optional[CycleCost] = None
Tim Halld8339a72021-05-27 18:49:40 +0100113 self.slack_buffering_cycles = 0
114 self.slack_buffering_memory = 0
115 self.full_weight_transfer_cycles = 0
116
117 def copy(self):
Jonas Ohlssond8575072022-03-30 10:30:25 +0200118 res = SchedulerOpInfo(
119 self.block_config,
120 self.weights_size,
121 self.stripe_input,
122 self.stripe_input2,
123 self.stripe,
124 )
Tim Halld8339a72021-05-27 18:49:40 +0100125 res.cascade = self.cascade
126 return res
127
128 def __str__(self):
129 res = f"\t\tBlock Config = {self.block_config}\n"
130 res += f"\t\tOFM Block = {self.block_config.ofm_block}\n"
131 res += f"\t\tIFM Stripe = {self.stripe_input}\n"
132 res += f"\t\tIFM2 Stripe = {self.stripe_input2}\n"
133 res += f"\t\tOFM Stripe = {self.stripe}\n"
134 res += f"\t\tEncoded Weights = {self.npu_weights_tensor and len(self.npu_weights_tensor.buffer)} bytes\n"
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000135 for idx, tens in enumerate(self.buffered_weight_tensors):
136 res += f"\t\tWeight buffer{idx + 1} = {tens.storage_size()} bytes\n"
Tim Halld8339a72021-05-27 18:49:40 +0100137 res += f"\t\tDepth slices = {self.ofm_depth_slices}\n"
138 res += f"\t\tAssigned Cascade = {self.cascade}"
139 return res
140
141
142class SchedulerOptions:
143 """Contains options for the Scheduler"""
144
145 def __init__(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200146 self,
147 optimization_strategy,
148 sram_target,
149 verbose_schedule,
Tim Halld8339a72021-05-27 18:49:40 +0100150 ):
151 self.optimization_strategy = optimization_strategy
152 self.optimization_sram_limit = sram_target
Tim Hall79d07d22020-04-27 18:20:16 +0100153 self.verbose_schedule = verbose_schedule
Tim Hall79d07d22020-04-27 18:20:16 +0100154
Tim Halld8339a72021-05-27 18:49:40 +0100155 def __str__(self) -> str:
156 return f"{type(self).__name__}: {str(self.__dict__)}"
Tim Hall79d07d22020-04-27 18:20:16 +0100157
158 __repr__ = __str__
159
160
Tim Halld8339a72021-05-27 18:49:40 +0100161class SchedulerTensor:
162 def __init__(self, shape, dt, mem_area, _format):
163 self.dtype = dt
164 self.mem_area = mem_area
165 self.shape = shape
166 self.format = _format
167 self.connection = None
Tim Hall79d07d22020-04-27 18:20:16 +0100168
Tim Halld8339a72021-05-27 18:49:40 +0100169
170class SchedulerOperation:
171 """Scheduler internal representation of 'Operation'
172 This class can be seen as a node within the Scheduler Graph representation
173 """
174
175 def __init__(self, ps: Pass, arch: ArchitectureFeatures, nng: Graph):
176 self.arch = arch
177 self.parent_ps = ps
178 self.parent_op = ps.primary_op
179 self.name = ps.primary_op.name
180 self.op_type = ps.primary_op.type
181 self.activation = ps.primary_op.activation
182 self.kernel = ps.primary_op.kernel
Tim Hall3c5cfe92022-03-16 16:31:57 +0000183 self.resampling_mode = ps.primary_op.ifm_resampling_mode
Tim Halld8339a72021-05-27 18:49:40 +0100184 self.uses_scalar = ps.primary_op.ifm2 is not None and (
185 ps.primary_op.ifm.shape == [] or ps.primary_op.ifm2.shape == []
Tim Hall79d07d22020-04-27 18:20:16 +0100186 )
erik.andersson@arm.com6b2a0b42022-03-22 15:35:30 +0100187
Tim Halld8339a72021-05-27 18:49:40 +0100188 self.ifm_ublock = arch.ifm_ublock
Tim Hall79d07d22020-04-27 18:20:16 +0100189
Jonas Ohlssond8575072022-03-30 10:30:25 +0200190 self.ifm = SchedulerTensor(
191 ps.ifm_shapes[0],
192 ps.ifm_tensor.dtype,
193 ps.ifm_tensor.mem_area,
194 ps.ifm_tensor.format,
195 )
Tim Hall79d07d22020-04-27 18:20:16 +0100196
Tim Halld8339a72021-05-27 18:49:40 +0100197 self.ifm2 = None
198 if ps.ifm2_tensor:
199 self.ifm2 = SchedulerTensor(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200200 ps.ifm_shapes[1],
201 ps.ifm2_tensor.dtype,
202 ps.ifm2_tensor.mem_area,
203 ps.ifm2_tensor.format,
Tim Halld8339a72021-05-27 18:49:40 +0100204 )
Tim Hall79d07d22020-04-27 18:20:16 +0100205
Jonas Ohlssond8575072022-03-30 10:30:25 +0200206 self.ofm = SchedulerTensor(
207 ps.ofm_shapes[0],
208 ps.ofm_tensor.dtype,
209 ps.ofm_tensor.mem_area,
210 ps.ofm_tensor.format,
211 )
Tim Hall79d07d22020-04-27 18:20:16 +0100212
Tim Halld8339a72021-05-27 18:49:40 +0100213 # Input volume width and height required to produce the smallest possible stripe
214 self.min_stripe_input_w, self.min_stripe_input_h = self._calculate_min_stripe_input()
Tim Hall79d07d22020-04-27 18:20:16 +0100215
Tim Halld8339a72021-05-27 18:49:40 +0100216 # Flags that marks whether this SchedulerOperation requires full IFM/OFM
217 self.requires_full_ifm = False
218 self.requires_full_ifm2 = False
219 self.requires_full_ofm = False
Tim Hall79d07d22020-04-27 18:20:16 +0100220
Johan Alfvén6f4cb032022-05-05 08:42:46 +0200221 self.evicted_fms_size = 0
222
Tim Halld8339a72021-05-27 18:49:40 +0100223 self.index = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100224
erik.andersson@arm.com6b2a0b42022-03-22 15:35:30 +0100225 # Perform an IFM swap for certain binary elementwise operators
226 # in order to enable cascading, if the SchedOp conforms to
227 # Elementwise cascading rules.
228 if self.op_type.is_binary_elementwise_op() and CascadeBuilder.element_wise_cascading_conformity(self):
229 ifm1 = ps.ifm_tensor
230 ifm2 = ps.ifm2_tensor
231 ofm = ps.ofm_tensor
232 assert ifm1.elements() > 0
233 assert ifm2.elements() > 0
234
235 if (
236 # The non-constant IFM should be the primary input
237 (ifm1.ops[0].type == Op.Const and ifm2.ops[0].type != Op.Const)
238 # The non-broadcasted IFM should be the primary input
239 or (ifm1.shape != ofm.shape and ifm2.shape == ofm.shape)
240 ):
241 self.ifm, self.ifm2 = self.ifm2, self.ifm
242
243 self.parent_ps.ifm_shapes = self.parent_ps.ifm_shapes[::-1]
244 self.parent_ps.inputs = self.parent_ps.inputs[::-1]
245 self.parent_ps.ifm_tensor, self.parent_ps.ifm2_tensor = (
246 self.parent_ps.ifm2_tensor,
247 self.parent_ps.ifm_tensor,
248 )
249
Tim Halld8339a72021-05-27 18:49:40 +0100250 def add_ifm_connection(self, conn: "Connection"):
251 """Add input connection to another SchedulerOperation or Subgraph Input"""
252 conn.consumers.append(self)
253 self.ifm.connection = conn
Tim Hall79d07d22020-04-27 18:20:16 +0100254
Tim Halld8339a72021-05-27 18:49:40 +0100255 def add_ifm2_connection(self, conn: "Connection"):
256 """Add input connection to another SchedulerOperation or Subgraph Input"""
257 if self.ifm2:
258 conn.consumers.append(self)
259 self.ifm2.connection = conn
Tim Hall79d07d22020-04-27 18:20:16 +0100260 else:
Tim Halld8339a72021-05-27 18:49:40 +0100261 assert False, f"Trying to set an IFM2 Connection to {self} which has no IFM2"
Tim Hall79d07d22020-04-27 18:20:16 +0100262
Tim Halld8339a72021-05-27 18:49:40 +0100263 def add_ofm_connection(self, conn: "Connection"):
264 """Add output connection to another SchedulerOperation or Subgraph Output"""
265 conn.producers.append(self)
266 self.ofm.connection = conn
267
268 def get_dependants(self):
269 """Returns a list of the Ops that depend on this Operation's OFM"""
270 return self.ofm.connection.consumers
271
272 def ifm_size_in_bytes(self) -> int:
273 """Returns size of the IFM in bytes"""
274 ifm_storage_shape = shape_for_format(self.ifm.shape, self.ifm.format)
275 return round_up(ifm_storage_shape.elements() * self.ifm.dtype.size_in_bytes(), Tensor.AllocationQuantum)
276
277 def ifm2_size_in_bytes(self) -> int:
278 """Returns size of the IFM2 in bytes"""
279 if self.ifm2:
280 ifm2_storage_shape = shape_for_format(self.ifm2.shape, self.ifm2.format)
281 return round_up(ifm2_storage_shape.elements() * self.ifm2.dtype.size_in_bytes(), Tensor.AllocationQuantum)
282
283 return 0
284
285 def ofm_size_in_bytes(self) -> int:
286 """Returns size of the OFM in bytes"""
287 ofm_storage_shape = shape_for_format(self.ofm.shape, self.ofm.format)
288 return round_up(ofm_storage_shape.elements() * self.ofm.dtype.size_in_bytes(), Tensor.AllocationQuantum)
289
290 def create_scheduler_info(self, nng: Graph, stripe: Shape4D) -> SchedulerOpInfo:
291 """Returns schedule info about this SchedulerOperation based on how many ofm elements it should produce"""
292 ifm_shape = self.ifm.shape
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100293 ifm2_shape = self.ifm2.shape if self.ifm2 is not None else None
Tim Halld8339a72021-05-27 18:49:40 +0100294 ofm_shape = stripe
295
296 if ofm_shape != self.ofm.shape:
297 # Striped Op - Need to calculate stripe input volume
298 stripe_input_w, stripe_input_h = self._get_stripe_input_requirement(stripe)
299 # Ensure stripe input volume is within the full IFM volume
300 stripe_input_h = min(stripe_input_h, self.ifm.shape.height)
301 stripe_input_w = min(stripe_input_w, self.ifm.shape.width)
302 ifm_shape = ifm_shape.with_hw(stripe_input_h, stripe_input_w)
303
304 if self.ifm2:
305 stripe_input2_h = min(stripe_input_h, self.ifm2.shape.height)
306 stripe_input2_w = min(stripe_input_w, self.ifm2.shape.width)
307 ifm2_shape = ifm2_shape.with_hw(stripe_input2_h, stripe_input2_w)
308
309 block_config = self._get_block_config(ifm_shape, ifm2_shape, self.uses_scalar, ofm_shape)
310
311 scheduler_op_info = SchedulerOpInfo(block_config, 0, ifm_shape, ifm2_shape, ofm_shape)
312 if self.parent_op.weights:
313 # Default full-depth weight encoding with no buffering
Tim Halld784af72021-06-08 21:25:57 +0100314 (
315 scheduler_op_info.npu_weights_tensor,
316 scheduler_op_info.npu_scales_tensor,
317 ) = weight_compressor.encode_weight_and_scale_tensor(
Tim Halld8339a72021-05-27 18:49:40 +0100318 self.arch,
319 self.parent_op,
320 self.parent_op.weights,
321 self.parent_op.bias,
322 self.kernel,
323 block_config,
324 [0, self.ofm.shape.depth],
325 )
326
327 self.parent_ps.block_config = block_config.old_style_representation()
328 return scheduler_op_info
329
330 def _get_stripe_input_requirement(self, stripe_shape: Shape4D) -> Tuple[int, int]:
331 """Returns the amount of IFM required to produce the stripe with shape:'stripe_shape'"""
332 ofm_shape_to_produce = Block.from_shape(stripe_shape.as_list())
333
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200334 return get_ifm_area_required(ofm_shape_to_produce, self.kernel, self.resampling_mode)
Tim Halld8339a72021-05-27 18:49:40 +0100335
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100336 def _calculate_min_stripe_input(self) -> Tuple[int, int]:
Tim Halld8339a72021-05-27 18:49:40 +0100337 # Calculate the input volume required height and width for the smallest possible stripe (h,w = 1,1)
338 min_stripe = self.ofm.shape.with_hw(1, 1)
339 return self._get_stripe_input_requirement(min_stripe)
340
341 def _get_block_config(
342 self, ifm_shape: Shape4D, ifm2_shape: Optional[Shape4D], uses_scalar: bool, ofm_shape: Shape4D
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100343 ) -> Optional[ArchitectureBlockConfig]:
Tim Halld8339a72021-05-27 18:49:40 +0100344 # Returns a block config and SHRAM layout
345 lut_banks = 2 if self.parent_op.activation_lut else 0
346 return find_block_config(
347 self.arch,
348 self.op_type.npu_block_type,
349 ofm_shape,
350 ifm_shape,
351 ifm2_shape,
352 uses_scalar,
353 self.ifm.dtype.size_in_bits(),
354 self.kernel,
355 lut_banks,
356 self.parent_op.has_scaling(),
357 self.resampling_mode,
358 )
359
360
361class Connection:
362 """Scheduler internal representation of a Tensor that connects two SchedulerOperations
363 This class can be seen as an edge within the Scheduler Graph representation
364 """
365
366 def __init__(self, tensor: Tensor):
367 self.parent_tens = tensor
368
369 # SchedulerOperation relationships
370 self.producers: List[SchedulerOperation] = []
371 self.consumers: List[SchedulerOperation] = []
Tim Hall79d07d22020-04-27 18:20:16 +0100372
373 def __str__(self):
Tim Halld8339a72021-05-27 18:49:40 +0100374 return f"<Connection {self.parent_tens.name}>"
Tim Hall79d07d22020-04-27 18:20:16 +0100375
376 __repr__ = __str__
377
378
Tim Halld8339a72021-05-27 18:49:40 +0100379class Schedule:
380 """Class that contains a solution of how to schedule an NPU subgraph and its cost"""
Tim Hall79d07d22020-04-27 18:20:16 +0100381
Tim Halld8339a72021-05-27 18:49:40 +0100382 def __init__(self, sg: Subgraph, label: str):
383 self.sg = sg
384 self.label = label
385 self.cost_map: Dict[SchedulerOperation, SchedulerOpInfo] = {}
386 self.cascades: Dict[int, CascadeInfo] = {}
387 self.fast_storage_peak_usage = 0
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100388 self.memory_snapshot: Optional[List[int]] = None
Tim Halld8339a72021-05-27 18:49:40 +0100389
390 @property
391 def name(self):
392 return f"{self.sg.name}_{self.label}"
Tim Hall79d07d22020-04-27 18:20:16 +0100393
394
Tim Halld8339a72021-05-27 18:49:40 +0100395class Scheduler:
396 """Main class of the Vela Scheduling"""
Tim Hall79d07d22020-04-27 18:20:16 +0100397
Tim Halld8339a72021-05-27 18:49:40 +0100398 def __init__(self, nng: Graph, sg: Subgraph, arch: ArchitectureFeatures, options: SchedulerOptions):
Tim Hall79d07d22020-04-27 18:20:16 +0100399 self.nng = nng
400 self.sg = sg
401 self.arch = arch
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000402 self.sched_ops: List[SchedulerOperation] = []
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100403 self.max_schedule: Optional[Schedule] = None
Tim Halld8339a72021-05-27 18:49:40 +0100404 self.scheduler_options = options
Tim Hall79d07d22020-04-27 18:20:16 +0100405
Johan Alfvén6f4cb032022-05-05 08:42:46 +0200406 self.scratched_fms: Dict[Tensor, Any] = {}
407 self.evicted_fms: List[live_range.LiveRange] = []
408
Johan Alfvén5e0ae552022-02-09 21:20:10 +0100409 def avoid_nhcwb16_for_ofm(self, tens, ps, arch):
410 # Only run this check for opt strategy Size
411 if self.scheduler_options.optimization_strategy == OptimizationStrategy.Performance:
412 return False
413
414 op = ps.primary_op
415 if not op.type.is_elementwise_op():
416 return False
417
418 depth = op.ofm_shapes[0][-1]
419 if (depth % 16) == 0:
420 return False
421
422 # Check if overwriting the inputs can be allowed
423 OpShapeTens = namedtuple("OpShapeTens", ["op_shape", "tens"])
424 outp = OpShapeTens(op.ofm_shapes[0], op.ofm)
425 inps = []
426 if op.ifm is not None:
427 inps.append(OpShapeTens(op.ifm_shapes[0], op.ifm))
428 if op.ifm2 is not None:
429 inps.append(OpShapeTens(op.ifm_shapes[1], op.ifm2))
430
431 # Find an input tensor that can be overwritten by the output
432 for inp in inps:
433 if (
434 # check op input and output shapes allow overlapping
435 inp.op_shape == outp.op_shape
436 # check input tensor is valid
437 and inp.tens is not None
438 and inp.tens.shape != []
439 # check input and output tensors are compatible
440 and inp.tens.format == outp.tens.format
441 and inp.tens.dtype == outp.tens.dtype
442 ):
443 if inp.tens.format == TensorFormat.NHWC:
444 return True
445
446 return False
447
Tim Halld8339a72021-05-27 18:49:40 +0100448 def create_scheduler_representation(self, arch: ArchitectureFeatures):
449 """Creates a Scheduler Graph representation"""
450 # Temporary dict for creating connections between the Operations
451 connections: Dict[Tensor, Connection] = {}
452 # Memory required for the largest FeatureMap that has to be full
453 min_memory_req = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100454 for ps in self.sg.passes:
Tim Halld8339a72021-05-27 18:49:40 +0100455 if ps.primary_op:
456 # Set tensor format to NHCWB16 for output FeatureMaps, if possible
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200457 for output in ps.outputs:
Jacob Bohlina5e8c1c2021-06-14 13:33:39 +0200458 if output in self.sg.output_tensors or output.purpose != TensorPurpose.FeatureMap:
Patrik Gustavssonfeeb06d2020-04-22 12:53:47 +0200459 continue
Johan Alfvén5e0ae552022-02-09 21:20:10 +0100460
461 if output.needs_linear_format:
462 continue
463
464 if self.avoid_nhcwb16_for_ofm(output, ps, arch):
465 output.needs_linear_format = True
466 continue
467
468 output.set_format(TensorFormat.NHCWB16, arch)
Tim Halld8339a72021-05-27 18:49:40 +0100469
470 # Create SchedulerOperations
471 op = SchedulerOperation(ps, arch, self.nng)
472 op.index = len(self.sched_ops)
473
474 # Make connections
475 if ps.ifm_tensor not in connections:
476 connections[ps.ifm_tensor] = Connection(ps.ifm_tensor)
477 if ps.ifm2_tensor and ps.ifm2_tensor not in connections:
478 connections[ps.ifm2_tensor] = Connection(ps.ifm2_tensor)
479 if ps.ofm_tensor not in connections:
480 connections[ps.ofm_tensor] = Connection(ps.ofm_tensor)
481
482 op.add_ifm_connection(connections[ps.ifm_tensor])
483 if ps.ifm2_tensor:
484 op.add_ifm2_connection(connections[ps.ifm2_tensor])
485 op.add_ofm_connection(connections[ps.ofm_tensor])
486
487 # Set requirements on the ifm/ofm buffers
488 self.sched_ops.append(op)
489 if ps.ifm_tensor in self.sg.input_tensors:
490 # This Op consumes a subgraph input
491 op.requires_full_ifm = True
492 if ps.ifm2_tensor and ps.ifm2_tensor in self.sg.input_tensors:
493 # This Op consumes a subgraph input
494 op.requires_full_ifm2 = True
495 if ps.ofm_tensor in self.sg.output_tensors:
496 # This Op produces a subgraph output
497 op.requires_full_ofm = True
498 if ps.ifm_tensor.needs_linear_format:
499 op.requires_full_ifm = True
500 if ps.ifm2_tensor and ps.ifm2_tensor.needs_linear_format:
501 op.requires_full_ifm2 = True
502 if ps.ofm_tensor.needs_linear_format or ps.primary_op.memory_function == Op.ConcatSliceWrite:
503 op.requires_full_ofm = True
504 if len(ps.primary_op.outputs) > 1 or len(ps.primary_op.outputs[0].consumer_list) > 1:
505 # Op has multiple outputs or consumers - requires full OFM
506 op.requires_full_ofm = True
507
508 # Check memory requirements if this Op requires any full FeatureMaps
509 op_memory_req = 0
510 if op.requires_full_ifm:
511 op_memory_req += op.ifm_size_in_bytes()
512 if op.requires_full_ifm2:
513 op_memory_req += op.ifm2_size_in_bytes()
514 if op.requires_full_ofm:
515 op_memory_req += op.ofm_size_in_bytes()
516
517 min_memory_req = max(op_memory_req, min_memory_req)
518
519 # Theoretical minimum required memory - used to guide the cascade building
520 self.min_memory_req = min_memory_req
521
522 def create_initial_schedule(self) -> Schedule:
523 """Creates an initial schedule with no cascading or buffering of any kind"""
524 schedule = Schedule(self.sg, "MAX")
Tim Halld8339a72021-05-27 18:49:40 +0100525 for op in self.sched_ops:
526 cost = op.create_scheduler_info(self.nng, op.ofm.shape)
527 cost.cycles = self.estimate_op_performance(op, cost.block_config, op.ofm.shape.depth)
528 schedule.cost_map[op] = cost
529
530 return schedule
531
532 def update_op_memory_snapshot(self, schedule: Schedule):
533 memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]
534
535 # Collect live ranges from tensors
536 lr_graph = live_range.LiveRangeGraph()
537 for mem_area, mem_type_set in memories_list:
538 live_range.extract_live_ranges_from_cascaded_passes(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200539 self.nng.get_root_subgraph(),
540 mem_area,
541 mem_type_set,
542 lr_graph,
543 Tensor.AllocationQuantum,
Tim Halld8339a72021-05-27 18:49:40 +0100544 )
545
546 # Populate time-array with memory used by live ranges
547 temporal_usage = lr_graph.get_temporal_memory_usage(self.arch.fast_storage_mem_area)
548 schedule.memory_snapshot = temporal_usage
549
550 # Set the peak memory usage
551 schedule.fast_storage_peak_usage = max(temporal_usage, default=0)
552
553 def estimate_op_performance(self, op: SchedulerOperation, block_config, ofm_depth):
554 query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)
555 query.ifm_shape = op.ifm.shape
556 query.ifm_memory_area = op.ifm.mem_area
557 query.ifm_bits = op.ifm.dtype.size_in_bits()
558 query.ifm_format = op.ifm.format
559 query.ifm2_shape = op.ifm2 and op.ifm2.shape
560 query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area
561 query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()
562 query.ifm2_format = op.ifm2 and op.ifm2.format
563 query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)
564 query.ofm_memory_area = op.ofm.mem_area
565 query.ofm_bits = op.ofm.dtype.size_in_bits()
566 query.ofm_format = op.ofm.format
567 if op.parent_op.bias:
568 query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)
569 query.const_memory_area = self.arch.fast_storage_mem_area
570
571 query.kernel = op.kernel
572 query.config = block_config
573
574 return npu_performance.measure_cycle_cost(self.arch, op.op_type, op.activation and op.activation.op_type, query)
575
Johan Alfvén5c309712022-06-10 15:40:58 +0200576 def estimate_element_access(self, op: SchedulerOperation, block_config, ofm_depth):
577 query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)
578 query.ifm_shape = op.ifm.shape
579 query.ifm_memory_area = op.ifm.mem_area
580 query.ifm_bits = op.ifm.dtype.size_in_bits()
581 query.ifm_format = op.ifm.format
582 query.ifm2_shape = op.ifm2 and op.ifm2.shape
583 query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area
584 query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()
585 query.ifm2_format = op.ifm2 and op.ifm2.format
586 query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)
587 query.ofm_memory_area = op.ofm.mem_area
588 query.ofm_bits = op.ofm.dtype.size_in_bits()
589 query.ofm_format = op.ofm.format
590 if op.parent_op.bias:
591 query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)
592 query.const_memory_area = self.arch.fast_storage_mem_area
593
594 query.kernel = op.kernel
595 query.config = block_config
596
597 return npu_performance.measure_element_access(self.arch, query)
598
Tim Hall789e6f32021-06-17 17:02:31 +0100599 def propose_schedule_buffering(self, ref_schedule: Schedule, staging_limit_bytes):
Tim Halld8339a72021-05-27 18:49:40 +0100600 """Create a buffered schedule"""
601 buffered_schedule = Schedule(self.sg, f"{ref_schedule.label}_BUFFERED")
Tim Halld8339a72021-05-27 18:49:40 +0100602
603 prev_op = None
604 for sched_op in self.sched_ops:
605 if sched_op not in ref_schedule.cost_map:
606 # sched_op is not part of this sub-schedule - skip
607 continue
608
609 self.propose_operator_buffering(sched_op, prev_op, buffered_schedule, ref_schedule, staging_limit_bytes)
610 prev_op = sched_op
611
612 return buffered_schedule
613
614 def propose_operator_buffering(
615 self,
616 sched_op: SchedulerOperation,
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100617 prev_op: Optional[SchedulerOperation],
Tim Halld8339a72021-05-27 18:49:40 +0100618 buffered_schedule: Schedule,
619 ref_schedule: Schedule,
620 staging_limit_bytes,
621 ):
622 # Mild recursion might mean this Op has already been seen
623 if sched_op in buffered_schedule.cost_map:
624 return
625
626 # Take the reference schedule as default costings for this schedule
627 ref_cost = ref_schedule.cost_map[sched_op]
628 cost = copy.copy(ref_cost)
629 cost.slack_buffering_cycles = ref_cost.cycles.op_cycles
630 memory_snapshot = ref_schedule.memory_snapshot
631 ref_memory_usage = memory_snapshot[ref_cost.time_index] if ref_cost.time_index < len(memory_snapshot) else 0
632 cost.slack_buffering_memory = staging_limit_bytes - ref_memory_usage
633 buffered_schedule.cost_map[sched_op] = cost
634
635 # Attempt weight buffering on anything with a weights tensor
636 if sched_op.parent_op.weights:
Johan Alfvén6f4cb032022-05-05 08:42:46 +0200637 buffer_limit_bytes = cost.slack_buffering_memory
638
639 # If applicable apply size limitation, but keep it within reason (ratio 1.5).
640 # Size limitation is used when use_fast_storage_for_feature_maps have
641 # detected that there are fms that do not fit in fast storage.
642 if sched_op.evicted_fms_size and ((buffer_limit_bytes / sched_op.evicted_fms_size) >= 1.5):
643 buffer_limit_bytes -= sched_op.evicted_fms_size
644
Tim Halld8339a72021-05-27 18:49:40 +0100645 self.propose_weight_buffering(
646 sched_op.parent_op.weights,
647 sched_op.parent_op.bias,
648 sched_op,
649 prev_op,
650 buffered_schedule,
651 ref_schedule,
Johan Alfvén6f4cb032022-05-05 08:42:46 +0200652 buffer_limit_bytes,
Tim Halld8339a72021-05-27 18:49:40 +0100653 )
654
655 return cost
656
657 def weights_needs_dma(self, weight_tensor):
658 if weight_tensor and weight_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast):
659 # Weights are in permanent storage
660 # Only when permanent storage differs from feature map storage, there is a point moving the data
661 if (
662 weight_tensor.mem_area in (MemArea.Dram, MemArea.OffChipFlash)
663 and self.arch.permanent_storage_mem_area != self.arch.fast_storage_mem_area
664 ):
665 return True
666 return False
667
668 def propose_weight_buffering(
669 self,
670 weight_tensor,
671 scale_tensor,
672 sched_op: SchedulerOperation,
673 prev_op: SchedulerOperation,
674 buffered_schedule: Schedule,
675 ref_schedule: Schedule,
676 buffer_limit_bytes,
677 ):
678 cost = buffered_schedule.cost_map[sched_op]
679 prev_cost = buffered_schedule.cost_map.get(prev_op)
680 ref_cost = ref_schedule.cost_map[sched_op]
681 assert cost and ref_cost
682
683 needs_dma = self.weights_needs_dma(weight_tensor)
684
685 ofm_full_depth_slices = [0, ref_cost.stripe.depth]
686
687 # Encode weights for the full depth
Tim Halld784af72021-06-08 21:25:57 +0100688 full_weights, full_scales = weight_compressor.encode_weight_and_scale_tensor(
Tim Halld8339a72021-05-27 18:49:40 +0100689 self.arch,
690 sched_op.parent_op,
691 weight_tensor,
692 scale_tensor,
693 sched_op.kernel,
694 cost.block_config,
695 ofm_full_depth_slices,
696 )
697 full_weights_bytes = len(full_weights.buffer)
698 cost.ofm_depth_slices = ofm_full_depth_slices
699
700 # No buffering required - take all the weights from permanent storage
701 if sched_op.op_type == Op.FullyConnected or not needs_dma:
702 cost.npu_weights_tensor = full_weights
Tim Halld784af72021-06-08 21:25:57 +0100703 cost.npu_scales_tensor = full_scales
Tim Halld8339a72021-05-27 18:49:40 +0100704 return
705
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100706 encoded_weights: Optional[NpuWeightTensor] = full_weights
Tim Halld784af72021-06-08 21:25:57 +0100707 encoded_scales = full_scales
Tim Halld8339a72021-05-27 18:49:40 +0100708
709 # How many NPU cycles are available under the previously executing
710 # operator and SRAM unused for performing buffered DMA transfers
711 slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0
712 slack_memory = prev_cost.slack_buffering_memory if prev_cost else 0
713
714 # Force full depth for cascaded Ops
715 if ref_cost.cascade != 0:
716 weight_tensor_purpose = TensorSubPurpose.Standard
717 weight_buffer_size = full_weights_bytes
718 # Update the memory snapshot to reflect the added size of the weights
719 ref_schedule.memory_snapshot[ref_cost.time_index] += weight_buffer_size
720 else:
721 # Estimate the buffering cycle time for the full set of weights
722 full_transfer_cycles = npu_performance.measure_mem2mem_cycles(
723 self.arch, weight_tensor.mem_area, self.arch.fast_storage_mem_area, full_weights_bytes
724 )
725 cost.full_weight_transfer_cycles = full_transfer_cycles
726
727 # Calculate the amount of prebuffering necessary (or what is possible with limited
728 # double buffer buffer size)
729 half_buffer_limit = buffer_limit_bytes // 2
730 if full_transfer_cycles > slack_cycles:
731 prebuffer_ratio = slack_cycles / full_transfer_cycles
732 prebuffer_bytes = min(prebuffer_ratio * full_weights_bytes, half_buffer_limit)
733 else:
734 prebuffer_bytes = min(full_weights_bytes, half_buffer_limit)
Tim Hall789e6f32021-06-17 17:02:31 +0100735
736 prebuffer_ratio = prebuffer_bytes / full_weights_bytes
Tim Halld8339a72021-05-27 18:49:40 +0100737
738 # Have to split the weights if the initial buffering can't store
739 # all of the compressed weights
740 if prebuffer_bytes < full_weights_bytes:
Tim Hall789e6f32021-06-17 17:02:31 +0100741 block_depth = cost.block_config.ofm_block.depth
Tim Halld8339a72021-05-27 18:49:40 +0100742
Tim Hall789e6f32021-06-17 17:02:31 +0100743 # Choose initial prebuffering depth (already buffer clamped)
744 prebuffer_depth = ref_cost.stripe.depth * prebuffer_ratio
Tim Halld8339a72021-05-27 18:49:40 +0100745 prebuffer_depth = int(max(16, round_down(prebuffer_depth, ArchitectureFeatures.OFMSplitDepth)))
746
Tim Hall789e6f32021-06-17 17:02:31 +0100747 # Calculate cycles executed during the prebuffer
748 pre_op_cycles = self.estimate_op_performance(sched_op, cost.block_config, prebuffer_depth)
749 buffering_depth = ref_cost.stripe.depth * (pre_op_cycles.op_cycles / full_transfer_cycles)
Tim Halld8339a72021-05-27 18:49:40 +0100750
Tim Hall789e6f32021-06-17 17:02:31 +0100751 # Choose initial buffering depth and clamp to the double buffering limit
752 buffering_depth = round_up(buffering_depth, block_depth)
753 buffering_bytes = (buffering_depth / ref_cost.stripe.depth) * full_weights_bytes
754 if buffering_bytes > half_buffer_limit:
755 buffering_depth = (half_buffer_limit / full_weights_bytes) * ref_cost.stripe.depth
756
757 while True:
758 # Attempt to buffer whole blocks
Johan Alfvéncce7f2d2022-04-08 10:47:09 +0200759 if buffering_depth > block_depth:
Tim Hall789e6f32021-06-17 17:02:31 +0100760 buffering_depth = round_down(buffering_depth, block_depth)
761 else:
762 buffering_depth = round_down(buffering_depth, ArchitectureFeatures.OFMSplitDepth)
763 buffering_depth = int(max(buffering_depth, ArchitectureFeatures.OFMSplitDepth))
Tim Halld8339a72021-05-27 18:49:40 +0100764
765 # Create list of depth slices
766 depth_slices = [0]
767 if prebuffer_depth < ref_cost.stripe.depth:
768 depth_slices += list(range(prebuffer_depth, ref_cost.stripe.depth, buffering_depth))
769 depth_slices.append(ref_cost.stripe.depth)
770
771 # Encode weights based depth slices
772 cost.ofm_depth_slices = depth_slices
Tim Halld784af72021-06-08 21:25:57 +0100773 encoded_weights, encoded_scales = weight_compressor.encode_weight_and_scale_tensor(
Tim Halld8339a72021-05-27 18:49:40 +0100774 self.arch,
775 sched_op.parent_op,
776 weight_tensor,
777 scale_tensor,
778 sched_op.kernel,
779 cost.block_config,
780 cost.ofm_depth_slices,
781 )
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100782 assert encoded_weights is not None
Tim Halld8339a72021-05-27 18:49:40 +0100783 # Chosen buffering might not fit at all, iterate until it does
784 # or until the minimum usable slice size is reached
785 if (
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000786 encoded_weights.double_buffer_size() <= buffer_limit_bytes
Tim Halld8339a72021-05-27 18:49:40 +0100787 or prebuffer_depth == ArchitectureFeatures.OFMSplitDepth
788 ):
789 break
790
Tim Hall789e6f32021-06-17 17:02:31 +0100791 if buffering_depth > prebuffer_depth:
792 buffering_depth = round_up(buffering_depth // 2, ArchitectureFeatures.OFMSplitDepth)
793 else:
794 prebuffer_depth = round_up(prebuffer_depth // 2, ArchitectureFeatures.OFMSplitDepth)
Tim Halld8339a72021-05-27 18:49:40 +0100795
796 # Calculate cycles required to run the last op for use as future slack
797 tail_cycles = self.estimate_op_performance(
798 sched_op, cost.block_config, depth_slices[-1] - depth_slices[-2]
799 )
800 cost.slack_buffering_cycles = tail_cycles.op_cycles
801
802 # Determine whether the weights need to be double buffered
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000803 weight_buffer_size = min(len(encoded_weights.buffer), encoded_weights.max_range_bytes())
Tim Halld8339a72021-05-27 18:49:40 +0100804
805 # Only buffer weights if there's still space left for the buffer
806 if weight_buffer_size <= buffer_limit_bytes:
807 assert weight_buffer_size % 16 == 0
808 # Determine whether to double buffer or single buffer
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000809 double_buffer_size = encoded_weights.double_buffer_size()
810 if (double_buffer_size <= buffer_limit_bytes) and (weight_buffer_size < len(encoded_weights.buffer)):
Tim Halld8339a72021-05-27 18:49:40 +0100811 weight_tensor_purpose = TensorSubPurpose.DoubleBuffer
812 else:
813 weight_tensor_purpose = TensorSubPurpose.Standard
814
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000815 cost.buffered_weight_tensors = [
816 self.buffer_tensor(
817 encoded_weights,
818 weight_tensor_purpose,
819 encoded_weights.double_buffer_sizes[0],
820 weight_tensor.name + "_buffer",
821 )
822 ]
823 if weight_tensor_purpose == TensorSubPurpose.DoubleBuffer:
824 buf2 = self.buffer_tensor(
825 encoded_weights,
826 weight_tensor_purpose,
827 encoded_weights.double_buffer_sizes[1],
828 weight_tensor.name + "_buffer2",
829 )
830 cost.buffered_weight_tensors.append(buf2)
831
832 last_used_buffer_idx = len(cost.ofm_depth_slices) % len(cost.buffered_weight_tensors)
833 weight_buffer_size = encoded_weights.double_buffer_sizes[last_used_buffer_idx]
834
Tim Halld8339a72021-05-27 18:49:40 +0100835 if ref_cost.cascade == 0:
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000836 # Determine if the lifetime can be extended and pre-buffer the first weight buffer
837 # under the previous operation
838 cost.buffered_weight_tensors[0].pre_buffer = encoded_weights.double_buffer_size() < slack_memory
Tim Halld8339a72021-05-27 18:49:40 +0100839
840 cost.slack_buffering_memory -= weight_buffer_size
841 else:
842 # Don't slice or buffer - use the whole depth from persistent storage
843 cost.ofm_depth_slices = ofm_full_depth_slices
844 encoded_weights = full_weights
Tim Halld784af72021-06-08 21:25:57 +0100845 encoded_scales = full_scales
Tim Halld8339a72021-05-27 18:49:40 +0100846
847 cost.npu_weights_tensor = encoded_weights
Tim Halld784af72021-06-08 21:25:57 +0100848 cost.npu_scales_tensor = encoded_scales
Tim Halld8339a72021-05-27 18:49:40 +0100849
Jacob Bohlineee9e5d2021-08-17 17:44:45 +0200850 def buffer_tensor(self, src_tensor: Tensor, sub_purpose: TensorSubPurpose, buffer_size: int, name: str) -> Tensor:
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000851 buffered_weight_tensor = Tensor([1, 1, 1, buffer_size], DataType.uint8, name)
Jacob Bohlineee9e5d2021-08-17 17:44:45 +0200852 buffered_weight_tensor.src_tensor = src_tensor
853 buffered_weight_tensor.mem_area = self.arch.fast_storage_mem_area
854 buffered_weight_tensor.mem_type = MemType.Scratch_fast
855 buffered_weight_tensor.purpose = TensorPurpose.Weights
856 buffered_weight_tensor.sub_purpose = sub_purpose
857 return buffered_weight_tensor
858
Tim Halld8339a72021-05-27 18:49:40 +0100859 def propose_minimal_schedule(self) -> Schedule:
860 """Proposes scheduling parameters where every operator is subdivided into the smallest stripe that satisfies the
861 next operators stride"""
862 min_schedule = Schedule(self.sg, "MIN")
863 cost_map = min_schedule.cost_map
864
865 # Keep track of the previous Op - which consumes the current Op's OFM
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100866 prev_op: Optional[SchedulerOperation] = None
Tim Halld8339a72021-05-27 18:49:40 +0100867 for sched_op in reversed(self.sched_ops):
868 min_stripe_height = prev_op.kernel.stride.y if prev_op else 1
869 min_stripe = sched_op.ofm.shape.with_height(min_stripe_height)
870
871 cost = sched_op.create_scheduler_info(self.nng, min_stripe)
872 cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)
873 cost_map[sched_op] = cost
874
875 prev_op = sched_op
876
877 return min_schedule
878
879 def propose_schedule_striping(self, final_stripe: Shape4D, label: str, ref_schedule: Schedule) -> Schedule:
880 """Proposes new striping for a schedule. The stripe is derived from the ifm requirements of the next Op down"""
881 ref_cost = ref_schedule.cost_map
882
883 striped_schedule = Schedule(self.sg, label)
884 stripe = final_stripe
885 for sched_op in reversed(self.sched_ops):
886 if sched_op not in ref_cost:
887 # sched_op is not part of the sub-schedule - skip
888 continue
889
890 # Create a cost entry with the new stripe
891 cost = sched_op.create_scheduler_info(self.nng, stripe)
892
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000893 weight_tensor = cost.npu_weights_tensor
894 for idx, buffered_tens in enumerate(ref_cost[sched_op].buffered_weight_tensors):
Jacob Bohlineee9e5d2021-08-17 17:44:45 +0200895 # If the weights are buffered in the reference schedule they should be in the new proposal
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000896 cost.buffered_weight_tensors.append(
897 self.buffer_tensor(
898 weight_tensor,
899 buffered_tens.sub_purpose,
900 weight_tensor.double_buffer_sizes[idx],
901 buffered_tens.name,
902 )
Jacob Bohlineee9e5d2021-08-17 17:44:45 +0200903 )
Tim Halld8339a72021-05-27 18:49:40 +0100904
905 # Estimate performance
906 cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)
907 striped_schedule.cost_map[sched_op] = cost
908
909 # Calculate the preceeding Op's stripe
Fredrik Svedbergd03dc502022-06-30 10:44:12 +0200910 height = stripe.height + stripe.height % to_upscale(sched_op.resampling_mode)
911 stripe = sched_op.ifm.shape.with_height(height * sched_op.kernel.stride.y)
Tim Halld8339a72021-05-27 18:49:40 +0100912
913 return striped_schedule
914
915 def estimate_schedule_memory_usage(self, schedule: Schedule, non_local_mem_usage: dict):
916 """Estimates the memory usage of a schedule"""
917 cost = schedule.cost_map
918 cascades = schedule.cascades
919 peak_mem_usage = 0
920 for sched_op in self.sched_ops:
921 if sched_op not in cost:
922 # sched_op is not part of the sub-schedule - skip
923 continue
924
925 if cost[sched_op].cascade:
926 # This Op is part of a cascade - use the cascade's memory usage
927 cascade_info = cascades[cost[sched_op].cascade]
928 # Non-local memory usage is already included in the cascade_info
929 peak_mem_usage = max(cascade_info.mem_usage, peak_mem_usage)
930 else:
931 # This Op is not part of a cascade - calculate the memory usage
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000932 op_weight_buffer = sum(tens.storage_size() for tens in cost[sched_op].buffered_weight_tensors)
Tim Halld8339a72021-05-27 18:49:40 +0100933
934 op_mem_usage = (
935 sched_op.ifm_size_in_bytes()
936 + sched_op.ofm_size_in_bytes()
937 + op_weight_buffer
938 + non_local_mem_usage.get(sched_op, 0)
939 )
940 peak_mem_usage = max(op_mem_usage, peak_mem_usage)
941
942 return peak_mem_usage
943
Johan Alfvén255dad72022-07-16 18:27:05 +0200944 def build_cascades_for_min_schedule(self, min_schedule: Schedule, max_template: Schedule, memory_limit: int):
945 # Update memory snapshot
946 self.sg.schedule = min_schedule
947 self.update_op_memory_snapshot(min_schedule)
948
949 # Calculate residual memory for Min schedule
950 non_local_mem_usage = {}
951 for sched_op in self.sched_ops:
952 time_index = min_schedule.cost_map[sched_op].time_index
953
954 if self.arch.is_spilling_enabled():
955 # For Dedicated SRAM only the intermediate buffers are in SRAM, hence op_mem_usage is 0
956 op_mem_usage = 0
957 else:
958 # Min schedule only have ifm and ofm in SRAM (no buffered weigth tensors)
959 op_mem_usage = sched_op.ifm_size_in_bytes() + sched_op.ofm_size_in_bytes()
960
961 non_local_mem_usage[sched_op] = min_schedule.memory_snapshot[time_index] - op_mem_usage
962
963 # Crate cascades for Min schedule
964 cascade_builder = CascadeBuilder(self.sched_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)
965 cascade_builder.build_cascades(min_schedule, max_template, memory_limit)
966
Tim Halld8339a72021-05-27 18:49:40 +0100967 def optimize_sub_schedule(
968 self, cascade_info: CascadeInfo, ref_schedule: Schedule, max_template: Schedule, memory_limit: int
969 ) -> Schedule:
970 """Extracts the Ops covered by the given cascade and creates a sub-schedule. The sub-schedule is optimized by
971 proposing weight buffering and then continously proposing new stripe sizes"""
972 ref_cost = ref_schedule.cost_map
973 # Extract the ops that are part of this sub-schedule
974 start = cascade_info.start
975 end = cascade_info.end
976 sub_schedule_ops = self.sched_ops[start : end + 1]
977 # Create a sub-schedule that contains only the costs for the Ops that are part of the sub-schedule
978 sub_schedule = Schedule(self.sg, f"SUB_{start}_{end}")
979 for sched_op in sub_schedule_ops:
980 sub_schedule.cost_map[sched_op] = ref_cost[sched_op]
981
982 sub_schedule.cascades[end] = cascade_info
983 # Use the memory snapshot from the reference schedule
984 sub_schedule.memory_snapshot = ref_schedule.memory_snapshot
985
986 # Calculate memory usage that is live during the sub-schedule but not part of it
987 time_for_cascade = ref_cost[sub_schedule_ops[0]].time_index
988 mem_usage_parallel_to_sub_schedule = ref_schedule.memory_snapshot[time_for_cascade] - cascade_info.mem_usage
989 # If the first Op's IFM has other consumers it has to live throughout the whole sub-schedule whether it's
990 # included in a cascade or not
991 persistent_initial_ifm = (
992 sub_schedule_ops[0].ifm_size_in_bytes() if len(sub_schedule_ops[0].ifm.connection.consumers) > 1 else 0
993 )
994 # Calculate non-local-mem-usage per Operator
995 non_local_mem_usage = {}
996 for idx, sched_op in enumerate(sub_schedule_ops):
997 non_local_mem_usage[sched_op] = mem_usage_parallel_to_sub_schedule
998 if idx != 0:
999 non_local_mem_usage[sched_op] += persistent_initial_ifm
1000
1001 cascade_builder = CascadeBuilder(sub_schedule_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)
1002
1003 # Start by adding buffering
Tim Hall789e6f32021-06-17 17:02:31 +01001004 buffered_sub_schedule = self.propose_schedule_buffering(
1005 sub_schedule, self.scheduler_options.optimization_sram_limit
1006 )
Tim Halld8339a72021-05-27 18:49:40 +01001007 # Copy the cascades over from the unbuffered-schedule
1008 buffered_sub_schedule.cascades = sub_schedule.cascades
1009
1010 # Generate the possible stripings for the final Op in the sub-schedule
1011 final_ofm_shape = sub_schedule_ops[-1].ofm.shape
1012 possible_stripes = [
1013 final_ofm_shape.with_height(stripe_h) for stripe_h in range(1, final_ofm_shape.height // 2 + 1)
1014 ]
1015
1016 # Propose different striping - the possible stripes are proposed similarly to a binary search
Jacob Bohlinfad72042021-08-24 21:51:41 +02001017 best_schedule = None
Tim Halld8339a72021-05-27 18:49:40 +01001018 iteration = 0
1019 while len(possible_stripes) > 1:
1020 proposed_stripe = possible_stripes[len(possible_stripes) // 2]
1021 proposed_schedule = self.propose_schedule_striping(
1022 proposed_stripe, f"OPTIMIZED_{iteration}", buffered_sub_schedule
1023 )
1024
1025 cascade_builder.build_cascades(proposed_schedule, max_template, memory_limit)
1026
1027 # Check if proposal fits
1028 proposed_schedule_mem_usage = self.estimate_schedule_memory_usage(proposed_schedule, non_local_mem_usage)
1029 if (proposed_schedule_mem_usage) <= memory_limit:
1030 # Remove all possible stripes smaller than this
1031 possible_stripes = possible_stripes[len(possible_stripes) // 2 :]
1032 best_schedule = proposed_schedule
1033 if not proposed_schedule.cascades:
1034 # No cascading required - early exit
1035 break
1036 else:
1037 # Proposal doesn't fit within the limit - remove all possible stripes larger than this
1038 possible_stripes = possible_stripes[: len(possible_stripes) // 2]
1039
1040 iteration += 1
1041
1042 return best_schedule
1043
1044 def optimize_schedule(
Jonas Ohlssond8575072022-03-30 10:30:25 +02001045 self,
1046 schedule: Schedule,
1047 max_sched: Schedule,
1048 max_template: Schedule,
1049 options: SchedulerOptions,
Tim Halld8339a72021-05-27 18:49:40 +01001050 ) -> Schedule:
1051 """Extracts sub-schedules based on the cascades and optimizes them and applies them to the final schedule"""
1052 sram_limit = options.optimization_sram_limit
1053 if max_sched.fast_storage_peak_usage < sram_limit and not self.arch.is_spilling_enabled():
1054 # Maximum performance schedule fits within the SRAM target
1055 return max_sched
1056
Jacob Bohlinfad72042021-08-24 21:51:41 +02001057 # Iterate over a copy of the cascades since they may change during the loop
1058 for cascade_info in list(schedule.cascades.values()):
Tim Halld8339a72021-05-27 18:49:40 +01001059 # Optimize the sub-schedule in this cascade
1060 opt_sub_schedule = self.optimize_sub_schedule(cascade_info, schedule, max_template, sram_limit)
Jacob Bohlinfad72042021-08-24 21:51:41 +02001061 if opt_sub_schedule:
1062 # Remove the existing cascade
1063 del schedule.cascades[cascade_info.end]
1064 # Update the sub-schedule Op and cascade costs to the full schedule
1065 schedule.cost_map.update(opt_sub_schedule.cost_map)
1066 schedule.cascades.update(opt_sub_schedule.cascades)
Tim Halld8339a72021-05-27 18:49:40 +01001067
1068 # Update memory snapshot
1069 self.sg.schedule = schedule
1070 self.update_op_memory_snapshot(schedule)
1071 # Propose schedule buffering to the optimized schedule
Tim Hall789e6f32021-06-17 17:02:31 +01001072 optimized_sched = self.propose_schedule_buffering(schedule, self.scheduler_options.optimization_sram_limit)
Tim Halld8339a72021-05-27 18:49:40 +01001073 # Copy the cascade's metadata from the unbuffered schedule
1074 optimized_sched.cascades = schedule.cascades
1075 return optimized_sched
1076
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001077 def optimize_weight_buffering_size(
1078 self,
1079 min_schedule: Schedule,
1080 options: SchedulerOptions,
1081 ):
1082 default_schedule = self.sg.schedule
Tim Hallc1be0872022-03-03 17:50:52 +00001083 npu_performance.calc_new_performance_for_network(self.nng, self.arch, None, False)
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001084 default_tot_cycles = self.nng.cycles[npu_performance.PassCycles.Total]
1085 default_dram_cycles = self.nng.cycles[npu_performance.PassCycles.DramAccess]
1086
1087 # Restore mem/type for scratched_fms
1088 for tens in self.scratched_fms:
1089 tens.mem_area = self.scratched_fms[tens][0]
1090 tens.mem_type = self.scratched_fms[tens][1]
1091
1092 self.update_op_memory_snapshot(self.sg.schedule)
1093
1094 # Collect live ranges from tensors
1095 memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]
1096 lr_graph = live_range.LiveRangeGraph()
1097 for mem_area, mem_type_set in memories_list:
1098 live_range.extract_live_ranges_from_cascaded_passes(
1099 self.nng.get_root_subgraph(),
1100 mem_area,
1101 mem_type_set,
1102 lr_graph,
1103 Tensor.AllocationQuantum,
1104 )
1105
1106 # Find the relation between the sched_op and the buffering tensor
1107 weight_ops = {}
1108 for sched_op in self.sched_ops:
1109 cost = self.sg.schedule.cost_map[sched_op]
Rickard Bolinfd8b5002022-05-16 09:11:06 +00001110 for tens in cost.buffered_weight_tensors:
1111 weight_ops[tens] = sched_op
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001112
1113 # Filter out weight buffer live ranges
1114 weight_lrs = []
1115 for lr in lr_graph.lrs:
1116 for tens in lr.tensors:
1117 if weight_ops.get(tens):
1118 weight_lrs.append(lr)
1119 break
1120
1121 # See if any evicted fm overlaps with a weight buffering op.
1122 # If this is the case add a size limitation to the buffering op
1123 for lr in self.evicted_fms:
1124 for weight_lr in weight_lrs:
1125 if lr.overlaps_ranges(weight_lr):
1126 for tens in weight_lr.tensors:
1127 sched_op = weight_ops.get(tens)
1128 if sched_op:
1129 # Add size reduction to the op
1130 sched_op.evicted_fms_size += lr.size
1131 break
1132
1133 self.sg.schedule = min_schedule
1134 self.update_op_memory_snapshot(self.sg.schedule)
1135
1136 # Run schedule buffering - with weight buffer size reduction
1137 schedule = self.propose_schedule_buffering(self.sg.schedule, options.optimization_sram_limit)
1138 schedule.cascades = self.sg.schedule.cascades
1139 self.sg.schedule = schedule
1140
1141 # Apply new buffer schdule and calc new performance
1142 self.update_op_memory_snapshot(self.sg.schedule)
1143 self.apply_schedule(self.sg.schedule)
1144 self.use_fast_storage_for_feature_maps(self.sg.schedule, options.optimization_sram_limit)
1145
Tim Hallc1be0872022-03-03 17:50:52 +00001146 npu_performance.calc_new_performance_for_network(self.nng, self.arch, None, False)
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001147 new_tot_cycles = self.nng.cycles[npu_performance.PassCycles.Total]
1148 new_dram_cycles = self.nng.cycles[npu_performance.PassCycles.DramAccess]
1149
Tim Hall8bc7a652022-05-19 15:29:23 +01001150 improvement_tot = (
1151 round((default_tot_cycles - new_tot_cycles) / default_tot_cycles, 2) if default_tot_cycles != 0 else 0
1152 )
1153 improvement_dram = (
1154 round((default_dram_cycles - new_dram_cycles) / default_dram_cycles, 2) if default_dram_cycles != 0 else 0
1155 )
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001156
1157 # Compare both total and dram improvement
Johan Alfvén3dae1b62022-05-17 10:26:48 +02001158 if not (improvement_tot >= 0 and improvement_dram > 0):
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001159 # No improvement, restore the default schedule
1160 for sched_op in self.sched_ops:
1161 sched_op.evicted_fms_size = 0
1162
1163 for tens in self.scratched_fms:
1164 tens.mem_area = self.scratched_fms[tens][0]
1165 tens.mem_type = self.scratched_fms[tens][1]
1166
1167 self.sg.schedule = default_schedule
1168 self.update_op_memory_snapshot(self.sg.schedule)
1169 self.apply_schedule(self.sg.schedule)
1170 self.use_fast_storage_for_feature_maps(self.sg.schedule, options.optimization_sram_limit)
1171
Tim Halld8339a72021-05-27 18:49:40 +01001172 def apply_schedule(self, sched: Schedule):
1173 """Applies the given schedule as a final solution"""
1174 for sched_op in self.sched_ops:
1175 op_info = sched.cost_map[sched_op]
1176 cascade_info = sched.cascades.get(op_info.cascade, None)
1177 if cascade_info and sched_op in cascade_info.buffers:
1178 buffer_tens = sched_op.ifm.connection.parent_tens
1179 # Apply memory area and type
1180 buffer_tens.mem_area = self.arch.fast_storage_mem_area
1181 buffer_tens.mem_type = MemType.Scratch_fast
1182 # Apply Rolling buffer
1183 buffer_tens.set_format(TensorFormat.NHCWB16, self.arch)
1184 buffer_tens.set_new_sub_purpose(TensorSubPurpose.RollingBufferY, cascade_info.buffers[sched_op].height)
1185
1186 sched_op.parent_ps.block_config = op_info.block_config.old_style_representation()
1187
1188 # Ensure that the src_tensor reference is set correctly
Rickard Bolinfd8b5002022-05-16 09:11:06 +00001189 for tens in op_info.buffered_weight_tensors:
1190 tens.src_tensor = op_info.npu_weights_tensor
Tim Halld8339a72021-05-27 18:49:40 +01001191
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001192 def use_fast_storage_for_feature_maps(self, schedule, staging_limit):
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001193 max_mem_usage = []
1194 base_mem_usage = []
1195 fast_storage_type = MemType.Scratch_fast
1196 fast_storage_mem_area = self.arch.fast_storage_mem_area
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001197 self.evicted_fms = []
Tim Halld8339a72021-05-27 18:49:40 +01001198
1199 # Force all OFMs to fast-storage
1200 for sched_op in self.sched_ops:
1201 cost = schedule.cost_map[sched_op]
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001202 if cost.cascade == 0 and sched_op.get_dependants():
1203 ofm_tens = sched_op.ofm.connection.parent_tens
1204 if not any(cons is None for cons in ofm_tens.consumer_list):
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001205 if ofm_tens not in self.scratched_fms:
1206 # Remember default mem area and mem type, only done once
1207 self.scratched_fms[ofm_tens] = (ofm_tens.mem_area, ofm_tens.mem_type)
1208
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001209 ofm_tens.mem_area = fast_storage_mem_area
1210 ofm_tens.mem_type = fast_storage_type
Tim Halld8339a72021-05-27 18:49:40 +01001211
1212 # Collect live ranges from tensors
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001213 memories_list = [(fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]
Tim Halld8339a72021-05-27 18:49:40 +01001214 lr_graph = live_range.LiveRangeGraph()
1215 for mem_area, mem_type_set in memories_list:
1216 live_range.extract_live_ranges_from_cascaded_passes(
Jonas Ohlssond8575072022-03-30 10:30:25 +02001217 self.nng.get_root_subgraph(),
1218 mem_area,
1219 mem_type_set,
1220 lr_graph,
1221 Tensor.AllocationQuantum,
Tim Halld8339a72021-05-27 18:49:40 +01001222 )
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001223 max_mem_usage = lr_graph.get_temporal_memory_usage(fast_storage_mem_area)
Tim Halld8339a72021-05-27 18:49:40 +01001224
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001225 # If true, everything fits and we can proceed
1226 if max(max_mem_usage) <= staging_limit:
1227 return
1228
1229 # Build up the base memory usage by removing the
1230 # mem_usage of the lrs we previously moved to fast-storage
1231 base_mem_usage = np.array(max_mem_usage)
1232 curr_lrs = []
Tim Halld8339a72021-05-27 18:49:40 +01001233 for lr in lr_graph.lrs:
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001234 for tens in lr.tensors:
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001235 if self.scratched_fms.get(tens):
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001236 curr_lrs.append(lr)
1237 base_mem_usage[lr.start_time : lr.end_time + 1] -= lr.size
1238 break
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001239 competing_lrs = []
Johan Alfvén5c309712022-06-10 15:40:58 +02001240 competing_tens_access = {}
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001241 for lr in curr_lrs:
1242 base_usage = max(base_mem_usage[lr.start_time : lr.end_time + 1])
1243 # If true, the lr will never fit and may thus be evicted
1244 if base_usage + lr.size > staging_limit:
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001245 self.evicted_fms.append(lr)
1246 FastStorageComponentAllocator.evict(lr, max_mem_usage, self.scratched_fms)
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001247 continue
1248 # Since max_mem_usage is the memory usage with all FMs still in fast-storage,
1249 # the memory limit cannot be exceeded if max_mem_usage does not.
1250 # Thus, the affected lrs can remain in fast-storage if the following is true
1251 if max(max_mem_usage[lr.start_time : lr.end_time + 1]) <= staging_limit:
1252 FastStorageComponentAllocator.keep(lr, base_mem_usage, staging_limit)
1253 else:
1254 competing_lrs.append(lr)
Johan Alfvén5c309712022-06-10 15:40:58 +02001255 for tens in lr.tensors:
1256 competing_tens_access[tens] = 0
1257
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001258 sz = len(competing_lrs)
1259 # All lrs and their tensors have been handled if sz is zero, we may thus return
1260 if sz == 0:
1261 return
1262
Johan Alfvén5c309712022-06-10 15:40:58 +02001263 # Estimate element access for all tensors that are competing for a place in fast-storage.
1264 # This number is used when deciding which tensor that stays in fast-storage
1265 for sched_op in self.sched_ops:
1266 cost = schedule.cost_map[sched_op]
1267
1268 if competing_tens_access.get(sched_op.ifm.connection.parent_tens) is not None:
1269 tens = sched_op.ifm.connection.parent_tens
1270 access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)
1271 competing_tens_access[tens] += access.ifm_read[0]
1272
1273 if sched_op.ifm2 and competing_tens_access.get(sched_op.ifm2.connection.parent_tens) is not None:
1274 tens = sched_op.ifm2.connection.parent_tens
1275 access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)
1276 competing_tens_access[tens] += access.ifm_read[1]
1277
1278 if competing_tens_access.get(sched_op.ofm.connection.parent_tens) is not None:
1279 tens = sched_op.ofm.connection.parent_tens
1280 access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)
1281 competing_tens_access[tens] += access.ofm_write
1282
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001283 competing_lrs = sorted(competing_lrs, key=lambda lr: (lr.start_time, lr.end_time + 1, lr.size))
1284 start = 0
1285 start_time = competing_lrs[0].start_time
1286 end_time = competing_lrs[0].end_time
1287 component_allocator = FastStorageComponentAllocator(base_mem_usage, max_mem_usage, staging_limit)
1288 # Build up components and then allocate each separately
1289 for i, lr in enumerate(competing_lrs):
Johan Alfvén5c309712022-06-10 15:40:58 +02001290 if lr.start_time <= end_time and i - start < component_allocator.MAX_EXHAUSTIVE_LIFE_RANGE:
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001291 start_time = min(start_time, lr.start_time)
1292 end_time = max(end_time, lr.end_time)
1293 else:
1294 component_allocator.allocate_component(
1295 component_allocator,
1296 competing_lrs[start:i],
1297 max_mem_usage,
1298 base_mem_usage,
1299 staging_limit,
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001300 self.scratched_fms,
Johan Alfvén5c309712022-06-10 15:40:58 +02001301 competing_tens_access,
Johan Alfvén68b8f2f2022-06-24 08:42:19 +02001302 self.evicted_fms,
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001303 )
1304 start = i
1305 start_time = lr.start_time
1306 end_time = lr.end_time
1307 component_allocator.allocate_component(
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001308 component_allocator,
1309 competing_lrs[start:sz],
1310 max_mem_usage,
1311 base_mem_usage,
1312 staging_limit,
1313 self.scratched_fms,
Johan Alfvén5c309712022-06-10 15:40:58 +02001314 competing_tens_access,
Johan Alfvén68b8f2f2022-06-24 08:42:19 +02001315 self.evicted_fms,
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001316 )
Tim Halld8339a72021-05-27 18:49:40 +01001317
1318 def move_constant_data(self):
1319 """Determine if data, can be moved from permanent storage to another memory area. A move
1320 will generate a DMA command in the high-level command stream"""
1321 for sched_op in self.sched_ops:
1322 parent_op = sched_op.parent_op
1323 is_lut_used = any(inp.purpose == TensorPurpose.LUT for inp in parent_op.inputs)
1324 max_ifm_shram_avail = (
1325 (self.arch.available_shram_banks(is_lut_used) - self.arch.shram_reserved_output_banks)
1326 * self.arch.shram_bank_size
1327 // 2
1328 )
1329
1330 for idx, tens in enumerate(parent_op.inputs):
1331 if tens.mem_type not in (MemType.Scratch, MemType.Scratch_fast):
1332 # Tensor is in permanent storage
1333 # Only when permanent storage differs from feature map storage, there is a point moving the data
1334 if (
1335 tens.mem_area in self.arch.permanent_storage_mem_area
1336 and self.arch.permanent_storage_mem_area != self.arch.feature_map_storage_mem_area
1337 ) or tens.purpose == TensorPurpose.LUT:
1338 if tens.purpose == TensorPurpose.LUT or (
Patrik Gustavsson94292fe2021-09-02 08:22:58 +02001339 # For elementwise broadcast
Tim Halld8339a72021-05-27 18:49:40 +01001340 tens.purpose == TensorPurpose.FeatureMap
1341 and sched_op.op_type.is_binary_elementwise_op()
1342 and tens.shape != []
1343 and sched_op.ifm.shape != sched_op.ofm.shape
Patrik Gustavsson94292fe2021-09-02 08:22:58 +02001344 and parent_op.write_shape is None
Tim Halld8339a72021-05-27 18:49:40 +01001345 and tens.storage_size() > max_ifm_shram_avail
1346 ):
1347 only_vector_product_consumers = all(
1348 oper and oper.type.npu_block_type == NpuBlockType.VectorProduct
1349 for oper in tens.consumers()
1350 )
1351
1352 if (not only_vector_product_consumers) or tens.purpose == TensorPurpose.LUT:
1353 new_tens = tens.clone_into_fast_storage(self.arch)
1354 if tens.purpose == TensorPurpose.LUT:
1355 new_tens.mem_area = MemArea.Shram
1356
1357 new_tens.consumer_list.append(parent_op)
1358 parent_op.inputs[idx] = new_tens
Dwight Lidman352607c2021-09-29 17:00:09 +02001359 # If the index is out of range, IFM and IFM2 are the same tensor
1360 # and pass inputs don't have duplicates
1361 if idx < len(sched_op.parent_ps.inputs):
1362 sched_op.parent_ps.inputs[idx] = new_tens
Tim Halld8339a72021-05-27 18:49:40 +01001363
1364 def print_schedule(self, schedule: Schedule):
1365 print(f"Schedule: '{schedule.name}'")
1366 for sched_op in self.sched_ops:
1367 if sched_op not in schedule.cost_map:
1368 # Sub-schedule printing
1369 continue
1370
1371 op_info = schedule.cost_map[sched_op]
1372 print(f"\t{sched_op.index}: Operation {sched_op.name} - OFM {sched_op.ofm.shape}")
1373 print(f"\t\tType: {sched_op.op_type}")
1374 print(f"\t\tKernel: {sched_op.kernel}")
1375 print(f"{op_info}")
1376 mem_usage = (
1377 schedule.memory_snapshot[op_info.time_index]
1378 if op_info.time_index < len(schedule.memory_snapshot)
1379 else 0
1380 )
1381 print(f"\t\tSRAM Used: {mem_usage} bytes")
1382
Jonas Ohlsson25e700c2022-03-04 14:58:56 +01001383 print("\tCascades:")
Tim Halld8339a72021-05-27 18:49:40 +01001384 for i, cascade in enumerate(schedule.cascades.values()):
1385 print(f"\t\t{i}: {cascade.start} -> {cascade.end}, size: {cascade.mem_usage}")
Patrik Gustavssonfeeb06d2020-04-22 12:53:47 +02001386
Andreas Nevalainen27d36f02020-11-19 11:27:50 +01001387
Tim Halld8339a72021-05-27 18:49:40 +01001388def _update_tensor_allocation(nng: Graph, arch: ArchitectureFeatures, options):
1389 """
1390 Creates live ranges and runs tensor allocator for the current schedule
1391 (i.e. sg.schedule for all subgraphs), returns the maximum memory usage
1392 and updates SchedulerOpInfo.mem_usage for all operations in the schedule.
1393 """
1394 root_sg = nng.get_root_subgraph()
1395
1396 alloc_list = []
1397 if arch.is_spilling_enabled():
1398 mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,)))
1399 mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))
1400 # Order is important
1401 alloc_list.append(mem_alloc_scratch_fast)
1402 alloc_list.append(mem_alloc_scratch)
1403 else:
1404 mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))
1405 alloc_list.append(mem_alloc_scratch)
1406
1407 for mem_area, mem_type_set in alloc_list:
1408 tensor_allocation.allocate_tensors(
1409 nng,
1410 root_sg,
1411 arch,
1412 mem_area,
1413 mem_type_set,
1414 tensor_allocator=options.tensor_allocator,
1415 verbose_allocation=options.verbose_allocation,
1416 cpu_tensor_alignment=options.cpu_tensor_alignment,
Tim Hallcda4fcb2022-05-19 12:36:58 +01001417 hillclimb_max_iterations=options.hillclimb_max_iterations,
Tim Halld8339a72021-05-27 18:49:40 +01001418 )
1419
1420
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001421class FastStorageComponentAllocator:
Johan Alfvén5c309712022-06-10 15:40:58 +02001422 MAX_EXHAUSTIVE_LIFE_RANGE = 20
1423
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001424 def __init__(self, base_mem_usage, max_mem_usage, staging_limit):
1425 self.base_mem_usage = base_mem_usage
1426 self.max_mem_usage = list(max_mem_usage)
1427 self.staging_limit = staging_limit
1428 self.lrs = []
1429 self.evicted = []
1430 self.curr_evicted = []
1431 self.remaining_total_size = []
Johan Alfvén5c309712022-06-10 15:40:58 +02001432 self.best_score = 0
1433 self.competing_tens_access = {}
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001434
Johan Alfvén5c309712022-06-10 15:40:58 +02001435 def allocate_exhaustive(self, ix, score):
1436 # Favour tensors with highest element access (score)
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001437 if ix >= len(self.lrs):
Johan Alfvén5c309712022-06-10 15:40:58 +02001438 if score > self.best_score:
1439 self.best_score = score
Louis Verhaard5c8f1e52022-02-23 14:13:07 +01001440 self.evicted = self.curr_evicted.copy()
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001441 return
1442
1443 lr = self.lrs[ix]
1444 for t in range(lr.start_time, lr.end_time):
1445 assert self.base_mem_usage[t] <= self.max_mem_usage[t]
1446 base_usage = max(self.base_mem_usage[lr.start_time : lr.end_time + 1])
1447 can_fit = base_usage + lr.size <= self.staging_limit
1448 always_fits = can_fit
1449
1450 if can_fit:
1451 max_usage = max(self.max_mem_usage[lr.start_time : lr.end_time + 1])
1452 always_fits = max_usage <= self.staging_limit
1453
1454 if can_fit or always_fits:
1455 self.curr_evicted[ix] = False
1456 self.base_mem_usage = self.update_mem_usage(self.base_mem_usage, lr, True)
Johan Alfvén5c309712022-06-10 15:40:58 +02001457 tens = lr.tensors[0]
1458 # Tensor is being included - add tensor element access to the score
1459 self.allocate_exhaustive(ix + 1, score + self.competing_tens_access[tens])
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001460 self.base_mem_usage = self.update_mem_usage(self.base_mem_usage, lr, False)
1461
1462 if not always_fits:
1463 self.curr_evicted[ix] = True
1464 self.max_mem_usage = self.update_mem_usage(self.max_mem_usage, lr, False)
Johan Alfvén5c309712022-06-10 15:40:58 +02001465 self.allocate_exhaustive(ix + 1, score)
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001466 self.max_mem_usage = self.update_mem_usage(self.max_mem_usage, lr, True)
1467
1468 @staticmethod
1469 def update_mem_usage(mem_usage, lr, increase):
1470 for t in range(lr.start_time, lr.end_time + 1):
1471 mem_usage[t] += lr.size if increase else -lr.size
1472 assert mem_usage[t] >= 0
1473 return mem_usage
1474
1475 @staticmethod
1476 def evict(lr, max_mem_usage, scratched_fms):
1477 for t in range(lr.start_time, lr.end_time + 1):
1478 max_mem_usage[t] -= lr.size
1479 for tens in lr.tensors:
1480 if tens in scratched_fms:
1481 tens.mem_area = scratched_fms[tens][0]
1482 tens.mem_type = scratched_fms[tens][1]
1483
1484 @staticmethod
1485 def keep(lr, base_mem_usage, staging_limit):
1486 for t in range(lr.start_time, lr.end_time + 1):
1487 base_mem_usage[t] += lr.size
1488 assert base_mem_usage[t] <= staging_limit
1489
Johan Alfvén68b8f2f2022-06-24 08:42:19 +02001490 def allocate_component(
1491 self, allocator, lrs, max_mem, min_mem, staging_limit, scratched_fms, competing_tens_access, evicted_fms
1492 ):
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001493 sz = len(lrs)
1494 allocator.lrs = lrs
1495 allocator.evicted = [0] * len(lrs)
1496 allocator.curr_evicted = [0] * sz
Johan Alfvén5c309712022-06-10 15:40:58 +02001497 allocator.best_score = -1
1498 allocator.competing_tens_access = competing_tens_access
1499 # Recursively evaluate all permutations of allocations of the lrs found in the component.
1500 # For every permutation that fits within the staging_limit there is a score calculated.
1501 # The permutation with the highest score will then be chosen. The score is calculated
1502 # as the sum of the actual element access (ifm read and ofm write) for all the
1503 # including tensors. So it is not necessary the tensor with the biggest size that ends up
1504 # being included in the result.
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001505 allocator.allocate_exhaustive(0, 0)
1506
1507 # Optimal allocation has been found, move lrs accordingly
1508 for i, e in enumerate(allocator.evicted):
1509 if e:
1510 self.evict(lrs[i], max_mem, scratched_fms)
Johan Alfvén68b8f2f2022-06-24 08:42:19 +02001511 if lrs[i] not in evicted_fms:
1512 evicted_fms.append(lrs[i])
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001513 else:
1514 self.keep(lrs[i], min_mem, staging_limit)
Johan Alfvén68b8f2f2022-06-24 08:42:19 +02001515 if lrs[i] in evicted_fms:
1516 evicted_fms.remove(lrs[i])
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001517
1518
Tim Halld8339a72021-05-27 18:49:40 +01001519def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_options: SchedulerOptions):
1520 """Entry point for the Scheduler"""
1521 # Initialize CPU subgraphs
1522 schedulers = dict()
1523 # Initialize schedulers with max schedule. Only schedule NPU subgraphs
Andreas Nevalainen27d36f02020-11-19 11:27:50 +01001524 for sg in nng.subgraphs:
Tim Halld8339a72021-05-27 18:49:40 +01001525 if sg.placement != PassPlacement.Npu:
1526 # Create cascaded passes for CPU Ops
1527 cascaded_passes = []
1528 for idx, ps in enumerate(sg.passes):
1529 cps = CascadedPass(
Jonas Ohlssond8575072022-03-30 10:30:25 +02001530 ps.name,
1531 SchedulingStrategy.WeightStream,
1532 ps.inputs,
1533 [],
1534 ps.outputs,
1535 [ps],
1536 ps.placement,
1537 False,
Tim Halld8339a72021-05-27 18:49:40 +01001538 )
Andreas Nevalainen27d36f02020-11-19 11:27:50 +01001539
Tim Halld8339a72021-05-27 18:49:40 +01001540 cps.time = idx
1541 ps.cascade = cps
1542 cascaded_passes.append(cps)
Andreas Nevalainen27d36f02020-11-19 11:27:50 +01001543
Tim Halld8339a72021-05-27 18:49:40 +01001544 sg.cascaded_passes = cascaded_passes
1545 else:
1546 # Npu subgraph - create schedule
1547 scheduler = Scheduler(nng, sg, arch, scheduler_options)
1548 schedulers[sg] = scheduler
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001549
Tim Halld8339a72021-05-27 18:49:40 +01001550 scheduler.create_scheduler_representation(arch)
1551 sg.sched_ops = scheduler.sched_ops
1552 scheduler.move_constant_data()
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001553
Tim Halld8339a72021-05-27 18:49:40 +01001554 # Create the Max schedule template
1555 max_schedule_template = scheduler.create_initial_schedule()
1556 scheduler.max_schedule = max_schedule_template
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001557
Tim Halld8339a72021-05-27 18:49:40 +01001558 # Create the optimimised Max schedule
1559 sg.schedule = max_schedule_template
1560 scheduler.update_op_memory_snapshot(max_schedule_template)
Tim Hall789e6f32021-06-17 17:02:31 +01001561 opt_max_schedule = scheduler.propose_schedule_buffering(max_schedule_template, 1 << 32)
Tim Halld8339a72021-05-27 18:49:40 +01001562 sg.schedule = opt_max_schedule
1563 scheduler.update_op_memory_snapshot(opt_max_schedule)
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001564
Tim Halld8339a72021-05-27 18:49:40 +01001565 # Create Min schedule
1566 min_schedule = scheduler.propose_minimal_schedule()
1567 initial_sram_limit = scheduler_options.optimization_sram_limit
1568 if scheduler_options.optimization_strategy == OptimizationStrategy.Size:
1569 initial_sram_limit = scheduler.min_memory_req
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001570
Johan Alfvén255dad72022-07-16 18:27:05 +02001571 # Build cascades for Min schedule
1572 scheduler.build_cascades_for_min_schedule(min_schedule, max_schedule_template, initial_sram_limit)
Tim Halld8339a72021-05-27 18:49:40 +01001573 sg.schedule = min_schedule
1574 scheduler.update_op_memory_snapshot(min_schedule)
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001575
Tim Halld8339a72021-05-27 18:49:40 +01001576 if scheduler_options.optimization_strategy == OptimizationStrategy.Performance:
1577 # Create an optimized schedule
1578 sg.schedule = scheduler.optimize_schedule(
1579 min_schedule, opt_max_schedule, max_schedule_template, scheduler_options
1580 )
1581 scheduler.update_op_memory_snapshot(sg.schedule)
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001582
Tim Halld8339a72021-05-27 18:49:40 +01001583 scheduler.apply_schedule(sg.schedule)
1584 scheduler.use_fast_storage_for_feature_maps(sg.schedule, scheduler_options.optimization_sram_limit)
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001585
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001586 if scheduler_options.optimization_strategy == OptimizationStrategy.Performance and scheduler.evicted_fms:
1587 # It might be possible to gain performance by reducing
1588 # weight buffer size and instead fit fms in fast storage
1589 scheduler.optimize_weight_buffering_size(min_schedule, scheduler_options)
1590
Tim Halld8339a72021-05-27 18:49:40 +01001591 if scheduler_options.verbose_schedule:
1592 scheduler.print_schedule(sg.schedule)
Tim Hall79d07d22020-04-27 18:20:16 +01001593
Tim Halld8339a72021-05-27 18:49:40 +01001594 # Evaluate schedule
1595 _update_tensor_allocation(nng, arch, options)