blob: dd66ec848ea326384eaa14ff621cc3455ca8605c [file] [log] [blame]
Rickard Bolinbc6ee582022-11-04 08:24:29 +00001# SPDX-FileCopyrightText: Copyright 2020-2022 Arm Limited and/or its affiliates <open-source-office@arm.com>
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Halld8339a72021-05-27 18:49:40 +010016#
Tim Hall79d07d22020-04-27 18:20:16 +010017# Description:
Tim Halld8339a72021-05-27 18:49:40 +010018# The scheduler creates and searches for an optimal plan for the network, selecting block configurations and
19# subdivisions for the Operators
Jonas Ohlsson845e2322022-03-01 12:39:55 +010020# For Class name forward references for the type annotations. (see PEP 563).
21from __future__ import annotations
22
Diego Russoea6111a2020-04-14 18:41:58 +010023import copy
Johan Alfvén5e0ae552022-02-09 21:20:10 +010024from collections import namedtuple
Tim Halld8339a72021-05-27 18:49:40 +010025from enum import auto
26from enum import IntEnum
Johan Alfvén6f4cb032022-05-05 08:42:46 +020027from typing import Any
Tim Halld8339a72021-05-27 18:49:40 +010028from typing import Dict
29from typing import List
30from typing import Optional
31from typing import Tuple
Jonas Ohlsson845e2322022-03-01 12:39:55 +010032from typing import TYPE_CHECKING
33
34# Import needed for Type annotations. Only import for Type checking to avoid run-time errors due to cyclic import.
35if TYPE_CHECKING:
36 from .npu_performance import CycleCost
Diego Russoea6111a2020-04-14 18:41:58 +010037
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +010038import numpy as np
39
Diego Russoea6111a2020-04-14 18:41:58 +010040from . import live_range
Tim Hall79d07d22020-04-27 18:20:16 +010041from . import npu_performance
Tim Halld8339a72021-05-27 18:49:40 +010042from . import tensor_allocation
43from . import weight_compressor
44from .architecture_allocator import ArchitectureBlockConfig
45from .architecture_allocator import find_block_config
46from .architecture_allocator import get_ifm_area_required
Fredrik Svedbergd03dc502022-06-30 10:44:12 +020047from .architecture_allocator import to_upscale
erik.andersson@arm.com8912f3a2022-08-16 11:08:57 +020048from .architecture_allocator import is_nearest
Tim Halld8339a72021-05-27 18:49:40 +010049from .architecture_features import ArchitectureFeatures
50from .architecture_features import Block
51from .cascade_builder import CascadeBuilder
52from .cascade_builder import CascadeInfo
Fredrik Svedberg880e7352020-08-25 11:31:47 +020053from .data_type import DataType
Diego Russoe8a10452020-04-21 17:39:10 +010054from .nn_graph import CascadedPass
Tim Halld8339a72021-05-27 18:49:40 +010055from .nn_graph import Graph
56from .nn_graph import Pass
Diego Russoe8a10452020-04-21 17:39:10 +010057from .nn_graph import PassPlacement
Diego Russoe8a10452020-04-21 17:39:10 +010058from .nn_graph import SchedulingStrategy
Tim Halld8339a72021-05-27 18:49:40 +010059from .nn_graph import Subgraph
Johan Alfvén92689d52022-12-06 11:16:19 +010060from .live_range import ofm_can_reuse_ifm
Tim Halld8339a72021-05-27 18:49:40 +010061from .numeric_util import round_down
62from .numeric_util import round_up
Diego Russoe8a10452020-04-21 17:39:10 +010063from .operation import NpuBlockType
Louis Verhaardaee5d752020-09-30 09:01:52 +020064from .operation import Op
Tim Halld8339a72021-05-27 18:49:40 +010065from .shape4d import Shape4D
Diego Russoe8a10452020-04-21 17:39:10 +010066from .tensor import MemArea
Patrik Gustavssoneca2e952020-05-27 09:15:11 +020067from .tensor import MemType
Tim Halld8339a72021-05-27 18:49:40 +010068from .tensor import Tensor
Diego Russoe8a10452020-04-21 17:39:10 +010069from .tensor import TensorFormat
70from .tensor import TensorPurpose
71from .tensor import TensorSubPurpose
Jonas Ohlsson845e2322022-03-01 12:39:55 +010072from .weight_compressor import NpuWeightTensor
Jacob Bohlin1a666972020-09-11 10:04:15 +020073
Tim Hall79d07d22020-04-27 18:20:16 +010074
Tim Halld8339a72021-05-27 18:49:40 +010075def shape_for_format(shape: Shape4D, tensor_format: TensorFormat) -> Shape4D:
76 if tensor_format == TensorFormat.NHCWB16:
77 return shape.with_depth(round_up(shape.depth, 16))
78
79 return shape
80
81
82class OptimizationStrategy(IntEnum):
83 """Enum defining the different optimization strategies for the Scheduler"""
84
85 Size = auto()
86 Performance = auto()
Tim Hall79d07d22020-04-27 18:20:16 +010087
88 def __str__(self):
89 return self.name
90
91
Tim Halld8339a72021-05-27 18:49:40 +010092class SchedulerOpInfo:
93 """Contains metadata about a SchedulerOperation that is unique to one Schedule"""
94
Tim Hall79d07d22020-04-27 18:20:16 +010095 def __init__(
96 self,
Tim Halld8339a72021-05-27 18:49:40 +010097 block_config: ArchitectureBlockConfig,
98 weights_size: int,
99 stripe_input: Shape4D,
100 stripe_input2: Optional[Shape4D],
101 stripe: Shape4D,
Tim Hall79d07d22020-04-27 18:20:16 +0100102 ):
Tim Halld8339a72021-05-27 18:49:40 +0100103 self.block_config = block_config
104 self.weights_size = weights_size
105 self.stripe_input = stripe_input
106 self.stripe_input2 = stripe_input2
107 self.stripe = stripe
108 self.cascade = 0 # Assigned by CascadeBuilder. 0 means not part of a cascade
109 self.time_index = None # Set by update_op_memory_snapshot
110 self.ofm_depth_slices: List[int] = [0, stripe.depth]
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100111 self.npu_weights_tensor: Optional[NpuWeightTensor] = None
112 self.npu_scales_tensor: Optional[NpuWeightTensor] = None
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000113 self.buffered_weight_tensors: List[Tensor] = []
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100114 self.cycles: Optional[CycleCost] = None
Tim Halld8339a72021-05-27 18:49:40 +0100115 self.slack_buffering_cycles = 0
116 self.slack_buffering_memory = 0
117 self.full_weight_transfer_cycles = 0
118
119 def copy(self):
Jonas Ohlssond8575072022-03-30 10:30:25 +0200120 res = SchedulerOpInfo(
121 self.block_config,
122 self.weights_size,
123 self.stripe_input,
124 self.stripe_input2,
125 self.stripe,
126 )
Tim Halld8339a72021-05-27 18:49:40 +0100127 res.cascade = self.cascade
128 return res
129
130 def __str__(self):
131 res = f"\t\tBlock Config = {self.block_config}\n"
132 res += f"\t\tOFM Block = {self.block_config.ofm_block}\n"
133 res += f"\t\tIFM Stripe = {self.stripe_input}\n"
134 res += f"\t\tIFM2 Stripe = {self.stripe_input2}\n"
135 res += f"\t\tOFM Stripe = {self.stripe}\n"
136 res += f"\t\tEncoded Weights = {self.npu_weights_tensor and len(self.npu_weights_tensor.buffer)} bytes\n"
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000137 for idx, tens in enumerate(self.buffered_weight_tensors):
138 res += f"\t\tWeight buffer{idx + 1} = {tens.storage_size()} bytes\n"
Tim Halld8339a72021-05-27 18:49:40 +0100139 res += f"\t\tDepth slices = {self.ofm_depth_slices}\n"
140 res += f"\t\tAssigned Cascade = {self.cascade}"
141 return res
142
143
144class SchedulerOptions:
145 """Contains options for the Scheduler"""
146
147 def __init__(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200148 self,
149 optimization_strategy,
150 sram_target,
151 verbose_schedule,
Tim Halld8339a72021-05-27 18:49:40 +0100152 ):
153 self.optimization_strategy = optimization_strategy
154 self.optimization_sram_limit = sram_target
Tim Hall79d07d22020-04-27 18:20:16 +0100155 self.verbose_schedule = verbose_schedule
Tim Hall79d07d22020-04-27 18:20:16 +0100156
Tim Halld8339a72021-05-27 18:49:40 +0100157 def __str__(self) -> str:
158 return f"{type(self).__name__}: {str(self.__dict__)}"
Tim Hall79d07d22020-04-27 18:20:16 +0100159
160 __repr__ = __str__
161
162
Tim Halld8339a72021-05-27 18:49:40 +0100163class SchedulerTensor:
164 def __init__(self, shape, dt, mem_area, _format):
165 self.dtype = dt
166 self.mem_area = mem_area
167 self.shape = shape
168 self.format = _format
169 self.connection = None
Tim Hall79d07d22020-04-27 18:20:16 +0100170
Tim Halld8339a72021-05-27 18:49:40 +0100171
172class SchedulerOperation:
173 """Scheduler internal representation of 'Operation'
174 This class can be seen as a node within the Scheduler Graph representation
175 """
176
177 def __init__(self, ps: Pass, arch: ArchitectureFeatures, nng: Graph):
178 self.arch = arch
179 self.parent_ps = ps
180 self.parent_op = ps.primary_op
181 self.name = ps.primary_op.name
182 self.op_type = ps.primary_op.type
183 self.activation = ps.primary_op.activation
184 self.kernel = ps.primary_op.kernel
Tim Hall3c5cfe92022-03-16 16:31:57 +0000185 self.resampling_mode = ps.primary_op.ifm_resampling_mode
Fredrik Svedbergb81e1bb2022-10-11 21:50:51 +0200186 self.reversed_operands = False
Tim Halld8339a72021-05-27 18:49:40 +0100187 self.uses_scalar = ps.primary_op.ifm2 is not None and (
188 ps.primary_op.ifm.shape == [] or ps.primary_op.ifm2.shape == []
Tim Hall79d07d22020-04-27 18:20:16 +0100189 )
erik.andersson@arm.com6b2a0b42022-03-22 15:35:30 +0100190
Tim Halld8339a72021-05-27 18:49:40 +0100191 self.ifm_ublock = arch.ifm_ublock
Tim Hall79d07d22020-04-27 18:20:16 +0100192
Jonas Ohlssond8575072022-03-30 10:30:25 +0200193 self.ifm = SchedulerTensor(
194 ps.ifm_shapes[0],
195 ps.ifm_tensor.dtype,
196 ps.ifm_tensor.mem_area,
197 ps.ifm_tensor.format,
198 )
Tim Hall79d07d22020-04-27 18:20:16 +0100199
Tim Halld8339a72021-05-27 18:49:40 +0100200 self.ifm2 = None
201 if ps.ifm2_tensor:
202 self.ifm2 = SchedulerTensor(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200203 ps.ifm_shapes[1],
204 ps.ifm2_tensor.dtype,
205 ps.ifm2_tensor.mem_area,
206 ps.ifm2_tensor.format,
Tim Halld8339a72021-05-27 18:49:40 +0100207 )
Tim Hall79d07d22020-04-27 18:20:16 +0100208
Jonas Ohlssond8575072022-03-30 10:30:25 +0200209 self.ofm = SchedulerTensor(
210 ps.ofm_shapes[0],
211 ps.ofm_tensor.dtype,
212 ps.ofm_tensor.mem_area,
213 ps.ofm_tensor.format,
214 )
Tim Hall79d07d22020-04-27 18:20:16 +0100215
Tim Halld8339a72021-05-27 18:49:40 +0100216 # Input volume width and height required to produce the smallest possible stripe
217 self.min_stripe_input_w, self.min_stripe_input_h = self._calculate_min_stripe_input()
Tim Hall79d07d22020-04-27 18:20:16 +0100218
Tim Halld8339a72021-05-27 18:49:40 +0100219 # Flags that marks whether this SchedulerOperation requires full IFM/OFM
220 self.requires_full_ifm = False
221 self.requires_full_ifm2 = False
222 self.requires_full_ofm = False
Tim Hall79d07d22020-04-27 18:20:16 +0100223
Johan Alfvén6f4cb032022-05-05 08:42:46 +0200224 self.evicted_fms_size = 0
225
Tim Halld8339a72021-05-27 18:49:40 +0100226 self.index = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100227
erik.andersson@arm.com6b2a0b42022-03-22 15:35:30 +0100228 # Perform an IFM swap for certain binary elementwise operators
229 # in order to enable cascading, if the SchedOp conforms to
230 # Elementwise cascading rules.
Johan Alfvén0f2e59f2022-10-21 11:21:38 +0200231 # The non-constant/non-scalar/non-broadcast IFM should be the primary input
232 if self.op_type.is_binary_elementwise_op():
233 ifm = self.parent_op.ifm
234 ifm2 = self.parent_op.ifm2
235 ofm = self.parent_op.ofm
erik.andersson@arm.com6b2a0b42022-03-22 15:35:30 +0100236
Johan Alfvén993ea532022-10-26 10:20:01 +0200237 ifm_can_swap = ifm.is_const or ifm.is_scalar
Johan Alfvén0f2e59f2022-10-21 11:21:38 +0200238 ifm2_can_be_primary = not (ifm2.is_const or ifm2.is_scalar or ifm2.is_broadcast(ofm))
239
Johan Alfvén993ea532022-10-26 10:20:01 +0200240 if ifm_can_swap and ifm2_can_be_primary:
Johan Alfvén0f2e59f2022-10-21 11:21:38 +0200241 # IFM2 is the primary input
Fredrik Svedbergb81e1bb2022-10-11 21:50:51 +0200242 self.reversed_operands = True
erik.andersson@arm.com6b2a0b42022-03-22 15:35:30 +0100243 self.ifm, self.ifm2 = self.ifm2, self.ifm
244
245 self.parent_ps.ifm_shapes = self.parent_ps.ifm_shapes[::-1]
246 self.parent_ps.inputs = self.parent_ps.inputs[::-1]
247 self.parent_ps.ifm_tensor, self.parent_ps.ifm2_tensor = (
248 self.parent_ps.ifm2_tensor,
249 self.parent_ps.ifm_tensor,
250 )
251
Tim Halld8339a72021-05-27 18:49:40 +0100252 def add_ifm_connection(self, conn: "Connection"):
253 """Add input connection to another SchedulerOperation or Subgraph Input"""
254 conn.consumers.append(self)
255 self.ifm.connection = conn
Tim Hall79d07d22020-04-27 18:20:16 +0100256
Tim Halld8339a72021-05-27 18:49:40 +0100257 def add_ifm2_connection(self, conn: "Connection"):
258 """Add input connection to another SchedulerOperation or Subgraph Input"""
259 if self.ifm2:
260 conn.consumers.append(self)
261 self.ifm2.connection = conn
Tim Hall79d07d22020-04-27 18:20:16 +0100262 else:
Tim Halld8339a72021-05-27 18:49:40 +0100263 assert False, f"Trying to set an IFM2 Connection to {self} which has no IFM2"
Tim Hall79d07d22020-04-27 18:20:16 +0100264
Tim Halld8339a72021-05-27 18:49:40 +0100265 def add_ofm_connection(self, conn: "Connection"):
266 """Add output connection to another SchedulerOperation or Subgraph Output"""
267 conn.producers.append(self)
268 self.ofm.connection = conn
269
270 def get_dependants(self):
271 """Returns a list of the Ops that depend on this Operation's OFM"""
272 return self.ofm.connection.consumers
273
274 def ifm_size_in_bytes(self) -> int:
275 """Returns size of the IFM in bytes"""
276 ifm_storage_shape = shape_for_format(self.ifm.shape, self.ifm.format)
277 return round_up(ifm_storage_shape.elements() * self.ifm.dtype.size_in_bytes(), Tensor.AllocationQuantum)
278
279 def ifm2_size_in_bytes(self) -> int:
280 """Returns size of the IFM2 in bytes"""
281 if self.ifm2:
282 ifm2_storage_shape = shape_for_format(self.ifm2.shape, self.ifm2.format)
283 return round_up(ifm2_storage_shape.elements() * self.ifm2.dtype.size_in_bytes(), Tensor.AllocationQuantum)
284
285 return 0
286
287 def ofm_size_in_bytes(self) -> int:
288 """Returns size of the OFM in bytes"""
289 ofm_storage_shape = shape_for_format(self.ofm.shape, self.ofm.format)
290 return round_up(ofm_storage_shape.elements() * self.ofm.dtype.size_in_bytes(), Tensor.AllocationQuantum)
291
292 def create_scheduler_info(self, nng: Graph, stripe: Shape4D) -> SchedulerOpInfo:
293 """Returns schedule info about this SchedulerOperation based on how many ofm elements it should produce"""
294 ifm_shape = self.ifm.shape
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100295 ifm2_shape = self.ifm2.shape if self.ifm2 is not None else None
Tim Halld8339a72021-05-27 18:49:40 +0100296 ofm_shape = stripe
297
298 if ofm_shape != self.ofm.shape:
299 # Striped Op - Need to calculate stripe input volume
300 stripe_input_w, stripe_input_h = self._get_stripe_input_requirement(stripe)
301 # Ensure stripe input volume is within the full IFM volume
302 stripe_input_h = min(stripe_input_h, self.ifm.shape.height)
303 stripe_input_w = min(stripe_input_w, self.ifm.shape.width)
304 ifm_shape = ifm_shape.with_hw(stripe_input_h, stripe_input_w)
305
306 if self.ifm2:
307 stripe_input2_h = min(stripe_input_h, self.ifm2.shape.height)
308 stripe_input2_w = min(stripe_input_w, self.ifm2.shape.width)
309 ifm2_shape = ifm2_shape.with_hw(stripe_input2_h, stripe_input2_w)
310
311 block_config = self._get_block_config(ifm_shape, ifm2_shape, self.uses_scalar, ofm_shape)
312
313 scheduler_op_info = SchedulerOpInfo(block_config, 0, ifm_shape, ifm2_shape, ofm_shape)
314 if self.parent_op.weights:
315 # Default full-depth weight encoding with no buffering
Tim Halld784af72021-06-08 21:25:57 +0100316 (
317 scheduler_op_info.npu_weights_tensor,
318 scheduler_op_info.npu_scales_tensor,
319 ) = weight_compressor.encode_weight_and_scale_tensor(
Tim Halld8339a72021-05-27 18:49:40 +0100320 self.arch,
321 self.parent_op,
322 self.parent_op.weights,
323 self.parent_op.bias,
324 self.kernel,
325 block_config,
326 [0, self.ofm.shape.depth],
327 )
328
329 self.parent_ps.block_config = block_config.old_style_representation()
330 return scheduler_op_info
331
332 def _get_stripe_input_requirement(self, stripe_shape: Shape4D) -> Tuple[int, int]:
333 """Returns the amount of IFM required to produce the stripe with shape:'stripe_shape'"""
334 ofm_shape_to_produce = Block.from_shape(stripe_shape.as_list())
335
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200336 return get_ifm_area_required(ofm_shape_to_produce, self.kernel, self.resampling_mode)
Tim Halld8339a72021-05-27 18:49:40 +0100337
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100338 def _calculate_min_stripe_input(self) -> Tuple[int, int]:
Tim Halld8339a72021-05-27 18:49:40 +0100339 # Calculate the input volume required height and width for the smallest possible stripe (h,w = 1,1)
340 min_stripe = self.ofm.shape.with_hw(1, 1)
341 return self._get_stripe_input_requirement(min_stripe)
342
343 def _get_block_config(
344 self, ifm_shape: Shape4D, ifm2_shape: Optional[Shape4D], uses_scalar: bool, ofm_shape: Shape4D
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100345 ) -> Optional[ArchitectureBlockConfig]:
Tim Halld8339a72021-05-27 18:49:40 +0100346 # Returns a block config and SHRAM layout
347 lut_banks = 2 if self.parent_op.activation_lut else 0
348 return find_block_config(
349 self.arch,
350 self.op_type.npu_block_type,
351 ofm_shape,
352 ifm_shape,
353 ifm2_shape,
354 uses_scalar,
355 self.ifm.dtype.size_in_bits(),
356 self.kernel,
357 lut_banks,
358 self.parent_op.has_scaling(),
359 self.resampling_mode,
360 )
361
362
363class Connection:
364 """Scheduler internal representation of a Tensor that connects two SchedulerOperations
365 This class can be seen as an edge within the Scheduler Graph representation
366 """
367
368 def __init__(self, tensor: Tensor):
369 self.parent_tens = tensor
370
371 # SchedulerOperation relationships
372 self.producers: List[SchedulerOperation] = []
373 self.consumers: List[SchedulerOperation] = []
Tim Hall79d07d22020-04-27 18:20:16 +0100374
375 def __str__(self):
Tim Halld8339a72021-05-27 18:49:40 +0100376 return f"<Connection {self.parent_tens.name}>"
Tim Hall79d07d22020-04-27 18:20:16 +0100377
378 __repr__ = __str__
379
380
Tim Halld8339a72021-05-27 18:49:40 +0100381class Schedule:
382 """Class that contains a solution of how to schedule an NPU subgraph and its cost"""
Tim Hall79d07d22020-04-27 18:20:16 +0100383
Tim Halld8339a72021-05-27 18:49:40 +0100384 def __init__(self, sg: Subgraph, label: str):
385 self.sg = sg
386 self.label = label
387 self.cost_map: Dict[SchedulerOperation, SchedulerOpInfo] = {}
388 self.cascades: Dict[int, CascadeInfo] = {}
389 self.fast_storage_peak_usage = 0
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100390 self.memory_snapshot: Optional[List[int]] = None
Tim Halld8339a72021-05-27 18:49:40 +0100391
392 @property
393 def name(self):
394 return f"{self.sg.name}_{self.label}"
Tim Hall79d07d22020-04-27 18:20:16 +0100395
396
Tim Halld8339a72021-05-27 18:49:40 +0100397class Scheduler:
398 """Main class of the Vela Scheduling"""
Tim Hall79d07d22020-04-27 18:20:16 +0100399
Tim Halld8339a72021-05-27 18:49:40 +0100400 def __init__(self, nng: Graph, sg: Subgraph, arch: ArchitectureFeatures, options: SchedulerOptions):
Tim Hall79d07d22020-04-27 18:20:16 +0100401 self.nng = nng
402 self.sg = sg
403 self.arch = arch
Ayaan Masoodb801dda2022-02-22 11:28:55 +0000404 self.sched_ops: List[SchedulerOperation] = []
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100405 self.max_schedule: Optional[Schedule] = None
Tim Halld8339a72021-05-27 18:49:40 +0100406 self.scheduler_options = options
Tim Hall79d07d22020-04-27 18:20:16 +0100407
Johan Alfvén6f4cb032022-05-05 08:42:46 +0200408 self.scratched_fms: Dict[Tensor, Any] = {}
409 self.evicted_fms: List[live_range.LiveRange] = []
410
Johan Alfvén5e0ae552022-02-09 21:20:10 +0100411 def avoid_nhcwb16_for_ofm(self, tens, ps, arch):
412 # Only run this check for opt strategy Size
413 if self.scheduler_options.optimization_strategy == OptimizationStrategy.Performance:
414 return False
415
416 op = ps.primary_op
417 if not op.type.is_elementwise_op():
418 return False
419
420 depth = op.ofm_shapes[0][-1]
421 if (depth % 16) == 0:
422 return False
423
424 # Check if overwriting the inputs can be allowed
425 OpShapeTens = namedtuple("OpShapeTens", ["op_shape", "tens"])
426 outp = OpShapeTens(op.ofm_shapes[0], op.ofm)
427 inps = []
428 if op.ifm is not None:
429 inps.append(OpShapeTens(op.ifm_shapes[0], op.ifm))
430 if op.ifm2 is not None:
431 inps.append(OpShapeTens(op.ifm_shapes[1], op.ifm2))
432
433 # Find an input tensor that can be overwritten by the output
434 for inp in inps:
435 if (
436 # check op input and output shapes allow overlapping
437 inp.op_shape == outp.op_shape
438 # check input tensor is valid
439 and inp.tens is not None
440 and inp.tens.shape != []
441 # check input and output tensors are compatible
442 and inp.tens.format == outp.tens.format
443 and inp.tens.dtype == outp.tens.dtype
444 ):
445 if inp.tens.format == TensorFormat.NHWC:
446 return True
447
448 return False
449
Tim Halld8339a72021-05-27 18:49:40 +0100450 def create_scheduler_representation(self, arch: ArchitectureFeatures):
451 """Creates a Scheduler Graph representation"""
452 # Temporary dict for creating connections between the Operations
453 connections: Dict[Tensor, Connection] = {}
454 # Memory required for the largest FeatureMap that has to be full
455 min_memory_req = 0
Tim Hall79d07d22020-04-27 18:20:16 +0100456 for ps in self.sg.passes:
Tim Halld8339a72021-05-27 18:49:40 +0100457 if ps.primary_op:
458 # Set tensor format to NHCWB16 for output FeatureMaps, if possible
Louis Verhaard0b9c9a32020-09-15 14:05:38 +0200459 for output in ps.outputs:
Jacob Bohlina5e8c1c2021-06-14 13:33:39 +0200460 if output in self.sg.output_tensors or output.purpose != TensorPurpose.FeatureMap:
Patrik Gustavssonfeeb06d2020-04-22 12:53:47 +0200461 continue
Johan Alfvén5e0ae552022-02-09 21:20:10 +0100462
463 if output.needs_linear_format:
464 continue
465
466 if self.avoid_nhcwb16_for_ofm(output, ps, arch):
467 output.needs_linear_format = True
468 continue
469
470 output.set_format(TensorFormat.NHCWB16, arch)
Tim Halld8339a72021-05-27 18:49:40 +0100471
472 # Create SchedulerOperations
473 op = SchedulerOperation(ps, arch, self.nng)
474 op.index = len(self.sched_ops)
475
476 # Make connections
477 if ps.ifm_tensor not in connections:
478 connections[ps.ifm_tensor] = Connection(ps.ifm_tensor)
479 if ps.ifm2_tensor and ps.ifm2_tensor not in connections:
480 connections[ps.ifm2_tensor] = Connection(ps.ifm2_tensor)
481 if ps.ofm_tensor not in connections:
482 connections[ps.ofm_tensor] = Connection(ps.ofm_tensor)
483
484 op.add_ifm_connection(connections[ps.ifm_tensor])
485 if ps.ifm2_tensor:
486 op.add_ifm2_connection(connections[ps.ifm2_tensor])
487 op.add_ofm_connection(connections[ps.ofm_tensor])
488
489 # Set requirements on the ifm/ofm buffers
490 self.sched_ops.append(op)
491 if ps.ifm_tensor in self.sg.input_tensors:
492 # This Op consumes a subgraph input
493 op.requires_full_ifm = True
494 if ps.ifm2_tensor and ps.ifm2_tensor in self.sg.input_tensors:
495 # This Op consumes a subgraph input
496 op.requires_full_ifm2 = True
497 if ps.ofm_tensor in self.sg.output_tensors:
498 # This Op produces a subgraph output
499 op.requires_full_ofm = True
500 if ps.ifm_tensor.needs_linear_format:
501 op.requires_full_ifm = True
502 if ps.ifm2_tensor and ps.ifm2_tensor.needs_linear_format:
503 op.requires_full_ifm2 = True
504 if ps.ofm_tensor.needs_linear_format or ps.primary_op.memory_function == Op.ConcatSliceWrite:
505 op.requires_full_ofm = True
506 if len(ps.primary_op.outputs) > 1 or len(ps.primary_op.outputs[0].consumer_list) > 1:
507 # Op has multiple outputs or consumers - requires full OFM
508 op.requires_full_ofm = True
509
510 # Check memory requirements if this Op requires any full FeatureMaps
511 op_memory_req = 0
512 if op.requires_full_ifm:
513 op_memory_req += op.ifm_size_in_bytes()
514 if op.requires_full_ifm2:
515 op_memory_req += op.ifm2_size_in_bytes()
516 if op.requires_full_ofm:
517 op_memory_req += op.ofm_size_in_bytes()
518
519 min_memory_req = max(op_memory_req, min_memory_req)
520
521 # Theoretical minimum required memory - used to guide the cascade building
522 self.min_memory_req = min_memory_req
523
524 def create_initial_schedule(self) -> Schedule:
525 """Creates an initial schedule with no cascading or buffering of any kind"""
526 schedule = Schedule(self.sg, "MAX")
Tim Halld8339a72021-05-27 18:49:40 +0100527 for op in self.sched_ops:
528 cost = op.create_scheduler_info(self.nng, op.ofm.shape)
529 cost.cycles = self.estimate_op_performance(op, cost.block_config, op.ofm.shape.depth)
530 schedule.cost_map[op] = cost
531
532 return schedule
533
534 def update_op_memory_snapshot(self, schedule: Schedule):
535 memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]
536
537 # Collect live ranges from tensors
538 lr_graph = live_range.LiveRangeGraph()
539 for mem_area, mem_type_set in memories_list:
540 live_range.extract_live_ranges_from_cascaded_passes(
Jonas Ohlssond8575072022-03-30 10:30:25 +0200541 self.nng.get_root_subgraph(),
542 mem_area,
543 mem_type_set,
544 lr_graph,
545 Tensor.AllocationQuantum,
Tim Halld8339a72021-05-27 18:49:40 +0100546 )
547
548 # Populate time-array with memory used by live ranges
549 temporal_usage = lr_graph.get_temporal_memory_usage(self.arch.fast_storage_mem_area)
550 schedule.memory_snapshot = temporal_usage
551
552 # Set the peak memory usage
553 schedule.fast_storage_peak_usage = max(temporal_usage, default=0)
554
555 def estimate_op_performance(self, op: SchedulerOperation, block_config, ofm_depth):
556 query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)
557 query.ifm_shape = op.ifm.shape
558 query.ifm_memory_area = op.ifm.mem_area
559 query.ifm_bits = op.ifm.dtype.size_in_bits()
560 query.ifm_format = op.ifm.format
561 query.ifm2_shape = op.ifm2 and op.ifm2.shape
562 query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area
563 query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()
564 query.ifm2_format = op.ifm2 and op.ifm2.format
565 query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)
566 query.ofm_memory_area = op.ofm.mem_area
567 query.ofm_bits = op.ofm.dtype.size_in_bits()
568 query.ofm_format = op.ofm.format
569 if op.parent_op.bias:
570 query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)
571 query.const_memory_area = self.arch.fast_storage_mem_area
572
573 query.kernel = op.kernel
574 query.config = block_config
575
576 return npu_performance.measure_cycle_cost(self.arch, op.op_type, op.activation and op.activation.op_type, query)
577
Johan Alfvén5c309712022-06-10 15:40:58 +0200578 def estimate_element_access(self, op: SchedulerOperation, block_config, ofm_depth):
579 query = npu_performance.PerformanceQuery(op.op_type.npu_block_type)
580 query.ifm_shape = op.ifm.shape
581 query.ifm_memory_area = op.ifm.mem_area
582 query.ifm_bits = op.ifm.dtype.size_in_bits()
583 query.ifm_format = op.ifm.format
584 query.ifm2_shape = op.ifm2 and op.ifm2.shape
585 query.ifm2_memory_area = op.ifm2 and op.ifm2.mem_area
586 query.ifm2_bits = op.ifm2 and op.ifm2.dtype.size_in_bits()
587 query.ifm2_format = op.ifm2 and op.ifm2.format
588 query.ofm_shape = op.ofm.shape.with_depth(ofm_depth)
589 query.ofm_memory_area = op.ofm.mem_area
590 query.ofm_bits = op.ofm.dtype.size_in_bits()
591 query.ofm_format = op.ofm.format
592 if op.parent_op.bias:
593 query.const_shape = Shape4D(1, 1, 1, op.ofm.shape.depth)
594 query.const_memory_area = self.arch.fast_storage_mem_area
595
596 query.kernel = op.kernel
597 query.config = block_config
598
599 return npu_performance.measure_element_access(self.arch, query)
600
Tim Hall789e6f32021-06-17 17:02:31 +0100601 def propose_schedule_buffering(self, ref_schedule: Schedule, staging_limit_bytes):
Tim Halld8339a72021-05-27 18:49:40 +0100602 """Create a buffered schedule"""
603 buffered_schedule = Schedule(self.sg, f"{ref_schedule.label}_BUFFERED")
Tim Halld8339a72021-05-27 18:49:40 +0100604
605 prev_op = None
606 for sched_op in self.sched_ops:
607 if sched_op not in ref_schedule.cost_map:
608 # sched_op is not part of this sub-schedule - skip
609 continue
610
611 self.propose_operator_buffering(sched_op, prev_op, buffered_schedule, ref_schedule, staging_limit_bytes)
612 prev_op = sched_op
613
614 return buffered_schedule
615
616 def propose_operator_buffering(
617 self,
618 sched_op: SchedulerOperation,
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100619 prev_op: Optional[SchedulerOperation],
Tim Halld8339a72021-05-27 18:49:40 +0100620 buffered_schedule: Schedule,
621 ref_schedule: Schedule,
622 staging_limit_bytes,
623 ):
624 # Mild recursion might mean this Op has already been seen
625 if sched_op in buffered_schedule.cost_map:
626 return
627
628 # Take the reference schedule as default costings for this schedule
629 ref_cost = ref_schedule.cost_map[sched_op]
630 cost = copy.copy(ref_cost)
631 cost.slack_buffering_cycles = ref_cost.cycles.op_cycles
632 memory_snapshot = ref_schedule.memory_snapshot
633 ref_memory_usage = memory_snapshot[ref_cost.time_index] if ref_cost.time_index < len(memory_snapshot) else 0
634 cost.slack_buffering_memory = staging_limit_bytes - ref_memory_usage
635 buffered_schedule.cost_map[sched_op] = cost
636
637 # Attempt weight buffering on anything with a weights tensor
638 if sched_op.parent_op.weights:
Johan Alfvén6f4cb032022-05-05 08:42:46 +0200639 buffer_limit_bytes = cost.slack_buffering_memory
640
641 # If applicable apply size limitation, but keep it within reason (ratio 1.5).
642 # Size limitation is used when use_fast_storage_for_feature_maps have
643 # detected that there are fms that do not fit in fast storage.
644 if sched_op.evicted_fms_size and ((buffer_limit_bytes / sched_op.evicted_fms_size) >= 1.5):
645 buffer_limit_bytes -= sched_op.evicted_fms_size
646
Tim Halld8339a72021-05-27 18:49:40 +0100647 self.propose_weight_buffering(
648 sched_op.parent_op.weights,
649 sched_op.parent_op.bias,
650 sched_op,
651 prev_op,
652 buffered_schedule,
653 ref_schedule,
Johan Alfvén6f4cb032022-05-05 08:42:46 +0200654 buffer_limit_bytes,
Tim Halld8339a72021-05-27 18:49:40 +0100655 )
656
657 return cost
658
659 def weights_needs_dma(self, weight_tensor):
660 if weight_tensor and weight_tensor.mem_type not in (MemType.Scratch, MemType.Scratch_fast):
661 # Weights are in permanent storage
662 # Only when permanent storage differs from feature map storage, there is a point moving the data
663 if (
664 weight_tensor.mem_area in (MemArea.Dram, MemArea.OffChipFlash)
665 and self.arch.permanent_storage_mem_area != self.arch.fast_storage_mem_area
666 ):
667 return True
668 return False
669
670 def propose_weight_buffering(
671 self,
672 weight_tensor,
673 scale_tensor,
674 sched_op: SchedulerOperation,
675 prev_op: SchedulerOperation,
676 buffered_schedule: Schedule,
677 ref_schedule: Schedule,
678 buffer_limit_bytes,
679 ):
680 cost = buffered_schedule.cost_map[sched_op]
681 prev_cost = buffered_schedule.cost_map.get(prev_op)
682 ref_cost = ref_schedule.cost_map[sched_op]
683 assert cost and ref_cost
684
685 needs_dma = self.weights_needs_dma(weight_tensor)
686
687 ofm_full_depth_slices = [0, ref_cost.stripe.depth]
688
689 # Encode weights for the full depth
Tim Halld784af72021-06-08 21:25:57 +0100690 full_weights, full_scales = weight_compressor.encode_weight_and_scale_tensor(
Tim Halld8339a72021-05-27 18:49:40 +0100691 self.arch,
692 sched_op.parent_op,
693 weight_tensor,
694 scale_tensor,
695 sched_op.kernel,
696 cost.block_config,
697 ofm_full_depth_slices,
698 )
699 full_weights_bytes = len(full_weights.buffer)
700 cost.ofm_depth_slices = ofm_full_depth_slices
701
702 # No buffering required - take all the weights from permanent storage
703 if sched_op.op_type == Op.FullyConnected or not needs_dma:
704 cost.npu_weights_tensor = full_weights
Tim Halld784af72021-06-08 21:25:57 +0100705 cost.npu_scales_tensor = full_scales
Tim Halld8339a72021-05-27 18:49:40 +0100706 return
707
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100708 encoded_weights: Optional[NpuWeightTensor] = full_weights
Tim Halld784af72021-06-08 21:25:57 +0100709 encoded_scales = full_scales
Tim Halld8339a72021-05-27 18:49:40 +0100710
711 # How many NPU cycles are available under the previously executing
712 # operator and SRAM unused for performing buffered DMA transfers
713 slack_cycles = prev_cost.slack_buffering_cycles if prev_cost else 0
714 slack_memory = prev_cost.slack_buffering_memory if prev_cost else 0
715
716 # Force full depth for cascaded Ops
717 if ref_cost.cascade != 0:
718 weight_tensor_purpose = TensorSubPurpose.Standard
719 weight_buffer_size = full_weights_bytes
720 # Update the memory snapshot to reflect the added size of the weights
721 ref_schedule.memory_snapshot[ref_cost.time_index] += weight_buffer_size
722 else:
723 # Estimate the buffering cycle time for the full set of weights
724 full_transfer_cycles = npu_performance.measure_mem2mem_cycles(
725 self.arch, weight_tensor.mem_area, self.arch.fast_storage_mem_area, full_weights_bytes
726 )
727 cost.full_weight_transfer_cycles = full_transfer_cycles
728
729 # Calculate the amount of prebuffering necessary (or what is possible with limited
730 # double buffer buffer size)
731 half_buffer_limit = buffer_limit_bytes // 2
732 if full_transfer_cycles > slack_cycles:
733 prebuffer_ratio = slack_cycles / full_transfer_cycles
734 prebuffer_bytes = min(prebuffer_ratio * full_weights_bytes, half_buffer_limit)
735 else:
736 prebuffer_bytes = min(full_weights_bytes, half_buffer_limit)
Tim Hall789e6f32021-06-17 17:02:31 +0100737
738 prebuffer_ratio = prebuffer_bytes / full_weights_bytes
Tim Halld8339a72021-05-27 18:49:40 +0100739
740 # Have to split the weights if the initial buffering can't store
741 # all of the compressed weights
742 if prebuffer_bytes < full_weights_bytes:
Tim Hall789e6f32021-06-17 17:02:31 +0100743 block_depth = cost.block_config.ofm_block.depth
Tim Halld8339a72021-05-27 18:49:40 +0100744
Tim Hall789e6f32021-06-17 17:02:31 +0100745 # Choose initial prebuffering depth (already buffer clamped)
746 prebuffer_depth = ref_cost.stripe.depth * prebuffer_ratio
Tim Halld8339a72021-05-27 18:49:40 +0100747 prebuffer_depth = int(max(16, round_down(prebuffer_depth, ArchitectureFeatures.OFMSplitDepth)))
748
Tim Hall789e6f32021-06-17 17:02:31 +0100749 # Calculate cycles executed during the prebuffer
750 pre_op_cycles = self.estimate_op_performance(sched_op, cost.block_config, prebuffer_depth)
751 buffering_depth = ref_cost.stripe.depth * (pre_op_cycles.op_cycles / full_transfer_cycles)
Tim Halld8339a72021-05-27 18:49:40 +0100752
Tim Hall789e6f32021-06-17 17:02:31 +0100753 # Choose initial buffering depth and clamp to the double buffering limit
754 buffering_depth = round_up(buffering_depth, block_depth)
755 buffering_bytes = (buffering_depth / ref_cost.stripe.depth) * full_weights_bytes
756 if buffering_bytes > half_buffer_limit:
757 buffering_depth = (half_buffer_limit / full_weights_bytes) * ref_cost.stripe.depth
758
759 while True:
760 # Attempt to buffer whole blocks
Johan Alfvéncce7f2d2022-04-08 10:47:09 +0200761 if buffering_depth > block_depth:
Tim Hall789e6f32021-06-17 17:02:31 +0100762 buffering_depth = round_down(buffering_depth, block_depth)
763 else:
764 buffering_depth = round_down(buffering_depth, ArchitectureFeatures.OFMSplitDepth)
765 buffering_depth = int(max(buffering_depth, ArchitectureFeatures.OFMSplitDepth))
Tim Halld8339a72021-05-27 18:49:40 +0100766
767 # Create list of depth slices
768 depth_slices = [0]
769 if prebuffer_depth < ref_cost.stripe.depth:
770 depth_slices += list(range(prebuffer_depth, ref_cost.stripe.depth, buffering_depth))
771 depth_slices.append(ref_cost.stripe.depth)
772
773 # Encode weights based depth slices
774 cost.ofm_depth_slices = depth_slices
Tim Halld784af72021-06-08 21:25:57 +0100775 encoded_weights, encoded_scales = weight_compressor.encode_weight_and_scale_tensor(
Tim Halld8339a72021-05-27 18:49:40 +0100776 self.arch,
777 sched_op.parent_op,
778 weight_tensor,
779 scale_tensor,
780 sched_op.kernel,
781 cost.block_config,
782 cost.ofm_depth_slices,
783 )
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100784 assert encoded_weights is not None
Tim Halld8339a72021-05-27 18:49:40 +0100785 # Chosen buffering might not fit at all, iterate until it does
786 # or until the minimum usable slice size is reached
787 if (
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000788 encoded_weights.double_buffer_size() <= buffer_limit_bytes
Tim Halld8339a72021-05-27 18:49:40 +0100789 or prebuffer_depth == ArchitectureFeatures.OFMSplitDepth
790 ):
791 break
792
Tim Hall789e6f32021-06-17 17:02:31 +0100793 if buffering_depth > prebuffer_depth:
794 buffering_depth = round_up(buffering_depth // 2, ArchitectureFeatures.OFMSplitDepth)
795 else:
796 prebuffer_depth = round_up(prebuffer_depth // 2, ArchitectureFeatures.OFMSplitDepth)
Tim Halld8339a72021-05-27 18:49:40 +0100797
798 # Calculate cycles required to run the last op for use as future slack
799 tail_cycles = self.estimate_op_performance(
800 sched_op, cost.block_config, depth_slices[-1] - depth_slices[-2]
801 )
802 cost.slack_buffering_cycles = tail_cycles.op_cycles
803
804 # Determine whether the weights need to be double buffered
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000805 weight_buffer_size = min(len(encoded_weights.buffer), encoded_weights.max_range_bytes())
Tim Halld8339a72021-05-27 18:49:40 +0100806
807 # Only buffer weights if there's still space left for the buffer
808 if weight_buffer_size <= buffer_limit_bytes:
809 assert weight_buffer_size % 16 == 0
810 # Determine whether to double buffer or single buffer
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000811 double_buffer_size = encoded_weights.double_buffer_size()
812 if (double_buffer_size <= buffer_limit_bytes) and (weight_buffer_size < len(encoded_weights.buffer)):
Tim Halld8339a72021-05-27 18:49:40 +0100813 weight_tensor_purpose = TensorSubPurpose.DoubleBuffer
814 else:
815 weight_tensor_purpose = TensorSubPurpose.Standard
816
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000817 cost.buffered_weight_tensors = [
818 self.buffer_tensor(
819 encoded_weights,
820 weight_tensor_purpose,
821 encoded_weights.double_buffer_sizes[0],
822 weight_tensor.name + "_buffer",
823 )
824 ]
825 if weight_tensor_purpose == TensorSubPurpose.DoubleBuffer:
826 buf2 = self.buffer_tensor(
827 encoded_weights,
828 weight_tensor_purpose,
829 encoded_weights.double_buffer_sizes[1],
830 weight_tensor.name + "_buffer2",
831 )
832 cost.buffered_weight_tensors.append(buf2)
833
834 last_used_buffer_idx = len(cost.ofm_depth_slices) % len(cost.buffered_weight_tensors)
835 weight_buffer_size = encoded_weights.double_buffer_sizes[last_used_buffer_idx]
836
Tim Halld8339a72021-05-27 18:49:40 +0100837 if ref_cost.cascade == 0:
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000838 # Determine if the lifetime can be extended and pre-buffer the first weight buffer
839 # under the previous operation
840 cost.buffered_weight_tensors[0].pre_buffer = encoded_weights.double_buffer_size() < slack_memory
Tim Halld8339a72021-05-27 18:49:40 +0100841
842 cost.slack_buffering_memory -= weight_buffer_size
843 else:
844 # Don't slice or buffer - use the whole depth from persistent storage
845 cost.ofm_depth_slices = ofm_full_depth_slices
846 encoded_weights = full_weights
Tim Halld784af72021-06-08 21:25:57 +0100847 encoded_scales = full_scales
Tim Halld8339a72021-05-27 18:49:40 +0100848
849 cost.npu_weights_tensor = encoded_weights
Tim Halld784af72021-06-08 21:25:57 +0100850 cost.npu_scales_tensor = encoded_scales
Tim Halld8339a72021-05-27 18:49:40 +0100851
Jacob Bohlineee9e5d2021-08-17 17:44:45 +0200852 def buffer_tensor(self, src_tensor: Tensor, sub_purpose: TensorSubPurpose, buffer_size: int, name: str) -> Tensor:
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000853 buffered_weight_tensor = Tensor([1, 1, 1, buffer_size], DataType.uint8, name)
Jacob Bohlineee9e5d2021-08-17 17:44:45 +0200854 buffered_weight_tensor.src_tensor = src_tensor
855 buffered_weight_tensor.mem_area = self.arch.fast_storage_mem_area
856 buffered_weight_tensor.mem_type = MemType.Scratch_fast
857 buffered_weight_tensor.purpose = TensorPurpose.Weights
858 buffered_weight_tensor.sub_purpose = sub_purpose
859 return buffered_weight_tensor
860
Tim Halld8339a72021-05-27 18:49:40 +0100861 def propose_minimal_schedule(self) -> Schedule:
862 """Proposes scheduling parameters where every operator is subdivided into the smallest stripe that satisfies the
863 next operators stride"""
864 min_schedule = Schedule(self.sg, "MIN")
865 cost_map = min_schedule.cost_map
866
867 # Keep track of the previous Op - which consumes the current Op's OFM
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100868 prev_op: Optional[SchedulerOperation] = None
Tim Halld8339a72021-05-27 18:49:40 +0100869 for sched_op in reversed(self.sched_ops):
870 min_stripe_height = prev_op.kernel.stride.y if prev_op else 1
871 min_stripe = sched_op.ofm.shape.with_height(min_stripe_height)
872
873 cost = sched_op.create_scheduler_info(self.nng, min_stripe)
874 cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)
875 cost_map[sched_op] = cost
876
877 prev_op = sched_op
878
879 return min_schedule
880
881 def propose_schedule_striping(self, final_stripe: Shape4D, label: str, ref_schedule: Schedule) -> Schedule:
882 """Proposes new striping for a schedule. The stripe is derived from the ifm requirements of the next Op down"""
883 ref_cost = ref_schedule.cost_map
884
885 striped_schedule = Schedule(self.sg, label)
886 stripe = final_stripe
887 for sched_op in reversed(self.sched_ops):
888 if sched_op not in ref_cost:
889 # sched_op is not part of the sub-schedule - skip
890 continue
891
892 # Create a cost entry with the new stripe
893 cost = sched_op.create_scheduler_info(self.nng, stripe)
894
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000895 weight_tensor = cost.npu_weights_tensor
896 for idx, buffered_tens in enumerate(ref_cost[sched_op].buffered_weight_tensors):
Jacob Bohlineee9e5d2021-08-17 17:44:45 +0200897 # If the weights are buffered in the reference schedule they should be in the new proposal
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000898 cost.buffered_weight_tensors.append(
899 self.buffer_tensor(
900 weight_tensor,
901 buffered_tens.sub_purpose,
902 weight_tensor.double_buffer_sizes[idx],
903 buffered_tens.name,
904 )
Jacob Bohlineee9e5d2021-08-17 17:44:45 +0200905 )
Tim Halld8339a72021-05-27 18:49:40 +0100906
907 # Estimate performance
908 cost.cycles = self.estimate_op_performance(sched_op, cost.block_config, sched_op.ofm.shape.depth)
909 striped_schedule.cost_map[sched_op] = cost
910
erik.andersson@arm.com8912f3a2022-08-16 11:08:57 +0200911 # Calculate the preceeding Op's stripe.
912
913 # In certain cases where an upscaling Op is cascaded,
914 # it may get instructed to produce an odd stripe height.
915 # Thus we need to force it back to even heights.
916 force_even_stripe_heights = False
917 for op in self.sched_ops:
918 # Check if the cascade has a Nearest Neighbor-op.
919 # If that is the case, force the stripes to be even.
920 if (
921 ref_cost.get(op, None)
922 and ref_cost.get(sched_op, None)
923 and ref_cost[op].cascade == ref_cost[sched_op].cascade
924 and is_nearest(op.resampling_mode)
925 ):
926 force_even_stripe_heights = True
927 break
928 upscaling_remainder = stripe.height % to_upscale(sched_op.resampling_mode)
929 height = stripe.height + (stripe.height % 2 if force_even_stripe_heights else upscaling_remainder)
Fredrik Svedbergd03dc502022-06-30 10:44:12 +0200930 stripe = sched_op.ifm.shape.with_height(height * sched_op.kernel.stride.y)
Tim Halld8339a72021-05-27 18:49:40 +0100931
932 return striped_schedule
933
934 def estimate_schedule_memory_usage(self, schedule: Schedule, non_local_mem_usage: dict):
935 """Estimates the memory usage of a schedule"""
936 cost = schedule.cost_map
937 cascades = schedule.cascades
938 peak_mem_usage = 0
939 for sched_op in self.sched_ops:
940 if sched_op not in cost:
941 # sched_op is not part of the sub-schedule - skip
942 continue
943
944 if cost[sched_op].cascade:
945 # This Op is part of a cascade - use the cascade's memory usage
946 cascade_info = cascades[cost[sched_op].cascade]
947 # Non-local memory usage is already included in the cascade_info
948 peak_mem_usage = max(cascade_info.mem_usage, peak_mem_usage)
949 else:
950 # This Op is not part of a cascade - calculate the memory usage
Rickard Bolinfd8b5002022-05-16 09:11:06 +0000951 op_weight_buffer = sum(tens.storage_size() for tens in cost[sched_op].buffered_weight_tensors)
Tim Halld8339a72021-05-27 18:49:40 +0100952
953 op_mem_usage = (
954 sched_op.ifm_size_in_bytes()
955 + sched_op.ofm_size_in_bytes()
956 + op_weight_buffer
957 + non_local_mem_usage.get(sched_op, 0)
958 )
959 peak_mem_usage = max(op_mem_usage, peak_mem_usage)
960
961 return peak_mem_usage
962
Johan Alfvén255dad72022-07-16 18:27:05 +0200963 def build_cascades_for_min_schedule(self, min_schedule: Schedule, max_template: Schedule, memory_limit: int):
964 # Update memory snapshot
965 self.sg.schedule = min_schedule
966 self.update_op_memory_snapshot(min_schedule)
967
968 # Calculate residual memory for Min schedule
969 non_local_mem_usage = {}
970 for sched_op in self.sched_ops:
971 time_index = min_schedule.cost_map[sched_op].time_index
972
973 if self.arch.is_spilling_enabled():
974 # For Dedicated SRAM only the intermediate buffers are in SRAM, hence op_mem_usage is 0
975 op_mem_usage = 0
976 else:
977 # Min schedule only have ifm and ofm in SRAM (no buffered weigth tensors)
Johan Alfvéneb332a32022-12-09 17:50:48 +0100978 # Only include IFM's that are in the scratch area
979 ifm = sched_op.ifm.connection.parent_tens
980 ifm_size = (
981 0 if ifm.mem_type not in (MemType.Scratch, MemType.Scratch_fast) else sched_op.ifm_size_in_bytes()
982 )
Johan Alfvén92689d52022-12-06 11:16:19 +0100983 ofm_size = 0 if ofm_can_reuse_ifm(sched_op) else sched_op.ofm_size_in_bytes()
Johan Alfvéneb332a32022-12-09 17:50:48 +0100984 op_mem_usage = ifm_size + ofm_size
Johan Alfvén255dad72022-07-16 18:27:05 +0200985
986 non_local_mem_usage[sched_op] = min_schedule.memory_snapshot[time_index] - op_mem_usage
Johan Alfvén92689d52022-12-06 11:16:19 +0100987 assert non_local_mem_usage[sched_op] >= 0
Johan Alfvén255dad72022-07-16 18:27:05 +0200988
989 # Crate cascades for Min schedule
990 cascade_builder = CascadeBuilder(self.sched_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)
991 cascade_builder.build_cascades(min_schedule, max_template, memory_limit)
992
Tim Halld8339a72021-05-27 18:49:40 +0100993 def optimize_sub_schedule(
994 self, cascade_info: CascadeInfo, ref_schedule: Schedule, max_template: Schedule, memory_limit: int
995 ) -> Schedule:
996 """Extracts the Ops covered by the given cascade and creates a sub-schedule. The sub-schedule is optimized by
997 proposing weight buffering and then continously proposing new stripe sizes"""
998 ref_cost = ref_schedule.cost_map
999 # Extract the ops that are part of this sub-schedule
1000 start = cascade_info.start
1001 end = cascade_info.end
1002 sub_schedule_ops = self.sched_ops[start : end + 1]
1003 # Create a sub-schedule that contains only the costs for the Ops that are part of the sub-schedule
1004 sub_schedule = Schedule(self.sg, f"SUB_{start}_{end}")
1005 for sched_op in sub_schedule_ops:
1006 sub_schedule.cost_map[sched_op] = ref_cost[sched_op]
1007
1008 sub_schedule.cascades[end] = cascade_info
1009 # Use the memory snapshot from the reference schedule
1010 sub_schedule.memory_snapshot = ref_schedule.memory_snapshot
1011
1012 # Calculate memory usage that is live during the sub-schedule but not part of it
1013 time_for_cascade = ref_cost[sub_schedule_ops[0]].time_index
1014 mem_usage_parallel_to_sub_schedule = ref_schedule.memory_snapshot[time_for_cascade] - cascade_info.mem_usage
1015 # If the first Op's IFM has other consumers it has to live throughout the whole sub-schedule whether it's
1016 # included in a cascade or not
1017 persistent_initial_ifm = (
1018 sub_schedule_ops[0].ifm_size_in_bytes() if len(sub_schedule_ops[0].ifm.connection.consumers) > 1 else 0
1019 )
1020 # Calculate non-local-mem-usage per Operator
1021 non_local_mem_usage = {}
1022 for idx, sched_op in enumerate(sub_schedule_ops):
1023 non_local_mem_usage[sched_op] = mem_usage_parallel_to_sub_schedule
1024 if idx != 0:
1025 non_local_mem_usage[sched_op] += persistent_initial_ifm
1026
1027 cascade_builder = CascadeBuilder(sub_schedule_ops, self.arch.is_spilling_enabled(), non_local_mem_usage)
1028
1029 # Start by adding buffering
Tim Hall789e6f32021-06-17 17:02:31 +01001030 buffered_sub_schedule = self.propose_schedule_buffering(
1031 sub_schedule, self.scheduler_options.optimization_sram_limit
1032 )
Tim Halld8339a72021-05-27 18:49:40 +01001033 # Copy the cascades over from the unbuffered-schedule
1034 buffered_sub_schedule.cascades = sub_schedule.cascades
1035
1036 # Generate the possible stripings for the final Op in the sub-schedule
1037 final_ofm_shape = sub_schedule_ops[-1].ofm.shape
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001038
1039 # Skip testing the min stripe used in the MIN schedule since that will be used
1040 # anyway if no new cascades are created below
1041 last_op = sub_schedule_ops[-1]
1042 min_stripe_h = sub_schedule.cost_map[last_op].stripe.height + 1
1043
Tim Halld8339a72021-05-27 18:49:40 +01001044 possible_stripes = [
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001045 final_ofm_shape.with_height(stripe_h) for stripe_h in range(min_stripe_h, final_ofm_shape.height // 2 + 1)
Tim Halld8339a72021-05-27 18:49:40 +01001046 ]
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001047 # Propose different striping
Jacob Bohlinfad72042021-08-24 21:51:41 +02001048 best_schedule = None
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001049 max_nbr_of_cascades = 0
1050 for iteration, proposed_stripe in enumerate(possible_stripes):
Tim Halld8339a72021-05-27 18:49:40 +01001051 proposed_schedule = self.propose_schedule_striping(
1052 proposed_stripe, f"OPTIMIZED_{iteration}", buffered_sub_schedule
1053 )
1054
1055 cascade_builder.build_cascades(proposed_schedule, max_template, memory_limit)
1056
1057 # Check if proposal fits
1058 proposed_schedule_mem_usage = self.estimate_schedule_memory_usage(proposed_schedule, non_local_mem_usage)
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001059
1060 nbr_of_cascades = len(proposed_schedule.cascades)
1061
1062 if iteration == 0:
1063 # First iteration - used as limit to prevent splitting up the cascades
1064 # Long cascades are better in order to reduce IFM/IFM dram bandwidth
1065 max_nbr_of_cascades = nbr_of_cascades
1066
1067 if (proposed_schedule_mem_usage) <= memory_limit and nbr_of_cascades <= max_nbr_of_cascades:
Tim Halld8339a72021-05-27 18:49:40 +01001068 best_schedule = proposed_schedule
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001069
Tim Halld8339a72021-05-27 18:49:40 +01001070 if not proposed_schedule.cascades:
1071 # No cascading required - early exit
1072 break
1073 else:
Johan Alfvén2a285fc2022-08-17 14:59:58 +02001074 break
Tim Halld8339a72021-05-27 18:49:40 +01001075
1076 return best_schedule
1077
1078 def optimize_schedule(
Jonas Ohlssond8575072022-03-30 10:30:25 +02001079 self,
1080 schedule: Schedule,
1081 max_sched: Schedule,
1082 max_template: Schedule,
1083 options: SchedulerOptions,
Tim Halld8339a72021-05-27 18:49:40 +01001084 ) -> Schedule:
1085 """Extracts sub-schedules based on the cascades and optimizes them and applies them to the final schedule"""
1086 sram_limit = options.optimization_sram_limit
1087 if max_sched.fast_storage_peak_usage < sram_limit and not self.arch.is_spilling_enabled():
1088 # Maximum performance schedule fits within the SRAM target
1089 return max_sched
1090
Jacob Bohlinfad72042021-08-24 21:51:41 +02001091 # Iterate over a copy of the cascades since they may change during the loop
1092 for cascade_info in list(schedule.cascades.values()):
Tim Halld8339a72021-05-27 18:49:40 +01001093 # Optimize the sub-schedule in this cascade
1094 opt_sub_schedule = self.optimize_sub_schedule(cascade_info, schedule, max_template, sram_limit)
Jacob Bohlinfad72042021-08-24 21:51:41 +02001095 if opt_sub_schedule:
1096 # Remove the existing cascade
1097 del schedule.cascades[cascade_info.end]
1098 # Update the sub-schedule Op and cascade costs to the full schedule
1099 schedule.cost_map.update(opt_sub_schedule.cost_map)
1100 schedule.cascades.update(opt_sub_schedule.cascades)
Tim Halld8339a72021-05-27 18:49:40 +01001101
1102 # Update memory snapshot
1103 self.sg.schedule = schedule
1104 self.update_op_memory_snapshot(schedule)
1105 # Propose schedule buffering to the optimized schedule
Tim Hall789e6f32021-06-17 17:02:31 +01001106 optimized_sched = self.propose_schedule_buffering(schedule, self.scheduler_options.optimization_sram_limit)
Tim Halld8339a72021-05-27 18:49:40 +01001107 # Copy the cascade's metadata from the unbuffered schedule
1108 optimized_sched.cascades = schedule.cascades
1109 return optimized_sched
1110
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001111 def optimize_weight_buffering_size(
1112 self,
1113 min_schedule: Schedule,
1114 options: SchedulerOptions,
1115 ):
1116 default_schedule = self.sg.schedule
Tim Hallc1be0872022-03-03 17:50:52 +00001117 npu_performance.calc_new_performance_for_network(self.nng, self.arch, None, False)
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001118 default_tot_cycles = self.nng.cycles[npu_performance.PassCycles.Total]
1119 default_dram_cycles = self.nng.cycles[npu_performance.PassCycles.DramAccess]
1120
1121 # Restore mem/type for scratched_fms
1122 for tens in self.scratched_fms:
1123 tens.mem_area = self.scratched_fms[tens][0]
1124 tens.mem_type = self.scratched_fms[tens][1]
1125
1126 self.update_op_memory_snapshot(self.sg.schedule)
1127
1128 # Collect live ranges from tensors
1129 memories_list = [(self.arch.fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]
1130 lr_graph = live_range.LiveRangeGraph()
1131 for mem_area, mem_type_set in memories_list:
1132 live_range.extract_live_ranges_from_cascaded_passes(
1133 self.nng.get_root_subgraph(),
1134 mem_area,
1135 mem_type_set,
1136 lr_graph,
1137 Tensor.AllocationQuantum,
1138 )
1139
1140 # Find the relation between the sched_op and the buffering tensor
1141 weight_ops = {}
1142 for sched_op in self.sched_ops:
1143 cost = self.sg.schedule.cost_map[sched_op]
Rickard Bolinfd8b5002022-05-16 09:11:06 +00001144 for tens in cost.buffered_weight_tensors:
1145 weight_ops[tens] = sched_op
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001146
1147 # Filter out weight buffer live ranges
1148 weight_lrs = []
1149 for lr in lr_graph.lrs:
1150 for tens in lr.tensors:
1151 if weight_ops.get(tens):
1152 weight_lrs.append(lr)
1153 break
1154
1155 # See if any evicted fm overlaps with a weight buffering op.
1156 # If this is the case add a size limitation to the buffering op
1157 for lr in self.evicted_fms:
1158 for weight_lr in weight_lrs:
1159 if lr.overlaps_ranges(weight_lr):
1160 for tens in weight_lr.tensors:
1161 sched_op = weight_ops.get(tens)
1162 if sched_op:
1163 # Add size reduction to the op
1164 sched_op.evicted_fms_size += lr.size
1165 break
1166
1167 self.sg.schedule = min_schedule
1168 self.update_op_memory_snapshot(self.sg.schedule)
1169
1170 # Run schedule buffering - with weight buffer size reduction
1171 schedule = self.propose_schedule_buffering(self.sg.schedule, options.optimization_sram_limit)
1172 schedule.cascades = self.sg.schedule.cascades
1173 self.sg.schedule = schedule
1174
1175 # Apply new buffer schdule and calc new performance
1176 self.update_op_memory_snapshot(self.sg.schedule)
1177 self.apply_schedule(self.sg.schedule)
1178 self.use_fast_storage_for_feature_maps(self.sg.schedule, options.optimization_sram_limit)
1179
Tim Hallc1be0872022-03-03 17:50:52 +00001180 npu_performance.calc_new_performance_for_network(self.nng, self.arch, None, False)
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001181 new_tot_cycles = self.nng.cycles[npu_performance.PassCycles.Total]
1182 new_dram_cycles = self.nng.cycles[npu_performance.PassCycles.DramAccess]
1183
Tim Hall8bc7a652022-05-19 15:29:23 +01001184 improvement_tot = (
1185 round((default_tot_cycles - new_tot_cycles) / default_tot_cycles, 2) if default_tot_cycles != 0 else 0
1186 )
1187 improvement_dram = (
1188 round((default_dram_cycles - new_dram_cycles) / default_dram_cycles, 2) if default_dram_cycles != 0 else 0
1189 )
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001190
1191 # Compare both total and dram improvement
Johan Alfvén3dae1b62022-05-17 10:26:48 +02001192 if not (improvement_tot >= 0 and improvement_dram > 0):
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001193 # No improvement, restore the default schedule
1194 for sched_op in self.sched_ops:
1195 sched_op.evicted_fms_size = 0
1196
1197 for tens in self.scratched_fms:
1198 tens.mem_area = self.scratched_fms[tens][0]
1199 tens.mem_type = self.scratched_fms[tens][1]
1200
1201 self.sg.schedule = default_schedule
1202 self.update_op_memory_snapshot(self.sg.schedule)
1203 self.apply_schedule(self.sg.schedule)
1204 self.use_fast_storage_for_feature_maps(self.sg.schedule, options.optimization_sram_limit)
1205
Tim Halld8339a72021-05-27 18:49:40 +01001206 def apply_schedule(self, sched: Schedule):
1207 """Applies the given schedule as a final solution"""
1208 for sched_op in self.sched_ops:
1209 op_info = sched.cost_map[sched_op]
1210 cascade_info = sched.cascades.get(op_info.cascade, None)
1211 if cascade_info and sched_op in cascade_info.buffers:
1212 buffer_tens = sched_op.ifm.connection.parent_tens
1213 # Apply memory area and type
1214 buffer_tens.mem_area = self.arch.fast_storage_mem_area
1215 buffer_tens.mem_type = MemType.Scratch_fast
1216 # Apply Rolling buffer
1217 buffer_tens.set_format(TensorFormat.NHCWB16, self.arch)
1218 buffer_tens.set_new_sub_purpose(TensorSubPurpose.RollingBufferY, cascade_info.buffers[sched_op].height)
1219
1220 sched_op.parent_ps.block_config = op_info.block_config.old_style_representation()
1221
1222 # Ensure that the src_tensor reference is set correctly
Rickard Bolinfd8b5002022-05-16 09:11:06 +00001223 for tens in op_info.buffered_weight_tensors:
1224 tens.src_tensor = op_info.npu_weights_tensor
Tim Halld8339a72021-05-27 18:49:40 +01001225
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001226 def use_fast_storage_for_feature_maps(self, schedule, staging_limit):
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001227 max_mem_usage = []
1228 base_mem_usage = []
1229 fast_storage_type = MemType.Scratch_fast
1230 fast_storage_mem_area = self.arch.fast_storage_mem_area
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001231 self.evicted_fms = []
Tim Halld8339a72021-05-27 18:49:40 +01001232
1233 # Force all OFMs to fast-storage
1234 for sched_op in self.sched_ops:
1235 cost = schedule.cost_map[sched_op]
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001236 if cost.cascade == 0 and sched_op.get_dependants():
1237 ofm_tens = sched_op.ofm.connection.parent_tens
1238 if not any(cons is None for cons in ofm_tens.consumer_list):
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001239 if ofm_tens not in self.scratched_fms:
1240 # Remember default mem area and mem type, only done once
1241 self.scratched_fms[ofm_tens] = (ofm_tens.mem_area, ofm_tens.mem_type)
1242
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001243 ofm_tens.mem_area = fast_storage_mem_area
1244 ofm_tens.mem_type = fast_storage_type
Tim Halld8339a72021-05-27 18:49:40 +01001245
1246 # Collect live ranges from tensors
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001247 memories_list = [(fast_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))]
Tim Halld8339a72021-05-27 18:49:40 +01001248 lr_graph = live_range.LiveRangeGraph()
1249 for mem_area, mem_type_set in memories_list:
1250 live_range.extract_live_ranges_from_cascaded_passes(
Jonas Ohlssond8575072022-03-30 10:30:25 +02001251 self.nng.get_root_subgraph(),
1252 mem_area,
1253 mem_type_set,
1254 lr_graph,
1255 Tensor.AllocationQuantum,
Tim Halld8339a72021-05-27 18:49:40 +01001256 )
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001257 max_mem_usage = lr_graph.get_temporal_memory_usage(fast_storage_mem_area)
Tim Halld8339a72021-05-27 18:49:40 +01001258
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001259 # If true, everything fits and we can proceed
1260 if max(max_mem_usage) <= staging_limit:
1261 return
1262
1263 # Build up the base memory usage by removing the
1264 # mem_usage of the lrs we previously moved to fast-storage
1265 base_mem_usage = np.array(max_mem_usage)
1266 curr_lrs = []
Tim Halld8339a72021-05-27 18:49:40 +01001267 for lr in lr_graph.lrs:
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001268 for tens in lr.tensors:
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001269 if self.scratched_fms.get(tens):
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001270 curr_lrs.append(lr)
1271 base_mem_usage[lr.start_time : lr.end_time + 1] -= lr.size
1272 break
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001273 competing_lrs = []
Johan Alfvén5c309712022-06-10 15:40:58 +02001274 competing_tens_access = {}
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001275 for lr in curr_lrs:
1276 base_usage = max(base_mem_usage[lr.start_time : lr.end_time + 1])
1277 # If true, the lr will never fit and may thus be evicted
1278 if base_usage + lr.size > staging_limit:
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001279 self.evicted_fms.append(lr)
1280 FastStorageComponentAllocator.evict(lr, max_mem_usage, self.scratched_fms)
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001281 continue
1282 # Since max_mem_usage is the memory usage with all FMs still in fast-storage,
1283 # the memory limit cannot be exceeded if max_mem_usage does not.
1284 # Thus, the affected lrs can remain in fast-storage if the following is true
1285 if max(max_mem_usage[lr.start_time : lr.end_time + 1]) <= staging_limit:
1286 FastStorageComponentAllocator.keep(lr, base_mem_usage, staging_limit)
1287 else:
1288 competing_lrs.append(lr)
Johan Alfvén5c309712022-06-10 15:40:58 +02001289 for tens in lr.tensors:
1290 competing_tens_access[tens] = 0
1291
Johan Alfvén3a6325f2022-10-07 18:03:48 +02001292 competing_lrs_sz = len(competing_lrs)
1293 # All lrs and their tensors have been handled if competing_lrs_sz is zero, we may thus return
1294 if competing_lrs_sz == 0:
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001295 return
1296
Johan Alfvén5c309712022-06-10 15:40:58 +02001297 # Estimate element access for all tensors that are competing for a place in fast-storage.
1298 # This number is used when deciding which tensor that stays in fast-storage
1299 for sched_op in self.sched_ops:
1300 cost = schedule.cost_map[sched_op]
1301
1302 if competing_tens_access.get(sched_op.ifm.connection.parent_tens) is not None:
1303 tens = sched_op.ifm.connection.parent_tens
1304 access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)
1305 competing_tens_access[tens] += access.ifm_read[0]
1306
1307 if sched_op.ifm2 and competing_tens_access.get(sched_op.ifm2.connection.parent_tens) is not None:
1308 tens = sched_op.ifm2.connection.parent_tens
1309 access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)
1310 competing_tens_access[tens] += access.ifm_read[1]
1311
1312 if competing_tens_access.get(sched_op.ofm.connection.parent_tens) is not None:
1313 tens = sched_op.ofm.connection.parent_tens
1314 access = self.estimate_element_access(sched_op, cost.block_config, sched_op.ofm.shape.depth)
1315 competing_tens_access[tens] += access.ofm_write
1316
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001317 competing_lrs = sorted(competing_lrs, key=lambda lr: (lr.start_time, lr.end_time + 1, lr.size))
Johan Alfvén3a6325f2022-10-07 18:03:48 +02001318
1319 # Remove lrs that have a live range that is too long compared to others.
1320 # They are causing problems for the HillClimb Allocator when it has to
1321 # change the allocation indices, in order to fit all the allocations into SRAM.
1322 # This problem only occur in larger networks with complex graphs.
1323 #
1324 # Limit the number of items for allocate_component to work with max MAX_EXHAUSTIVE_ITEMS
1325 # at the time. Too many will give too long compilation time
1326 #
1327 # Too long is currently decided to be (based on experience, analyzing many networks):
1328 # Compare lr at postion i with lr at position i + MAX_EXHAUSTIVE_ITEMS.
1329 # If end time differs by at least MAX_EXHAUSTIVE_LIFE_RANGE then do not include lr at position i.
1330 if competing_lrs_sz > FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS:
1331 # create a copy of the original list to iterate over because the original version is modified in-loop
1332 competing_lrs_copy = competing_lrs.copy()
1333 for i, lr in enumerate(competing_lrs_copy):
1334 lr_time = lr.end_time - lr.start_time
1335 if lr_time < FastStorageComponentAllocator.MAX_EXHAUSTIVE_LIFE_RANGE:
1336 # Skip small ranges
1337 continue
1338
1339 # Compare current lr with lr at position lr + MAX_EXHAUSTIVE_ITEMS
1340 cmp_pos = min(i + FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS, competing_lrs_sz - 1)
1341
1342 # Compare end times + plus a margin by MAX_EXHAUSTIVE_LIFE_RANGE
1343 if (
1344 lr.end_time
1345 > competing_lrs_copy[cmp_pos].end_time + FastStorageComponentAllocator.MAX_EXHAUSTIVE_LIFE_RANGE
1346 ):
1347 # Current lr live time stands out, remove it. No use adding it to the
1348 # evicted_fms list since the lr should not be included in the fast storage allocation
1349 FastStorageComponentAllocator.evict(lr, max_mem_usage, self.scratched_fms)
1350 competing_lrs.remove(lr)
1351
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001352 start = 0
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001353 end_time = competing_lrs[0].end_time
Johan Alfvén3a6325f2022-10-07 18:03:48 +02001354 competing_lrs_sz = len(competing_lrs)
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001355 component_allocator = FastStorageComponentAllocator(base_mem_usage, max_mem_usage, staging_limit)
1356 # Build up components and then allocate each separately
1357 for i, lr in enumerate(competing_lrs):
Johan Alfvén3a6325f2022-10-07 18:03:48 +02001358 nbr_items = i - start
1359 if lr.start_time <= end_time and (nbr_items < FastStorageComponentAllocator.MAX_EXHAUSTIVE_ITEMS):
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001360 end_time = max(end_time, lr.end_time)
1361 else:
Johan Alfvén3a6325f2022-10-07 18:03:48 +02001362 # Number items reached max items or current lr's start time
1363 # does not overlap with previous lr's end time
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001364 component_allocator.allocate_component(
1365 component_allocator,
1366 competing_lrs[start:i],
1367 max_mem_usage,
1368 base_mem_usage,
1369 staging_limit,
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001370 self.scratched_fms,
Johan Alfvén5c309712022-06-10 15:40:58 +02001371 competing_tens_access,
Johan Alfvén68b8f2f2022-06-24 08:42:19 +02001372 self.evicted_fms,
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001373 )
1374 start = i
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001375 end_time = lr.end_time
1376 component_allocator.allocate_component(
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001377 component_allocator,
Johan Alfvén3a6325f2022-10-07 18:03:48 +02001378 competing_lrs[start:competing_lrs_sz],
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001379 max_mem_usage,
1380 base_mem_usage,
1381 staging_limit,
1382 self.scratched_fms,
Johan Alfvén5c309712022-06-10 15:40:58 +02001383 competing_tens_access,
Johan Alfvén68b8f2f2022-06-24 08:42:19 +02001384 self.evicted_fms,
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001385 )
Tim Halld8339a72021-05-27 18:49:40 +01001386
1387 def move_constant_data(self):
1388 """Determine if data, can be moved from permanent storage to another memory area. A move
1389 will generate a DMA command in the high-level command stream"""
1390 for sched_op in self.sched_ops:
1391 parent_op = sched_op.parent_op
1392 is_lut_used = any(inp.purpose == TensorPurpose.LUT for inp in parent_op.inputs)
1393 max_ifm_shram_avail = (
1394 (self.arch.available_shram_banks(is_lut_used) - self.arch.shram_reserved_output_banks)
1395 * self.arch.shram_bank_size
1396 // 2
1397 )
1398
1399 for idx, tens in enumerate(parent_op.inputs):
1400 if tens.mem_type not in (MemType.Scratch, MemType.Scratch_fast):
1401 # Tensor is in permanent storage
1402 # Only when permanent storage differs from feature map storage, there is a point moving the data
1403 if (
1404 tens.mem_area in self.arch.permanent_storage_mem_area
1405 and self.arch.permanent_storage_mem_area != self.arch.feature_map_storage_mem_area
1406 ) or tens.purpose == TensorPurpose.LUT:
1407 if tens.purpose == TensorPurpose.LUT or (
Patrik Gustavsson94292fe2021-09-02 08:22:58 +02001408 # For elementwise broadcast
Tim Halld8339a72021-05-27 18:49:40 +01001409 tens.purpose == TensorPurpose.FeatureMap
1410 and sched_op.op_type.is_binary_elementwise_op()
1411 and tens.shape != []
1412 and sched_op.ifm.shape != sched_op.ofm.shape
Patrik Gustavsson94292fe2021-09-02 08:22:58 +02001413 and parent_op.write_shape is None
Tim Halld8339a72021-05-27 18:49:40 +01001414 and tens.storage_size() > max_ifm_shram_avail
1415 ):
1416 only_vector_product_consumers = all(
1417 oper and oper.type.npu_block_type == NpuBlockType.VectorProduct
1418 for oper in tens.consumers()
1419 )
1420
1421 if (not only_vector_product_consumers) or tens.purpose == TensorPurpose.LUT:
1422 new_tens = tens.clone_into_fast_storage(self.arch)
1423 if tens.purpose == TensorPurpose.LUT:
1424 new_tens.mem_area = MemArea.Shram
1425
1426 new_tens.consumer_list.append(parent_op)
1427 parent_op.inputs[idx] = new_tens
Dwight Lidman352607c2021-09-29 17:00:09 +02001428 # If the index is out of range, IFM and IFM2 are the same tensor
1429 # and pass inputs don't have duplicates
1430 if idx < len(sched_op.parent_ps.inputs):
1431 sched_op.parent_ps.inputs[idx] = new_tens
Tim Halld8339a72021-05-27 18:49:40 +01001432
1433 def print_schedule(self, schedule: Schedule):
1434 print(f"Schedule: '{schedule.name}'")
1435 for sched_op in self.sched_ops:
1436 if sched_op not in schedule.cost_map:
1437 # Sub-schedule printing
1438 continue
1439
1440 op_info = schedule.cost_map[sched_op]
1441 print(f"\t{sched_op.index}: Operation {sched_op.name} - OFM {sched_op.ofm.shape}")
1442 print(f"\t\tType: {sched_op.op_type}")
1443 print(f"\t\tKernel: {sched_op.kernel}")
1444 print(f"{op_info}")
1445 mem_usage = (
1446 schedule.memory_snapshot[op_info.time_index]
1447 if op_info.time_index < len(schedule.memory_snapshot)
1448 else 0
1449 )
1450 print(f"\t\tSRAM Used: {mem_usage} bytes")
1451
Jonas Ohlsson25e700c2022-03-04 14:58:56 +01001452 print("\tCascades:")
Tim Halld8339a72021-05-27 18:49:40 +01001453 for i, cascade in enumerate(schedule.cascades.values()):
1454 print(f"\t\t{i}: {cascade.start} -> {cascade.end}, size: {cascade.mem_usage}")
Patrik Gustavssonfeeb06d2020-04-22 12:53:47 +02001455
Andreas Nevalainen27d36f02020-11-19 11:27:50 +01001456
Tim Halld8339a72021-05-27 18:49:40 +01001457def _update_tensor_allocation(nng: Graph, arch: ArchitectureFeatures, options):
1458 """
1459 Creates live ranges and runs tensor allocator for the current schedule
1460 (i.e. sg.schedule for all subgraphs), returns the maximum memory usage
1461 and updates SchedulerOpInfo.mem_usage for all operations in the schedule.
1462 """
1463 root_sg = nng.get_root_subgraph()
1464
1465 alloc_list = []
1466 if arch.is_spilling_enabled():
1467 mem_alloc_scratch_fast = (arch.fast_storage_mem_area, set((MemType.Scratch_fast,)))
1468 mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch,)))
1469 # Order is important
1470 alloc_list.append(mem_alloc_scratch_fast)
1471 alloc_list.append(mem_alloc_scratch)
1472 else:
1473 mem_alloc_scratch = (arch.feature_map_storage_mem_area, set((MemType.Scratch, MemType.Scratch_fast)))
1474 alloc_list.append(mem_alloc_scratch)
1475
1476 for mem_area, mem_type_set in alloc_list:
1477 tensor_allocation.allocate_tensors(
1478 nng,
1479 root_sg,
1480 arch,
1481 mem_area,
1482 mem_type_set,
1483 tensor_allocator=options.tensor_allocator,
1484 verbose_allocation=options.verbose_allocation,
1485 cpu_tensor_alignment=options.cpu_tensor_alignment,
Tim Hallcda4fcb2022-05-19 12:36:58 +01001486 hillclimb_max_iterations=options.hillclimb_max_iterations,
Tim Halld8339a72021-05-27 18:49:40 +01001487 )
1488
1489
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001490class FastStorageComponentAllocator:
Johan Alfvén5c309712022-06-10 15:40:58 +02001491 MAX_EXHAUSTIVE_LIFE_RANGE = 20
Johan Alfvén3a6325f2022-10-07 18:03:48 +02001492 MAX_EXHAUSTIVE_ITEMS = 20
Johan Alfvén5c309712022-06-10 15:40:58 +02001493
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001494 def __init__(self, base_mem_usage, max_mem_usage, staging_limit):
1495 self.base_mem_usage = base_mem_usage
1496 self.max_mem_usage = list(max_mem_usage)
1497 self.staging_limit = staging_limit
1498 self.lrs = []
1499 self.evicted = []
1500 self.curr_evicted = []
1501 self.remaining_total_size = []
Johan Alfvén5c309712022-06-10 15:40:58 +02001502 self.best_score = 0
1503 self.competing_tens_access = {}
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001504
Johan Alfvén5c309712022-06-10 15:40:58 +02001505 def allocate_exhaustive(self, ix, score):
1506 # Favour tensors with highest element access (score)
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001507 if ix >= len(self.lrs):
Johan Alfvén5c309712022-06-10 15:40:58 +02001508 if score > self.best_score:
1509 self.best_score = score
Louis Verhaard5c8f1e52022-02-23 14:13:07 +01001510 self.evicted = self.curr_evicted.copy()
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001511 return
1512
1513 lr = self.lrs[ix]
1514 for t in range(lr.start_time, lr.end_time):
1515 assert self.base_mem_usage[t] <= self.max_mem_usage[t]
1516 base_usage = max(self.base_mem_usage[lr.start_time : lr.end_time + 1])
1517 can_fit = base_usage + lr.size <= self.staging_limit
1518 always_fits = can_fit
1519
1520 if can_fit:
1521 max_usage = max(self.max_mem_usage[lr.start_time : lr.end_time + 1])
1522 always_fits = max_usage <= self.staging_limit
1523
1524 if can_fit or always_fits:
1525 self.curr_evicted[ix] = False
1526 self.base_mem_usage = self.update_mem_usage(self.base_mem_usage, lr, True)
Johan Alfvén5c309712022-06-10 15:40:58 +02001527 tens = lr.tensors[0]
1528 # Tensor is being included - add tensor element access to the score
1529 self.allocate_exhaustive(ix + 1, score + self.competing_tens_access[tens])
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001530 self.base_mem_usage = self.update_mem_usage(self.base_mem_usage, lr, False)
1531
1532 if not always_fits:
1533 self.curr_evicted[ix] = True
1534 self.max_mem_usage = self.update_mem_usage(self.max_mem_usage, lr, False)
Johan Alfvén5c309712022-06-10 15:40:58 +02001535 self.allocate_exhaustive(ix + 1, score)
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001536 self.max_mem_usage = self.update_mem_usage(self.max_mem_usage, lr, True)
1537
1538 @staticmethod
1539 def update_mem_usage(mem_usage, lr, increase):
1540 for t in range(lr.start_time, lr.end_time + 1):
1541 mem_usage[t] += lr.size if increase else -lr.size
1542 assert mem_usage[t] >= 0
1543 return mem_usage
1544
1545 @staticmethod
1546 def evict(lr, max_mem_usage, scratched_fms):
1547 for t in range(lr.start_time, lr.end_time + 1):
1548 max_mem_usage[t] -= lr.size
1549 for tens in lr.tensors:
1550 if tens in scratched_fms:
1551 tens.mem_area = scratched_fms[tens][0]
1552 tens.mem_type = scratched_fms[tens][1]
1553
1554 @staticmethod
1555 def keep(lr, base_mem_usage, staging_limit):
1556 for t in range(lr.start_time, lr.end_time + 1):
1557 base_mem_usage[t] += lr.size
1558 assert base_mem_usage[t] <= staging_limit
1559
Johan Alfvén68b8f2f2022-06-24 08:42:19 +02001560 def allocate_component(
1561 self, allocator, lrs, max_mem, min_mem, staging_limit, scratched_fms, competing_tens_access, evicted_fms
1562 ):
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001563 sz = len(lrs)
1564 allocator.lrs = lrs
1565 allocator.evicted = [0] * len(lrs)
1566 allocator.curr_evicted = [0] * sz
Johan Alfvén5c309712022-06-10 15:40:58 +02001567 allocator.best_score = -1
1568 allocator.competing_tens_access = competing_tens_access
1569 # Recursively evaluate all permutations of allocations of the lrs found in the component.
1570 # For every permutation that fits within the staging_limit there is a score calculated.
1571 # The permutation with the highest score will then be chosen. The score is calculated
1572 # as the sum of the actual element access (ifm read and ofm write) for all the
1573 # including tensors. So it is not necessary the tensor with the biggest size that ends up
1574 # being included in the result.
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001575 allocator.allocate_exhaustive(0, 0)
1576
1577 # Optimal allocation has been found, move lrs accordingly
1578 for i, e in enumerate(allocator.evicted):
1579 if e:
1580 self.evict(lrs[i], max_mem, scratched_fms)
Johan Alfvén68b8f2f2022-06-24 08:42:19 +02001581 if lrs[i] not in evicted_fms:
1582 evicted_fms.append(lrs[i])
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001583 else:
1584 self.keep(lrs[i], min_mem, staging_limit)
Johan Alfvén68b8f2f2022-06-24 08:42:19 +02001585 if lrs[i] in evicted_fms:
1586 evicted_fms.remove(lrs[i])
erik.andersson@arm.comde6cb642022-02-02 14:03:15 +01001587
1588
Tim Halld8339a72021-05-27 18:49:40 +01001589def schedule_passes(nng: Graph, arch: ArchitectureFeatures, options, scheduler_options: SchedulerOptions):
1590 """Entry point for the Scheduler"""
1591 # Initialize CPU subgraphs
1592 schedulers = dict()
1593 # Initialize schedulers with max schedule. Only schedule NPU subgraphs
Andreas Nevalainen27d36f02020-11-19 11:27:50 +01001594 for sg in nng.subgraphs:
Tim Halld8339a72021-05-27 18:49:40 +01001595 if sg.placement != PassPlacement.Npu:
1596 # Create cascaded passes for CPU Ops
1597 cascaded_passes = []
1598 for idx, ps in enumerate(sg.passes):
1599 cps = CascadedPass(
Jonas Ohlssond8575072022-03-30 10:30:25 +02001600 ps.name,
1601 SchedulingStrategy.WeightStream,
1602 ps.inputs,
1603 [],
1604 ps.outputs,
1605 [ps],
1606 ps.placement,
1607 False,
Tim Halld8339a72021-05-27 18:49:40 +01001608 )
Andreas Nevalainen27d36f02020-11-19 11:27:50 +01001609
Tim Halld8339a72021-05-27 18:49:40 +01001610 cps.time = idx
1611 ps.cascade = cps
1612 cascaded_passes.append(cps)
Andreas Nevalainen27d36f02020-11-19 11:27:50 +01001613
Tim Halld8339a72021-05-27 18:49:40 +01001614 sg.cascaded_passes = cascaded_passes
1615 else:
1616 # Npu subgraph - create schedule
1617 scheduler = Scheduler(nng, sg, arch, scheduler_options)
1618 schedulers[sg] = scheduler
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001619
Tim Halld8339a72021-05-27 18:49:40 +01001620 scheduler.create_scheduler_representation(arch)
1621 sg.sched_ops = scheduler.sched_ops
1622 scheduler.move_constant_data()
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001623
Tim Halld8339a72021-05-27 18:49:40 +01001624 # Create the Max schedule template
1625 max_schedule_template = scheduler.create_initial_schedule()
1626 scheduler.max_schedule = max_schedule_template
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001627
Tim Halld8339a72021-05-27 18:49:40 +01001628 # Create the optimimised Max schedule
1629 sg.schedule = max_schedule_template
1630 scheduler.update_op_memory_snapshot(max_schedule_template)
Tim Hall789e6f32021-06-17 17:02:31 +01001631 opt_max_schedule = scheduler.propose_schedule_buffering(max_schedule_template, 1 << 32)
Tim Halld8339a72021-05-27 18:49:40 +01001632 sg.schedule = opt_max_schedule
1633 scheduler.update_op_memory_snapshot(opt_max_schedule)
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001634
Tim Halld8339a72021-05-27 18:49:40 +01001635 # Create Min schedule
1636 min_schedule = scheduler.propose_minimal_schedule()
1637 initial_sram_limit = scheduler_options.optimization_sram_limit
1638 if scheduler_options.optimization_strategy == OptimizationStrategy.Size:
1639 initial_sram_limit = scheduler.min_memory_req
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001640
Johan Alfvén255dad72022-07-16 18:27:05 +02001641 # Build cascades for Min schedule
1642 scheduler.build_cascades_for_min_schedule(min_schedule, max_schedule_template, initial_sram_limit)
Tim Halld8339a72021-05-27 18:49:40 +01001643 sg.schedule = min_schedule
1644 scheduler.update_op_memory_snapshot(min_schedule)
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001645
Tim Halld8339a72021-05-27 18:49:40 +01001646 if scheduler_options.optimization_strategy == OptimizationStrategy.Performance:
1647 # Create an optimized schedule
1648 sg.schedule = scheduler.optimize_schedule(
1649 min_schedule, opt_max_schedule, max_schedule_template, scheduler_options
1650 )
1651 scheduler.update_op_memory_snapshot(sg.schedule)
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001652
Tim Halld8339a72021-05-27 18:49:40 +01001653 scheduler.apply_schedule(sg.schedule)
1654 scheduler.use_fast_storage_for_feature_maps(sg.schedule, scheduler_options.optimization_sram_limit)
Andreas Nevalainen897cc142020-10-28 15:42:08 +01001655
Johan Alfvén6f4cb032022-05-05 08:42:46 +02001656 if scheduler_options.optimization_strategy == OptimizationStrategy.Performance and scheduler.evicted_fms:
1657 # It might be possible to gain performance by reducing
1658 # weight buffer size and instead fit fms in fast storage
1659 scheduler.optimize_weight_buffering_size(min_schedule, scheduler_options)
1660
Tim Halld8339a72021-05-27 18:49:40 +01001661 if scheduler_options.verbose_schedule:
1662 scheduler.print_schedule(sg.schedule)
Tim Hall79d07d22020-04-27 18:20:16 +01001663
Tim Halld8339a72021-05-27 18:49:40 +01001664 # Evaluate schedule
1665 _update_tensor_allocation(nng, arch, options)