blob: 3c49eb13394a8cc0e4a87152f23d9063752cf7cd [file] [log] [blame]
Tim Halld8339a72021-05-27 18:49:40 +01001# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17# Description: Architecture SHRAM allocator
18import enum
19import math
20from typing import Optional
21from typing import Tuple
22
23from .architecture_features import ArchitectureFeatures
24from .architecture_features import Block
25from .architecture_features import SHRAMConfig
26from .architecture_features import SHRAMElements
27from .ethos_u55_regs.ethos_u55_regs import resampling_mode
28from .numeric_util import round_up
29from .numeric_util import round_up_divide
30from .operation import Kernel
31from .operation import NpuBlockType
32from .range_set import MemoryRangeSet
33from .shape4d import Shape4D
34from .tensor import MemArea
35
36
37class SHRAMLayout:
38 def __init__(self):
39 self.ib_start = 0
40 self.ib_end = 0
41 self.ib_start2 = 0
42 self.ab_start = 0
43 self.lut_start = 0
44
45
46class ArchitectureBlockConfig:
47 def __init__(self):
48 self.layout = SHRAMLayout()
49 self.ifm_block = Shape4D()
50 self.ofm_block = Shape4D()
51 self.acc_type = SHRAMElements.Acc32
52 self.is_partkernel = False
53 self.bank_size = 0
54
55 def get_shram_memory_access_range(self):
56 # Returns the SHRAM memory access range used by this shared buffer,
57 # excluding access to LUT
58 return MemoryRangeSet(MemArea.Shram, 0, self.layout.lut_start * self.bank_size)
59
60 def old_style_representation(self):
61 return [self.ofm_block.height, self.ofm_block.width, self.ifm_block.depth, self.ofm_block.depth]
62
63 def __str__(self):
64 return str(self.old_style_representation())
65
66
67_AccumulatorBits = {SHRAMElements.Acc16: 16, SHRAMElements.Acc32: 32, SHRAMElements.Acc40: 40}
68
69
70class ElementwiseUsage(enum.IntEnum):
71 No = 0
72 Full = 1
73 Scalar = 2
74
75
76def _try_block_config(
77 shram: SHRAMConfig,
78 ew_usage: ElementwiseUsage,
79 ofm_block: Block,
80 ifm_block: Block,
81 ifm_bits: int,
82 ifm_granule: int,
83 acc_bits: int,
84 acc_granule: int,
85 lut_banks: int,
86) -> SHRAMLayout:
87 assert (acc_bits > 0) and (acc_granule > 0)
88 assert (ifm_bits >= 8) and ((ifm_bits % 8) == 0) and (ifm_granule > 0)
89
90 # Aways need IFM space
91 ifm_bytes = ifm_block.elements_wh() * round_up((ifm_block.depth * ifm_bits) / 8, 8)
92 ifm_banks = round_up_divide(ifm_bytes, shram.bank_size_bytes) * 2
93 ifm_banks = round_up(ifm_banks, ifm_granule)
94
95 # Calculate SHRAM boundaries of the IFM and Accumulators
96 lut_start = shram.total_banks - lut_banks
97 ifm_end = shram.reserved_output_banks + ifm_banks
98 ifm2_start = ifm_end
99 acc_start = lut_start
100
101 # If not elementwise then we need accumulator space
102 if ew_usage == ElementwiseUsage.No:
103 acc_bytes = (ofm_block.elements_wh() * round_up(ofm_block.depth, 8) * acc_bits) // 8
104 acc_banks = round_up_divide(acc_bytes, shram.bank_size_bytes) * 2
105 acc_banks = round_up(acc_banks, acc_granule)
106 acc_start = acc_start - acc_banks
107 else:
108 ifm2_banks = ifm_banks if ew_usage == ElementwiseUsage.Full else 0
109 if ifm2_start + ifm2_banks > acc_start:
110 return None
111 ifm_end = acc_start
112
113 # IFM must still fit before accumulators
114 if ifm_end > acc_start:
115 return None
116
117 # Should all fit, so return this layout
118 layout = SHRAMLayout()
119 layout.ib_start = shram.reserved_output_banks
120 layout.ib_start2 = ifm2_start
121 layout.ib_end = ifm_end
122 layout.ab_start = acc_start
123 layout.lut_start = lut_start
124 return layout
125
126
127def _choose_kernel_method(ifm_shape: Shape4D, ifm_bits: int, kernel: Kernel) -> bool:
128 if ifm_shape.depth <= 8:
129 return True
130
131 # Compare part-kernel to depth-kernel and choose the one with best utilisation
132 kernel_elements = kernel.elements_wh()
133 depth_utilisation = ifm_shape.depth / round_up(ifm_shape.depth, 32 if ifm_bits == 8 else 16)
134 part_utilisation = (
135 ifm_shape.depth
136 * kernel_elements
137 / (round_up(ifm_shape.depth, 8) * round_up(kernel_elements, 4 if ifm_bits == 8 else 2))
138 )
139
140 return part_utilisation > depth_utilisation
141
142
143def _ew_usage(npu_op_type: NpuBlockType, uses_scalar: bool) -> ElementwiseUsage:
144 ew_usage = ElementwiseUsage.No
145 if npu_op_type == NpuBlockType.ElementWise:
146 ew_usage = ElementwiseUsage.Full
147 if uses_scalar:
148 ew_usage = ElementwiseUsage.Scalar
149 return ew_usage
150
151
152def _acc_type(npu_op_type: NpuBlockType, ifm_bits: int, scaled: bool) -> int:
153 """Returns accumulator type"""
154 acc_type = SHRAMElements.Acc32
155 if (ifm_bits == 16) and npu_op_type != NpuBlockType.Pooling and scaled:
156 acc_type = SHRAMElements.Acc40
157 return acc_type
158
159
160def to_upscale(ifm_resampling: resampling_mode) -> int:
161 # Upscaling depending on resampling mode
162 return 1 if ifm_resampling == resampling_mode.NONE else 2
163
164
165def _ifm_blockdepth(arch, ifm_shape: Shape4D, ifm_bits: int, is_partkernel: bool):
166 if ifm_bits == 16:
167 ifm_blockdepth = round_up(min(ifm_shape.depth, 16), 4)
168 else:
169 ifm_blockdepth = round_up(min(ifm_shape.depth, 16 if is_partkernel else 32), arch.ifm_ublock.depth)
170 return ifm_blockdepth
171
172
173def _required_size(value: int, stride: int, border: int, upscale: int) -> int:
174 return int(math.ceil(((value - 1) * stride + border) / upscale))
175
176
177def get_ifm_area_required(ofm_shape: Shape4D, kernel: Kernel, upscale: int) -> Tuple[int, int]:
178 h1 = _required_size(ofm_shape.height, kernel.stride.y, kernel.area_height(), upscale)
179 w1 = _required_size(ofm_shape.width, kernel.stride.x, kernel.area_width(), upscale)
180 return (w1, h1)
181
182
183def _get_ifm_blocksize(
184 ofm_block: Shape4D, kernel: Kernel, ublock: Block, subkernel_limit: Block, upscale: int
185) -> Shape4D:
186 # IFM block height
187 h1 = _required_size(ofm_block.height, kernel.stride.y, min(kernel.area_height(), subkernel_limit.height), upscale)
188 h2 = h1
189 height = round_up(min(h1, h2), ublock.height)
190
191 # IFM block width
192 w1 = _required_size(ofm_block.width, kernel.stride.x, min(kernel.area_width(), subkernel_limit.width), upscale)
193 w2 = w1
194 width = round_up(min(w1, w2), ublock.width)
195
196 return Shape4D(1, height, width, ofm_block.depth)
197
198
Tim Hall30161572021-06-17 17:03:49 +0100199def fit_block_for_ofm(arch: ArchitectureFeatures, ofm_shape: Shape4D, kernel: Kernel, block: Shape4D):
200 # 256/512 Conv1D optimisation (ratio of IFM:Accumulators changes) This is a specific
201 # interpretation of a more general constraint that can't be applied because the
202 # find_block_config function must return block configs that can be applied to any OFM shape.
203 if (ofm_shape.height == 1) and (kernel.height == 1) and (arch.ofm_ublock.height == 2):
204 return Shape4D(1, min(block.height, ofm_shape.height), block.width, block.depth)
205 return block
206
207
Tim Halld8339a72021-05-27 18:49:40 +0100208def find_block_config(
209 arch: ArchitectureFeatures,
210 npu_op_type: NpuBlockType,
211 ofm_shape: Shape4D,
212 ifm_shape: Shape4D,
213 ifm2_shape: Shape4D,
214 uses_scalar: bool,
215 ifm_bits: int,
216 kernel: Kernel,
217 lut_banks: int,
218 scaled: bool,
219 ifm_resampling: resampling_mode,
220) -> ArchitectureBlockConfig:
221 SplitDepth = ArchitectureFeatures.OFMSplitDepth
222 # Elementwise larger-volume correction
223 if ifm2_shape is not None and ifm2_shape.elements() > ifm_shape.elements():
224 ifm_shape = ifm2_shape
225
226 # Figure out if SHRAM should be portioned for elementwise
227 ew_usage = _ew_usage(npu_op_type, uses_scalar)
228
229 # Operator typing help
230 is_pooling = npu_op_type == NpuBlockType.Pooling
231 is_depthwise = npu_op_type == NpuBlockType.ConvolutionDepthWise
232 is_equal_depth_op = (ew_usage != ElementwiseUsage.No) or is_pooling or is_depthwise
233 is_convolution = (npu_op_type == NpuBlockType.ConvolutionMxN) or is_depthwise
234
235 # Block config to be returned
236 config = ArchitectureBlockConfig()
237 config.is_partkernel = is_convolution and _choose_kernel_method(ifm_shape, ifm_bits, kernel)
238
239 # Accumulator & granule settings
240 config.acc_type = _acc_type(npu_op_type, ifm_bits, scaled)
241
242 # Memory rounding granules
243 acc_granule = arch.accumulator_granules[config.acc_type]
244 acc_bits = _AccumulatorBits[config.acc_type]
245 if ew_usage != ElementwiseUsage.No:
246 ifm_granule = arch.ifm_ew_bank_granules[ifm_bits]
247 else:
248 ifm_granule = arch.ifm_bank_granules[ifm_bits]
249 lut_banks = max(lut_banks, arch.shram.reserved_end_banks)
250 upscale = to_upscale(ifm_resampling)
251
252 # Subkernel repeats of the IFM
253 ifm_repeats = round_up_divide(kernel.area_width(), arch.SubKernelMax.width) * round_up_divide(
254 kernel.area_height(), arch.SubKernelMax.height
255 )
256 ifm_blockdepth = _ifm_blockdepth(arch, ifm_shape, ifm_bits, config.is_partkernel)
257
258 # Weights fetch (for operators that have them)
259 weight_fetch_wh = (kernel.area_width() * kernel.area_height()) if is_convolution else 0
260
261 search_space = Shape4D.min(ofm_shape, Shape4D(arch.ofm_block_max.to_hwc()))
262 search_space = Shape4D.round_up(search_space, Shape4D(arch.ofm_ublock.to_hwc()))
263
264 # Block WHC search, loops across the search space looking for best efficiency
265 best_cost = math.inf
Tim Halldaed1522021-07-19 21:22:46 +0100266 best_coverage = math.inf
Tim Halld8339a72021-05-27 18:49:40 +0100267 depth = max(arch.ofm_ublock.depth, min(search_space.depth, SplitDepth))
268 if depth < ofm_shape.depth:
269 depth = round_up(depth, SplitDepth)
270
271 while depth <= search_space.depth:
272 wont_fit = {}
273 for height in range(arch.ofm_ublock.height, search_space.height + 1, arch.ofm_ublock.height):
274 for width in range(arch.ofm_ublock.width, search_space.width + 1, arch.ofm_ublock.width):
275 # Avoid checking W/H transposed blocks that already didn't fit. i.e. if 8x4x16 didn't
276 # fit, then 4x8x16 won't either.
277 if wont_fit.get((height, width), False):
278 continue
279
280 # Calculate the IFM block dimensions required to feed this OFM block
281 ofm_block = Shape4D(1, height, width, depth)
282 ifm_block = _get_ifm_blocksize(ofm_block, kernel, arch.ofm_ublock, arch.SubKernelMax, upscale)
283 if not is_equal_depth_op:
284 ifm_block = ifm_block.with_depth(ifm_blockdepth)
285
286 # Test if the IFM/OFM blocks fit into SHRAM
Tim Hall30161572021-06-17 17:03:49 +0100287 ofm_block = fit_block_for_ofm(arch, ofm_shape, kernel, ofm_block)
Tim Halld8339a72021-05-27 18:49:40 +0100288 layout = _try_block_config(
289 arch.shram, ew_usage, ofm_block, ifm_block, ifm_bits, ifm_granule, acc_bits, acc_granule, lut_banks
290 )
291
292 if layout:
Tim Hall789e6f32021-06-17 17:02:31 +0100293 full_blocks = Shape4D.div_round_up(ofm_shape, ofm_block)
294 blocks = ofm_shape / ofm_block
Tim Halld8339a72021-05-27 18:49:40 +0100295
Tim Hall789e6f32021-06-17 17:02:31 +0100296 # Weights fetching
297 weight_fetch = weight_fetch_wh * ifm_shape.depth * full_blocks.elements_wh()
298 if not is_depthwise:
299 weight_fetch *= ofm_block.depth * blocks.depth
Tim Halld8339a72021-05-27 18:49:40 +0100300
Tim Hall789e6f32021-06-17 17:02:31 +0100301 # IFM fetching
302 ifm_fetch = ifm_block.elements_wh() * ifm_shape.depth * ifm_repeats * blocks.elements_wh()
303 if not is_equal_depth_op:
304 ifm_fetch *= full_blocks.depth
Tim Halld8339a72021-05-27 18:49:40 +0100305
Tim Hall789e6f32021-06-17 17:02:31 +0100306 # Scale relative to every output OFM element
307 relative_cost = (ifm_fetch + weight_fetch) / ofm_shape.elements()
Tim Halld8339a72021-05-27 18:49:40 +0100308
309 # If the entire IFM can be encompassed by both buffers, bias to prefer this configuration
310 if ifm_shape.elements() < ifm_block.elements() * 2:
311 relative_cost = relative_cost / 2
312
Tim Halldaed1522021-07-19 21:22:46 +0100313 # Choose based on relative minimum cost or larger IFM area (if equal cost)
314 if relative_cost <= best_cost:
315 choose_this = False
316 # Check IFM coverage only when it's equal best_cost and small OFM
317 if relative_cost == best_cost:
318 coverage_shape = Shape4D.min(ifm_shape, ifm_block)
319 coverage = ifm_shape.elements_wh() / coverage_shape.elements_wh()
320 # Small 4x4 IFM constraint found through analysis of networks
321 if coverage <= best_coverage and (height <= 4 and width <= 4):
322 best_coverage = coverage
323 choose_this = True
324 else:
325 best_coverage = math.inf
326 choose_this = True
327
328 if choose_this:
329 best_cost = relative_cost
330 config.layout = layout
331 config.bank_size = arch.shram_bank_size
332 config.ifm_block = ifm_block
333 config.ofm_block = Shape4D(1, height, width, depth)
Tim Halld8339a72021-05-27 18:49:40 +0100334 else:
335 wont_fit[(width, height)] = True
336
337 depth = depth + arch.ofm_ublock.depth
338 if depth < ofm_shape.depth:
339 depth = round_up(depth, SplitDepth)
340
341 if best_cost != math.inf:
342 return config
343
344 return None
345
346
347def try_block_config(
348 block_config: Block,
349 arch: ArchitectureFeatures,
350 npu_op_type: NpuBlockType,
Tim Hall30161572021-06-17 17:03:49 +0100351 ofm_shape: Block,
Tim Halld8339a72021-05-27 18:49:40 +0100352 ifm_shape: Block,
353 ifm2_shape: Optional[Block],
354 uses_scalar: bool,
355 ifm_bits: int,
356 is_partkernel: bool,
357 kernel: Kernel,
358 lut_banks: int,
359 scaled: bool,
360 ifm_resampling: resampling_mode,
361) -> Optional[ArchitectureBlockConfig]:
362 """
363 Given a block_config, returns a corresponding ArchitectureBlockConfig.
364 Returns None if the block_config does not fit or is invalid.
365 """
366 # Check block config validity
367 if not all(
368 blk > 0 and blk <= blk_max and blk % ublk == 0
369 for blk, blk_max, ublk in zip(block_config.as_list(), arch.ofm_block_max.as_list(), arch.ofm_ublock.as_list())
370 ):
371 return None
372 # Elementwise larger-volume correction
373 if ifm2_shape is not None and ifm2_shape.elements() > ifm_shape.elements():
374 ifm_shape = ifm2_shape
375
376 ew_usage = _ew_usage(npu_op_type, uses_scalar)
377
378 # Operator typing help
379 is_pooling = npu_op_type == NpuBlockType.Pooling
380 is_depthwise = npu_op_type == NpuBlockType.ConvolutionDepthWise
381 is_equal_depth_op = (ew_usage != ElementwiseUsage.No) or is_pooling or is_depthwise
382
383 # Block config to be returned
384 config = ArchitectureBlockConfig()
385 config.is_partkernel = is_partkernel
386
387 # Accumulator & granule settings
388 config.acc_type = _acc_type(npu_op_type, ifm_bits, scaled)
389
390 # Memory rounding granules
391 acc_granule = arch.accumulator_granules[config.acc_type]
392 acc_bits = _AccumulatorBits[config.acc_type]
393 if ew_usage != ElementwiseUsage.No:
394 ifm_granule = arch.ifm_ew_bank_granules[ifm_bits]
395 else:
396 ifm_granule = arch.ifm_bank_granules[ifm_bits]
397 lut_banks = max(lut_banks, arch.shram.reserved_end_banks)
398 upscale = to_upscale(ifm_resampling)
399 ifm_blockdepth = _ifm_blockdepth(arch, ifm_shape, ifm_bits, is_partkernel)
400 ifm_block = _get_ifm_blocksize(block_config, kernel, arch.ofm_ublock, arch.SubKernelMax, upscale)
401 if not is_equal_depth_op:
402 ifm_block = ifm_block.with_depth(ifm_blockdepth)
403
Tim Hall30161572021-06-17 17:03:49 +0100404 # 256/512 Conv1D optimisation (ratio of IFM:Accumulators changes)
405 block_config = fit_block_for_ofm(arch, ofm_shape, kernel, block_config)
406
Tim Halld8339a72021-05-27 18:49:40 +0100407 layout = _try_block_config(
408 arch.shram, ew_usage, block_config, ifm_block, ifm_bits, ifm_granule, acc_bits, acc_granule, lut_banks
409 )
410 if layout is None:
411 return None
412 config.layout = layout
413 config.bank_size = arch.shram_bank_size
414 config.ifm_block = ifm_block
415 return config