blob: d27f126431376a1f851659e9cfed4a9ffbbb895d [file] [log] [blame]
Tim Halld8339a72021-05-27 18:49:40 +01001# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16#
17# Description: Architecture SHRAM allocator
18import enum
19import math
Jonas Ohlsson845e2322022-03-01 12:39:55 +010020from typing import Dict
Tim Halld8339a72021-05-27 18:49:40 +010021from typing import Optional
22from typing import Tuple
Jonas Ohlsson845e2322022-03-01 12:39:55 +010023from typing import Union
Tim Halld8339a72021-05-27 18:49:40 +010024
25from .architecture_features import ArchitectureFeatures
26from .architecture_features import Block
27from .architecture_features import SHRAMConfig
28from .architecture_features import SHRAMElements
29from .ethos_u55_regs.ethos_u55_regs import resampling_mode
30from .numeric_util import round_up
31from .numeric_util import round_up_divide
32from .operation import Kernel
33from .operation import NpuBlockType
34from .range_set import MemoryRangeSet
35from .shape4d import Shape4D
36from .tensor import MemArea
37
38
39class SHRAMLayout:
40 def __init__(self):
41 self.ib_start = 0
42 self.ib_end = 0
43 self.ib_start2 = 0
44 self.ab_start = 0
45 self.lut_start = 0
46
47
48class ArchitectureBlockConfig:
49 def __init__(self):
50 self.layout = SHRAMLayout()
51 self.ifm_block = Shape4D()
James Ward399c4a22021-10-20 11:04:46 +010052 self.ofm_block = Shape4D() # non-1D-optimised block
Tim Halld8339a72021-05-27 18:49:40 +010053 self.acc_type = SHRAMElements.Acc32
54 self.is_partkernel = False
55 self.bank_size = 0
Louis Verhaardd2b55102022-03-17 15:59:04 +010056 self.ifm_depth_buf_scaling = 0
Tim Halld8339a72021-05-27 18:49:40 +010057
58 def get_shram_memory_access_range(self):
59 # Returns the SHRAM memory access range used by this shared buffer,
60 # excluding access to LUT
61 return MemoryRangeSet(MemArea.Shram, 0, self.layout.lut_start * self.bank_size)
62
63 def old_style_representation(self):
64 return [self.ofm_block.height, self.ofm_block.width, self.ifm_block.depth, self.ofm_block.depth]
65
66 def __str__(self):
67 return str(self.old_style_representation())
68
69
70_AccumulatorBits = {SHRAMElements.Acc16: 16, SHRAMElements.Acc32: 32, SHRAMElements.Acc40: 40}
71
72
73class ElementwiseUsage(enum.IntEnum):
74 No = 0
75 Full = 1
76 Scalar = 2
77
78
79def _try_block_config(
80 shram: SHRAMConfig,
81 ew_usage: ElementwiseUsage,
Jonas Ohlsson845e2322022-03-01 12:39:55 +010082 ofm_block: Union[Shape4D, Block],
83 ifm_block: Union[Shape4D, Block],
Tim Halld8339a72021-05-27 18:49:40 +010084 ifm_bits: int,
85 ifm_granule: int,
86 acc_bits: int,
87 acc_granule: int,
88 lut_banks: int,
Louis Verhaardd2b55102022-03-17 15:59:04 +010089 ifm_depth_buf_scaling: int,
90 cores: int,
Jonas Ohlsson845e2322022-03-01 12:39:55 +010091) -> Union[SHRAMLayout, None]:
Tim Halld8339a72021-05-27 18:49:40 +010092 assert (acc_bits > 0) and (acc_granule > 0)
93 assert (ifm_bits >= 8) and ((ifm_bits % 8) == 0) and (ifm_granule > 0)
94
Louis Verhaardd2b55102022-03-17 15:59:04 +010095 # Scale depth with cores
96 ifm_depth = round_up_divide(ifm_block.depth, ifm_depth_buf_scaling)
97 ofm_depth = round_up_divide(ofm_block.depth, cores)
98
Tim Halld8339a72021-05-27 18:49:40 +010099 # Aways need IFM space
Louis Verhaardd2b55102022-03-17 15:59:04 +0100100 ifm_bytes = ifm_block.elements_wh() * round_up((ifm_depth * ifm_bits) / 8, 8)
Tim Halld8339a72021-05-27 18:49:40 +0100101 ifm_banks = round_up_divide(ifm_bytes, shram.bank_size_bytes) * 2
102 ifm_banks = round_up(ifm_banks, ifm_granule)
103
104 # Calculate SHRAM boundaries of the IFM and Accumulators
105 lut_start = shram.total_banks - lut_banks
106 ifm_end = shram.reserved_output_banks + ifm_banks
107 ifm2_start = ifm_end
108 acc_start = lut_start
109
110 # If not elementwise then we need accumulator space
111 if ew_usage == ElementwiseUsage.No:
Louis Verhaardd2b55102022-03-17 15:59:04 +0100112 acc_bytes = (ofm_block.elements_wh() * round_up(ofm_depth, 8) * acc_bits) // 8
Tim Halld8339a72021-05-27 18:49:40 +0100113 acc_banks = round_up_divide(acc_bytes, shram.bank_size_bytes) * 2
114 acc_banks = round_up(acc_banks, acc_granule)
115 acc_start = acc_start - acc_banks
116 else:
117 ifm2_banks = ifm_banks if ew_usage == ElementwiseUsage.Full else 0
118 if ifm2_start + ifm2_banks > acc_start:
119 return None
120 ifm_end = acc_start
121
122 # IFM must still fit before accumulators
123 if ifm_end > acc_start:
124 return None
125
126 # Should all fit, so return this layout
127 layout = SHRAMLayout()
128 layout.ib_start = shram.reserved_output_banks
129 layout.ib_start2 = ifm2_start
130 layout.ib_end = ifm_end
131 layout.ab_start = acc_start
132 layout.lut_start = lut_start
133 return layout
134
135
136def _choose_kernel_method(ifm_shape: Shape4D, ifm_bits: int, kernel: Kernel) -> bool:
137 if ifm_shape.depth <= 8:
138 return True
139
140 # Compare part-kernel to depth-kernel and choose the one with best utilisation
141 kernel_elements = kernel.elements_wh()
142 depth_utilisation = ifm_shape.depth / round_up(ifm_shape.depth, 32 if ifm_bits == 8 else 16)
143 part_utilisation = (
144 ifm_shape.depth
145 * kernel_elements
146 / (round_up(ifm_shape.depth, 8) * round_up(kernel_elements, 4 if ifm_bits == 8 else 2))
147 )
148
149 return part_utilisation > depth_utilisation
150
151
152def _ew_usage(npu_op_type: NpuBlockType, uses_scalar: bool) -> ElementwiseUsage:
153 ew_usage = ElementwiseUsage.No
154 if npu_op_type == NpuBlockType.ElementWise:
155 ew_usage = ElementwiseUsage.Full
156 if uses_scalar:
157 ew_usage = ElementwiseUsage.Scalar
158 return ew_usage
159
160
161def _acc_type(npu_op_type: NpuBlockType, ifm_bits: int, scaled: bool) -> int:
162 """Returns accumulator type"""
163 acc_type = SHRAMElements.Acc32
164 if (ifm_bits == 16) and npu_op_type != NpuBlockType.Pooling and scaled:
165 acc_type = SHRAMElements.Acc40
166 return acc_type
167
168
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200169def is_nearest(ifm_resampling: resampling_mode) -> bool:
170 return ifm_resampling == resampling_mode.NEAREST
171
172
Tim Halld8339a72021-05-27 18:49:40 +0100173def to_upscale(ifm_resampling: resampling_mode) -> int:
174 # Upscaling depending on resampling mode
175 return 1 if ifm_resampling == resampling_mode.NONE else 2
176
177
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100178def _ifm_blockdepth(arch, ifm_shape: Union[Shape4D, Block], ifm_bits: int, is_partkernel: bool):
Tim Halld8339a72021-05-27 18:49:40 +0100179 if ifm_bits == 16:
180 ifm_blockdepth = round_up(min(ifm_shape.depth, 16), 4)
181 else:
182 ifm_blockdepth = round_up(min(ifm_shape.depth, 16 if is_partkernel else 32), arch.ifm_ublock.depth)
183 return ifm_blockdepth
184
185
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200186def _required_size(value: int, stride: int, border: int, upscale: int, nearest: bool) -> int:
187 return int(math.ceil(((value - 1) * stride + border + nearest) / upscale))
Tim Halld8339a72021-05-27 18:49:40 +0100188
189
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100190def get_ifm_area_required(
191 ofm_shape: Union[Shape4D, Block], kernel: Kernel, resampling_mode: resampling_mode
192) -> Tuple[int, int]:
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200193 upscale = to_upscale(resampling_mode)
194 nearest = is_nearest(resampling_mode)
195 h1 = _required_size(ofm_shape.height, kernel.stride.y, kernel.area_height(), upscale, nearest)
196 w1 = _required_size(ofm_shape.width, kernel.stride.x, kernel.area_width(), upscale, nearest)
Tim Halld8339a72021-05-27 18:49:40 +0100197 return (w1, h1)
198
199
200def _get_ifm_blocksize(
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100201 ofm_block: Union[Shape4D, Block], kernel: Kernel, ublock: Block, subkernel_limit: Block, upscale: int, nearest: bool
Tim Halld8339a72021-05-27 18:49:40 +0100202) -> Shape4D:
203 # IFM block height
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200204 h1 = _required_size(
205 ofm_block.height, kernel.stride.y, min(kernel.area_height(), subkernel_limit.height), upscale, nearest
206 )
Tim Halld8339a72021-05-27 18:49:40 +0100207 h2 = h1
208 height = round_up(min(h1, h2), ublock.height)
209
210 # IFM block width
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200211 w1 = _required_size(
212 ofm_block.width, kernel.stride.x, min(kernel.area_width(), subkernel_limit.width), upscale, nearest
213 )
Tim Halld8339a72021-05-27 18:49:40 +0100214 w2 = w1
215 width = round_up(min(w1, w2), ublock.width)
216
217 return Shape4D(1, height, width, ofm_block.depth)
218
219
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100220def fit_block_for_ofm(
221 arch: ArchitectureFeatures, ofm_shape: Union[Shape4D, Block], kernel: Kernel, block: Union[Shape4D, Block]
222):
Tim Hall30161572021-06-17 17:03:49 +0100223 # 256/512 Conv1D optimisation (ratio of IFM:Accumulators changes) This is a specific
224 # interpretation of a more general constraint that can't be applied because the
225 # find_block_config function must return block configs that can be applied to any OFM shape.
226 if (ofm_shape.height == 1) and (kernel.height == 1) and (arch.ofm_ublock.height == 2):
227 return Shape4D(1, min(block.height, ofm_shape.height), block.width, block.depth)
228 return block
229
230
Tim Halld8339a72021-05-27 18:49:40 +0100231def find_block_config(
232 arch: ArchitectureFeatures,
233 npu_op_type: NpuBlockType,
234 ofm_shape: Shape4D,
235 ifm_shape: Shape4D,
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100236 ifm2_shape: Optional[Shape4D],
Tim Halld8339a72021-05-27 18:49:40 +0100237 uses_scalar: bool,
238 ifm_bits: int,
239 kernel: Kernel,
240 lut_banks: int,
241 scaled: bool,
242 ifm_resampling: resampling_mode,
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100243) -> Optional[ArchitectureBlockConfig]:
Tim Halld8339a72021-05-27 18:49:40 +0100244 SplitDepth = ArchitectureFeatures.OFMSplitDepth
245 # Elementwise larger-volume correction
246 if ifm2_shape is not None and ifm2_shape.elements() > ifm_shape.elements():
247 ifm_shape = ifm2_shape
248
249 # Figure out if SHRAM should be portioned for elementwise
250 ew_usage = _ew_usage(npu_op_type, uses_scalar)
251
252 # Operator typing help
253 is_pooling = npu_op_type == NpuBlockType.Pooling
254 is_depthwise = npu_op_type == NpuBlockType.ConvolutionDepthWise
255 is_equal_depth_op = (ew_usage != ElementwiseUsage.No) or is_pooling or is_depthwise
256 is_convolution = (npu_op_type == NpuBlockType.ConvolutionMxN) or is_depthwise
257
258 # Block config to be returned
259 config = ArchitectureBlockConfig()
260 config.is_partkernel = is_convolution and _choose_kernel_method(ifm_shape, ifm_bits, kernel)
261
Louis Verhaardd2b55102022-03-17 15:59:04 +0100262 # IFM is not broadcasted for pooling and depthwise ops and for elementwise
263 # when there's no elementwise-broadcasting in depth
264 elemwise_buf_scalable = npu_op_type == NpuBlockType.ElementWise and (
265 not ifm2_shape or ifm_shape.depth == ifm2_shape.depth
266 )
267 ifm_depth_buf_scaling = arch.ncores if is_pooling or is_depthwise or elemwise_buf_scalable else 1
268 config.ifm_depth_buf_scaling = ifm_depth_buf_scaling
269
Tim Halld8339a72021-05-27 18:49:40 +0100270 # Accumulator & granule settings
271 config.acc_type = _acc_type(npu_op_type, ifm_bits, scaled)
272
273 # Memory rounding granules
274 acc_granule = arch.accumulator_granules[config.acc_type]
275 acc_bits = _AccumulatorBits[config.acc_type]
276 if ew_usage != ElementwiseUsage.No:
277 ifm_granule = arch.ifm_ew_bank_granules[ifm_bits]
278 else:
279 ifm_granule = arch.ifm_bank_granules[ifm_bits]
280 lut_banks = max(lut_banks, arch.shram.reserved_end_banks)
281 upscale = to_upscale(ifm_resampling)
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200282 nearest = is_nearest(ifm_resampling)
Tim Halld8339a72021-05-27 18:49:40 +0100283
284 # Subkernel repeats of the IFM
285 ifm_repeats = round_up_divide(kernel.area_width(), arch.SubKernelMax.width) * round_up_divide(
286 kernel.area_height(), arch.SubKernelMax.height
287 )
288 ifm_blockdepth = _ifm_blockdepth(arch, ifm_shape, ifm_bits, config.is_partkernel)
289
290 # Weights fetch (for operators that have them)
291 weight_fetch_wh = (kernel.area_width() * kernel.area_height()) if is_convolution else 0
292
Louis Verhaardd2b55102022-03-17 15:59:04 +0100293 ofm_ublock_depth = arch.ofm_ublock.depth * arch.ncores
Tim Halld8339a72021-05-27 18:49:40 +0100294 search_space = Shape4D.min(ofm_shape, Shape4D(arch.ofm_block_max.to_hwc()))
Louis Verhaardd2b55102022-03-17 15:59:04 +0100295 search_space = Shape4D.round_up(search_space, Shape4D(arch.ofm_ublock.to_hwc()).with_depth(ofm_ublock_depth))
Tim Halld8339a72021-05-27 18:49:40 +0100296
297 # Block WHC search, loops across the search space looking for best efficiency
298 best_cost = math.inf
Tim Halldaed1522021-07-19 21:22:46 +0100299 best_coverage = math.inf
Tim Halld8339a72021-05-27 18:49:40 +0100300 depth = max(arch.ofm_ublock.depth, min(search_space.depth, SplitDepth))
301 if depth < ofm_shape.depth:
302 depth = round_up(depth, SplitDepth)
303
304 while depth <= search_space.depth:
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100305 wont_fit: Dict[Tuple[int, int], bool] = {}
Tim Halld8339a72021-05-27 18:49:40 +0100306 for height in range(arch.ofm_ublock.height, search_space.height + 1, arch.ofm_ublock.height):
307 for width in range(arch.ofm_ublock.width, search_space.width + 1, arch.ofm_ublock.width):
308 # Avoid checking W/H transposed blocks that already didn't fit. i.e. if 8x4x16 didn't
309 # fit, then 4x8x16 won't either.
310 if wont_fit.get((height, width), False):
311 continue
312
313 # Calculate the IFM block dimensions required to feed this OFM block
314 ofm_block = Shape4D(1, height, width, depth)
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200315 ifm_block = _get_ifm_blocksize(ofm_block, kernel, arch.ofm_ublock, arch.SubKernelMax, upscale, nearest)
Tim Halld8339a72021-05-27 18:49:40 +0100316 if not is_equal_depth_op:
317 ifm_block = ifm_block.with_depth(ifm_blockdepth)
318
319 # Test if the IFM/OFM blocks fit into SHRAM
Tim Hall30161572021-06-17 17:03:49 +0100320 ofm_block = fit_block_for_ofm(arch, ofm_shape, kernel, ofm_block)
Tim Halld8339a72021-05-27 18:49:40 +0100321 layout = _try_block_config(
Louis Verhaardd2b55102022-03-17 15:59:04 +0100322 arch.shram,
323 ew_usage,
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100324 Block(ofm_block.width, ofm_block.height, ofm_block.depth),
325 Block(ifm_block.width, ifm_block.height, ifm_block.depth),
Louis Verhaardd2b55102022-03-17 15:59:04 +0100326 ifm_bits,
327 ifm_granule,
328 acc_bits,
329 acc_granule,
330 lut_banks,
331 ifm_depth_buf_scaling,
332 arch.ncores,
Tim Halld8339a72021-05-27 18:49:40 +0100333 )
334
335 if layout:
Tim Hall789e6f32021-06-17 17:02:31 +0100336 full_blocks = Shape4D.div_round_up(ofm_shape, ofm_block)
337 blocks = ofm_shape / ofm_block
Tim Halld8339a72021-05-27 18:49:40 +0100338
Tim Hall789e6f32021-06-17 17:02:31 +0100339 # Weights fetching
340 weight_fetch = weight_fetch_wh * ifm_shape.depth * full_blocks.elements_wh()
341 if not is_depthwise:
342 weight_fetch *= ofm_block.depth * blocks.depth
Tim Halld8339a72021-05-27 18:49:40 +0100343
Tim Hall789e6f32021-06-17 17:02:31 +0100344 # IFM fetching
345 ifm_fetch = ifm_block.elements_wh() * ifm_shape.depth * ifm_repeats * blocks.elements_wh()
346 if not is_equal_depth_op:
347 ifm_fetch *= full_blocks.depth
Tim Halld8339a72021-05-27 18:49:40 +0100348
Tim Hall789e6f32021-06-17 17:02:31 +0100349 # Scale relative to every output OFM element
350 relative_cost = (ifm_fetch + weight_fetch) / ofm_shape.elements()
Tim Halld8339a72021-05-27 18:49:40 +0100351
352 # If the entire IFM can be encompassed by both buffers, bias to prefer this configuration
353 if ifm_shape.elements() < ifm_block.elements() * 2:
354 relative_cost = relative_cost / 2
355
Tim Halldaed1522021-07-19 21:22:46 +0100356 # Choose based on relative minimum cost or larger IFM area (if equal cost)
357 if relative_cost <= best_cost:
358 choose_this = False
359 # Check IFM coverage only when it's equal best_cost and small OFM
360 if relative_cost == best_cost:
361 coverage_shape = Shape4D.min(ifm_shape, ifm_block)
362 coverage = ifm_shape.elements_wh() / coverage_shape.elements_wh()
363 # Small 4x4 IFM constraint found through analysis of networks
364 if coverage <= best_coverage and (height <= 4 and width <= 4):
365 best_coverage = coverage
366 choose_this = True
367 else:
368 best_coverage = math.inf
369 choose_this = True
370
371 if choose_this:
372 best_cost = relative_cost
373 config.layout = layout
374 config.bank_size = arch.shram_bank_size
375 config.ifm_block = ifm_block
376 config.ofm_block = Shape4D(1, height, width, depth)
Tim Halld8339a72021-05-27 18:49:40 +0100377 else:
378 wont_fit[(width, height)] = True
379
380 depth = depth + arch.ofm_ublock.depth
381 if depth < ofm_shape.depth:
382 depth = round_up(depth, SplitDepth)
383
384 if best_cost != math.inf:
385 return config
386
387 return None
388
389
390def try_block_config(
391 block_config: Block,
392 arch: ArchitectureFeatures,
393 npu_op_type: NpuBlockType,
Jonas Ohlsson845e2322022-03-01 12:39:55 +0100394 ofm_shape: Union[Shape4D, Block],
395 ifm_shape: Union[Shape4D, Block],
396 ifm2_shape: Optional[Union[Shape4D, Block]],
Tim Halld8339a72021-05-27 18:49:40 +0100397 uses_scalar: bool,
398 ifm_bits: int,
399 is_partkernel: bool,
400 kernel: Kernel,
401 lut_banks: int,
402 scaled: bool,
403 ifm_resampling: resampling_mode,
404) -> Optional[ArchitectureBlockConfig]:
405 """
406 Given a block_config, returns a corresponding ArchitectureBlockConfig.
407 Returns None if the block_config does not fit or is invalid.
408 """
409 # Check block config validity
410 if not all(
411 blk > 0 and blk <= blk_max and blk % ublk == 0
412 for blk, blk_max, ublk in zip(block_config.as_list(), arch.ofm_block_max.as_list(), arch.ofm_ublock.as_list())
413 ):
414 return None
415 # Elementwise larger-volume correction
416 if ifm2_shape is not None and ifm2_shape.elements() > ifm_shape.elements():
417 ifm_shape = ifm2_shape
418
419 ew_usage = _ew_usage(npu_op_type, uses_scalar)
420
421 # Operator typing help
422 is_pooling = npu_op_type == NpuBlockType.Pooling
423 is_depthwise = npu_op_type == NpuBlockType.ConvolutionDepthWise
424 is_equal_depth_op = (ew_usage != ElementwiseUsage.No) or is_pooling or is_depthwise
425
426 # Block config to be returned
427 config = ArchitectureBlockConfig()
428 config.is_partkernel = is_partkernel
429
Louis Verhaardd2b55102022-03-17 15:59:04 +0100430 # IFM is not broadcasted for pooling and depthwise ops and for elementwise
431 # when there's no elementwise-broadcasting in depth
432 elemwise_buf_scalable = npu_op_type == NpuBlockType.ElementWise and (
433 not ifm2_shape or ifm_shape.depth == ifm2_shape.depth
434 )
435 ifm_depth_buf_scaling = arch.ncores if is_pooling or is_depthwise or elemwise_buf_scalable else 1
436 config.ifm_depth_buf_scaling = ifm_depth_buf_scaling
437
Tim Halld8339a72021-05-27 18:49:40 +0100438 # Accumulator & granule settings
439 config.acc_type = _acc_type(npu_op_type, ifm_bits, scaled)
440
441 # Memory rounding granules
442 acc_granule = arch.accumulator_granules[config.acc_type]
443 acc_bits = _AccumulatorBits[config.acc_type]
444 if ew_usage != ElementwiseUsage.No:
445 ifm_granule = arch.ifm_ew_bank_granules[ifm_bits]
446 else:
447 ifm_granule = arch.ifm_bank_granules[ifm_bits]
448 lut_banks = max(lut_banks, arch.shram.reserved_end_banks)
449 upscale = to_upscale(ifm_resampling)
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200450 nearest = is_nearest(ifm_resampling)
Tim Halld8339a72021-05-27 18:49:40 +0100451 ifm_blockdepth = _ifm_blockdepth(arch, ifm_shape, ifm_bits, is_partkernel)
Fredrik Svedberg3ff7a4a2021-09-29 10:08:04 +0200452 ifm_block = _get_ifm_blocksize(block_config, kernel, arch.ofm_ublock, arch.SubKernelMax, upscale, nearest)
Tim Halld8339a72021-05-27 18:49:40 +0100453 if not is_equal_depth_op:
454 ifm_block = ifm_block.with_depth(ifm_blockdepth)
455
Tim Hall30161572021-06-17 17:03:49 +0100456 # 256/512 Conv1D optimisation (ratio of IFM:Accumulators changes)
James Ward399c4a22021-10-20 11:04:46 +0100457 block_config_opt = fit_block_for_ofm(arch, ofm_shape, kernel, block_config)
Tim Hall30161572021-06-17 17:03:49 +0100458
Tim Halld8339a72021-05-27 18:49:40 +0100459 layout = _try_block_config(
Louis Verhaardd2b55102022-03-17 15:59:04 +0100460 arch.shram,
461 ew_usage,
462 block_config_opt,
463 ifm_block,
464 ifm_bits,
465 ifm_granule,
466 acc_bits,
467 acc_granule,
468 lut_banks,
469 ifm_depth_buf_scaling,
470 arch.ncores,
Tim Halld8339a72021-05-27 18:49:40 +0100471 )
472 if layout is None:
473 return None
474 config.layout = layout
475 config.bank_size = arch.shram_bank_size
476 config.ifm_block = ifm_block
Jacob Bohlinb8060f52021-08-09 12:22:51 +0100477 config.ofm_block = block_config
Tim Halld8339a72021-05-27 18:49:40 +0100478 return config