Blame - ethosu/vela/architecture_allocator.py - ml/ethos-u/ethos-u-vela

blob: 84d8354bdfe03bddf9bd65fe902173c86a928fc8 [file] [log] [blame]

Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	1	# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
				2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
				16	#
				17	# Description: Architecture SHRAM allocator
				18	import enum
				19	import math
				20	from typing import Optional
				21	from typing import Tuple
				22
				23	from .architecture_features import ArchitectureFeatures
				24	from .architecture_features import Block
				25	from .architecture_features import SHRAMConfig
				26	from .architecture_features import SHRAMElements
				27	from .ethos_u55_regs.ethos_u55_regs import resampling_mode
				28	from .numeric_util import round_up
				29	from .numeric_util import round_up_divide
				30	from .operation import Kernel
				31	from .operation import NpuBlockType
				32	from .range_set import MemoryRangeSet
				33	from .shape4d import Shape4D
				34	from .tensor import MemArea
				35
				36
				37	class SHRAMLayout:
				38	def __init__(self):
				39	self.ib_start = 0
				40	self.ib_end = 0
				41	self.ib_start2 = 0
				42	self.ab_start = 0
				43	self.lut_start = 0
				44
				45
				46	class ArchitectureBlockConfig:
				47	def __init__(self):
				48	self.layout = SHRAMLayout()
				49	self.ifm_block = Shape4D()
James Ward	399c4a2	2021-10-20 11:04:46 +0100	[diff] [blame]	50	self.ofm_block = Shape4D() # non-1D-optimised block
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	51	self.acc_type = SHRAMElements.Acc32
				52	self.is_partkernel = False
				53	self.bank_size = 0
Louis Verhaard	d2b5510	2022-03-17 15:59:04 +0100	[diff] [blame^]	54	self.ifm_depth_buf_scaling = 0
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	55
				56	def get_shram_memory_access_range(self):
				57	# Returns the SHRAM memory access range used by this shared buffer,
				58	# excluding access to LUT
				59	return MemoryRangeSet(MemArea.Shram, 0, self.layout.lut_start * self.bank_size)
				60
				61	def old_style_representation(self):
				62	return [self.ofm_block.height, self.ofm_block.width, self.ifm_block.depth, self.ofm_block.depth]
				63
				64	def __str__(self):
				65	return str(self.old_style_representation())
				66
				67
				68	_AccumulatorBits = {SHRAMElements.Acc16: 16, SHRAMElements.Acc32: 32, SHRAMElements.Acc40: 40}
				69
				70
				71	class ElementwiseUsage(enum.IntEnum):
				72	No = 0
				73	Full = 1
				74	Scalar = 2
				75
				76
				77	def _try_block_config(
				78	shram: SHRAMConfig,
				79	ew_usage: ElementwiseUsage,
				80	ofm_block: Block,
				81	ifm_block: Block,
				82	ifm_bits: int,
				83	ifm_granule: int,
				84	acc_bits: int,
				85	acc_granule: int,
				86	lut_banks: int,
Louis Verhaard	d2b5510	2022-03-17 15:59:04 +0100	[diff] [blame^]	87	ifm_depth_buf_scaling: int,
				88	cores: int,
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	89	) -> SHRAMLayout:
				90	assert (acc_bits > 0) and (acc_granule > 0)
				91	assert (ifm_bits >= 8) and ((ifm_bits % 8) == 0) and (ifm_granule > 0)
				92
Louis Verhaard	d2b5510	2022-03-17 15:59:04 +0100	[diff] [blame^]	93	# Scale depth with cores
				94	ifm_depth = round_up_divide(ifm_block.depth, ifm_depth_buf_scaling)
				95	ofm_depth = round_up_divide(ofm_block.depth, cores)
				96
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	97	# Aways need IFM space
Louis Verhaard	d2b5510	2022-03-17 15:59:04 +0100	[diff] [blame^]	98	ifm_bytes = ifm_block.elements_wh() * round_up((ifm_depth * ifm_bits) / 8, 8)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	99	ifm_banks = round_up_divide(ifm_bytes, shram.bank_size_bytes) * 2
				100	ifm_banks = round_up(ifm_banks, ifm_granule)
				101
				102	# Calculate SHRAM boundaries of the IFM and Accumulators
				103	lut_start = shram.total_banks - lut_banks
				104	ifm_end = shram.reserved_output_banks + ifm_banks
				105	ifm2_start = ifm_end
				106	acc_start = lut_start
				107
				108	# If not elementwise then we need accumulator space
				109	if ew_usage == ElementwiseUsage.No:
Louis Verhaard	d2b5510	2022-03-17 15:59:04 +0100	[diff] [blame^]	110	acc_bytes = (ofm_block.elements_wh() * round_up(ofm_depth, 8) * acc_bits) // 8
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	111	acc_banks = round_up_divide(acc_bytes, shram.bank_size_bytes) * 2
				112	acc_banks = round_up(acc_banks, acc_granule)
				113	acc_start = acc_start - acc_banks
				114	else:
				115	ifm2_banks = ifm_banks if ew_usage == ElementwiseUsage.Full else 0
				116	if ifm2_start + ifm2_banks > acc_start:
				117	return None
				118	ifm_end = acc_start
				119
				120	# IFM must still fit before accumulators
				121	if ifm_end > acc_start:
				122	return None
				123
				124	# Should all fit, so return this layout
				125	layout = SHRAMLayout()
				126	layout.ib_start = shram.reserved_output_banks
				127	layout.ib_start2 = ifm2_start
				128	layout.ib_end = ifm_end
				129	layout.ab_start = acc_start
				130	layout.lut_start = lut_start
				131	return layout
				132
				133
				134	def _choose_kernel_method(ifm_shape: Shape4D, ifm_bits: int, kernel: Kernel) -> bool:
				135	if ifm_shape.depth <= 8:
				136	return True
				137
				138	# Compare part-kernel to depth-kernel and choose the one with best utilisation
				139	kernel_elements = kernel.elements_wh()
				140	depth_utilisation = ifm_shape.depth / round_up(ifm_shape.depth, 32 if ifm_bits == 8 else 16)
				141	part_utilisation = (
				142	ifm_shape.depth
				143	* kernel_elements
				144	/ (round_up(ifm_shape.depth, 8) * round_up(kernel_elements, 4 if ifm_bits == 8 else 2))
				145	)
				146
				147	return part_utilisation > depth_utilisation
				148
				149
				150	def _ew_usage(npu_op_type: NpuBlockType, uses_scalar: bool) -> ElementwiseUsage:
				151	ew_usage = ElementwiseUsage.No
				152	if npu_op_type == NpuBlockType.ElementWise:
				153	ew_usage = ElementwiseUsage.Full
				154	if uses_scalar:
				155	ew_usage = ElementwiseUsage.Scalar
				156	return ew_usage
				157
				158
				159	def _acc_type(npu_op_type: NpuBlockType, ifm_bits: int, scaled: bool) -> int:
				160	"""Returns accumulator type"""
				161	acc_type = SHRAMElements.Acc32
				162	if (ifm_bits == 16) and npu_op_type != NpuBlockType.Pooling and scaled:
				163	acc_type = SHRAMElements.Acc40
				164	return acc_type
				165
				166
Fredrik Svedberg	3ff7a4a	2021-09-29 10:08:04 +0200	[diff] [blame]	167	def is_nearest(ifm_resampling: resampling_mode) -> bool:
				168	return ifm_resampling == resampling_mode.NEAREST
				169
				170
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	171	def to_upscale(ifm_resampling: resampling_mode) -> int:
				172	# Upscaling depending on resampling mode
				173	return 1 if ifm_resampling == resampling_mode.NONE else 2
				174
				175
				176	def _ifm_blockdepth(arch, ifm_shape: Shape4D, ifm_bits: int, is_partkernel: bool):
				177	if ifm_bits == 16:
				178	ifm_blockdepth = round_up(min(ifm_shape.depth, 16), 4)
				179	else:
				180	ifm_blockdepth = round_up(min(ifm_shape.depth, 16 if is_partkernel else 32), arch.ifm_ublock.depth)
				181	return ifm_blockdepth
				182
				183
Fredrik Svedberg	3ff7a4a	2021-09-29 10:08:04 +0200	[diff] [blame]	184	def _required_size(value: int, stride: int, border: int, upscale: int, nearest: bool) -> int:
				185	return int(math.ceil(((value - 1) * stride + border + nearest) / upscale))
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	186
				187
Fredrik Svedberg	3ff7a4a	2021-09-29 10:08:04 +0200	[diff] [blame]	188	def get_ifm_area_required(ofm_shape: Shape4D, kernel: Kernel, resampling_mode: resampling_mode) -> Tuple[int, int]:
				189	upscale = to_upscale(resampling_mode)
				190	nearest = is_nearest(resampling_mode)
				191	h1 = _required_size(ofm_shape.height, kernel.stride.y, kernel.area_height(), upscale, nearest)
				192	w1 = _required_size(ofm_shape.width, kernel.stride.x, kernel.area_width(), upscale, nearest)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	193	return (w1, h1)
				194
				195
				196	def _get_ifm_blocksize(
Fredrik Svedberg	3ff7a4a	2021-09-29 10:08:04 +0200	[diff] [blame]	197	ofm_block: Shape4D, kernel: Kernel, ublock: Block, subkernel_limit: Block, upscale: int, nearest: bool
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	198	) -> Shape4D:
				199	# IFM block height
Fredrik Svedberg	3ff7a4a	2021-09-29 10:08:04 +0200	[diff] [blame]	200	h1 = _required_size(
				201	ofm_block.height, kernel.stride.y, min(kernel.area_height(), subkernel_limit.height), upscale, nearest
				202	)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	203	h2 = h1
				204	height = round_up(min(h1, h2), ublock.height)
				205
				206	# IFM block width
Fredrik Svedberg	3ff7a4a	2021-09-29 10:08:04 +0200	[diff] [blame]	207	w1 = _required_size(
				208	ofm_block.width, kernel.stride.x, min(kernel.area_width(), subkernel_limit.width), upscale, nearest
				209	)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	210	w2 = w1
				211	width = round_up(min(w1, w2), ublock.width)
				212
				213	return Shape4D(1, height, width, ofm_block.depth)
				214
				215
Tim Hall	3016157	2021-06-17 17:03:49 +0100	[diff] [blame]	216	def fit_block_for_ofm(arch: ArchitectureFeatures, ofm_shape: Shape4D, kernel: Kernel, block: Shape4D):
				217	# 256/512 Conv1D optimisation (ratio of IFM:Accumulators changes) This is a specific
				218	# interpretation of a more general constraint that can't be applied because the
				219	# find_block_config function must return block configs that can be applied to any OFM shape.
				220	if (ofm_shape.height == 1) and (kernel.height == 1) and (arch.ofm_ublock.height == 2):
				221	return Shape4D(1, min(block.height, ofm_shape.height), block.width, block.depth)
				222	return block
				223
				224
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	225	def find_block_config(
				226	arch: ArchitectureFeatures,
				227	npu_op_type: NpuBlockType,
				228	ofm_shape: Shape4D,
				229	ifm_shape: Shape4D,
				230	ifm2_shape: Shape4D,
				231	uses_scalar: bool,
				232	ifm_bits: int,
				233	kernel: Kernel,
				234	lut_banks: int,
				235	scaled: bool,
				236	ifm_resampling: resampling_mode,
				237	) -> ArchitectureBlockConfig:
				238	SplitDepth = ArchitectureFeatures.OFMSplitDepth
				239	# Elementwise larger-volume correction
				240	if ifm2_shape is not None and ifm2_shape.elements() > ifm_shape.elements():
				241	ifm_shape = ifm2_shape
				242
				243	# Figure out if SHRAM should be portioned for elementwise
				244	ew_usage = _ew_usage(npu_op_type, uses_scalar)
				245
				246	# Operator typing help
				247	is_pooling = npu_op_type == NpuBlockType.Pooling
				248	is_depthwise = npu_op_type == NpuBlockType.ConvolutionDepthWise
				249	is_equal_depth_op = (ew_usage != ElementwiseUsage.No) or is_pooling or is_depthwise
				250	is_convolution = (npu_op_type == NpuBlockType.ConvolutionMxN) or is_depthwise
				251
				252	# Block config to be returned
				253	config = ArchitectureBlockConfig()
				254	config.is_partkernel = is_convolution and _choose_kernel_method(ifm_shape, ifm_bits, kernel)
				255
Louis Verhaard	d2b5510	2022-03-17 15:59:04 +0100	[diff] [blame^]	256	# IFM is not broadcasted for pooling and depthwise ops and for elementwise
				257	# when there's no elementwise-broadcasting in depth
				258	elemwise_buf_scalable = npu_op_type == NpuBlockType.ElementWise and (
				259	not ifm2_shape or ifm_shape.depth == ifm2_shape.depth
				260	)
				261	ifm_depth_buf_scaling = arch.ncores if is_pooling or is_depthwise or elemwise_buf_scalable else 1
				262	config.ifm_depth_buf_scaling = ifm_depth_buf_scaling
				263
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	264	# Accumulator & granule settings
				265	config.acc_type = _acc_type(npu_op_type, ifm_bits, scaled)
				266
				267	# Memory rounding granules
				268	acc_granule = arch.accumulator_granules[config.acc_type]
				269	acc_bits = _AccumulatorBits[config.acc_type]
				270	if ew_usage != ElementwiseUsage.No:
				271	ifm_granule = arch.ifm_ew_bank_granules[ifm_bits]
				272	else:
				273	ifm_granule = arch.ifm_bank_granules[ifm_bits]
				274	lut_banks = max(lut_banks, arch.shram.reserved_end_banks)
				275	upscale = to_upscale(ifm_resampling)
Fredrik Svedberg	3ff7a4a	2021-09-29 10:08:04 +0200	[diff] [blame]	276	nearest = is_nearest(ifm_resampling)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	277
				278	# Subkernel repeats of the IFM
				279	ifm_repeats = round_up_divide(kernel.area_width(), arch.SubKernelMax.width) * round_up_divide(
				280	kernel.area_height(), arch.SubKernelMax.height
				281	)
				282	ifm_blockdepth = _ifm_blockdepth(arch, ifm_shape, ifm_bits, config.is_partkernel)
				283
				284	# Weights fetch (for operators that have them)
				285	weight_fetch_wh = (kernel.area_width() * kernel.area_height()) if is_convolution else 0
				286
Louis Verhaard	d2b5510	2022-03-17 15:59:04 +0100	[diff] [blame^]	287	ofm_ublock_depth = arch.ofm_ublock.depth * arch.ncores
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	288	search_space = Shape4D.min(ofm_shape, Shape4D(arch.ofm_block_max.to_hwc()))
Louis Verhaard	d2b5510	2022-03-17 15:59:04 +0100	[diff] [blame^]	289	search_space = Shape4D.round_up(search_space, Shape4D(arch.ofm_ublock.to_hwc()).with_depth(ofm_ublock_depth))
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	290
				291	# Block WHC search, loops across the search space looking for best efficiency
				292	best_cost = math.inf
Tim Hall	daed152	2021-07-19 21:22:46 +0100	[diff] [blame]	293	best_coverage = math.inf
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	294	depth = max(arch.ofm_ublock.depth, min(search_space.depth, SplitDepth))
				295	if depth < ofm_shape.depth:
				296	depth = round_up(depth, SplitDepth)
				297
				298	while depth <= search_space.depth:
				299	wont_fit = {}
				300	for height in range(arch.ofm_ublock.height, search_space.height + 1, arch.ofm_ublock.height):
				301	for width in range(arch.ofm_ublock.width, search_space.width + 1, arch.ofm_ublock.width):
				302	# Avoid checking W/H transposed blocks that already didn't fit. i.e. if 8x4x16 didn't
				303	# fit, then 4x8x16 won't either.
				304	if wont_fit.get((height, width), False):
				305	continue
				306
				307	# Calculate the IFM block dimensions required to feed this OFM block
				308	ofm_block = Shape4D(1, height, width, depth)
Fredrik Svedberg	3ff7a4a	2021-09-29 10:08:04 +0200	[diff] [blame]	309	ifm_block = _get_ifm_blocksize(ofm_block, kernel, arch.ofm_ublock, arch.SubKernelMax, upscale, nearest)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	310	if not is_equal_depth_op:
				311	ifm_block = ifm_block.with_depth(ifm_blockdepth)
				312
				313	# Test if the IFM/OFM blocks fit into SHRAM
Tim Hall	3016157	2021-06-17 17:03:49 +0100	[diff] [blame]	314	ofm_block = fit_block_for_ofm(arch, ofm_shape, kernel, ofm_block)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	315	layout = _try_block_config(
Louis Verhaard	d2b5510	2022-03-17 15:59:04 +0100	[diff] [blame^]	316	arch.shram,
				317	ew_usage,
				318	ofm_block,
				319	ifm_block,
				320	ifm_bits,
				321	ifm_granule,
				322	acc_bits,
				323	acc_granule,
				324	lut_banks,
				325	ifm_depth_buf_scaling,
				326	arch.ncores,
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	327	)
				328
				329	if layout:
Tim Hall	789e6f3	2021-06-17 17:02:31 +0100	[diff] [blame]	330	full_blocks = Shape4D.div_round_up(ofm_shape, ofm_block)
				331	blocks = ofm_shape / ofm_block
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	332
Tim Hall	789e6f3	2021-06-17 17:02:31 +0100	[diff] [blame]	333	# Weights fetching
				334	weight_fetch = weight_fetch_wh * ifm_shape.depth * full_blocks.elements_wh()
				335	if not is_depthwise:
				336	weight_fetch = ofm_block.depth blocks.depth
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	337
Tim Hall	789e6f3	2021-06-17 17:02:31 +0100	[diff] [blame]	338	# IFM fetching
				339	ifm_fetch = ifm_block.elements_wh() * ifm_shape.depth * ifm_repeats * blocks.elements_wh()
				340	if not is_equal_depth_op:
				341	ifm_fetch *= full_blocks.depth
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	342
Tim Hall	789e6f3	2021-06-17 17:02:31 +0100	[diff] [blame]	343	# Scale relative to every output OFM element
				344	relative_cost = (ifm_fetch + weight_fetch) / ofm_shape.elements()
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	345
				346	# If the entire IFM can be encompassed by both buffers, bias to prefer this configuration
				347	if ifm_shape.elements() < ifm_block.elements() * 2:
				348	relative_cost = relative_cost / 2
				349
Tim Hall	daed152	2021-07-19 21:22:46 +0100	[diff] [blame]	350	# Choose based on relative minimum cost or larger IFM area (if equal cost)
				351	if relative_cost <= best_cost:
				352	choose_this = False
				353	# Check IFM coverage only when it's equal best_cost and small OFM
				354	if relative_cost == best_cost:
				355	coverage_shape = Shape4D.min(ifm_shape, ifm_block)
				356	coverage = ifm_shape.elements_wh() / coverage_shape.elements_wh()
				357	# Small 4x4 IFM constraint found through analysis of networks
				358	if coverage <= best_coverage and (height <= 4 and width <= 4):
				359	best_coverage = coverage
				360	choose_this = True
				361	else:
				362	best_coverage = math.inf
				363	choose_this = True
				364
				365	if choose_this:
				366	best_cost = relative_cost
				367	config.layout = layout
				368	config.bank_size = arch.shram_bank_size
				369	config.ifm_block = ifm_block
				370	config.ofm_block = Shape4D(1, height, width, depth)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	371	else:
				372	wont_fit[(width, height)] = True
				373
				374	depth = depth + arch.ofm_ublock.depth
				375	if depth < ofm_shape.depth:
				376	depth = round_up(depth, SplitDepth)
				377
				378	if best_cost != math.inf:
				379	return config
				380
				381	return None
				382
				383
				384	def try_block_config(
				385	block_config: Block,
				386	arch: ArchitectureFeatures,
				387	npu_op_type: NpuBlockType,
Tim Hall	3016157	2021-06-17 17:03:49 +0100	[diff] [blame]	388	ofm_shape: Block,
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	389	ifm_shape: Block,
				390	ifm2_shape: Optional[Block],
				391	uses_scalar: bool,
				392	ifm_bits: int,
				393	is_partkernel: bool,
				394	kernel: Kernel,
				395	lut_banks: int,
				396	scaled: bool,
				397	ifm_resampling: resampling_mode,
				398	) -> Optional[ArchitectureBlockConfig]:
				399	"""
				400	Given a block_config, returns a corresponding ArchitectureBlockConfig.
				401	Returns None if the block_config does not fit or is invalid.
				402	"""
				403	# Check block config validity
				404	if not all(
				405	blk > 0 and blk <= blk_max and blk % ublk == 0
				406	for blk, blk_max, ublk in zip(block_config.as_list(), arch.ofm_block_max.as_list(), arch.ofm_ublock.as_list())
				407	):
				408	return None
				409	# Elementwise larger-volume correction
				410	if ifm2_shape is not None and ifm2_shape.elements() > ifm_shape.elements():
				411	ifm_shape = ifm2_shape
				412
				413	ew_usage = _ew_usage(npu_op_type, uses_scalar)
				414
				415	# Operator typing help
				416	is_pooling = npu_op_type == NpuBlockType.Pooling
				417	is_depthwise = npu_op_type == NpuBlockType.ConvolutionDepthWise
				418	is_equal_depth_op = (ew_usage != ElementwiseUsage.No) or is_pooling or is_depthwise
				419
				420	# Block config to be returned
				421	config = ArchitectureBlockConfig()
				422	config.is_partkernel = is_partkernel
				423
Louis Verhaard	d2b5510	2022-03-17 15:59:04 +0100	[diff] [blame^]	424	# IFM is not broadcasted for pooling and depthwise ops and for elementwise
				425	# when there's no elementwise-broadcasting in depth
				426	elemwise_buf_scalable = npu_op_type == NpuBlockType.ElementWise and (
				427	not ifm2_shape or ifm_shape.depth == ifm2_shape.depth
				428	)
				429	ifm_depth_buf_scaling = arch.ncores if is_pooling or is_depthwise or elemwise_buf_scalable else 1
				430	config.ifm_depth_buf_scaling = ifm_depth_buf_scaling
				431
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	432	# Accumulator & granule settings
				433	config.acc_type = _acc_type(npu_op_type, ifm_bits, scaled)
				434
				435	# Memory rounding granules
				436	acc_granule = arch.accumulator_granules[config.acc_type]
				437	acc_bits = _AccumulatorBits[config.acc_type]
				438	if ew_usage != ElementwiseUsage.No:
				439	ifm_granule = arch.ifm_ew_bank_granules[ifm_bits]
				440	else:
				441	ifm_granule = arch.ifm_bank_granules[ifm_bits]
				442	lut_banks = max(lut_banks, arch.shram.reserved_end_banks)
				443	upscale = to_upscale(ifm_resampling)
Fredrik Svedberg	3ff7a4a	2021-09-29 10:08:04 +0200	[diff] [blame]	444	nearest = is_nearest(ifm_resampling)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	445	ifm_blockdepth = _ifm_blockdepth(arch, ifm_shape, ifm_bits, is_partkernel)
Fredrik Svedberg	3ff7a4a	2021-09-29 10:08:04 +0200	[diff] [blame]	446	ifm_block = _get_ifm_blocksize(block_config, kernel, arch.ofm_ublock, arch.SubKernelMax, upscale, nearest)
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	447	if not is_equal_depth_op:
				448	ifm_block = ifm_block.with_depth(ifm_blockdepth)
				449
Tim Hall	3016157	2021-06-17 17:03:49 +0100	[diff] [blame]	450	# 256/512 Conv1D optimisation (ratio of IFM:Accumulators changes)
James Ward	399c4a2	2021-10-20 11:04:46 +0100	[diff] [blame]	451	block_config_opt = fit_block_for_ofm(arch, ofm_shape, kernel, block_config)
Tim Hall	3016157	2021-06-17 17:03:49 +0100	[diff] [blame]	452
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	453	layout = _try_block_config(
Louis Verhaard	d2b5510	2022-03-17 15:59:04 +0100	[diff] [blame^]	454	arch.shram,
				455	ew_usage,
				456	block_config_opt,
				457	ifm_block,
				458	ifm_bits,
				459	ifm_granule,
				460	acc_bits,
				461	acc_granule,
				462	lut_banks,
				463	ifm_depth_buf_scaling,
				464	arch.ncores,
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	465	)
				466	if layout is None:
				467	return None
				468	config.layout = layout
				469	config.bank_size = arch.shram_bank_size
				470	config.ifm_block = ifm_block
Jacob Bohlin	b8060f5	2021-08-09 12:22:51 +0100	[diff] [blame]	471	config.ofm_block = block_config
Tim Hall	d8339a7	2021-05-27 18:49:40 +0100	[diff] [blame]	472	return config