ethosu/vela/architecture_allocator.py - ml/ethos-u/ethos-u-vela - Gitiles

 # Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
 #
 # Licensed under the Apache License, Version 2.0 (the License); you may
 # not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an AS IS BASIS, WITHOUT
 # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
 # Description: Architecture SHRAM allocator
 import enum
 import math
 from typing import Optional
 from typing import Tuple

 from .architecture_features import ArchitectureFeatures
 from .architecture_features import Block
 from .architecture_features import SHRAMConfig
 from .architecture_features import SHRAMElements
 from .ethos_u55_regs.ethos_u55_regs import resampling_mode
 from .numeric_util import round_up
 from .numeric_util import round_up_divide
 from .operation import Kernel
 from .operation import NpuBlockType
 from .range_set import MemoryRangeSet
 from .shape4d import Shape4D
 from .tensor import MemArea


 class SHRAMLayout:
     def __init__(self):
         self.ib_start = 0
         self.ib_end = 0
         self.ib_start2 = 0
         self.ab_start = 0
         self.lut_start = 0


 class ArchitectureBlockConfig:
     def __init__(self):
         self.layout = SHRAMLayout()
         self.ifm_block = Shape4D()
         self.ofm_block = Shape4D()  # non-1D-optimised block
         self.acc_type = SHRAMElements.Acc32
         self.is_partkernel = False
         self.bank_size = 0

     def get_shram_memory_access_range(self):
         # Returns the SHRAM memory access range used by this shared buffer,
         # excluding access to LUT
         return MemoryRangeSet(MemArea.Shram, 0, self.layout.lut_start * self.bank_size)

     def old_style_representation(self):
         return [self.ofm_block.height, self.ofm_block.width, self.ifm_block.depth, self.ofm_block.depth]

     def __str__(self):
         return str(self.old_style_representation())


 _AccumulatorBits = {SHRAMElements.Acc16: 16, SHRAMElements.Acc32: 32, SHRAMElements.Acc40: 40}


 class ElementwiseUsage(enum.IntEnum):
     No = 0
     Full = 1
     Scalar = 2


 def _try_block_config(
     shram: SHRAMConfig,
     ew_usage: ElementwiseUsage,
     ofm_block: Block,
     ifm_block: Block,
     ifm_bits: int,
     ifm_granule: int,
     acc_bits: int,
     acc_granule: int,
     lut_banks: int,
 ) -> SHRAMLayout:
     assert (acc_bits > 0) and (acc_granule > 0)
     assert (ifm_bits >= 8) and ((ifm_bits % 8) == 0) and (ifm_granule > 0)

     # Aways need IFM space
     ifm_bytes = ifm_block.elements_wh() * round_up((ifm_block.depth * ifm_bits) / 8, 8)
     ifm_banks = round_up_divide(ifm_bytes, shram.bank_size_bytes) * 2
     ifm_banks = round_up(ifm_banks, ifm_granule)

     # Calculate SHRAM boundaries of the IFM and Accumulators
     lut_start = shram.total_banks - lut_banks
     ifm_end = shram.reserved_output_banks + ifm_banks
     ifm2_start = ifm_end
     acc_start = lut_start

     # If not elementwise then we need accumulator space
     if ew_usage == ElementwiseUsage.No:
         acc_bytes = (ofm_block.elements_wh() * round_up(ofm_block.depth, 8) * acc_bits) // 8
         acc_banks = round_up_divide(acc_bytes, shram.bank_size_bytes) * 2
         acc_banks = round_up(acc_banks, acc_granule)
         acc_start = acc_start - acc_banks
     else:
         ifm2_banks = ifm_banks if ew_usage == ElementwiseUsage.Full else 0
         if ifm2_start + ifm2_banks > acc_start:
             return None
         ifm_end = acc_start

     # IFM must still fit before accumulators
     if ifm_end > acc_start:
         return None

     # Should all fit, so return this layout
     layout = SHRAMLayout()
     layout.ib_start = shram.reserved_output_banks
     layout.ib_start2 = ifm2_start
     layout.ib_end = ifm_end
     layout.ab_start = acc_start
     layout.lut_start = lut_start
     return layout


 def _choose_kernel_method(ifm_shape: Shape4D, ifm_bits: int, kernel: Kernel) -> bool:
     if ifm_shape.depth <= 8:
         return True

     # Compare part-kernel to depth-kernel and choose the one with best utilisation
     kernel_elements = kernel.elements_wh()
     depth_utilisation = ifm_shape.depth / round_up(ifm_shape.depth, 32 if ifm_bits == 8 else 16)
     part_utilisation = (
         ifm_shape.depth
         * kernel_elements
         / (round_up(ifm_shape.depth, 8) * round_up(kernel_elements, 4 if ifm_bits == 8 else 2))
     )

     return part_utilisation > depth_utilisation


 def _ew_usage(npu_op_type: NpuBlockType, uses_scalar: bool) -> ElementwiseUsage:
     ew_usage = ElementwiseUsage.No
     if npu_op_type == NpuBlockType.ElementWise:
         ew_usage = ElementwiseUsage.Full
         if uses_scalar:
             ew_usage = ElementwiseUsage.Scalar
     return ew_usage


 def _acc_type(npu_op_type: NpuBlockType, ifm_bits: int, scaled: bool) -> int:
     """Returns accumulator type"""
     acc_type = SHRAMElements.Acc32
     if (ifm_bits == 16) and npu_op_type != NpuBlockType.Pooling and scaled:
         acc_type = SHRAMElements.Acc40
     return acc_type


 def is_nearest(ifm_resampling: resampling_mode) -> bool:
     return ifm_resampling == resampling_mode.NEAREST


 def to_upscale(ifm_resampling: resampling_mode) -> int:
     # Upscaling depending on resampling mode
     return 1 if ifm_resampling == resampling_mode.NONE else 2


 def _ifm_blockdepth(arch, ifm_shape: Shape4D, ifm_bits: int, is_partkernel: bool):
     if ifm_bits == 16:
         ifm_blockdepth = round_up(min(ifm_shape.depth, 16), 4)
     else:
         ifm_blockdepth = round_up(min(ifm_shape.depth, 16 if is_partkernel else 32), arch.ifm_ublock.depth)
     return ifm_blockdepth


 def _required_size(value: int, stride: int, border: int, upscale: int, nearest: bool) -> int:
     return int(math.ceil(((value - 1) * stride + border + nearest) / upscale))


 def get_ifm_area_required(ofm_shape: Shape4D, kernel: Kernel, resampling_mode: resampling_mode) -> Tuple[int, int]:
     upscale = to_upscale(resampling_mode)
     nearest = is_nearest(resampling_mode)
     h1 = _required_size(ofm_shape.height, kernel.stride.y, kernel.area_height(), upscale, nearest)
     w1 = _required_size(ofm_shape.width, kernel.stride.x, kernel.area_width(), upscale, nearest)
     return (w1, h1)


 def _get_ifm_blocksize(
     ofm_block: Shape4D, kernel: Kernel, ublock: Block, subkernel_limit: Block, upscale: int, nearest: bool
 ) -> Shape4D:
     # IFM block height
     h1 = _required_size(
         ofm_block.height, kernel.stride.y, min(kernel.area_height(), subkernel_limit.height), upscale, nearest
     )
     h2 = h1
     height = round_up(min(h1, h2), ublock.height)

     # IFM block width
     w1 = _required_size(
         ofm_block.width, kernel.stride.x, min(kernel.area_width(), subkernel_limit.width), upscale, nearest
     )
     w2 = w1
     width = round_up(min(w1, w2), ublock.width)

     return Shape4D(1, height, width, ofm_block.depth)


 def fit_block_for_ofm(arch: ArchitectureFeatures, ofm_shape: Shape4D, kernel: Kernel, block: Shape4D):
     # 256/512 Conv1D optimisation (ratio of IFM:Accumulators changes) This is a specific
     # interpretation of a more general constraint that can't be applied because the
     # find_block_config function must return block configs that can be applied to any OFM shape.
     if (ofm_shape.height == 1) and (kernel.height == 1) and (arch.ofm_ublock.height == 2):
         return Shape4D(1, min(block.height, ofm_shape.height), block.width, block.depth)
     return block


 def find_block_config(
     arch: ArchitectureFeatures,
     npu_op_type: NpuBlockType,
     ofm_shape: Shape4D,
     ifm_shape: Shape4D,
     ifm2_shape: Shape4D,
     uses_scalar: bool,
     ifm_bits: int,
     kernel: Kernel,
     lut_banks: int,
     scaled: bool,
     ifm_resampling: resampling_mode,
 ) -> ArchitectureBlockConfig:
     SplitDepth = ArchitectureFeatures.OFMSplitDepth
     # Elementwise larger-volume correction
     if ifm2_shape is not None and ifm2_shape.elements() > ifm_shape.elements():
         ifm_shape = ifm2_shape

     # Figure out if SHRAM should be portioned for elementwise
     ew_usage = _ew_usage(npu_op_type, uses_scalar)

     # Operator typing help
     is_pooling = npu_op_type == NpuBlockType.Pooling
     is_depthwise = npu_op_type == NpuBlockType.ConvolutionDepthWise
     is_equal_depth_op = (ew_usage != ElementwiseUsage.No) or is_pooling or is_depthwise
     is_convolution = (npu_op_type == NpuBlockType.ConvolutionMxN) or is_depthwise

     # Block config to be returned
     config = ArchitectureBlockConfig()
     config.is_partkernel = is_convolution and _choose_kernel_method(ifm_shape, ifm_bits, kernel)

     # Accumulator & granule settings
     config.acc_type = _acc_type(npu_op_type, ifm_bits, scaled)

     # Memory rounding granules
     acc_granule = arch.accumulator_granules[config.acc_type]
     acc_bits = _AccumulatorBits[config.acc_type]
     if ew_usage != ElementwiseUsage.No:
         ifm_granule = arch.ifm_ew_bank_granules[ifm_bits]
     else:
         ifm_granule = arch.ifm_bank_granules[ifm_bits]
     lut_banks = max(lut_banks, arch.shram.reserved_end_banks)
     upscale = to_upscale(ifm_resampling)
     nearest = is_nearest(ifm_resampling)

     # Subkernel repeats of the IFM
     ifm_repeats = round_up_divide(kernel.area_width(), arch.SubKernelMax.width) * round_up_divide(
         kernel.area_height(), arch.SubKernelMax.height
     )
     ifm_blockdepth = _ifm_blockdepth(arch, ifm_shape, ifm_bits, config.is_partkernel)

     # Weights fetch (for operators that have them)
     weight_fetch_wh = (kernel.area_width() * kernel.area_height()) if is_convolution else 0

     search_space = Shape4D.min(ofm_shape, Shape4D(arch.ofm_block_max.to_hwc()))
     search_space = Shape4D.round_up(search_space, Shape4D(arch.ofm_ublock.to_hwc()))

     # Block WHC search, loops across the search space looking for best efficiency
     best_cost = math.inf
     best_coverage = math.inf
     depth = max(arch.ofm_ublock.depth, min(search_space.depth, SplitDepth))
     if depth < ofm_shape.depth:
         depth = round_up(depth, SplitDepth)

     while depth <= search_space.depth:
         wont_fit = {}
         for height in range(arch.ofm_ublock.height, search_space.height + 1, arch.ofm_ublock.height):
             for width in range(arch.ofm_ublock.width, search_space.width + 1, arch.ofm_ublock.width):
                 # Avoid checking W/H transposed blocks that already didn't fit. i.e. if 8x4x16 didn't
                 # fit, then 4x8x16 won't either.
                 if wont_fit.get((height, width), False):
                     continue

                 # Calculate the IFM block dimensions required to feed this OFM block
                 ofm_block = Shape4D(1, height, width, depth)
                 ifm_block = _get_ifm_blocksize(ofm_block, kernel, arch.ofm_ublock, arch.SubKernelMax, upscale, nearest)
                 if not is_equal_depth_op:
                     ifm_block = ifm_block.with_depth(ifm_blockdepth)

                 # Test if the IFM/OFM blocks fit into SHRAM
                 ofm_block = fit_block_for_ofm(arch, ofm_shape, kernel, ofm_block)
                 layout = _try_block_config(
                     arch.shram, ew_usage, ofm_block, ifm_block, ifm_bits, ifm_granule, acc_bits, acc_granule, lut_banks
                 )

                 if layout:
                     full_blocks = Shape4D.div_round_up(ofm_shape, ofm_block)
                     blocks = ofm_shape / ofm_block

                     # Weights fetching
                     weight_fetch = weight_fetch_wh * ifm_shape.depth * full_blocks.elements_wh()
                     if not is_depthwise:
                         weight_fetch *= ofm_block.depth * blocks.depth

                     # IFM fetching
                     ifm_fetch = ifm_block.elements_wh() * ifm_shape.depth * ifm_repeats * blocks.elements_wh()
                     if not is_equal_depth_op:
                         ifm_fetch *= full_blocks.depth

                     # Scale relative to every output OFM element
                     relative_cost = (ifm_fetch + weight_fetch) / ofm_shape.elements()

                     # If the entire IFM can be encompassed by both buffers, bias to prefer this configuration
                     if ifm_shape.elements() < ifm_block.elements() * 2:
                         relative_cost = relative_cost / 2

                     # Choose based on relative minimum cost or larger IFM area (if equal cost)
                     if relative_cost <= best_cost:
                         choose_this = False
                         # Check IFM coverage only when it's equal best_cost and small OFM
                         if relative_cost == best_cost:
                             coverage_shape = Shape4D.min(ifm_shape, ifm_block)
                             coverage = ifm_shape.elements_wh() / coverage_shape.elements_wh()
                             # Small 4x4 IFM constraint found through analysis of networks
                             if coverage <= best_coverage and (height <= 4 and width <= 4):
                                 best_coverage = coverage
                                 choose_this = True
                         else:
                             best_coverage = math.inf
                             choose_this = True

                         if choose_this:
                             best_cost = relative_cost
                             config.layout = layout
                             config.bank_size = arch.shram_bank_size
                             config.ifm_block = ifm_block
                             config.ofm_block = Shape4D(1, height, width, depth)
                 else:
                     wont_fit[(width, height)] = True

         depth = depth + arch.ofm_ublock.depth
         if depth < ofm_shape.depth:
             depth = round_up(depth, SplitDepth)

     if best_cost != math.inf:
         return config

     return None


 def try_block_config(
     block_config: Block,
     arch: ArchitectureFeatures,
     npu_op_type: NpuBlockType,
     ofm_shape: Block,
     ifm_shape: Block,
     ifm2_shape: Optional[Block],
     uses_scalar: bool,
     ifm_bits: int,
     is_partkernel: bool,
     kernel: Kernel,
     lut_banks: int,
     scaled: bool,
     ifm_resampling: resampling_mode,
 ) -> Optional[ArchitectureBlockConfig]:
     """
     Given a block_config, returns a corresponding ArchitectureBlockConfig.
     Returns None if the block_config does not fit or is invalid.
     """
     # Check block config validity
     if not all(
         blk > 0 and blk <= blk_max and blk % ublk == 0
         for blk, blk_max, ublk in zip(block_config.as_list(), arch.ofm_block_max.as_list(), arch.ofm_ublock.as_list())
     ):
         return None
     # Elementwise larger-volume correction
     if ifm2_shape is not None and ifm2_shape.elements() > ifm_shape.elements():
         ifm_shape = ifm2_shape

     ew_usage = _ew_usage(npu_op_type, uses_scalar)

     # Operator typing help
     is_pooling = npu_op_type == NpuBlockType.Pooling
     is_depthwise = npu_op_type == NpuBlockType.ConvolutionDepthWise
     is_equal_depth_op = (ew_usage != ElementwiseUsage.No) or is_pooling or is_depthwise

     # Block config to be returned
     config = ArchitectureBlockConfig()
     config.is_partkernel = is_partkernel

     # Accumulator & granule settings
     config.acc_type = _acc_type(npu_op_type, ifm_bits, scaled)

     # Memory rounding granules
     acc_granule = arch.accumulator_granules[config.acc_type]
     acc_bits = _AccumulatorBits[config.acc_type]
     if ew_usage != ElementwiseUsage.No:
         ifm_granule = arch.ifm_ew_bank_granules[ifm_bits]
     else:
         ifm_granule = arch.ifm_bank_granules[ifm_bits]
     lut_banks = max(lut_banks, arch.shram.reserved_end_banks)
     upscale = to_upscale(ifm_resampling)
     nearest = is_nearest(ifm_resampling)
     ifm_blockdepth = _ifm_blockdepth(arch, ifm_shape, ifm_bits, is_partkernel)
     ifm_block = _get_ifm_blocksize(block_config, kernel, arch.ofm_ublock, arch.SubKernelMax, upscale, nearest)
     if not is_equal_depth_op:
         ifm_block = ifm_block.with_depth(ifm_blockdepth)

     # 256/512 Conv1D optimisation (ratio of IFM:Accumulators changes)
     block_config_opt = fit_block_for_ofm(arch, ofm_shape, kernel, block_config)

     layout = _try_block_config(
         arch.shram, ew_usage, block_config_opt, ifm_block, ifm_bits, ifm_granule, acc_bits, acc_granule, lut_banks
     )
     if layout is None:
         return None
     config.layout = layout
     config.bank_size = arch.shram_bank_size
     config.ifm_block = ifm_block
     config.ofm_block = block_config
     return config
	# Copyright (C) 2021 Arm Limited or its affiliates. All rights reserved.
	#
	# SPDX-License-Identifier: Apache-2.0
	#
	# Licensed under the Apache License, Version 2.0 (the License); you may
	# not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#
	# Description: Architecture SHRAM allocator
	import enum
	import math
	from typing import Optional
	from typing import Tuple

	from .architecture_features import ArchitectureFeatures
	from .architecture_features import Block
	from .architecture_features import SHRAMConfig
	from .architecture_features import SHRAMElements
	from .ethos_u55_regs.ethos_u55_regs import resampling_mode
	from .numeric_util import round_up
	from .numeric_util import round_up_divide
	from .operation import Kernel
	from .operation import NpuBlockType
	from .range_set import MemoryRangeSet
	from .shape4d import Shape4D
	from .tensor import MemArea


	class SHRAMLayout:
	def __init__(self):
	self.ib_start = 0
	self.ib_end = 0
	self.ib_start2 = 0
	self.ab_start = 0
	self.lut_start = 0


	class ArchitectureBlockConfig:
	def __init__(self):
	self.layout = SHRAMLayout()
	self.ifm_block = Shape4D()
	self.ofm_block = Shape4D() # non-1D-optimised block
	self.acc_type = SHRAMElements.Acc32
	self.is_partkernel = False
	self.bank_size = 0

	def get_shram_memory_access_range(self):
	# Returns the SHRAM memory access range used by this shared buffer,
	# excluding access to LUT
	return MemoryRangeSet(MemArea.Shram, 0, self.layout.lut_start * self.bank_size)

	def old_style_representation(self):
	return [self.ofm_block.height, self.ofm_block.width, self.ifm_block.depth, self.ofm_block.depth]

	def __str__(self):
	return str(self.old_style_representation())


	_AccumulatorBits = {SHRAMElements.Acc16: 16, SHRAMElements.Acc32: 32, SHRAMElements.Acc40: 40}


	class ElementwiseUsage(enum.IntEnum):
	No = 0
	Full = 1
	Scalar = 2


	def _try_block_config(
	shram: SHRAMConfig,
	ew_usage: ElementwiseUsage,
	ofm_block: Block,
	ifm_block: Block,
	ifm_bits: int,
	ifm_granule: int,
	acc_bits: int,
	acc_granule: int,
	lut_banks: int,
	) -> SHRAMLayout:
	assert (acc_bits > 0) and (acc_granule > 0)
	assert (ifm_bits >= 8) and ((ifm_bits % 8) == 0) and (ifm_granule > 0)

	# Aways need IFM space
	ifm_bytes = ifm_block.elements_wh() * round_up((ifm_block.depth * ifm_bits) / 8, 8)
	ifm_banks = round_up_divide(ifm_bytes, shram.bank_size_bytes) * 2
	ifm_banks = round_up(ifm_banks, ifm_granule)

	# Calculate SHRAM boundaries of the IFM and Accumulators
	lut_start = shram.total_banks - lut_banks
	ifm_end = shram.reserved_output_banks + ifm_banks
	ifm2_start = ifm_end
	acc_start = lut_start

	# If not elementwise then we need accumulator space
	if ew_usage == ElementwiseUsage.No:
	acc_bytes = (ofm_block.elements_wh() * round_up(ofm_block.depth, 8) * acc_bits) // 8
	acc_banks = round_up_divide(acc_bytes, shram.bank_size_bytes) * 2
	acc_banks = round_up(acc_banks, acc_granule)
	acc_start = acc_start - acc_banks
	else:
	ifm2_banks = ifm_banks if ew_usage == ElementwiseUsage.Full else 0
	if ifm2_start + ifm2_banks > acc_start:
	return None
	ifm_end = acc_start

	# IFM must still fit before accumulators
	if ifm_end > acc_start:
	return None

	# Should all fit, so return this layout
	layout = SHRAMLayout()
	layout.ib_start = shram.reserved_output_banks
	layout.ib_start2 = ifm2_start
	layout.ib_end = ifm_end
	layout.ab_start = acc_start
	layout.lut_start = lut_start
	return layout


	def _choose_kernel_method(ifm_shape: Shape4D, ifm_bits: int, kernel: Kernel) -> bool:
	if ifm_shape.depth <= 8:
	return True

	# Compare part-kernel to depth-kernel and choose the one with best utilisation
	kernel_elements = kernel.elements_wh()
	depth_utilisation = ifm_shape.depth / round_up(ifm_shape.depth, 32 if ifm_bits == 8 else 16)
	part_utilisation = (
	ifm_shape.depth
	* kernel_elements
	/ (round_up(ifm_shape.depth, 8) * round_up(kernel_elements, 4 if ifm_bits == 8 else 2))
	)

	return part_utilisation > depth_utilisation


	def _ew_usage(npu_op_type: NpuBlockType, uses_scalar: bool) -> ElementwiseUsage:
	ew_usage = ElementwiseUsage.No
	if npu_op_type == NpuBlockType.ElementWise:
	ew_usage = ElementwiseUsage.Full
	if uses_scalar:
	ew_usage = ElementwiseUsage.Scalar
	return ew_usage


	def _acc_type(npu_op_type: NpuBlockType, ifm_bits: int, scaled: bool) -> int:
	"""Returns accumulator type"""
	acc_type = SHRAMElements.Acc32
	if (ifm_bits == 16) and npu_op_type != NpuBlockType.Pooling and scaled:
	acc_type = SHRAMElements.Acc40
	return acc_type


	def is_nearest(ifm_resampling: resampling_mode) -> bool:
	return ifm_resampling == resampling_mode.NEAREST


	def to_upscale(ifm_resampling: resampling_mode) -> int:
	# Upscaling depending on resampling mode
	return 1 if ifm_resampling == resampling_mode.NONE else 2


	def _ifm_blockdepth(arch, ifm_shape: Shape4D, ifm_bits: int, is_partkernel: bool):
	if ifm_bits == 16:
	ifm_blockdepth = round_up(min(ifm_shape.depth, 16), 4)
	else:
	ifm_blockdepth = round_up(min(ifm_shape.depth, 16 if is_partkernel else 32), arch.ifm_ublock.depth)
	return ifm_blockdepth


	def _required_size(value: int, stride: int, border: int, upscale: int, nearest: bool) -> int:
	return int(math.ceil(((value - 1) * stride + border + nearest) / upscale))


	def get_ifm_area_required(ofm_shape: Shape4D, kernel: Kernel, resampling_mode: resampling_mode) -> Tuple[int, int]:
	upscale = to_upscale(resampling_mode)
	nearest = is_nearest(resampling_mode)
	h1 = _required_size(ofm_shape.height, kernel.stride.y, kernel.area_height(), upscale, nearest)
	w1 = _required_size(ofm_shape.width, kernel.stride.x, kernel.area_width(), upscale, nearest)
	return (w1, h1)


	def _get_ifm_blocksize(
	ofm_block: Shape4D, kernel: Kernel, ublock: Block, subkernel_limit: Block, upscale: int, nearest: bool
	) -> Shape4D:
	# IFM block height
	h1 = _required_size(
	ofm_block.height, kernel.stride.y, min(kernel.area_height(), subkernel_limit.height), upscale, nearest
	)
	h2 = h1
	height = round_up(min(h1, h2), ublock.height)

	# IFM block width
	w1 = _required_size(
	ofm_block.width, kernel.stride.x, min(kernel.area_width(), subkernel_limit.width), upscale, nearest
	)
	w2 = w1
	width = round_up(min(w1, w2), ublock.width)

	return Shape4D(1, height, width, ofm_block.depth)


	def fit_block_for_ofm(arch: ArchitectureFeatures, ofm_shape: Shape4D, kernel: Kernel, block: Shape4D):
	# 256/512 Conv1D optimisation (ratio of IFM:Accumulators changes) This is a specific
	# interpretation of a more general constraint that can't be applied because the
	# find_block_config function must return block configs that can be applied to any OFM shape.
	if (ofm_shape.height == 1) and (kernel.height == 1) and (arch.ofm_ublock.height == 2):
	return Shape4D(1, min(block.height, ofm_shape.height), block.width, block.depth)
	return block


	def find_block_config(
	arch: ArchitectureFeatures,
	npu_op_type: NpuBlockType,
	ofm_shape: Shape4D,
	ifm_shape: Shape4D,
	ifm2_shape: Shape4D,
	uses_scalar: bool,
	ifm_bits: int,
	kernel: Kernel,
	lut_banks: int,
	scaled: bool,
	ifm_resampling: resampling_mode,
	) -> ArchitectureBlockConfig:
	SplitDepth = ArchitectureFeatures.OFMSplitDepth
	# Elementwise larger-volume correction
	if ifm2_shape is not None and ifm2_shape.elements() > ifm_shape.elements():
	ifm_shape = ifm2_shape

	# Figure out if SHRAM should be portioned for elementwise
	ew_usage = _ew_usage(npu_op_type, uses_scalar)

	# Operator typing help
	is_pooling = npu_op_type == NpuBlockType.Pooling
	is_depthwise = npu_op_type == NpuBlockType.ConvolutionDepthWise
	is_equal_depth_op = (ew_usage != ElementwiseUsage.No) or is_pooling or is_depthwise
	is_convolution = (npu_op_type == NpuBlockType.ConvolutionMxN) or is_depthwise

	# Block config to be returned
	config = ArchitectureBlockConfig()
	config.is_partkernel = is_convolution and _choose_kernel_method(ifm_shape, ifm_bits, kernel)

	# Accumulator & granule settings
	config.acc_type = _acc_type(npu_op_type, ifm_bits, scaled)

	# Memory rounding granules
	acc_granule = arch.accumulator_granules[config.acc_type]
	acc_bits = _AccumulatorBits[config.acc_type]
	if ew_usage != ElementwiseUsage.No:
	ifm_granule = arch.ifm_ew_bank_granules[ifm_bits]
	else:
	ifm_granule = arch.ifm_bank_granules[ifm_bits]
	lut_banks = max(lut_banks, arch.shram.reserved_end_banks)
	upscale = to_upscale(ifm_resampling)
	nearest = is_nearest(ifm_resampling)

	# Subkernel repeats of the IFM
	ifm_repeats = round_up_divide(kernel.area_width(), arch.SubKernelMax.width) * round_up_divide(
	kernel.area_height(), arch.SubKernelMax.height
	)
	ifm_blockdepth = _ifm_blockdepth(arch, ifm_shape, ifm_bits, config.is_partkernel)

	# Weights fetch (for operators that have them)
	weight_fetch_wh = (kernel.area_width() * kernel.area_height()) if is_convolution else 0

	search_space = Shape4D.min(ofm_shape, Shape4D(arch.ofm_block_max.to_hwc()))
	search_space = Shape4D.round_up(search_space, Shape4D(arch.ofm_ublock.to_hwc()))

	# Block WHC search, loops across the search space looking for best efficiency
	best_cost = math.inf
	best_coverage = math.inf
	depth = max(arch.ofm_ublock.depth, min(search_space.depth, SplitDepth))
	if depth < ofm_shape.depth:
	depth = round_up(depth, SplitDepth)

	while depth <= search_space.depth:
	wont_fit = {}
	for height in range(arch.ofm_ublock.height, search_space.height + 1, arch.ofm_ublock.height):
	for width in range(arch.ofm_ublock.width, search_space.width + 1, arch.ofm_ublock.width):
	# Avoid checking W/H transposed blocks that already didn't fit. i.e. if 8x4x16 didn't
	# fit, then 4x8x16 won't either.
	if wont_fit.get((height, width), False):
	continue

	# Calculate the IFM block dimensions required to feed this OFM block
	ofm_block = Shape4D(1, height, width, depth)
	ifm_block = _get_ifm_blocksize(ofm_block, kernel, arch.ofm_ublock, arch.SubKernelMax, upscale, nearest)
	if not is_equal_depth_op:
	ifm_block = ifm_block.with_depth(ifm_blockdepth)

	# Test if the IFM/OFM blocks fit into SHRAM
	ofm_block = fit_block_for_ofm(arch, ofm_shape, kernel, ofm_block)
	layout = _try_block_config(
	arch.shram, ew_usage, ofm_block, ifm_block, ifm_bits, ifm_granule, acc_bits, acc_granule, lut_banks
	)

	if layout:
	full_blocks = Shape4D.div_round_up(ofm_shape, ofm_block)
	blocks = ofm_shape / ofm_block

	# Weights fetching
	weight_fetch = weight_fetch_wh * ifm_shape.depth * full_blocks.elements_wh()
	if not is_depthwise:
	weight_fetch = ofm_block.depth blocks.depth

	# IFM fetching
	ifm_fetch = ifm_block.elements_wh() * ifm_shape.depth * ifm_repeats * blocks.elements_wh()
	if not is_equal_depth_op:
	ifm_fetch *= full_blocks.depth

	# Scale relative to every output OFM element
	relative_cost = (ifm_fetch + weight_fetch) / ofm_shape.elements()

	# If the entire IFM can be encompassed by both buffers, bias to prefer this configuration
	if ifm_shape.elements() < ifm_block.elements() * 2:
	relative_cost = relative_cost / 2

	# Choose based on relative minimum cost or larger IFM area (if equal cost)
	if relative_cost <= best_cost:
	choose_this = False
	# Check IFM coverage only when it's equal best_cost and small OFM
	if relative_cost == best_cost:
	coverage_shape = Shape4D.min(ifm_shape, ifm_block)
	coverage = ifm_shape.elements_wh() / coverage_shape.elements_wh()
	# Small 4x4 IFM constraint found through analysis of networks
	if coverage <= best_coverage and (height <= 4 and width <= 4):
	best_coverage = coverage
	choose_this = True
	else:
	best_coverage = math.inf
	choose_this = True

	if choose_this:
	best_cost = relative_cost
	config.layout = layout
	config.bank_size = arch.shram_bank_size
	config.ifm_block = ifm_block
	config.ofm_block = Shape4D(1, height, width, depth)
	else:
	wont_fit[(width, height)] = True

	depth = depth + arch.ofm_ublock.depth
	if depth < ofm_shape.depth:
	depth = round_up(depth, SplitDepth)

	if best_cost != math.inf:
	return config

	return None


	def try_block_config(
	block_config: Block,
	arch: ArchitectureFeatures,
	npu_op_type: NpuBlockType,
	ofm_shape: Block,
	ifm_shape: Block,
	ifm2_shape: Optional[Block],
	uses_scalar: bool,
	ifm_bits: int,
	is_partkernel: bool,
	kernel: Kernel,
	lut_banks: int,
	scaled: bool,
	ifm_resampling: resampling_mode,
	) -> Optional[ArchitectureBlockConfig]:
	"""
	Given a block_config, returns a corresponding ArchitectureBlockConfig.
	Returns None if the block_config does not fit or is invalid.
	"""
	# Check block config validity
	if not all(
	blk > 0 and blk <= blk_max and blk % ublk == 0
	for blk, blk_max, ublk in zip(block_config.as_list(), arch.ofm_block_max.as_list(), arch.ofm_ublock.as_list())
	):
	return None
	# Elementwise larger-volume correction
	if ifm2_shape is not None and ifm2_shape.elements() > ifm_shape.elements():
	ifm_shape = ifm2_shape

	ew_usage = _ew_usage(npu_op_type, uses_scalar)

	# Operator typing help
	is_pooling = npu_op_type == NpuBlockType.Pooling
	is_depthwise = npu_op_type == NpuBlockType.ConvolutionDepthWise
	is_equal_depth_op = (ew_usage != ElementwiseUsage.No) or is_pooling or is_depthwise

	# Block config to be returned
	config = ArchitectureBlockConfig()
	config.is_partkernel = is_partkernel

	# Accumulator & granule settings
	config.acc_type = _acc_type(npu_op_type, ifm_bits, scaled)

	# Memory rounding granules
	acc_granule = arch.accumulator_granules[config.acc_type]
	acc_bits = _AccumulatorBits[config.acc_type]
	if ew_usage != ElementwiseUsage.No:
	ifm_granule = arch.ifm_ew_bank_granules[ifm_bits]
	else:
	ifm_granule = arch.ifm_bank_granules[ifm_bits]
	lut_banks = max(lut_banks, arch.shram.reserved_end_banks)
	upscale = to_upscale(ifm_resampling)
	nearest = is_nearest(ifm_resampling)
	ifm_blockdepth = _ifm_blockdepth(arch, ifm_shape, ifm_bits, is_partkernel)
	ifm_block = _get_ifm_blocksize(block_config, kernel, arch.ofm_ublock, arch.SubKernelMax, upscale, nearest)
	if not is_equal_depth_op:
	ifm_block = ifm_block.with_depth(ifm_blockdepth)

	# 256/512 Conv1D optimisation (ratio of IFM:Accumulators changes)
	block_config_opt = fit_block_for_ofm(arch, ofm_shape, kernel, block_config)

	layout = _try_block_config(
	arch.shram, ew_usage, block_config_opt, ifm_block, ifm_bits, ifm_granule, acc_bits, acc_granule, lut_banks
	)
	if layout is None:
	return None
	config.layout = layout
	config.bank_size = arch.shram_bank_size
	config.ifm_block = ifm_block
	config.ofm_block = block_config
	return config