Blame - ethosu/vela/shared_buffer_allocation.py - ml/ethos-u/ethos-u-vela

blob: c9a97c0f0cddcaf2e3fdcc080eb73c462c1555c0 [file] [log] [blame]

Jacob Bohlin	2a58530	2021-02-11 16:04:53 +0100	[diff] [blame]	1	# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	16	# Description:
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	17	# Shared buffer allocation works out how to allocate the Ethos-U shared buffer for a given pass.
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	18	from typing import List
				19	from typing import Tuple
				20
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	21	import numpy as np
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	22
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	23	from .api import NpuActivationOp
				24	from .api import NpuBlockOperation
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	25	from .architecture_features import ArchitectureFeatures
				26	from .architecture_features import Block
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	27	from .architecture_features import SharedBufferArea
				28	from .architecture_features import SHRAMElements
Dwight Lidman	7ad408b	2020-08-11 11:55:22 +0200	[diff] [blame]	29	from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Tim Hall	4ed38bc	2020-10-20 18:54:20 +0100	[diff] [blame]	30	from .operation import Kernel
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	31	from .operation import NpuBlockType
Louis Verhaard	814cfbb	2020-08-21 14:06:25 +0200	[diff] [blame]	32	from .range_set import MemoryRangeSet
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	33	from .register_command_stream_util import to_kernel
patrik.gustavsson	eeb8515	2020-12-21 17:10:40 +0000	[diff] [blame]	34	from .shape4d import Shape4D
Louis Verhaard	814cfbb	2020-08-21 14:06:25 +0200	[diff] [blame]	35	from .tensor import MemArea
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	36
				37
				38	class SharedBufferAllocation:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	39	def __init__(
				40	self,
				41	arch,
				42	kernel,
				43	uses_lut,
				44	npu_block_type,
				45	all_fms_have_quant,
				46	ifm_resampling_mode,
				47	ifm_bits,
				48	ifm_depth,
				49	ifm_count,
				50	ofm_shape,
				51	):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	52	self.arch = arch
				53
				54	self.bank_locations = np.zeros(SharedBufferArea.Size)
				55	self.banks_required = np.zeros(SharedBufferArea.Size)
				56
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	57	self.kernel = Kernel(1, 1) if kernel is None else kernel
				58	self.is_elementwise = npu_block_type == NpuBlockType.ElementWise
				59	self.uses_lut = uses_lut
				60	self.ifm_count = ifm_count
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	61
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	62	self.is_equal_depth_op = self.is_elementwise or npu_block_type in (
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	63	NpuBlockType.ConvolutionDepthWise,
				64	NpuBlockType.Pooling,
				65	)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	66
				67	self.use_accumulator_element = SHRAMElements.Acc32
Tim Hall	d5044a4	2020-10-06 12:07:04 +0100	[diff] [blame]	68	if self.is_elementwise:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	69	self.use_ifm_element = SHRAMElements.IFM8_Elementwise
				70	else:
				71	self.use_ifm_element = SHRAMElements.IFM8
				72
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	73	self.ifm_resampling_mode = ifm_resampling_mode
				74	self.ifm_bits = ifm_bits
				75	self.ifm_depth = ifm_depth
				76	self.ifm_count = ifm_count
Andreas Nevalainen	6e82708	2020-10-14 13:55:43 +0200	[diff] [blame]	77
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	78	if self.ifm_bits == 16:
				79	if npu_block_type != NpuBlockType.Pooling and all_fms_have_quant:
				80	self.use_accumulator_element = SHRAMElements.Acc40
				81	self.use_ifm_element = self.use_ifm_element + 1
				82	assert (self.use_ifm_element == SHRAMElements.IFM16) or (
				83	self.use_ifm_element == SHRAMElements.IFM16_Elementwise
				84	)
				85	elif self.ifm_bits == 32:
				86	assert self.is_elementwise or npu_block_type == NpuBlockType.ReduceSum, "Unsupported 32-bit IFM operation"
				87	self.use_ifm_element = SHRAMElements.IFM32
				88	else:
				89	assert self.ifm_bits == 8, "Unexpected IFM bitdepth"
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	90
				91	self.ifm_block_depth = arch.calc_ifm_block_depth(self.ifm_depth, self.ifm_bits)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	92	self.ofm_shape = ofm_shape
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	93
				94	self.banks_required[SharedBufferArea.Weights] = arch.shram_reserved_weight_banks
				95	self.banks_required[SharedBufferArea.OFM] = arch.shram_reserved_output_banks
				96
				97	def is_valid(self):
				98	# Assign zero-based bank starts (first element remains zero)
				99	self.bank_locations[1:] = np.cumsum(self.banks_required)[:-1]
				100
				101	# Accumulator area is measured from the end of the buffer
				102	self.bank_locations[SharedBufferArea.Accumulators] = (
Louis Verhaard	814cfbb	2020-08-21 14:06:25 +0200	[diff] [blame]	103	self.arch.available_shram_banks(self.uses_lut) - self.banks_required[SharedBufferArea.Accumulators]
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	104	)
				105	ifm_end = self.bank_locations[SharedBufferArea.IFM] + self.banks_required[SharedBufferArea.IFM]
				106	return ifm_end <= self.bank_locations[SharedBufferArea.Accumulators]
				107
				108	def try_block(self, ofm_block: Block):
				109	# Get IFM block configuration
				110	ifm_block_depth = ofm_block.depth if self.is_equal_depth_op else self.ifm_block_depth
Tim Hall	c30f495	2020-06-15 20:47:35 +0100	[diff] [blame]	111	ifm_block = self.arch.get_ifm_block_size(
				112	ifm_block_depth, ofm_block, self.kernel, ifm_resampling_mode=self.ifm_resampling_mode
				113	)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	114	ifm_config = self.arch.get_block_config(ifm_block.width, ifm_block.height, ifm_block.depth)
				115	if ifm_config is None:
				116	return None
				117
				118	# Get OFM block configuration
				119	ofm_config = self.arch.get_block_config(ofm_block.width, ofm_block.height, ofm_block.depth)
				120	if ofm_config is None:
				121	return None
				122
Tim Hall	d5044a4	2020-10-06 12:07:04 +0100	[diff] [blame]	123	acc_banks = ofm_config.banks[self.use_accumulator_element]
				124
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	125	# Update bank counts for IFM and Accumulator
Andreas Nevalainen	6e82708	2020-10-14 13:55:43 +0200	[diff] [blame]	126	self.banks_required[SharedBufferArea.IFM] = ifm_config.banks[self.use_ifm_element] * self.ifm_count
Tim Hall	d5044a4	2020-10-06 12:07:04 +0100	[diff] [blame]	127	self.banks_required[SharedBufferArea.Accumulators] = 0 if self.is_elementwise else acc_banks
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	128
				129	# Validating calculates bank layout and returns validity
				130	if not self.is_valid():
				131	return None
				132
				133	return (ofm_block.height, ofm_block.width, ifm_block.depth, ofm_block.depth)
				134
				135	def generate_used_mask(self, active_set):
				136	res = np.zeros(self.arch.shram_total_banks, dtype=np.int64)
				137	for kind in active_set:
				138	start = int(self.bank_locations[kind])
				139	end = start + int(self.banks_required[kind])
				140	res[start:end] = 1
				141	return res
				142
				143	def is_compatible(first, second):
				144	"""See if the bank allocations of two convolutions are compatible,
				145	so that they can run back-to-back without a fence in between"""
				146
				147	first_set = set((SharedBufferArea.OFM, SharedBufferArea.Accumulators))
				148	second_set = set((SharedBufferArea.IFM, SharedBufferArea.Weights))
				149
				150	first_mask = first.generate_used_mask(first_set)
				151	second_mask = second.generate_used_mask(second_set)
				152
				153	if np.sum(first_mask & second_mask):
				154	# overlap
				155	return False
				156
				157	return True
				158
Louis Verhaard	814cfbb	2020-08-21 14:06:25 +0200	[diff] [blame]	159	def get_shram_memory_access_range(self):
				160	# Returns the SHRAM memory access range used by this shared buffer,
				161	# excluding access to LUT
				162	return MemoryRangeSet(
				163	MemArea.Shram, 0, self.arch.available_shram_banks(self.uses_lut) * self.arch.shram_bank_size
				164	)
				165
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	166
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	167	def _all_fms_have_quant(ifm_tensor, ofm_tensor, ifm2_tensor=None) -> bool:
Diqing Zhong	09387e2	2020-09-28 18:46:22 +0200	[diff] [blame]	168	tensors = [t for t in (ifm_tensor, ifm2_tensor, ofm_tensor) if t is not None]
				169	scales = [t.quantization.scale_f32 for t in tensors if t.quantization is not None]
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	170	return len(tensors) == len(scales) and None not in scales
Diqing Zhong	09387e2	2020-09-28 18:46:22 +0200	[diff] [blame]	171
				172
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	173	def is_acc_40bits_used(npu_block_type, ifm_tensor, ofm_tensor, ifm2_tensor=None):
Diqing Zhong	016b827	2020-12-16 16:46:06 +0100	[diff] [blame]	174	return (
				175	ifm_tensor.dtype.size_in_bits() == 16
				176	and npu_block_type != NpuBlockType.Pooling
				177	and _all_fms_have_quant(ifm_tensor, ofm_tensor, ifm2_tensor)
				178	)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	179
				180
				181	def shared_buffer_allocation_for_pass(arch, ps) -> SharedBufferAllocation:
				182	ifm_tensor, ifm2_tensor, _, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()
				183	all_fms_have_quant = _all_fms_have_quant(ifm_tensor, ifm2_tensor, ofm_tensor)
				184
				185	kernel = Kernel(1, 1)
				186	is_elementwise = ps.npu_block_type == NpuBlockType.ElementWise
				187	uses_lut = False
				188	ifm_count = 1
				189
				190	if ps.primary_op:
				191	kernel = ps.primary_op.kernel
				192	uses_lut = ps.primary_op.activation_lut is not None
				193
				194	ifm_resampling_mode = resampling_mode.NONE
				195	ifm_bits = 0
				196	ifm_depth = 0
				197	if ifm_tensor:
				198	ifm_resampling_mode = ifm_tensor.resampling_mode
				199	ifm_bits = ifm_tensor.dtype.size_in_bits()
Patrik Gustavsson	2349d42	2020-12-01 16:02:29 +0100	[diff] [blame]	200	ifm_shape = ps.primary_op.ifm_shapes[0]
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	201
patrik.gustavsson	eeb8515	2020-12-21 17:10:40 +0000	[diff] [blame]	202	if ifm_tensor.shape != []:
				203	ifm_depth = ifm_shape.depth
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	204
				205	if is_elementwise:
				206	ifm_count = 2
				207	if ifm_tensor.shape == []: # Scalar in ifm1
				208	assert ifm2_tensor
patrik.gustavsson	eeb8515	2020-12-21 17:10:40 +0000	[diff] [blame]	209	ifm_depth = ps.primary_op.ifm_shapes[1].depth
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	210	ifm_count = 1
				211	elif not ifm2_tensor or ifm2_tensor.shape == []: # Scalar in ifm2
				212	ifm_count = 1
				213	return SharedBufferAllocation(
				214	arch,
				215	kernel,
				216	uses_lut,
				217	npu_block_type=ps.npu_block_type,
				218	all_fms_have_quant=all_fms_have_quant,
				219	ifm_resampling_mode=ifm_resampling_mode,
				220	ifm_bits=ifm_bits,
				221	ifm_depth=ifm_depth,
				222	ifm_count=ifm_count,
Patrik Gustavsson	2349d42	2020-12-01 16:02:29 +0100	[diff] [blame]	223	ofm_shape=ps.primary_op.ofm_shapes[0],
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	224	)
				225
				226
				227	def shared_buffer_allocation_for_pass_and_block_config(arch, ps, block_config) -> SharedBufferAllocation:
				228	alloc = shared_buffer_allocation_for_pass(arch, ps)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	229	assert (alloc.ifm_block_depth == block_config[2]) or alloc.is_equal_depth_op
				230	if alloc.try_block(Block(block_config[1], block_config[0], block_config[3])):
				231	return alloc
				232
				233	return None
				234
				235
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	236	def shared_buffer_allocation_for_npu_op(
				237	arch, npu_op: NpuBlockOperation, npu_block_type: NpuBlockType, ifm_resampling_mode
				238	) -> SharedBufferAllocation:
				239	uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
				240	fms = [npu_op.ifm, npu_op.ofm]
				241	if npu_op.ifm2 is not None:
				242	fms.append(npu_op.ifm2)
				243	all_fms_have_quant = not any(fm.quantization is None or fm.quantization.scale_f32 is None for fm in fms)
				244	ifm_bits = npu_op.ifm.data_type.size_in_bits()
				245	ifm_depth = npu_op.ifm.shape.depth
				246	ifm_count = 2 if npu_op.ifm2 is not None and npu_op.ifm2_scalar is None else 1
				247	ofm_shape = [1, npu_op.ofm.shape.height, npu_op.ofm.shape.width, npu_op.ofm.shape.depth]
				248	return SharedBufferAllocation(
				249	arch,
				250	to_kernel(npu_op.kernel),
				251	uses_lut,
				252	npu_block_type=npu_block_type,
				253	all_fms_have_quant=all_fms_have_quant,
				254	ifm_resampling_mode=ifm_resampling_mode,
				255	ifm_bits=ifm_bits,
				256	ifm_depth=ifm_depth,
				257	ifm_count=ifm_count,
patrik.gustavsson	eeb8515	2020-12-21 17:10:40 +0000	[diff] [blame]	258	ofm_shape=Shape4D(ofm_shape),
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	259	)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	260
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	261
				262	def find_suitable_block_configs(arch, alloc: SharedBufferAllocation) -> List[Tuple]:
				263	"""Returns list of block configs that would fit with the given shared buffer allocation"""
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	264
				265	# Constrain the search space if the OFM is smaller than the max block size
				266	# - Add other block search constraints here if required
patrik.gustavsson	eeb8515	2020-12-21 17:10:40 +0000	[diff] [blame]	267	max_block_width = alloc.ofm_shape.width
				268	max_block_height = alloc.ofm_shape.height
				269	max_block_depth = alloc.ofm_shape.depth
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	270
				271	# Constrain to valid ranges before search
				272	max_block_width = min(arch.ofm_block_max.width, max_block_width)
				273	max_block_height = min(arch.ofm_block_max.height, max_block_height)
				274	max_block_depth = min(arch.ofm_block_max.depth, max_block_depth)
				275
Jacob Bohlin	2a58530	2021-02-11 16:04:53 +0100	[diff] [blame]	276	min_block_height = max(arch.ofm_ublock.height, 2 if alloc.ifm_resampling_mode != resampling_mode.NONE else 1)
				277	min_block_width = max(arch.ofm_ublock.width, 2 if alloc.ifm_resampling_mode != resampling_mode.NONE else 1)
				278
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	279	valid_block_configs = []
				280	# Try a range of block shapes against this pass
Jacob Bohlin	2a58530	2021-02-11 16:04:53 +0100	[diff] [blame]	281	for w in range(min_block_width, max_block_width + min_block_width, min_block_width):
				282	for h in range(min_block_height, max_block_height + min_block_height, min_block_height):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	283	# Try valid OFM block depths
				284	for c in range(arch.ofm_ublock.depth, max_block_depth + arch.ofm_ublock.depth, arch.ofm_ublock.depth):
				285	# OFM block depth has the constraint that if it causes the OFM to be
				286	# split, it must be a multiple of the OFM split size
				287	if (c >= max_block_depth) or (c < max_block_depth and (c % ArchitectureFeatures.OFMSplitDepth) == 0):
				288	config = alloc.try_block(Block(w, h, c))
				289	if config:
				290	valid_block_configs.append(config)
				291
				292	assert len(valid_block_configs) > 0
				293	return valid_block_configs
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	294
				295
				296	def find_block_configs_suitable_for_pass_and_shared_buffer(arch, ps) -> List[Tuple]:
				297	alloc = shared_buffer_allocation_for_pass(arch, ps)
				298	return find_suitable_block_configs(arch, alloc)