Blame - ethosu/vela/shared_buffer_allocation.py - ml/ethos-u/ethos-u-vela

blob: 1f027d60cdeb90328e6af036c5583e1bcb44192d [file] [log] [blame]

Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1	# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
				2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	16	# Description:
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	17	# Shared buffer allocation works out how to allocate the Ethos-U shared buffer for a given pass.
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	18	from typing import List
				19	from typing import Tuple
				20
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	21	import numpy as np
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	22
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	23	from .api import NpuActivationOp
				24	from .api import NpuBlockOperation
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	25	from .architecture_features import ArchitectureFeatures
				26	from .architecture_features import Block
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	27	from .architecture_features import SharedBufferArea
				28	from .architecture_features import SHRAMElements
Michael McGeagh	7a6f843	2020-12-02 15:29:22 +0000	[diff] [blame]	29	from .errors import AllocationError
Dwight Lidman	7ad408b	2020-08-11 11:55:22 +0200	[diff] [blame]	30	from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Tim Hall	4ed38bc	2020-10-20 18:54:20 +0100	[diff] [blame]	31	from .operation import Kernel
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	32	from .operation import NpuBlockType
Louis Verhaard	814cfbb	2020-08-21 14:06:25 +0200	[diff] [blame]	33	from .range_set import MemoryRangeSet
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	34	from .register_command_stream_util import to_kernel
Louis Verhaard	814cfbb	2020-08-21 14:06:25 +0200	[diff] [blame]	35	from .tensor import MemArea
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	36
				37
				38	class SharedBufferAllocation:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	39	def __init__(
				40	self,
				41	arch,
				42	kernel,
				43	uses_lut,
				44	npu_block_type,
				45	all_fms_have_quant,
				46	ifm_resampling_mode,
				47	ifm_bits,
				48	ifm_depth,
				49	ifm_count,
				50	ofm_shape,
				51	):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	52	self.arch = arch
				53
				54	self.bank_locations = np.zeros(SharedBufferArea.Size)
				55	self.banks_required = np.zeros(SharedBufferArea.Size)
				56
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	57	self.kernel = Kernel(1, 1) if kernel is None else kernel
				58	self.is_elementwise = npu_block_type == NpuBlockType.ElementWise
				59	self.uses_lut = uses_lut
				60	self.ifm_count = ifm_count
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	61
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	62	self.is_equal_depth_op = self.is_elementwise or npu_block_type in (
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	63	NpuBlockType.ConvolutionDepthWise,
				64	NpuBlockType.Pooling,
				65	)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	66
				67	self.use_accumulator_element = SHRAMElements.Acc32
Tim Hall	d5044a4	2020-10-06 12:07:04 +0100	[diff] [blame]	68	if self.is_elementwise:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	69	self.use_ifm_element = SHRAMElements.IFM8_Elementwise
				70	else:
				71	self.use_ifm_element = SHRAMElements.IFM8
				72
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	73	self.ifm_resampling_mode = ifm_resampling_mode
				74	self.ifm_bits = ifm_bits
				75	self.ifm_depth = ifm_depth
				76	self.ifm_count = ifm_count
Andreas Nevalainen	6e82708	2020-10-14 13:55:43 +0200	[diff] [blame]	77
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	78	if self.ifm_bits == 16:
				79	if npu_block_type != NpuBlockType.Pooling and all_fms_have_quant:
				80	self.use_accumulator_element = SHRAMElements.Acc40
				81	self.use_ifm_element = self.use_ifm_element + 1
				82	assert (self.use_ifm_element == SHRAMElements.IFM16) or (
				83	self.use_ifm_element == SHRAMElements.IFM16_Elementwise
				84	)
				85	elif self.ifm_bits == 32:
				86	assert self.is_elementwise or npu_block_type == NpuBlockType.ReduceSum, "Unsupported 32-bit IFM operation"
				87	self.use_ifm_element = SHRAMElements.IFM32
				88	else:
				89	assert self.ifm_bits == 8, "Unexpected IFM bitdepth"
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	90
				91	self.ifm_block_depth = arch.calc_ifm_block_depth(self.ifm_depth, self.ifm_bits)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	92	self.ofm_shape = ofm_shape
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	93
				94	self.banks_required[SharedBufferArea.Weights] = arch.shram_reserved_weight_banks
				95	self.banks_required[SharedBufferArea.OFM] = arch.shram_reserved_output_banks
				96
				97	def is_valid(self):
				98	# Assign zero-based bank starts (first element remains zero)
				99	self.bank_locations[1:] = np.cumsum(self.banks_required)[:-1]
				100
				101	# Accumulator area is measured from the end of the buffer
				102	self.bank_locations[SharedBufferArea.Accumulators] = (
Louis Verhaard	814cfbb	2020-08-21 14:06:25 +0200	[diff] [blame]	103	self.arch.available_shram_banks(self.uses_lut) - self.banks_required[SharedBufferArea.Accumulators]
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	104	)
				105	ifm_end = self.bank_locations[SharedBufferArea.IFM] + self.banks_required[SharedBufferArea.IFM]
				106	return ifm_end <= self.bank_locations[SharedBufferArea.Accumulators]
				107
				108	def try_block(self, ofm_block: Block):
				109	# Get IFM block configuration
				110	ifm_block_depth = ofm_block.depth if self.is_equal_depth_op else self.ifm_block_depth
Tim Hall	c30f495	2020-06-15 20:47:35 +0100	[diff] [blame]	111	ifm_block = self.arch.get_ifm_block_size(
				112	ifm_block_depth, ofm_block, self.kernel, ifm_resampling_mode=self.ifm_resampling_mode
				113	)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	114	ifm_config = self.arch.get_block_config(ifm_block.width, ifm_block.height, ifm_block.depth)
				115	if ifm_config is None:
				116	return None
				117
				118	# Get OFM block configuration
				119	ofm_config = self.arch.get_block_config(ofm_block.width, ofm_block.height, ofm_block.depth)
				120	if ofm_config is None:
				121	return None
				122
Tim Hall	d5044a4	2020-10-06 12:07:04 +0100	[diff] [blame]	123	acc_banks = ofm_config.banks[self.use_accumulator_element]
				124
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	125	# Update bank counts for IFM and Accumulator
Andreas Nevalainen	6e82708	2020-10-14 13:55:43 +0200	[diff] [blame]	126	self.banks_required[SharedBufferArea.IFM] = ifm_config.banks[self.use_ifm_element] * self.ifm_count
Tim Hall	d5044a4	2020-10-06 12:07:04 +0100	[diff] [blame]	127	self.banks_required[SharedBufferArea.Accumulators] = 0 if self.is_elementwise else acc_banks
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	128
				129	# Validating calculates bank layout and returns validity
				130	if not self.is_valid():
				131	return None
				132
				133	return (ofm_block.height, ofm_block.width, ifm_block.depth, ofm_block.depth)
				134
				135	def generate_used_mask(self, active_set):
				136	res = np.zeros(self.arch.shram_total_banks, dtype=np.int64)
				137	for kind in active_set:
				138	start = int(self.bank_locations[kind])
				139	end = start + int(self.banks_required[kind])
				140	res[start:end] = 1
				141	return res
				142
				143	def is_compatible(first, second):
				144	"""See if the bank allocations of two convolutions are compatible,
				145	so that they can run back-to-back without a fence in between"""
				146
				147	first_set = set((SharedBufferArea.OFM, SharedBufferArea.Accumulators))
				148	second_set = set((SharedBufferArea.IFM, SharedBufferArea.Weights))
				149
				150	first_mask = first.generate_used_mask(first_set)
				151	second_mask = second.generate_used_mask(second_set)
				152
				153	if np.sum(first_mask & second_mask):
				154	# overlap
				155	return False
				156
				157	return True
				158
Louis Verhaard	814cfbb	2020-08-21 14:06:25 +0200	[diff] [blame]	159	def get_shram_memory_access_range(self):
				160	# Returns the SHRAM memory access range used by this shared buffer,
				161	# excluding access to LUT
				162	return MemoryRangeSet(
				163	MemArea.Shram, 0, self.arch.available_shram_banks(self.uses_lut) * self.arch.shram_bank_size
				164	)
				165
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	166
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	167	def _all_fms_have_quant(ifm_tensor, ofm_tensor, ifm2_tensor=None) -> bool:
Diqing Zhong	09387e2	2020-09-28 18:46:22 +0200	[diff] [blame]	168	tensors = [t for t in (ifm_tensor, ifm2_tensor, ofm_tensor) if t is not None]
				169	scales = [t.quantization.scale_f32 for t in tensors if t.quantization is not None]
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	170	return len(tensors) == len(scales) and None not in scales
Diqing Zhong	09387e2	2020-09-28 18:46:22 +0200	[diff] [blame]	171
				172
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	173	def is_acc_40bits_used(npu_block_type, ifm_tensor, ofm_tensor, ifm2_tensor=None):
				174	return npu_block_type != NpuBlockType.Pooling and _all_fms_have_quant(ifm_tensor, ofm_tensor, ifm2_tensor)
				175
				176
				177	def shared_buffer_allocation_for_pass(arch, ps) -> SharedBufferAllocation:
				178	ifm_tensor, ifm2_tensor, _, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()
				179	all_fms_have_quant = _all_fms_have_quant(ifm_tensor, ifm2_tensor, ofm_tensor)
				180
				181	kernel = Kernel(1, 1)
				182	is_elementwise = ps.npu_block_type == NpuBlockType.ElementWise
				183	uses_lut = False
				184	ifm_count = 1
				185
				186	if ps.primary_op:
				187	kernel = ps.primary_op.kernel
				188	uses_lut = ps.primary_op.activation_lut is not None
				189
				190	ifm_resampling_mode = resampling_mode.NONE
				191	ifm_bits = 0
				192	ifm_depth = 0
				193	if ifm_tensor:
				194	ifm_resampling_mode = ifm_tensor.resampling_mode
				195	ifm_bits = ifm_tensor.dtype.size_in_bits()
Patrik Gustavsson	2349d42	2020-12-01 16:02:29 +0100	[diff] [blame^]	196	ifm_shape = ps.primary_op.ifm_shapes[0]
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	197
Patrik Gustavsson	2349d42	2020-12-01 16:02:29 +0100	[diff] [blame^]	198	if ifm_shape != []:
				199	ifm_depth = ifm_shape[-1]
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	200
				201	if is_elementwise:
				202	ifm_count = 2
				203	if ifm_tensor.shape == []: # Scalar in ifm1
				204	assert ifm2_tensor
Patrik Gustavsson	2349d42	2020-12-01 16:02:29 +0100	[diff] [blame^]	205	ifm_depth = ps.primary_op.ifm_shapes[1][-1]
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	206	ifm_count = 1
				207	elif not ifm2_tensor or ifm2_tensor.shape == []: # Scalar in ifm2
				208	ifm_count = 1
				209	return SharedBufferAllocation(
				210	arch,
				211	kernel,
				212	uses_lut,
				213	npu_block_type=ps.npu_block_type,
				214	all_fms_have_quant=all_fms_have_quant,
				215	ifm_resampling_mode=ifm_resampling_mode,
				216	ifm_bits=ifm_bits,
				217	ifm_depth=ifm_depth,
				218	ifm_count=ifm_count,
Patrik Gustavsson	2349d42	2020-12-01 16:02:29 +0100	[diff] [blame^]	219	ofm_shape=ps.primary_op.ofm_shapes[0],
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	220	)
				221
				222
				223	def shared_buffer_allocation_for_pass_and_block_config(arch, ps, block_config) -> SharedBufferAllocation:
				224	alloc = shared_buffer_allocation_for_pass(arch, ps)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	225	assert (alloc.ifm_block_depth == block_config[2]) or alloc.is_equal_depth_op
				226	if alloc.try_block(Block(block_config[1], block_config[0], block_config[3])):
				227	return alloc
				228
				229	return None
				230
				231
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	232	def shared_buffer_allocation_for_npu_op(
				233	arch, npu_op: NpuBlockOperation, npu_block_type: NpuBlockType, ifm_resampling_mode
				234	) -> SharedBufferAllocation:
				235	uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
				236	fms = [npu_op.ifm, npu_op.ofm]
				237	if npu_op.ifm2 is not None:
				238	fms.append(npu_op.ifm2)
				239	all_fms_have_quant = not any(fm.quantization is None or fm.quantization.scale_f32 is None for fm in fms)
				240	ifm_bits = npu_op.ifm.data_type.size_in_bits()
				241	ifm_depth = npu_op.ifm.shape.depth
				242	ifm_count = 2 if npu_op.ifm2 is not None and npu_op.ifm2_scalar is None else 1
				243	ofm_shape = [1, npu_op.ofm.shape.height, npu_op.ofm.shape.width, npu_op.ofm.shape.depth]
				244	return SharedBufferAllocation(
				245	arch,
				246	to_kernel(npu_op.kernel),
				247	uses_lut,
				248	npu_block_type=npu_block_type,
				249	all_fms_have_quant=all_fms_have_quant,
				250	ifm_resampling_mode=ifm_resampling_mode,
				251	ifm_bits=ifm_bits,
				252	ifm_depth=ifm_depth,
				253	ifm_count=ifm_count,
				254	ofm_shape=ofm_shape,
				255	)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	256
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	257
				258	def find_suitable_block_configs(arch, alloc: SharedBufferAllocation) -> List[Tuple]:
				259	"""Returns list of block configs that would fit with the given shared buffer allocation"""
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	260	if arch.override_block_config:
				261	config = alloc.try_block(arch.override_block_config)
Tim Hall	2a7ebe3	2020-06-18 11:42:21 +0100	[diff] [blame]	262	if config is None:
Michael McGeagh	7a6f843	2020-12-02 15:29:22 +0000	[diff] [blame]	263	raise AllocationError(f"Block config override '{arch.override_block_config}' cannot be allocated")
Tim Hall	2a7ebe3	2020-06-18 11:42:21 +0100	[diff] [blame]	264	return [config]
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	265
				266	# Constrain the search space if the OFM is smaller than the max block size
				267	# - Add other block search constraints here if required
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	268	if len(alloc.ofm_shape) <= 2:
				269	max_block_height = max_block_width = alloc.ofm_shape[0]
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	270	else:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	271	max_block_width = alloc.ofm_shape[-2]
				272	max_block_height = alloc.ofm_shape[-3]
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	273
				274	# Common block depth
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	275	max_block_depth = alloc.ofm_shape[-1]
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	276
				277	# Constrain to valid ranges before search
				278	max_block_width = min(arch.ofm_block_max.width, max_block_width)
				279	max_block_height = min(arch.ofm_block_max.height, max_block_height)
				280	max_block_depth = min(arch.ofm_block_max.depth, max_block_depth)
				281
				282	valid_block_configs = []
				283	# Try a range of block shapes against this pass
				284	for w in range(arch.ofm_ublock.width, max_block_width + arch.ofm_ublock.width, arch.ofm_ublock.width):
				285	for h in range(arch.ofm_ublock.height, max_block_height + arch.ofm_ublock.height, arch.ofm_ublock.height):
				286	# Try valid OFM block depths
				287	for c in range(arch.ofm_ublock.depth, max_block_depth + arch.ofm_ublock.depth, arch.ofm_ublock.depth):
				288	# OFM block depth has the constraint that if it causes the OFM to be
				289	# split, it must be a multiple of the OFM split size
				290	if (c >= max_block_depth) or (c < max_block_depth and (c % ArchitectureFeatures.OFMSplitDepth) == 0):
				291	config = alloc.try_block(Block(w, h, c))
				292	if config:
				293	valid_block_configs.append(config)
				294
				295	assert len(valid_block_configs) > 0
				296	return valid_block_configs
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	297
				298
				299	def find_block_configs_suitable_for_pass_and_shared_buffer(arch, ps) -> List[Tuple]:
				300	alloc = shared_buffer_allocation_for_pass(arch, ps)
				301	return find_suitable_block_configs(arch, alloc)