Blame - ethosu/vela/shared_buffer_allocation.py - ml/ethos-u/ethos-u-vela

blob: d8faf369d63c120971a243fbdc89fb8428071f97 [file] [log] [blame]

Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1	# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
				2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	16	# Description:
Tim Hall	c8a7386	2020-10-27 12:43:14 +0000	[diff] [blame]	17	# Shared buffer allocation works out how to allocate the Ethos-U shared buffer for a given pass.
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	18	from typing import List
				19	from typing import Tuple
				20
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	21	import numpy as np
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	22
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	23	from .api import NpuActivationOp
				24	from .api import NpuBlockOperation
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	25	from .architecture_features import ArchitectureFeatures
				26	from .architecture_features import Block
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	27	from .architecture_features import SharedBufferArea
				28	from .architecture_features import SHRAMElements
Michael McGeagh	7a6f843	2020-12-02 15:29:22 +0000	[diff] [blame]	29	from .errors import AllocationError
Dwight Lidman	7ad408b	2020-08-11 11:55:22 +0200	[diff] [blame]	30	from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Tim Hall	4ed38bc	2020-10-20 18:54:20 +0100	[diff] [blame]	31	from .operation import Kernel
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	32	from .operation import NpuBlockType
Louis Verhaard	814cfbb	2020-08-21 14:06:25 +0200	[diff] [blame]	33	from .range_set import MemoryRangeSet
Louis Verhaard	1e17018	2020-11-26 11:42:04 +0100	[diff] [blame]	34	from .register_command_stream_util import to_kernel
patrik.gustavsson	eeb8515	2020-12-21 17:10:40 +0000	[diff] [blame^]	35	from .shape4d import Shape4D
Louis Verhaard	814cfbb	2020-08-21 14:06:25 +0200	[diff] [blame]	36	from .tensor import MemArea
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	37
				38
				39	class SharedBufferAllocation:
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	40	def __init__(
				41	self,
				42	arch,
				43	kernel,
				44	uses_lut,
				45	npu_block_type,
				46	all_fms_have_quant,
				47	ifm_resampling_mode,
				48	ifm_bits,
				49	ifm_depth,
				50	ifm_count,
				51	ofm_shape,
				52	):
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	53	self.arch = arch
				54
				55	self.bank_locations = np.zeros(SharedBufferArea.Size)
				56	self.banks_required = np.zeros(SharedBufferArea.Size)
				57
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	58	self.kernel = Kernel(1, 1) if kernel is None else kernel
				59	self.is_elementwise = npu_block_type == NpuBlockType.ElementWise
				60	self.uses_lut = uses_lut
				61	self.ifm_count = ifm_count
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	62
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	63	self.is_equal_depth_op = self.is_elementwise or npu_block_type in (
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	64	NpuBlockType.ConvolutionDepthWise,
				65	NpuBlockType.Pooling,
				66	)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	67
				68	self.use_accumulator_element = SHRAMElements.Acc32
Tim Hall	d5044a4	2020-10-06 12:07:04 +0100	[diff] [blame]	69	if self.is_elementwise:
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	70	self.use_ifm_element = SHRAMElements.IFM8_Elementwise
				71	else:
				72	self.use_ifm_element = SHRAMElements.IFM8
				73
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	74	self.ifm_resampling_mode = ifm_resampling_mode
				75	self.ifm_bits = ifm_bits
				76	self.ifm_depth = ifm_depth
				77	self.ifm_count = ifm_count
Andreas Nevalainen	6e82708	2020-10-14 13:55:43 +0200	[diff] [blame]	78
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	79	if self.ifm_bits == 16:
				80	if npu_block_type != NpuBlockType.Pooling and all_fms_have_quant:
				81	self.use_accumulator_element = SHRAMElements.Acc40
				82	self.use_ifm_element = self.use_ifm_element + 1
				83	assert (self.use_ifm_element == SHRAMElements.IFM16) or (
				84	self.use_ifm_element == SHRAMElements.IFM16_Elementwise
				85	)
				86	elif self.ifm_bits == 32:
				87	assert self.is_elementwise or npu_block_type == NpuBlockType.ReduceSum, "Unsupported 32-bit IFM operation"
				88	self.use_ifm_element = SHRAMElements.IFM32
				89	else:
				90	assert self.ifm_bits == 8, "Unexpected IFM bitdepth"
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	91
				92	self.ifm_block_depth = arch.calc_ifm_block_depth(self.ifm_depth, self.ifm_bits)
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	93	self.ofm_shape = ofm_shape
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	94
				95	self.banks_required[SharedBufferArea.Weights] = arch.shram_reserved_weight_banks
				96	self.banks_required[SharedBufferArea.OFM] = arch.shram_reserved_output_banks
				97
				98	def is_valid(self):
				99	# Assign zero-based bank starts (first element remains zero)
				100	self.bank_locations[1:] = np.cumsum(self.banks_required)[:-1]
				101
				102	# Accumulator area is measured from the end of the buffer
				103	self.bank_locations[SharedBufferArea.Accumulators] = (
Louis Verhaard	814cfbb	2020-08-21 14:06:25 +0200	[diff] [blame]	104	self.arch.available_shram_banks(self.uses_lut) - self.banks_required[SharedBufferArea.Accumulators]
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	105	)
				106	ifm_end = self.bank_locations[SharedBufferArea.IFM] + self.banks_required[SharedBufferArea.IFM]
				107	return ifm_end <= self.bank_locations[SharedBufferArea.Accumulators]
				108
				109	def try_block(self, ofm_block: Block):
				110	# Get IFM block configuration
				111	ifm_block_depth = ofm_block.depth if self.is_equal_depth_op else self.ifm_block_depth
Tim Hall	c30f495	2020-06-15 20:47:35 +0100	[diff] [blame]	112	ifm_block = self.arch.get_ifm_block_size(
				113	ifm_block_depth, ofm_block, self.kernel, ifm_resampling_mode=self.ifm_resampling_mode
				114	)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	115	ifm_config = self.arch.get_block_config(ifm_block.width, ifm_block.height, ifm_block.depth)
				116	if ifm_config is None:
				117	return None
				118
				119	# Get OFM block configuration
				120	ofm_config = self.arch.get_block_config(ofm_block.width, ofm_block.height, ofm_block.depth)
				121	if ofm_config is None:
				122	return None
				123
Tim Hall	d5044a4	2020-10-06 12:07:04 +0100	[diff] [blame]	124	acc_banks = ofm_config.banks[self.use_accumulator_element]
				125
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	126	# Update bank counts for IFM and Accumulator
Andreas Nevalainen	6e82708	2020-10-14 13:55:43 +0200	[diff] [blame]	127	self.banks_required[SharedBufferArea.IFM] = ifm_config.banks[self.use_ifm_element] * self.ifm_count
Tim Hall	d5044a4	2020-10-06 12:07:04 +0100	[diff] [blame]	128	self.banks_required[SharedBufferArea.Accumulators] = 0 if self.is_elementwise else acc_banks
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	129
				130	# Validating calculates bank layout and returns validity
				131	if not self.is_valid():
				132	return None
				133
				134	return (ofm_block.height, ofm_block.width, ifm_block.depth, ofm_block.depth)
				135
				136	def generate_used_mask(self, active_set):
				137	res = np.zeros(self.arch.shram_total_banks, dtype=np.int64)
				138	for kind in active_set:
				139	start = int(self.bank_locations[kind])
				140	end = start + int(self.banks_required[kind])
				141	res[start:end] = 1
				142	return res
				143
				144	def is_compatible(first, second):
				145	"""See if the bank allocations of two convolutions are compatible,
				146	so that they can run back-to-back without a fence in between"""
				147
				148	first_set = set((SharedBufferArea.OFM, SharedBufferArea.Accumulators))
				149	second_set = set((SharedBufferArea.IFM, SharedBufferArea.Weights))
				150
				151	first_mask = first.generate_used_mask(first_set)
				152	second_mask = second.generate_used_mask(second_set)
				153
				154	if np.sum(first_mask & second_mask):
				155	# overlap
				156	return False
				157
				158	return True
				159
Louis Verhaard	814cfbb	2020-08-21 14:06:25 +0200	[diff] [blame]	160	def get_shram_memory_access_range(self):
				161	# Returns the SHRAM memory access range used by this shared buffer,
				162	# excluding access to LUT
				163	return MemoryRangeSet(
				164	MemArea.Shram, 0, self.arch.available_shram_banks(self.uses_lut) * self.arch.shram_bank_size
				165	)
				166
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	167
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	168	def _all_fms_have_quant(ifm_tensor, ofm_tensor, ifm2_tensor=None) -> bool:
Diqing Zhong	09387e2	2020-09-28 18:46:22 +0200	[diff] [blame]	169	tensors = [t for t in (ifm_tensor, ifm2_tensor, ofm_tensor) if t is not None]
				170	scales = [t.quantization.scale_f32 for t in tensors if t.quantization is not None]
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	171	return len(tensors) == len(scales) and None not in scales
Diqing Zhong	09387e2	2020-09-28 18:46:22 +0200	[diff] [blame]	172
				173
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	174	def is_acc_40bits_used(npu_block_type, ifm_tensor, ofm_tensor, ifm2_tensor=None):
				175	return npu_block_type != NpuBlockType.Pooling and _all_fms_have_quant(ifm_tensor, ofm_tensor, ifm2_tensor)
				176
				177
				178	def shared_buffer_allocation_for_pass(arch, ps) -> SharedBufferAllocation:
				179	ifm_tensor, ifm2_tensor, _, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()
				180	all_fms_have_quant = _all_fms_have_quant(ifm_tensor, ifm2_tensor, ofm_tensor)
				181
				182	kernel = Kernel(1, 1)
				183	is_elementwise = ps.npu_block_type == NpuBlockType.ElementWise
				184	uses_lut = False
				185	ifm_count = 1
				186
				187	if ps.primary_op:
				188	kernel = ps.primary_op.kernel
				189	uses_lut = ps.primary_op.activation_lut is not None
				190
				191	ifm_resampling_mode = resampling_mode.NONE
				192	ifm_bits = 0
				193	ifm_depth = 0
				194	if ifm_tensor:
				195	ifm_resampling_mode = ifm_tensor.resampling_mode
				196	ifm_bits = ifm_tensor.dtype.size_in_bits()
Patrik Gustavsson	2349d42	2020-12-01 16:02:29 +0100	[diff] [blame]	197	ifm_shape = ps.primary_op.ifm_shapes[0]
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	198
patrik.gustavsson	eeb8515	2020-12-21 17:10:40 +0000	[diff] [blame^]	199	if ifm_tensor.shape != []:
				200	ifm_depth = ifm_shape.depth
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	201
				202	if is_elementwise:
				203	ifm_count = 2
				204	if ifm_tensor.shape == []: # Scalar in ifm1
				205	assert ifm2_tensor
patrik.gustavsson	eeb8515	2020-12-21 17:10:40 +0000	[diff] [blame^]	206	ifm_depth = ps.primary_op.ifm_shapes[1].depth
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	207	ifm_count = 1
				208	elif not ifm2_tensor or ifm2_tensor.shape == []: # Scalar in ifm2
				209	ifm_count = 1
				210	return SharedBufferAllocation(
				211	arch,
				212	kernel,
				213	uses_lut,
				214	npu_block_type=ps.npu_block_type,
				215	all_fms_have_quant=all_fms_have_quant,
				216	ifm_resampling_mode=ifm_resampling_mode,
				217	ifm_bits=ifm_bits,
				218	ifm_depth=ifm_depth,
				219	ifm_count=ifm_count,
Patrik Gustavsson	2349d42	2020-12-01 16:02:29 +0100	[diff] [blame]	220	ofm_shape=ps.primary_op.ofm_shapes[0],
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	221	)
				222
				223
				224	def shared_buffer_allocation_for_pass_and_block_config(arch, ps, block_config) -> SharedBufferAllocation:
				225	alloc = shared_buffer_allocation_for_pass(arch, ps)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	226	assert (alloc.ifm_block_depth == block_config[2]) or alloc.is_equal_depth_op
				227	if alloc.try_block(Block(block_config[1], block_config[0], block_config[3])):
				228	return alloc
				229
				230	return None
				231
				232
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	233	def shared_buffer_allocation_for_npu_op(
				234	arch, npu_op: NpuBlockOperation, npu_block_type: NpuBlockType, ifm_resampling_mode
				235	) -> SharedBufferAllocation:
				236	uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
				237	fms = [npu_op.ifm, npu_op.ofm]
				238	if npu_op.ifm2 is not None:
				239	fms.append(npu_op.ifm2)
				240	all_fms_have_quant = not any(fm.quantization is None or fm.quantization.scale_f32 is None for fm in fms)
				241	ifm_bits = npu_op.ifm.data_type.size_in_bits()
				242	ifm_depth = npu_op.ifm.shape.depth
				243	ifm_count = 2 if npu_op.ifm2 is not None and npu_op.ifm2_scalar is None else 1
				244	ofm_shape = [1, npu_op.ofm.shape.height, npu_op.ofm.shape.width, npu_op.ofm.shape.depth]
				245	return SharedBufferAllocation(
				246	arch,
				247	to_kernel(npu_op.kernel),
				248	uses_lut,
				249	npu_block_type=npu_block_type,
				250	all_fms_have_quant=all_fms_have_quant,
				251	ifm_resampling_mode=ifm_resampling_mode,
				252	ifm_bits=ifm_bits,
				253	ifm_depth=ifm_depth,
				254	ifm_count=ifm_count,
patrik.gustavsson	eeb8515	2020-12-21 17:10:40 +0000	[diff] [blame^]	255	ofm_shape=Shape4D(ofm_shape),
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	256	)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	257
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	258
				259	def find_suitable_block_configs(arch, alloc: SharedBufferAllocation) -> List[Tuple]:
				260	"""Returns list of block configs that would fit with the given shared buffer allocation"""
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	261	if arch.override_block_config:
				262	config = alloc.try_block(arch.override_block_config)
Tim Hall	2a7ebe3	2020-06-18 11:42:21 +0100	[diff] [blame]	263	if config is None:
Michael McGeagh	7a6f843	2020-12-02 15:29:22 +0000	[diff] [blame]	264	raise AllocationError(f"Block config override '{arch.override_block_config}' cannot be allocated")
Tim Hall	2a7ebe3	2020-06-18 11:42:21 +0100	[diff] [blame]	265	return [config]
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	266
				267	# Constrain the search space if the OFM is smaller than the max block size
				268	# - Add other block search constraints here if required
patrik.gustavsson	eeb8515	2020-12-21 17:10:40 +0000	[diff] [blame^]	269	max_block_width = alloc.ofm_shape.width
				270	max_block_height = alloc.ofm_shape.height
				271	max_block_depth = alloc.ofm_shape.depth
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	272
				273	# Constrain to valid ranges before search
				274	max_block_width = min(arch.ofm_block_max.width, max_block_width)
				275	max_block_height = min(arch.ofm_block_max.height, max_block_height)
				276	max_block_depth = min(arch.ofm_block_max.depth, max_block_depth)
				277
				278	valid_block_configs = []
				279	# Try a range of block shapes against this pass
				280	for w in range(arch.ofm_ublock.width, max_block_width + arch.ofm_ublock.width, arch.ofm_ublock.width):
				281	for h in range(arch.ofm_ublock.height, max_block_height + arch.ofm_ublock.height, arch.ofm_ublock.height):
				282	# Try valid OFM block depths
				283	for c in range(arch.ofm_ublock.depth, max_block_depth + arch.ofm_ublock.depth, arch.ofm_ublock.depth):
				284	# OFM block depth has the constraint that if it causes the OFM to be
				285	# split, it must be a multiple of the OFM split size
				286	if (c >= max_block_depth) or (c < max_block_depth and (c % ArchitectureFeatures.OFMSplitDepth) == 0):
				287	config = alloc.try_block(Block(w, h, c))
				288	if config:
				289	valid_block_configs.append(config)
				290
				291	assert len(valid_block_configs) > 0
				292	return valid_block_configs
Louis Verhaard	e8a5a78	2020-11-02 18:04:27 +0100	[diff] [blame]	293
				294
				295	def find_block_configs_suitable_for_pass_and_shared_buffer(arch, ps) -> List[Tuple]:
				296	alloc = shared_buffer_allocation_for_pass(arch, ps)
				297	return find_suitable_block_configs(arch, alloc)