Blame - ethosu/vela/shared_buffer_allocation.py - ml/ethos-u/ethos-u-vela

blob: 07637f36d080d561eb431b0333b5849912887a73 [file] [log] [blame]

Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	1	# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
				2	#
				3	# SPDX-License-Identifier: Apache-2.0
				4	#
				5	# Licensed under the Apache License, Version 2.0 (the License); you may
				6	# not use this file except in compliance with the License.
				7	# You may obtain a copy of the License at
				8	#
				9	# www.apache.org/licenses/LICENSE-2.0
				10	#
				11	# Unless required by applicable law or agreed to in writing, software
				12	# distributed under the License is distributed on an AS IS BASIS, WITHOUT
				13	# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				14	# See the License for the specific language governing permissions and
				15	# limitations under the License.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	16	# Description:
				17	# Shared buffer allocation works out how to allocate the Ethos-U55 shared buffer for a given pass.
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	18	import numpy as np
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	19
Diego Russo	e8a1045	2020-04-21 17:39:10 +0100	[diff] [blame]	20	from .architecture_features import ArchitectureFeatures
				21	from .architecture_features import Block
				22	from .architecture_features import Kernel
				23	from .architecture_features import SharedBufferArea
				24	from .architecture_features import SHRAMElements
Tim Hall	2a7ebe3	2020-06-18 11:42:21 +0100	[diff] [blame^]	25	from .errors import VelaError
Diego Russo	ea6111a	2020-04-14 18:41:58 +0100	[diff] [blame]	26	from .operation import NpuBlockType
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	27
				28
				29	class SharedBufferAllocation:
				30	def __init__(self, arch, ps):
				31	self.arch = arch
				32
				33	self.bank_locations = np.zeros(SharedBufferArea.Size)
				34	self.banks_required = np.zeros(SharedBufferArea.Size)
				35
				36	ifm_tensor, ifm2_tensor, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()
				37
				38	strides = (1, 1, 1, 1)
				39	dilation = (1, 1, 1, 1)
				40	self.kernel = Kernel(1, 1)
				41	is_elementwise = ps.npu_block_type == NpuBlockType.ElementWise
				42
				43	if ps.primary_op:
				44	strides = ps.primary_op.attrs.get("strides", strides)
				45	dilation = ps.primary_op.attrs.get("dilation", dilation)
				46	k_h = 1
				47	k_w = 1
				48	if weight_tensor:
				49	if ps.primary_op.type != "FullyConnectedAct":
				50	k_h = weight_tensor.shape[0]
				51	k_w = weight_tensor.shape[1]
				52	else:
				53	k_h = ps.primary_op.attrs.get("filter_height", 1)
				54	k_w = ps.primary_op.attrs.get("filter_width", 1)
				55
				56	self.kernel = Kernel(k_w, k_h, strides[2], strides[1], dilation[2], dilation[1])
				57
				58	self.is_equal_depth_op = is_elementwise or ps.npu_block_type in (
				59	NpuBlockType.ConvolutionDepthWise,
				60	NpuBlockType.Pooling,
				61	)
				62	self.strides = strides
				63
				64	self.use_accumulator_element = SHRAMElements.Acc32
				65	if is_elementwise:
				66	self.use_ifm_element = SHRAMElements.IFM8_Elementwise
				67	else:
				68	self.use_ifm_element = SHRAMElements.IFM8
				69
				70	self.ifm_bits = 0
				71	self.ifm_depth = 0
				72	if ifm_tensor:
				73	self.ifm_bits = ifm_tensor.dtype.size_in_bits()
				74	if ifm_tensor.shape == [] and is_elementwise:
				75	# Elementwise operator with scalar in ifm, use ifm2 depth
				76	self.ifm_depth = ifm2_tensor.shape[-1]
				77	else:
				78	self.ifm_depth = ifm_tensor.shape[-1]
				79	if self.ifm_bits == 16:
				80	self.use_accumulator_element = SHRAMElements.Acc40
				81	self.use_ifm_element = self.use_ifm_element + 1
				82	assert (self.use_ifm_element == SHRAMElements.IFM16) or (
				83	self.use_ifm_element == SHRAMElements.IFM16_Elementwise
				84	)
				85	else:
				86	assert self.ifm_bits == 8, "Unexpected IFM bitdepth"
				87
Dwight Lidman	a9390f7	2020-05-13 12:00:08 +0200	[diff] [blame]	88	self.ifm_resampling_mode = ifm_tensor.resampling_mode
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	89	self.ifm_block_depth = arch.calc_ifm_block_depth(self.ifm_depth, self.ifm_bits)
				90	self.ofm_tensor = ofm_tensor
				91
				92	self.banks_required[SharedBufferArea.Weights] = arch.shram_reserved_weight_banks
				93	self.banks_required[SharedBufferArea.OFM] = arch.shram_reserved_output_banks
				94
				95	def is_valid(self):
				96	# Assign zero-based bank starts (first element remains zero)
				97	self.bank_locations[1:] = np.cumsum(self.banks_required)[:-1]
				98
				99	# Accumulator area is measured from the end of the buffer
				100	self.bank_locations[SharedBufferArea.Accumulators] = (
				101	self.arch.shram_total_banks - self.banks_required[SharedBufferArea.Accumulators]
				102	)
				103	ifm_end = self.bank_locations[SharedBufferArea.IFM] + self.banks_required[SharedBufferArea.IFM]
				104	return ifm_end <= self.bank_locations[SharedBufferArea.Accumulators]
				105
				106	def try_block(self, ofm_block: Block):
				107	# Get IFM block configuration
				108	ifm_block_depth = ofm_block.depth if self.is_equal_depth_op else self.ifm_block_depth
Tim Hall	c30f495	2020-06-15 20:47:35 +0100	[diff] [blame]	109	ifm_block = self.arch.get_ifm_block_size(
				110	ifm_block_depth, ofm_block, self.kernel, ifm_resampling_mode=self.ifm_resampling_mode
				111	)
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	112	ifm_config = self.arch.get_block_config(ifm_block.width, ifm_block.height, ifm_block.depth)
				113	if ifm_config is None:
				114	return None
				115
				116	# Get OFM block configuration
				117	ofm_config = self.arch.get_block_config(ofm_block.width, ofm_block.height, ofm_block.depth)
				118	if ofm_config is None:
				119	return None
				120
				121	# Update bank counts for IFM and Accumulator
				122	self.banks_required[SharedBufferArea.IFM] = ifm_config.banks[self.use_ifm_element]
				123	self.banks_required[SharedBufferArea.Accumulators] = ofm_config.banks[self.use_accumulator_element]
				124
				125	# Validating calculates bank layout and returns validity
				126	if not self.is_valid():
				127	return None
				128
				129	return (ofm_block.height, ofm_block.width, ifm_block.depth, ofm_block.depth)
				130
				131	def generate_used_mask(self, active_set):
				132	res = np.zeros(self.arch.shram_total_banks, dtype=np.int64)
				133	for kind in active_set:
				134	start = int(self.bank_locations[kind])
				135	end = start + int(self.banks_required[kind])
				136	res[start:end] = 1
				137	return res
				138
				139	def is_compatible(first, second):
				140	"""See if the bank allocations of two convolutions are compatible,
				141	so that they can run back-to-back without a fence in between"""
				142
				143	first_set = set((SharedBufferArea.OFM, SharedBufferArea.Accumulators))
				144	second_set = set((SharedBufferArea.IFM, SharedBufferArea.Weights))
				145
				146	first_mask = first.generate_used_mask(first_set)
				147	second_mask = second.generate_used_mask(second_set)
				148
				149	if np.sum(first_mask & second_mask):
				150	# overlap
				151	return False
				152
				153	return True
				154
				155
				156	def shared_buffer_allocation_for_pass_and_block_config(arch, ps, block_config):
				157	alloc = SharedBufferAllocation(arch, ps)
				158	assert (alloc.ifm_block_depth == block_config[2]) or alloc.is_equal_depth_op
				159	if alloc.try_block(Block(block_config[1], block_config[0], block_config[3])):
				160	return alloc
				161
				162	return None
				163
				164
				165	def find_block_configs_suitable_for_pass_and_shared_buffer(arch, ps):
				166	alloc = SharedBufferAllocation(arch, ps)
				167
				168	if arch.override_block_config:
				169	config = alloc.try_block(arch.override_block_config)
Tim Hall	2a7ebe3	2020-06-18 11:42:21 +0100	[diff] [blame^]	170	if config is None:
				171	raise VelaError("Block config override '{0}' cannot be allocated".format(arch.override_block_config) )
				172	return [config]
Tim Hall	79d07d2	2020-04-27 18:20:16 +0100	[diff] [blame]	173
				174	# Constrain the search space if the OFM is smaller than the max block size
				175	# - Add other block search constraints here if required
				176	if len(alloc.ofm_tensor.shape) == 2:
				177	max_block_height = max_block_width = alloc.ofm_tensor.shape[0]
				178	else:
				179	max_block_width = alloc.ofm_tensor.shape[-2]
				180	max_block_height = alloc.ofm_tensor.shape[-3]
				181
				182	# Common block depth
				183	max_block_depth = alloc.ofm_tensor.shape[-1]
				184
				185	# Constrain to valid ranges before search
				186	max_block_width = min(arch.ofm_block_max.width, max_block_width)
				187	max_block_height = min(arch.ofm_block_max.height, max_block_height)
				188	max_block_depth = min(arch.ofm_block_max.depth, max_block_depth)
				189
				190	valid_block_configs = []
				191	# Try a range of block shapes against this pass
				192	for w in range(arch.ofm_ublock.width, max_block_width + arch.ofm_ublock.width, arch.ofm_ublock.width):
				193	for h in range(arch.ofm_ublock.height, max_block_height + arch.ofm_ublock.height, arch.ofm_ublock.height):
				194	# Try valid OFM block depths
				195	for c in range(arch.ofm_ublock.depth, max_block_depth + arch.ofm_ublock.depth, arch.ofm_ublock.depth):
				196	# OFM block depth has the constraint that if it causes the OFM to be
				197	# split, it must be a multiple of the OFM split size
				198	if (c >= max_block_depth) or (c < max_block_depth and (c % ArchitectureFeatures.OFMSplitDepth) == 0):
				199	config = alloc.try_block(Block(w, h, c))
				200	if config:
				201	valid_block_configs.append(config)
				202
				203	assert len(valid_block_configs) > 0
				204	return valid_block_configs