blob: 51fb168378a039c4ba975879594c3c2c1002934d [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
17# Shared buffer allocation works out how to allocate the Ethos-U55 shared buffer for a given pass.
Tim Hall79d07d22020-04-27 18:20:16 +010018import numpy as np
Diego Russoea6111a2020-04-14 18:41:58 +010019
Diego Russoe8a10452020-04-21 17:39:10 +010020from .architecture_features import ArchitectureFeatures
21from .architecture_features import Block
Diego Russoe8a10452020-04-21 17:39:10 +010022from .architecture_features import SharedBufferArea
23from .architecture_features import SHRAMElements
Tim Hall2a7ebe32020-06-18 11:42:21 +010024from .errors import VelaError
Dwight Lidman7ad408b2020-08-11 11:55:22 +020025from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Tim Hall4ed38bc2020-10-20 18:54:20 +010026from .operation import Kernel
Diego Russoea6111a2020-04-14 18:41:58 +010027from .operation import NpuBlockType
Louis Verhaard814cfbb2020-08-21 14:06:25 +020028from .range_set import MemoryRangeSet
29from .tensor import MemArea
Tim Hall79d07d22020-04-27 18:20:16 +010030
31
32class SharedBufferAllocation:
33 def __init__(self, arch, ps):
34 self.arch = arch
35
36 self.bank_locations = np.zeros(SharedBufferArea.Size)
37 self.banks_required = np.zeros(SharedBufferArea.Size)
38
39 ifm_tensor, ifm2_tensor, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()
40
Tim Hall79d07d22020-04-27 18:20:16 +010041 self.kernel = Kernel(1, 1)
Tim Halld5044a42020-10-06 12:07:04 +010042 self.is_elementwise = ps.npu_block_type == NpuBlockType.ElementWise
Louis Verhaard814cfbb2020-08-21 14:06:25 +020043 self.uses_lut = False
Andreas Nevalainen6e827082020-10-14 13:55:43 +020044 self.ifm_count = 1
Tim Hall79d07d22020-04-27 18:20:16 +010045
46 if ps.primary_op:
Tim Hall4ed38bc2020-10-20 18:54:20 +010047 self.kernel = ps.primary_op.kernel
Louis Verhaard814cfbb2020-08-21 14:06:25 +020048 self.uses_lut = ps.primary_op.activation_lut is not None
Tim Hall79d07d22020-04-27 18:20:16 +010049
Tim Halld5044a42020-10-06 12:07:04 +010050 self.is_equal_depth_op = self.is_elementwise or ps.npu_block_type in (
Tim Hall79d07d22020-04-27 18:20:16 +010051 NpuBlockType.ConvolutionDepthWise,
52 NpuBlockType.Pooling,
53 )
Tim Hall79d07d22020-04-27 18:20:16 +010054
55 self.use_accumulator_element = SHRAMElements.Acc32
Tim Halld5044a42020-10-06 12:07:04 +010056 if self.is_elementwise:
Tim Hall79d07d22020-04-27 18:20:16 +010057 self.use_ifm_element = SHRAMElements.IFM8_Elementwise
58 else:
59 self.use_ifm_element = SHRAMElements.IFM8
60
Dwight Lidman7ad408b2020-08-11 11:55:22 +020061 self.ifm_resampling_mode = resampling_mode.NONE
Tim Hall79d07d22020-04-27 18:20:16 +010062 self.ifm_bits = 0
63 self.ifm_depth = 0
64 if ifm_tensor:
Dwight Lidman7ad408b2020-08-11 11:55:22 +020065 self.ifm_resampling_mode = ifm_tensor.resampling_mode
Tim Hall79d07d22020-04-27 18:20:16 +010066 self.ifm_bits = ifm_tensor.dtype.size_in_bits()
Andreas Nevalainen6e827082020-10-14 13:55:43 +020067
68 if ifm_tensor.shape != []:
Tim Hall79d07d22020-04-27 18:20:16 +010069 self.ifm_depth = ifm_tensor.shape[-1]
Andreas Nevalainen6e827082020-10-14 13:55:43 +020070
71 if self.is_elementwise:
72 self.ifm_count = 2
Tim Hall4ed38bc2020-10-20 18:54:20 +010073 if ifm_tensor.shape == []: # Scalar in ifm1
Andreas Nevalainen6e827082020-10-14 13:55:43 +020074 assert ifm2_tensor
75 self.ifm_depth = ifm2_tensor.shape[-1]
76 self.ifm_count = 1
Tim Hall4ed38bc2020-10-20 18:54:20 +010077 elif not ifm2_tensor or ifm2_tensor.shape == []: # Scalar in ifm2
Andreas Nevalainen6e827082020-10-14 13:55:43 +020078 self.ifm_count = 1
79
Tim Hall79d07d22020-04-27 18:20:16 +010080 if self.ifm_bits == 16:
Diqing Zhong09387e22020-09-28 18:46:22 +020081 if is_acc_40bits_used(ps.npu_block_type, ifm_tensor, ofm_tensor, ifm2_tensor):
Tim Hall749bfd52020-08-30 14:40:46 +010082 self.use_accumulator_element = SHRAMElements.Acc40
Tim Hall79d07d22020-04-27 18:20:16 +010083 self.use_ifm_element = self.use_ifm_element + 1
84 assert (self.use_ifm_element == SHRAMElements.IFM16) or (
85 self.use_ifm_element == SHRAMElements.IFM16_Elementwise
86 )
Tim Hall2b7a1622020-09-08 17:00:33 +010087 elif self.ifm_bits == 32:
Louis Verhaardaee5d752020-09-30 09:01:52 +020088 assert (
89 self.is_elementwise or ps.npu_block_type == NpuBlockType.ReduceSum
90 ), "Unsupported 32-bit IFM operation"
Fredrik Svedberg597fd3f2020-08-13 10:02:53 +020091 self.use_ifm_element = SHRAMElements.IFM32
Tim Hall79d07d22020-04-27 18:20:16 +010092 else:
93 assert self.ifm_bits == 8, "Unexpected IFM bitdepth"
94
95 self.ifm_block_depth = arch.calc_ifm_block_depth(self.ifm_depth, self.ifm_bits)
96 self.ofm_tensor = ofm_tensor
97
98 self.banks_required[SharedBufferArea.Weights] = arch.shram_reserved_weight_banks
99 self.banks_required[SharedBufferArea.OFM] = arch.shram_reserved_output_banks
100
101 def is_valid(self):
102 # Assign zero-based bank starts (first element remains zero)
103 self.bank_locations[1:] = np.cumsum(self.banks_required)[:-1]
104
105 # Accumulator area is measured from the end of the buffer
106 self.bank_locations[SharedBufferArea.Accumulators] = (
Louis Verhaard814cfbb2020-08-21 14:06:25 +0200107 self.arch.available_shram_banks(self.uses_lut) - self.banks_required[SharedBufferArea.Accumulators]
Tim Hall79d07d22020-04-27 18:20:16 +0100108 )
109 ifm_end = self.bank_locations[SharedBufferArea.IFM] + self.banks_required[SharedBufferArea.IFM]
110 return ifm_end <= self.bank_locations[SharedBufferArea.Accumulators]
111
112 def try_block(self, ofm_block: Block):
113 # Get IFM block configuration
114 ifm_block_depth = ofm_block.depth if self.is_equal_depth_op else self.ifm_block_depth
Tim Hallc30f4952020-06-15 20:47:35 +0100115 ifm_block = self.arch.get_ifm_block_size(
116 ifm_block_depth, ofm_block, self.kernel, ifm_resampling_mode=self.ifm_resampling_mode
117 )
Tim Hall79d07d22020-04-27 18:20:16 +0100118 ifm_config = self.arch.get_block_config(ifm_block.width, ifm_block.height, ifm_block.depth)
119 if ifm_config is None:
120 return None
121
122 # Get OFM block configuration
123 ofm_config = self.arch.get_block_config(ofm_block.width, ofm_block.height, ofm_block.depth)
124 if ofm_config is None:
125 return None
126
Tim Halld5044a42020-10-06 12:07:04 +0100127 acc_banks = ofm_config.banks[self.use_accumulator_element]
128
Tim Hall79d07d22020-04-27 18:20:16 +0100129 # Update bank counts for IFM and Accumulator
Andreas Nevalainen6e827082020-10-14 13:55:43 +0200130 self.banks_required[SharedBufferArea.IFM] = ifm_config.banks[self.use_ifm_element] * self.ifm_count
Tim Halld5044a42020-10-06 12:07:04 +0100131 self.banks_required[SharedBufferArea.Accumulators] = 0 if self.is_elementwise else acc_banks
Tim Hall79d07d22020-04-27 18:20:16 +0100132
133 # Validating calculates bank layout and returns validity
134 if not self.is_valid():
135 return None
136
137 return (ofm_block.height, ofm_block.width, ifm_block.depth, ofm_block.depth)
138
139 def generate_used_mask(self, active_set):
140 res = np.zeros(self.arch.shram_total_banks, dtype=np.int64)
141 for kind in active_set:
142 start = int(self.bank_locations[kind])
143 end = start + int(self.banks_required[kind])
144 res[start:end] = 1
145 return res
146
147 def is_compatible(first, second):
148 """See if the bank allocations of two convolutions are compatible,
149 so that they can run back-to-back without a fence in between"""
150
151 first_set = set((SharedBufferArea.OFM, SharedBufferArea.Accumulators))
152 second_set = set((SharedBufferArea.IFM, SharedBufferArea.Weights))
153
154 first_mask = first.generate_used_mask(first_set)
155 second_mask = second.generate_used_mask(second_set)
156
157 if np.sum(first_mask & second_mask):
158 # overlap
159 return False
160
161 return True
162
Louis Verhaard814cfbb2020-08-21 14:06:25 +0200163 def get_shram_memory_access_range(self):
164 # Returns the SHRAM memory access range used by this shared buffer,
165 # excluding access to LUT
166 return MemoryRangeSet(
167 MemArea.Shram, 0, self.arch.available_shram_banks(self.uses_lut) * self.arch.shram_bank_size
168 )
169
Tim Hall79d07d22020-04-27 18:20:16 +0100170
Diqing Zhong09387e22020-09-28 18:46:22 +0200171def is_acc_40bits_used(npu_block_type, ifm_tensor, ofm_tensor, ifm2_tensor=None):
172 tensors = [t for t in (ifm_tensor, ifm2_tensor, ofm_tensor) if t is not None]
173 scales = [t.quantization.scale_f32 for t in tensors if t.quantization is not None]
174 has_scale = len(tensors) == len(scales) and None not in scales
175 return npu_block_type != NpuBlockType.Pooling and has_scale
176
177
Tim Hall79d07d22020-04-27 18:20:16 +0100178def shared_buffer_allocation_for_pass_and_block_config(arch, ps, block_config):
179 alloc = SharedBufferAllocation(arch, ps)
180 assert (alloc.ifm_block_depth == block_config[2]) or alloc.is_equal_depth_op
181 if alloc.try_block(Block(block_config[1], block_config[0], block_config[3])):
182 return alloc
183
184 return None
185
186
187def find_block_configs_suitable_for_pass_and_shared_buffer(arch, ps):
188 alloc = SharedBufferAllocation(arch, ps)
189
190 if arch.override_block_config:
191 config = alloc.try_block(arch.override_block_config)
Tim Hall2a7ebe32020-06-18 11:42:21 +0100192 if config is None:
Fredrik Svedberga0c36242020-06-03 15:43:31 +0200193 raise VelaError("Block config override '{0}' cannot be allocated".format(arch.override_block_config))
Tim Hall2a7ebe32020-06-18 11:42:21 +0100194 return [config]
Tim Hall79d07d22020-04-27 18:20:16 +0100195
196 # Constrain the search space if the OFM is smaller than the max block size
197 # - Add other block search constraints here if required
Fredrik Svedberg0f98b362020-09-29 10:00:39 +0200198 if len(alloc.ofm_tensor.shape) <= 2:
Tim Hall79d07d22020-04-27 18:20:16 +0100199 max_block_height = max_block_width = alloc.ofm_tensor.shape[0]
200 else:
201 max_block_width = alloc.ofm_tensor.shape[-2]
202 max_block_height = alloc.ofm_tensor.shape[-3]
203
204 # Common block depth
205 max_block_depth = alloc.ofm_tensor.shape[-1]
206
207 # Constrain to valid ranges before search
208 max_block_width = min(arch.ofm_block_max.width, max_block_width)
209 max_block_height = min(arch.ofm_block_max.height, max_block_height)
210 max_block_depth = min(arch.ofm_block_max.depth, max_block_depth)
211
212 valid_block_configs = []
213 # Try a range of block shapes against this pass
214 for w in range(arch.ofm_ublock.width, max_block_width + arch.ofm_ublock.width, arch.ofm_ublock.width):
215 for h in range(arch.ofm_ublock.height, max_block_height + arch.ofm_ublock.height, arch.ofm_ublock.height):
216 # Try valid OFM block depths
217 for c in range(arch.ofm_ublock.depth, max_block_depth + arch.ofm_ublock.depth, arch.ofm_ublock.depth):
218 # OFM block depth has the constraint that if it causes the OFM to be
219 # split, it must be a multiple of the OFM split size
220 if (c >= max_block_depth) or (c < max_block_depth and (c % ArchitectureFeatures.OFMSplitDepth) == 0):
221 config = alloc.try_block(Block(w, h, c))
222 if config:
223 valid_block_configs.append(config)
224
225 assert len(valid_block_configs) > 0
226 return valid_block_configs