blob: c9a97c0f0cddcaf2e3fdcc080eb73c462c1555c0 [file] [log] [blame]
Jacob Bohlin2a585302021-02-11 16:04:53 +01001# Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved.
Tim Hall79d07d22020-04-27 18:20:16 +01002#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
Tim Hall79d07d22020-04-27 18:20:16 +010016# Description:
Tim Hallc8a73862020-10-27 12:43:14 +000017# Shared buffer allocation works out how to allocate the Ethos-U shared buffer for a given pass.
Louis Verhaarde8a5a782020-11-02 18:04:27 +010018from typing import List
19from typing import Tuple
20
Tim Hall79d07d22020-04-27 18:20:16 +010021import numpy as np
Diego Russoea6111a2020-04-14 18:41:58 +010022
Louis Verhaarde8a5a782020-11-02 18:04:27 +010023from .api import NpuActivationOp
24from .api import NpuBlockOperation
Diego Russoe8a10452020-04-21 17:39:10 +010025from .architecture_features import ArchitectureFeatures
26from .architecture_features import Block
Diego Russoe8a10452020-04-21 17:39:10 +010027from .architecture_features import SharedBufferArea
28from .architecture_features import SHRAMElements
Dwight Lidman7ad408b2020-08-11 11:55:22 +020029from .ethos_u55_regs.ethos_u55_regs import resampling_mode
Tim Hall4ed38bc2020-10-20 18:54:20 +010030from .operation import Kernel
Diego Russoea6111a2020-04-14 18:41:58 +010031from .operation import NpuBlockType
Louis Verhaard814cfbb2020-08-21 14:06:25 +020032from .range_set import MemoryRangeSet
Louis Verhaard1e170182020-11-26 11:42:04 +010033from .register_command_stream_util import to_kernel
patrik.gustavssoneeb85152020-12-21 17:10:40 +000034from .shape4d import Shape4D
Louis Verhaard814cfbb2020-08-21 14:06:25 +020035from .tensor import MemArea
Tim Hall79d07d22020-04-27 18:20:16 +010036
37
38class SharedBufferAllocation:
Louis Verhaarde8a5a782020-11-02 18:04:27 +010039 def __init__(
40 self,
41 arch,
42 kernel,
43 uses_lut,
44 npu_block_type,
45 all_fms_have_quant,
46 ifm_resampling_mode,
47 ifm_bits,
48 ifm_depth,
49 ifm_count,
50 ofm_shape,
51 ):
Tim Hall79d07d22020-04-27 18:20:16 +010052 self.arch = arch
53
54 self.bank_locations = np.zeros(SharedBufferArea.Size)
55 self.banks_required = np.zeros(SharedBufferArea.Size)
56
Louis Verhaarde8a5a782020-11-02 18:04:27 +010057 self.kernel = Kernel(1, 1) if kernel is None else kernel
58 self.is_elementwise = npu_block_type == NpuBlockType.ElementWise
59 self.uses_lut = uses_lut
60 self.ifm_count = ifm_count
Tim Hall79d07d22020-04-27 18:20:16 +010061
Louis Verhaarde8a5a782020-11-02 18:04:27 +010062 self.is_equal_depth_op = self.is_elementwise or npu_block_type in (
Tim Hall79d07d22020-04-27 18:20:16 +010063 NpuBlockType.ConvolutionDepthWise,
64 NpuBlockType.Pooling,
65 )
Tim Hall79d07d22020-04-27 18:20:16 +010066
67 self.use_accumulator_element = SHRAMElements.Acc32
Tim Halld5044a42020-10-06 12:07:04 +010068 if self.is_elementwise:
Tim Hall79d07d22020-04-27 18:20:16 +010069 self.use_ifm_element = SHRAMElements.IFM8_Elementwise
70 else:
71 self.use_ifm_element = SHRAMElements.IFM8
72
Louis Verhaarde8a5a782020-11-02 18:04:27 +010073 self.ifm_resampling_mode = ifm_resampling_mode
74 self.ifm_bits = ifm_bits
75 self.ifm_depth = ifm_depth
76 self.ifm_count = ifm_count
Andreas Nevalainen6e827082020-10-14 13:55:43 +020077
Louis Verhaarde8a5a782020-11-02 18:04:27 +010078 if self.ifm_bits == 16:
79 if npu_block_type != NpuBlockType.Pooling and all_fms_have_quant:
80 self.use_accumulator_element = SHRAMElements.Acc40
81 self.use_ifm_element = self.use_ifm_element + 1
82 assert (self.use_ifm_element == SHRAMElements.IFM16) or (
83 self.use_ifm_element == SHRAMElements.IFM16_Elementwise
84 )
85 elif self.ifm_bits == 32:
86 assert self.is_elementwise or npu_block_type == NpuBlockType.ReduceSum, "Unsupported 32-bit IFM operation"
87 self.use_ifm_element = SHRAMElements.IFM32
88 else:
89 assert self.ifm_bits == 8, "Unexpected IFM bitdepth"
Tim Hall79d07d22020-04-27 18:20:16 +010090
91 self.ifm_block_depth = arch.calc_ifm_block_depth(self.ifm_depth, self.ifm_bits)
Louis Verhaarde8a5a782020-11-02 18:04:27 +010092 self.ofm_shape = ofm_shape
Tim Hall79d07d22020-04-27 18:20:16 +010093
94 self.banks_required[SharedBufferArea.Weights] = arch.shram_reserved_weight_banks
95 self.banks_required[SharedBufferArea.OFM] = arch.shram_reserved_output_banks
96
97 def is_valid(self):
98 # Assign zero-based bank starts (first element remains zero)
99 self.bank_locations[1:] = np.cumsum(self.banks_required)[:-1]
100
101 # Accumulator area is measured from the end of the buffer
102 self.bank_locations[SharedBufferArea.Accumulators] = (
Louis Verhaard814cfbb2020-08-21 14:06:25 +0200103 self.arch.available_shram_banks(self.uses_lut) - self.banks_required[SharedBufferArea.Accumulators]
Tim Hall79d07d22020-04-27 18:20:16 +0100104 )
105 ifm_end = self.bank_locations[SharedBufferArea.IFM] + self.banks_required[SharedBufferArea.IFM]
106 return ifm_end <= self.bank_locations[SharedBufferArea.Accumulators]
107
108 def try_block(self, ofm_block: Block):
109 # Get IFM block configuration
110 ifm_block_depth = ofm_block.depth if self.is_equal_depth_op else self.ifm_block_depth
Tim Hallc30f4952020-06-15 20:47:35 +0100111 ifm_block = self.arch.get_ifm_block_size(
112 ifm_block_depth, ofm_block, self.kernel, ifm_resampling_mode=self.ifm_resampling_mode
113 )
Tim Hall79d07d22020-04-27 18:20:16 +0100114 ifm_config = self.arch.get_block_config(ifm_block.width, ifm_block.height, ifm_block.depth)
115 if ifm_config is None:
116 return None
117
118 # Get OFM block configuration
119 ofm_config = self.arch.get_block_config(ofm_block.width, ofm_block.height, ofm_block.depth)
120 if ofm_config is None:
121 return None
122
Tim Halld5044a42020-10-06 12:07:04 +0100123 acc_banks = ofm_config.banks[self.use_accumulator_element]
124
Tim Hall79d07d22020-04-27 18:20:16 +0100125 # Update bank counts for IFM and Accumulator
Andreas Nevalainen6e827082020-10-14 13:55:43 +0200126 self.banks_required[SharedBufferArea.IFM] = ifm_config.banks[self.use_ifm_element] * self.ifm_count
Tim Halld5044a42020-10-06 12:07:04 +0100127 self.banks_required[SharedBufferArea.Accumulators] = 0 if self.is_elementwise else acc_banks
Tim Hall79d07d22020-04-27 18:20:16 +0100128
129 # Validating calculates bank layout and returns validity
130 if not self.is_valid():
131 return None
132
133 return (ofm_block.height, ofm_block.width, ifm_block.depth, ofm_block.depth)
134
135 def generate_used_mask(self, active_set):
136 res = np.zeros(self.arch.shram_total_banks, dtype=np.int64)
137 for kind in active_set:
138 start = int(self.bank_locations[kind])
139 end = start + int(self.banks_required[kind])
140 res[start:end] = 1
141 return res
142
143 def is_compatible(first, second):
144 """See if the bank allocations of two convolutions are compatible,
145 so that they can run back-to-back without a fence in between"""
146
147 first_set = set((SharedBufferArea.OFM, SharedBufferArea.Accumulators))
148 second_set = set((SharedBufferArea.IFM, SharedBufferArea.Weights))
149
150 first_mask = first.generate_used_mask(first_set)
151 second_mask = second.generate_used_mask(second_set)
152
153 if np.sum(first_mask & second_mask):
154 # overlap
155 return False
156
157 return True
158
Louis Verhaard814cfbb2020-08-21 14:06:25 +0200159 def get_shram_memory_access_range(self):
160 # Returns the SHRAM memory access range used by this shared buffer,
161 # excluding access to LUT
162 return MemoryRangeSet(
163 MemArea.Shram, 0, self.arch.available_shram_banks(self.uses_lut) * self.arch.shram_bank_size
164 )
165
Tim Hall79d07d22020-04-27 18:20:16 +0100166
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100167def _all_fms_have_quant(ifm_tensor, ofm_tensor, ifm2_tensor=None) -> bool:
Diqing Zhong09387e22020-09-28 18:46:22 +0200168 tensors = [t for t in (ifm_tensor, ifm2_tensor, ofm_tensor) if t is not None]
169 scales = [t.quantization.scale_f32 for t in tensors if t.quantization is not None]
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100170 return len(tensors) == len(scales) and None not in scales
Diqing Zhong09387e22020-09-28 18:46:22 +0200171
172
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100173def is_acc_40bits_used(npu_block_type, ifm_tensor, ofm_tensor, ifm2_tensor=None):
Diqing Zhong016b8272020-12-16 16:46:06 +0100174 return (
175 ifm_tensor.dtype.size_in_bits() == 16
176 and npu_block_type != NpuBlockType.Pooling
177 and _all_fms_have_quant(ifm_tensor, ofm_tensor, ifm2_tensor)
178 )
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100179
180
181def shared_buffer_allocation_for_pass(arch, ps) -> SharedBufferAllocation:
182 ifm_tensor, ifm2_tensor, _, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()
183 all_fms_have_quant = _all_fms_have_quant(ifm_tensor, ifm2_tensor, ofm_tensor)
184
185 kernel = Kernel(1, 1)
186 is_elementwise = ps.npu_block_type == NpuBlockType.ElementWise
187 uses_lut = False
188 ifm_count = 1
189
190 if ps.primary_op:
191 kernel = ps.primary_op.kernel
192 uses_lut = ps.primary_op.activation_lut is not None
193
194 ifm_resampling_mode = resampling_mode.NONE
195 ifm_bits = 0
196 ifm_depth = 0
197 if ifm_tensor:
198 ifm_resampling_mode = ifm_tensor.resampling_mode
199 ifm_bits = ifm_tensor.dtype.size_in_bits()
Patrik Gustavsson2349d422020-12-01 16:02:29 +0100200 ifm_shape = ps.primary_op.ifm_shapes[0]
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100201
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000202 if ifm_tensor.shape != []:
203 ifm_depth = ifm_shape.depth
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100204
205 if is_elementwise:
206 ifm_count = 2
207 if ifm_tensor.shape == []: # Scalar in ifm1
208 assert ifm2_tensor
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000209 ifm_depth = ps.primary_op.ifm_shapes[1].depth
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100210 ifm_count = 1
211 elif not ifm2_tensor or ifm2_tensor.shape == []: # Scalar in ifm2
212 ifm_count = 1
213 return SharedBufferAllocation(
214 arch,
215 kernel,
216 uses_lut,
217 npu_block_type=ps.npu_block_type,
218 all_fms_have_quant=all_fms_have_quant,
219 ifm_resampling_mode=ifm_resampling_mode,
220 ifm_bits=ifm_bits,
221 ifm_depth=ifm_depth,
222 ifm_count=ifm_count,
Patrik Gustavsson2349d422020-12-01 16:02:29 +0100223 ofm_shape=ps.primary_op.ofm_shapes[0],
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100224 )
225
226
227def shared_buffer_allocation_for_pass_and_block_config(arch, ps, block_config) -> SharedBufferAllocation:
228 alloc = shared_buffer_allocation_for_pass(arch, ps)
Tim Hall79d07d22020-04-27 18:20:16 +0100229 assert (alloc.ifm_block_depth == block_config[2]) or alloc.is_equal_depth_op
230 if alloc.try_block(Block(block_config[1], block_config[0], block_config[3])):
231 return alloc
232
233 return None
234
235
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100236def shared_buffer_allocation_for_npu_op(
237 arch, npu_op: NpuBlockOperation, npu_block_type: NpuBlockType, ifm_resampling_mode
238) -> SharedBufferAllocation:
239 uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP
240 fms = [npu_op.ifm, npu_op.ofm]
241 if npu_op.ifm2 is not None:
242 fms.append(npu_op.ifm2)
243 all_fms_have_quant = not any(fm.quantization is None or fm.quantization.scale_f32 is None for fm in fms)
244 ifm_bits = npu_op.ifm.data_type.size_in_bits()
245 ifm_depth = npu_op.ifm.shape.depth
246 ifm_count = 2 if npu_op.ifm2 is not None and npu_op.ifm2_scalar is None else 1
247 ofm_shape = [1, npu_op.ofm.shape.height, npu_op.ofm.shape.width, npu_op.ofm.shape.depth]
248 return SharedBufferAllocation(
249 arch,
250 to_kernel(npu_op.kernel),
251 uses_lut,
252 npu_block_type=npu_block_type,
253 all_fms_have_quant=all_fms_have_quant,
254 ifm_resampling_mode=ifm_resampling_mode,
255 ifm_bits=ifm_bits,
256 ifm_depth=ifm_depth,
257 ifm_count=ifm_count,
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000258 ofm_shape=Shape4D(ofm_shape),
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100259 )
Tim Hall79d07d22020-04-27 18:20:16 +0100260
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100261
262def find_suitable_block_configs(arch, alloc: SharedBufferAllocation) -> List[Tuple]:
263 """Returns list of block configs that would fit with the given shared buffer allocation"""
Tim Hall79d07d22020-04-27 18:20:16 +0100264
265 # Constrain the search space if the OFM is smaller than the max block size
266 # - Add other block search constraints here if required
patrik.gustavssoneeb85152020-12-21 17:10:40 +0000267 max_block_width = alloc.ofm_shape.width
268 max_block_height = alloc.ofm_shape.height
269 max_block_depth = alloc.ofm_shape.depth
Tim Hall79d07d22020-04-27 18:20:16 +0100270
271 # Constrain to valid ranges before search
272 max_block_width = min(arch.ofm_block_max.width, max_block_width)
273 max_block_height = min(arch.ofm_block_max.height, max_block_height)
274 max_block_depth = min(arch.ofm_block_max.depth, max_block_depth)
275
Jacob Bohlin2a585302021-02-11 16:04:53 +0100276 min_block_height = max(arch.ofm_ublock.height, 2 if alloc.ifm_resampling_mode != resampling_mode.NONE else 1)
277 min_block_width = max(arch.ofm_ublock.width, 2 if alloc.ifm_resampling_mode != resampling_mode.NONE else 1)
278
Tim Hall79d07d22020-04-27 18:20:16 +0100279 valid_block_configs = []
280 # Try a range of block shapes against this pass
Jacob Bohlin2a585302021-02-11 16:04:53 +0100281 for w in range(min_block_width, max_block_width + min_block_width, min_block_width):
282 for h in range(min_block_height, max_block_height + min_block_height, min_block_height):
Tim Hall79d07d22020-04-27 18:20:16 +0100283 # Try valid OFM block depths
284 for c in range(arch.ofm_ublock.depth, max_block_depth + arch.ofm_ublock.depth, arch.ofm_ublock.depth):
285 # OFM block depth has the constraint that if it causes the OFM to be
286 # split, it must be a multiple of the OFM split size
287 if (c >= max_block_depth) or (c < max_block_depth and (c % ArchitectureFeatures.OFMSplitDepth) == 0):
288 config = alloc.try_block(Block(w, h, c))
289 if config:
290 valid_block_configs.append(config)
291
292 assert len(valid_block_configs) > 0
293 return valid_block_configs
Louis Verhaarde8a5a782020-11-02 18:04:27 +0100294
295
296def find_block_configs_suitable_for_pass_and_shared_buffer(arch, ps) -> List[Tuple]:
297 alloc = shared_buffer_allocation_for_pass(arch, ps)
298 return find_suitable_block_configs(arch, alloc)