Jacob Bohlin | 2a58530 | 2021-02-11 16:04:53 +0100 | [diff] [blame] | 1 | # Copyright (C) 2020-2021 Arm Limited or its affiliates. All rights reserved. |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 2 | # |
| 3 | # SPDX-License-Identifier: Apache-2.0 |
| 4 | # |
| 5 | # Licensed under the Apache License, Version 2.0 (the License); you may |
| 6 | # not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| 8 | # |
| 9 | # www.apache.org/licenses/LICENSE-2.0 |
| 10 | # |
| 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an AS IS BASIS, WITHOUT |
| 13 | # WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 16 | # Description: |
Tim Hall | c8a7386 | 2020-10-27 12:43:14 +0000 | [diff] [blame] | 17 | # Shared buffer allocation works out how to allocate the Ethos-U shared buffer for a given pass. |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 18 | from typing import List |
| 19 | from typing import Tuple |
| 20 | |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 21 | import numpy as np |
Diego Russo | ea6111a | 2020-04-14 18:41:58 +0100 | [diff] [blame] | 22 | |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 23 | from .api import NpuActivationOp |
| 24 | from .api import NpuBlockOperation |
Diego Russo | e8a1045 | 2020-04-21 17:39:10 +0100 | [diff] [blame] | 25 | from .architecture_features import ArchitectureFeatures |
| 26 | from .architecture_features import Block |
Diego Russo | e8a1045 | 2020-04-21 17:39:10 +0100 | [diff] [blame] | 27 | from .architecture_features import SharedBufferArea |
| 28 | from .architecture_features import SHRAMElements |
Dwight Lidman | 7ad408b | 2020-08-11 11:55:22 +0200 | [diff] [blame] | 29 | from .ethos_u55_regs.ethos_u55_regs import resampling_mode |
Tim Hall | 4ed38bc | 2020-10-20 18:54:20 +0100 | [diff] [blame] | 30 | from .operation import Kernel |
Diego Russo | ea6111a | 2020-04-14 18:41:58 +0100 | [diff] [blame] | 31 | from .operation import NpuBlockType |
Louis Verhaard | 814cfbb | 2020-08-21 14:06:25 +0200 | [diff] [blame] | 32 | from .range_set import MemoryRangeSet |
Louis Verhaard | 1e17018 | 2020-11-26 11:42:04 +0100 | [diff] [blame] | 33 | from .register_command_stream_util import to_kernel |
patrik.gustavsson | eeb8515 | 2020-12-21 17:10:40 +0000 | [diff] [blame] | 34 | from .shape4d import Shape4D |
Louis Verhaard | 814cfbb | 2020-08-21 14:06:25 +0200 | [diff] [blame] | 35 | from .tensor import MemArea |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 36 | |
| 37 | |
| 38 | class SharedBufferAllocation: |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 39 | def __init__( |
| 40 | self, |
| 41 | arch, |
| 42 | kernel, |
| 43 | uses_lut, |
| 44 | npu_block_type, |
| 45 | all_fms_have_quant, |
| 46 | ifm_resampling_mode, |
| 47 | ifm_bits, |
| 48 | ifm_depth, |
| 49 | ifm_count, |
| 50 | ofm_shape, |
| 51 | ): |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 52 | self.arch = arch |
| 53 | |
| 54 | self.bank_locations = np.zeros(SharedBufferArea.Size) |
| 55 | self.banks_required = np.zeros(SharedBufferArea.Size) |
| 56 | |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 57 | self.kernel = Kernel(1, 1) if kernel is None else kernel |
| 58 | self.is_elementwise = npu_block_type == NpuBlockType.ElementWise |
| 59 | self.uses_lut = uses_lut |
| 60 | self.ifm_count = ifm_count |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 61 | |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 62 | self.is_equal_depth_op = self.is_elementwise or npu_block_type in ( |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 63 | NpuBlockType.ConvolutionDepthWise, |
| 64 | NpuBlockType.Pooling, |
| 65 | ) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 66 | |
| 67 | self.use_accumulator_element = SHRAMElements.Acc32 |
Tim Hall | d5044a4 | 2020-10-06 12:07:04 +0100 | [diff] [blame] | 68 | if self.is_elementwise: |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 69 | self.use_ifm_element = SHRAMElements.IFM8_Elementwise |
| 70 | else: |
| 71 | self.use_ifm_element = SHRAMElements.IFM8 |
| 72 | |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 73 | self.ifm_resampling_mode = ifm_resampling_mode |
| 74 | self.ifm_bits = ifm_bits |
| 75 | self.ifm_depth = ifm_depth |
| 76 | self.ifm_count = ifm_count |
Andreas Nevalainen | 6e82708 | 2020-10-14 13:55:43 +0200 | [diff] [blame] | 77 | |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 78 | if self.ifm_bits == 16: |
| 79 | if npu_block_type != NpuBlockType.Pooling and all_fms_have_quant: |
| 80 | self.use_accumulator_element = SHRAMElements.Acc40 |
| 81 | self.use_ifm_element = self.use_ifm_element + 1 |
| 82 | assert (self.use_ifm_element == SHRAMElements.IFM16) or ( |
| 83 | self.use_ifm_element == SHRAMElements.IFM16_Elementwise |
| 84 | ) |
| 85 | elif self.ifm_bits == 32: |
| 86 | assert self.is_elementwise or npu_block_type == NpuBlockType.ReduceSum, "Unsupported 32-bit IFM operation" |
| 87 | self.use_ifm_element = SHRAMElements.IFM32 |
| 88 | else: |
| 89 | assert self.ifm_bits == 8, "Unexpected IFM bitdepth" |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 90 | |
| 91 | self.ifm_block_depth = arch.calc_ifm_block_depth(self.ifm_depth, self.ifm_bits) |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 92 | self.ofm_shape = ofm_shape |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 93 | |
| 94 | self.banks_required[SharedBufferArea.Weights] = arch.shram_reserved_weight_banks |
| 95 | self.banks_required[SharedBufferArea.OFM] = arch.shram_reserved_output_banks |
| 96 | |
| 97 | def is_valid(self): |
| 98 | # Assign zero-based bank starts (first element remains zero) |
| 99 | self.bank_locations[1:] = np.cumsum(self.banks_required)[:-1] |
| 100 | |
| 101 | # Accumulator area is measured from the end of the buffer |
| 102 | self.bank_locations[SharedBufferArea.Accumulators] = ( |
Louis Verhaard | 814cfbb | 2020-08-21 14:06:25 +0200 | [diff] [blame] | 103 | self.arch.available_shram_banks(self.uses_lut) - self.banks_required[SharedBufferArea.Accumulators] |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 104 | ) |
| 105 | ifm_end = self.bank_locations[SharedBufferArea.IFM] + self.banks_required[SharedBufferArea.IFM] |
| 106 | return ifm_end <= self.bank_locations[SharedBufferArea.Accumulators] |
| 107 | |
| 108 | def try_block(self, ofm_block: Block): |
| 109 | # Get IFM block configuration |
| 110 | ifm_block_depth = ofm_block.depth if self.is_equal_depth_op else self.ifm_block_depth |
Tim Hall | c30f495 | 2020-06-15 20:47:35 +0100 | [diff] [blame] | 111 | ifm_block = self.arch.get_ifm_block_size( |
| 112 | ifm_block_depth, ofm_block, self.kernel, ifm_resampling_mode=self.ifm_resampling_mode |
| 113 | ) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 114 | ifm_config = self.arch.get_block_config(ifm_block.width, ifm_block.height, ifm_block.depth) |
| 115 | if ifm_config is None: |
| 116 | return None |
| 117 | |
| 118 | # Get OFM block configuration |
| 119 | ofm_config = self.arch.get_block_config(ofm_block.width, ofm_block.height, ofm_block.depth) |
| 120 | if ofm_config is None: |
| 121 | return None |
| 122 | |
Tim Hall | d5044a4 | 2020-10-06 12:07:04 +0100 | [diff] [blame] | 123 | acc_banks = ofm_config.banks[self.use_accumulator_element] |
| 124 | |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 125 | # Update bank counts for IFM and Accumulator |
Andreas Nevalainen | 6e82708 | 2020-10-14 13:55:43 +0200 | [diff] [blame] | 126 | self.banks_required[SharedBufferArea.IFM] = ifm_config.banks[self.use_ifm_element] * self.ifm_count |
Tim Hall | d5044a4 | 2020-10-06 12:07:04 +0100 | [diff] [blame] | 127 | self.banks_required[SharedBufferArea.Accumulators] = 0 if self.is_elementwise else acc_banks |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 128 | |
| 129 | # Validating calculates bank layout and returns validity |
| 130 | if not self.is_valid(): |
| 131 | return None |
| 132 | |
| 133 | return (ofm_block.height, ofm_block.width, ifm_block.depth, ofm_block.depth) |
| 134 | |
| 135 | def generate_used_mask(self, active_set): |
| 136 | res = np.zeros(self.arch.shram_total_banks, dtype=np.int64) |
| 137 | for kind in active_set: |
| 138 | start = int(self.bank_locations[kind]) |
| 139 | end = start + int(self.banks_required[kind]) |
| 140 | res[start:end] = 1 |
| 141 | return res |
| 142 | |
| 143 | def is_compatible(first, second): |
| 144 | """See if the bank allocations of two convolutions are compatible, |
| 145 | so that they can run back-to-back without a fence in between""" |
| 146 | |
| 147 | first_set = set((SharedBufferArea.OFM, SharedBufferArea.Accumulators)) |
| 148 | second_set = set((SharedBufferArea.IFM, SharedBufferArea.Weights)) |
| 149 | |
| 150 | first_mask = first.generate_used_mask(first_set) |
| 151 | second_mask = second.generate_used_mask(second_set) |
| 152 | |
| 153 | if np.sum(first_mask & second_mask): |
| 154 | # overlap |
| 155 | return False |
| 156 | |
| 157 | return True |
| 158 | |
Louis Verhaard | 814cfbb | 2020-08-21 14:06:25 +0200 | [diff] [blame] | 159 | def get_shram_memory_access_range(self): |
| 160 | # Returns the SHRAM memory access range used by this shared buffer, |
| 161 | # excluding access to LUT |
| 162 | return MemoryRangeSet( |
| 163 | MemArea.Shram, 0, self.arch.available_shram_banks(self.uses_lut) * self.arch.shram_bank_size |
| 164 | ) |
| 165 | |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 166 | |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 167 | def _all_fms_have_quant(ifm_tensor, ofm_tensor, ifm2_tensor=None) -> bool: |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 168 | tensors = [t for t in (ifm_tensor, ifm2_tensor, ofm_tensor) if t is not None] |
| 169 | scales = [t.quantization.scale_f32 for t in tensors if t.quantization is not None] |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 170 | return len(tensors) == len(scales) and None not in scales |
Diqing Zhong | 09387e2 | 2020-09-28 18:46:22 +0200 | [diff] [blame] | 171 | |
| 172 | |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 173 | def is_acc_40bits_used(npu_block_type, ifm_tensor, ofm_tensor, ifm2_tensor=None): |
Diqing Zhong | 016b827 | 2020-12-16 16:46:06 +0100 | [diff] [blame] | 174 | return ( |
| 175 | ifm_tensor.dtype.size_in_bits() == 16 |
| 176 | and npu_block_type != NpuBlockType.Pooling |
| 177 | and _all_fms_have_quant(ifm_tensor, ofm_tensor, ifm2_tensor) |
| 178 | ) |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 179 | |
| 180 | |
| 181 | def shared_buffer_allocation_for_pass(arch, ps) -> SharedBufferAllocation: |
| 182 | ifm_tensor, ifm2_tensor, _, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm() |
| 183 | all_fms_have_quant = _all_fms_have_quant(ifm_tensor, ifm2_tensor, ofm_tensor) |
| 184 | |
| 185 | kernel = Kernel(1, 1) |
| 186 | is_elementwise = ps.npu_block_type == NpuBlockType.ElementWise |
| 187 | uses_lut = False |
| 188 | ifm_count = 1 |
| 189 | |
| 190 | if ps.primary_op: |
| 191 | kernel = ps.primary_op.kernel |
| 192 | uses_lut = ps.primary_op.activation_lut is not None |
| 193 | |
| 194 | ifm_resampling_mode = resampling_mode.NONE |
| 195 | ifm_bits = 0 |
| 196 | ifm_depth = 0 |
| 197 | if ifm_tensor: |
| 198 | ifm_resampling_mode = ifm_tensor.resampling_mode |
| 199 | ifm_bits = ifm_tensor.dtype.size_in_bits() |
Patrik Gustavsson | 2349d42 | 2020-12-01 16:02:29 +0100 | [diff] [blame] | 200 | ifm_shape = ps.primary_op.ifm_shapes[0] |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 201 | |
patrik.gustavsson | eeb8515 | 2020-12-21 17:10:40 +0000 | [diff] [blame] | 202 | if ifm_tensor.shape != []: |
| 203 | ifm_depth = ifm_shape.depth |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 204 | |
| 205 | if is_elementwise: |
| 206 | ifm_count = 2 |
| 207 | if ifm_tensor.shape == []: # Scalar in ifm1 |
| 208 | assert ifm2_tensor |
patrik.gustavsson | eeb8515 | 2020-12-21 17:10:40 +0000 | [diff] [blame] | 209 | ifm_depth = ps.primary_op.ifm_shapes[1].depth |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 210 | ifm_count = 1 |
| 211 | elif not ifm2_tensor or ifm2_tensor.shape == []: # Scalar in ifm2 |
| 212 | ifm_count = 1 |
| 213 | return SharedBufferAllocation( |
| 214 | arch, |
| 215 | kernel, |
| 216 | uses_lut, |
| 217 | npu_block_type=ps.npu_block_type, |
| 218 | all_fms_have_quant=all_fms_have_quant, |
| 219 | ifm_resampling_mode=ifm_resampling_mode, |
| 220 | ifm_bits=ifm_bits, |
| 221 | ifm_depth=ifm_depth, |
| 222 | ifm_count=ifm_count, |
Patrik Gustavsson | 2349d42 | 2020-12-01 16:02:29 +0100 | [diff] [blame] | 223 | ofm_shape=ps.primary_op.ofm_shapes[0], |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 224 | ) |
| 225 | |
| 226 | |
| 227 | def shared_buffer_allocation_for_pass_and_block_config(arch, ps, block_config) -> SharedBufferAllocation: |
| 228 | alloc = shared_buffer_allocation_for_pass(arch, ps) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 229 | assert (alloc.ifm_block_depth == block_config[2]) or alloc.is_equal_depth_op |
| 230 | if alloc.try_block(Block(block_config[1], block_config[0], block_config[3])): |
| 231 | return alloc |
| 232 | |
| 233 | return None |
| 234 | |
| 235 | |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 236 | def shared_buffer_allocation_for_npu_op( |
| 237 | arch, npu_op: NpuBlockOperation, npu_block_type: NpuBlockType, ifm_resampling_mode |
| 238 | ) -> SharedBufferAllocation: |
| 239 | uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP |
| 240 | fms = [npu_op.ifm, npu_op.ofm] |
| 241 | if npu_op.ifm2 is not None: |
| 242 | fms.append(npu_op.ifm2) |
| 243 | all_fms_have_quant = not any(fm.quantization is None or fm.quantization.scale_f32 is None for fm in fms) |
| 244 | ifm_bits = npu_op.ifm.data_type.size_in_bits() |
| 245 | ifm_depth = npu_op.ifm.shape.depth |
| 246 | ifm_count = 2 if npu_op.ifm2 is not None and npu_op.ifm2_scalar is None else 1 |
| 247 | ofm_shape = [1, npu_op.ofm.shape.height, npu_op.ofm.shape.width, npu_op.ofm.shape.depth] |
| 248 | return SharedBufferAllocation( |
| 249 | arch, |
| 250 | to_kernel(npu_op.kernel), |
| 251 | uses_lut, |
| 252 | npu_block_type=npu_block_type, |
| 253 | all_fms_have_quant=all_fms_have_quant, |
| 254 | ifm_resampling_mode=ifm_resampling_mode, |
| 255 | ifm_bits=ifm_bits, |
| 256 | ifm_depth=ifm_depth, |
| 257 | ifm_count=ifm_count, |
patrik.gustavsson | eeb8515 | 2020-12-21 17:10:40 +0000 | [diff] [blame] | 258 | ofm_shape=Shape4D(ofm_shape), |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 259 | ) |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 260 | |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 261 | |
| 262 | def find_suitable_block_configs(arch, alloc: SharedBufferAllocation) -> List[Tuple]: |
| 263 | """Returns list of block configs that would fit with the given shared buffer allocation""" |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 264 | |
| 265 | # Constrain the search space if the OFM is smaller than the max block size |
| 266 | # - Add other block search constraints here if required |
patrik.gustavsson | eeb8515 | 2020-12-21 17:10:40 +0000 | [diff] [blame] | 267 | max_block_width = alloc.ofm_shape.width |
| 268 | max_block_height = alloc.ofm_shape.height |
| 269 | max_block_depth = alloc.ofm_shape.depth |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 270 | |
| 271 | # Constrain to valid ranges before search |
| 272 | max_block_width = min(arch.ofm_block_max.width, max_block_width) |
| 273 | max_block_height = min(arch.ofm_block_max.height, max_block_height) |
| 274 | max_block_depth = min(arch.ofm_block_max.depth, max_block_depth) |
| 275 | |
Jacob Bohlin | 2a58530 | 2021-02-11 16:04:53 +0100 | [diff] [blame] | 276 | min_block_height = max(arch.ofm_ublock.height, 2 if alloc.ifm_resampling_mode != resampling_mode.NONE else 1) |
| 277 | min_block_width = max(arch.ofm_ublock.width, 2 if alloc.ifm_resampling_mode != resampling_mode.NONE else 1) |
| 278 | |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 279 | valid_block_configs = [] |
| 280 | # Try a range of block shapes against this pass |
Jacob Bohlin | 2a58530 | 2021-02-11 16:04:53 +0100 | [diff] [blame] | 281 | for w in range(min_block_width, max_block_width + min_block_width, min_block_width): |
| 282 | for h in range(min_block_height, max_block_height + min_block_height, min_block_height): |
Tim Hall | 79d07d2 | 2020-04-27 18:20:16 +0100 | [diff] [blame] | 283 | # Try valid OFM block depths |
| 284 | for c in range(arch.ofm_ublock.depth, max_block_depth + arch.ofm_ublock.depth, arch.ofm_ublock.depth): |
| 285 | # OFM block depth has the constraint that if it causes the OFM to be |
| 286 | # split, it must be a multiple of the OFM split size |
| 287 | if (c >= max_block_depth) or (c < max_block_depth and (c % ArchitectureFeatures.OFMSplitDepth) == 0): |
| 288 | config = alloc.try_block(Block(w, h, c)) |
| 289 | if config: |
| 290 | valid_block_configs.append(config) |
| 291 | |
| 292 | assert len(valid_block_configs) > 0 |
| 293 | return valid_block_configs |
Louis Verhaard | e8a5a78 | 2020-11-02 18:04:27 +0100 | [diff] [blame] | 294 | |
| 295 | |
| 296 | def find_block_configs_suitable_for_pass_and_shared_buffer(arch, ps) -> List[Tuple]: |
| 297 | alloc = shared_buffer_allocation_for_pass(arch, ps) |
| 298 | return find_suitable_block_configs(arch, alloc) |