blob: 29be6d8d0d6b525921fb8aa7b9ebe761ec3e6fe1 [file] [log] [blame]
Tim Hall79d07d22020-04-27 18:20:16 +01001# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
2#
3# SPDX-License-Identifier: Apache-2.0
4#
5# Licensed under the Apache License, Version 2.0 (the License); you may
6# not use this file except in compliance with the License.
7# You may obtain a copy of the License at
8#
9# www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an AS IS BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14# See the License for the specific language governing permissions and
15# limitations under the License.
16
17
18# Description:
19# Shared buffer allocation works out how to allocate the Ethos-U55 shared buffer for a given pass.
20
21import numpy as np
Diego Russoea6111a2020-04-14 18:41:58 +010022
23from .operation import NpuBlockType
Tim Hall79d07d22020-04-27 18:20:16 +010024from .architecture_features import Block, Kernel, SHRAMElements, SharedBufferArea, ArchitectureFeatures
Tim Hall79d07d22020-04-27 18:20:16 +010025
26
27class SharedBufferAllocation:
28 def __init__(self, arch, ps):
29 self.arch = arch
30
31 self.bank_locations = np.zeros(SharedBufferArea.Size)
32 self.banks_required = np.zeros(SharedBufferArea.Size)
33
34 ifm_tensor, ifm2_tensor, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()
35
36 strides = (1, 1, 1, 1)
37 dilation = (1, 1, 1, 1)
38 self.kernel = Kernel(1, 1)
39 is_elementwise = ps.npu_block_type == NpuBlockType.ElementWise
40
41 if ps.primary_op:
42 strides = ps.primary_op.attrs.get("strides", strides)
43 dilation = ps.primary_op.attrs.get("dilation", dilation)
44 k_h = 1
45 k_w = 1
46 if weight_tensor:
47 if ps.primary_op.type != "FullyConnectedAct":
48 k_h = weight_tensor.shape[0]
49 k_w = weight_tensor.shape[1]
50 else:
51 k_h = ps.primary_op.attrs.get("filter_height", 1)
52 k_w = ps.primary_op.attrs.get("filter_width", 1)
53
54 self.kernel = Kernel(k_w, k_h, strides[2], strides[1], dilation[2], dilation[1])
55
56 self.is_equal_depth_op = is_elementwise or ps.npu_block_type in (
57 NpuBlockType.ConvolutionDepthWise,
58 NpuBlockType.Pooling,
59 )
60 self.strides = strides
61
62 self.use_accumulator_element = SHRAMElements.Acc32
63 if is_elementwise:
64 self.use_ifm_element = SHRAMElements.IFM8_Elementwise
65 else:
66 self.use_ifm_element = SHRAMElements.IFM8
67
68 self.ifm_bits = 0
69 self.ifm_depth = 0
70 if ifm_tensor:
71 self.ifm_bits = ifm_tensor.dtype.size_in_bits()
72 if ifm_tensor.shape == [] and is_elementwise:
73 # Elementwise operator with scalar in ifm, use ifm2 depth
74 self.ifm_depth = ifm2_tensor.shape[-1]
75 else:
76 self.ifm_depth = ifm_tensor.shape[-1]
77 if self.ifm_bits == 16:
78 self.use_accumulator_element = SHRAMElements.Acc40
79 self.use_ifm_element = self.use_ifm_element + 1
80 assert (self.use_ifm_element == SHRAMElements.IFM16) or (
81 self.use_ifm_element == SHRAMElements.IFM16_Elementwise
82 )
83 else:
84 assert self.ifm_bits == 8, "Unexpected IFM bitdepth"
85
86 self.ifm_block_depth = arch.calc_ifm_block_depth(self.ifm_depth, self.ifm_bits)
87 self.ofm_tensor = ofm_tensor
88
89 self.banks_required[SharedBufferArea.Weights] = arch.shram_reserved_weight_banks
90 self.banks_required[SharedBufferArea.OFM] = arch.shram_reserved_output_banks
91
92 def is_valid(self):
93 # Assign zero-based bank starts (first element remains zero)
94 self.bank_locations[1:] = np.cumsum(self.banks_required)[:-1]
95
96 # Accumulator area is measured from the end of the buffer
97 self.bank_locations[SharedBufferArea.Accumulators] = (
98 self.arch.shram_total_banks - self.banks_required[SharedBufferArea.Accumulators]
99 )
100 ifm_end = self.bank_locations[SharedBufferArea.IFM] + self.banks_required[SharedBufferArea.IFM]
101 return ifm_end <= self.bank_locations[SharedBufferArea.Accumulators]
102
103 def try_block(self, ofm_block: Block):
104 # Get IFM block configuration
105 ifm_block_depth = ofm_block.depth if self.is_equal_depth_op else self.ifm_block_depth
106 ifm_block = self.arch.get_ifm_block_size(ifm_block_depth, ofm_block, self.kernel)
107 ifm_config = self.arch.get_block_config(ifm_block.width, ifm_block.height, ifm_block.depth)
108 if ifm_config is None:
109 return None
110
111 # Get OFM block configuration
112 ofm_config = self.arch.get_block_config(ofm_block.width, ofm_block.height, ofm_block.depth)
113 if ofm_config is None:
114 return None
115
116 # Update bank counts for IFM and Accumulator
117 self.banks_required[SharedBufferArea.IFM] = ifm_config.banks[self.use_ifm_element]
118 self.banks_required[SharedBufferArea.Accumulators] = ofm_config.banks[self.use_accumulator_element]
119
120 # Validating calculates bank layout and returns validity
121 if not self.is_valid():
122 return None
123
124 return (ofm_block.height, ofm_block.width, ifm_block.depth, ofm_block.depth)
125
126 def generate_used_mask(self, active_set):
127 res = np.zeros(self.arch.shram_total_banks, dtype=np.int64)
128 for kind in active_set:
129 start = int(self.bank_locations[kind])
130 end = start + int(self.banks_required[kind])
131 res[start:end] = 1
132 return res
133
134 def is_compatible(first, second):
135 """See if the bank allocations of two convolutions are compatible,
136 so that they can run back-to-back without a fence in between"""
137
138 first_set = set((SharedBufferArea.OFM, SharedBufferArea.Accumulators))
139 second_set = set((SharedBufferArea.IFM, SharedBufferArea.Weights))
140
141 first_mask = first.generate_used_mask(first_set)
142 second_mask = second.generate_used_mask(second_set)
143
144 if np.sum(first_mask & second_mask):
145 # overlap
146 return False
147
148 return True
149
150
151def shared_buffer_allocation_for_pass_and_block_config(arch, ps, block_config):
152 alloc = SharedBufferAllocation(arch, ps)
153 assert (alloc.ifm_block_depth == block_config[2]) or alloc.is_equal_depth_op
154 if alloc.try_block(Block(block_config[1], block_config[0], block_config[3])):
155 return alloc
156
157 return None
158
159
160def find_block_configs_suitable_for_pass_and_shared_buffer(arch, ps):
161 alloc = SharedBufferAllocation(arch, ps)
162
163 if arch.override_block_config:
164 config = alloc.try_block(arch.override_block_config)
165 assert config, "Block config override cannot be used"
166 return [config]
167
168 # Constrain the search space if the OFM is smaller than the max block size
169 # - Add other block search constraints here if required
170 if len(alloc.ofm_tensor.shape) == 2:
171 max_block_height = max_block_width = alloc.ofm_tensor.shape[0]
172 else:
173 max_block_width = alloc.ofm_tensor.shape[-2]
174 max_block_height = alloc.ofm_tensor.shape[-3]
175
176 # Common block depth
177 max_block_depth = alloc.ofm_tensor.shape[-1]
178
179 # Constrain to valid ranges before search
180 max_block_width = min(arch.ofm_block_max.width, max_block_width)
181 max_block_height = min(arch.ofm_block_max.height, max_block_height)
182 max_block_depth = min(arch.ofm_block_max.depth, max_block_depth)
183
184 valid_block_configs = []
185 # Try a range of block shapes against this pass
186 for w in range(arch.ofm_ublock.width, max_block_width + arch.ofm_ublock.width, arch.ofm_ublock.width):
187 for h in range(arch.ofm_ublock.height, max_block_height + arch.ofm_ublock.height, arch.ofm_ublock.height):
188 # Try valid OFM block depths
189 for c in range(arch.ofm_ublock.depth, max_block_depth + arch.ofm_ublock.depth, arch.ofm_ublock.depth):
190 # OFM block depth has the constraint that if it causes the OFM to be
191 # split, it must be a multiple of the OFM split size
192 if (c >= max_block_depth) or (c < max_block_depth and (c % ArchitectureFeatures.OFMSplitDepth) == 0):
193 config = alloc.try_block(Block(w, h, c))
194 if config:
195 valid_block_configs.append(config)
196
197 assert len(valid_block_configs) > 0
198 return valid_block_configs