Blame - ethosu/vela/shared_buffer_allocation.py - ml/ethos-u/ethos-u-vela

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

1

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

16

# Description:

17

# Shared buffer allocation works out how to allocate the Ethos-U55 shared buffer for a given pass.

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

18

import numpy as np

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

19

Diego Russo

e8a1045

2020-04-21 17:39:10 +0100

[diff] [blame]

20

from .architecture_features import ArchitectureFeatures

21

from .architecture_features import Block

22

from .architecture_features import Kernel

23

from .architecture_features import SharedBufferArea

24

from .architecture_features import SHRAMElements

Tim Hall

2a7ebe3

2020-06-18 11:42:21 +0100

[diff] [blame]

25

from .errors import VelaError

Dwight Lidman

7ad408b

2020-08-11 11:55:22 +0200

[diff] [blame]

26

from .ethos_u55_regs.ethos_u55_regs import resampling_mode

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

27

from .operation import NpuBlockType

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

28

29

30

class SharedBufferAllocation:

31

def __init__(self, arch, ps):

32

self.arch = arch

33

34

self.bank_locations = np.zeros(SharedBufferArea.Size)

35

self.banks_required = np.zeros(SharedBufferArea.Size)

36

37

ifm_tensor, ifm2_tensor, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()

38

39

strides = (1, 1, 1, 1)

40

dilation = (1, 1, 1, 1)

41

self.kernel = Kernel(1, 1)

42

is_elementwise = ps.npu_block_type == NpuBlockType.ElementWise

43

44

if ps.primary_op:

45

strides = ps.primary_op.attrs.get("strides", strides)

46

dilation = ps.primary_op.attrs.get("dilation", dilation)

k_h = 1

k_w = 1

if weight_tensor:

if ps.primary_op.type != "FullyConnectedAct":

51

k_h = weight_tensor.shape[0]

52

k_w = weight_tensor.shape[1]

53

else:

54

k_h = ps.primary_op.attrs.get("filter_height", 1)

55

k_w = ps.primary_op.attrs.get("filter_width", 1)

56

57

self.kernel = Kernel(k_w, k_h, strides[2], strides[1], dilation[2], dilation[1])

58

59

self.is_equal_depth_op = is_elementwise or ps.npu_block_type in (

60

NpuBlockType.ConvolutionDepthWise,

61

NpuBlockType.Pooling,

62

)

63

self.strides = strides

64

65

self.use_accumulator_element = SHRAMElements.Acc32

66

if is_elementwise:

67

self.use_ifm_element = SHRAMElements.IFM8_Elementwise

68

else:

69

self.use_ifm_element = SHRAMElements.IFM8

70

Dwight Lidman

7ad408b

2020-08-11 11:55:22 +0200

[diff] [blame]

71

self.ifm_resampling_mode = resampling_mode.NONE

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

72

self.ifm_bits = 0

73

self.ifm_depth = 0

74

if ifm_tensor:

Dwight Lidman

7ad408b

2020-08-11 11:55:22 +0200

[diff] [blame]

75

self.ifm_resampling_mode = ifm_tensor.resampling_mode

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

76

self.ifm_bits = ifm_tensor.dtype.size_in_bits()

77

if ifm_tensor.shape == [] and is_elementwise:

78

# Elementwise operator with scalar in ifm, use ifm2 depth

79

self.ifm_depth = ifm2_tensor.shape[-1]

80

else:

81

self.ifm_depth = ifm_tensor.shape[-1]

82

if self.ifm_bits == 16:

83

self.use_accumulator_element = SHRAMElements.Acc40

84

self.use_ifm_element = self.use_ifm_element + 1

85

assert (self.use_ifm_element == SHRAMElements.IFM16) or (

86

self.use_ifm_element == SHRAMElements.IFM16_Elementwise

87

)

Fredrik Svedberg

597fd3f

2020-08-13 10:02:53 +0200

[diff] [blame]

88

elif is_elementwise or ps.npu_block_type == NpuBlockType.ReduceSum and self.ifm_bits == 32:

89

self.use_ifm_element = SHRAMElements.IFM32

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

90

else:

91

assert self.ifm_bits == 8, "Unexpected IFM bitdepth"

92

93

self.ifm_block_depth = arch.calc_ifm_block_depth(self.ifm_depth, self.ifm_bits)

94

self.ofm_tensor = ofm_tensor

95

96

self.banks_required[SharedBufferArea.Weights] = arch.shram_reserved_weight_banks

97

self.banks_required[SharedBufferArea.OFM] = arch.shram_reserved_output_banks

98

99

def is_valid(self):

100

# Assign zero-based bank starts (first element remains zero)

101

self.bank_locations[1:] = np.cumsum(self.banks_required)[:-1]

102

103

# Accumulator area is measured from the end of the buffer

104

self.bank_locations[SharedBufferArea.Accumulators] = (

105

self.arch.shram_total_banks - self.banks_required[SharedBufferArea.Accumulators]

106

)

107

ifm_end = self.bank_locations[SharedBufferArea.IFM] + self.banks_required[SharedBufferArea.IFM]

108

return ifm_end <= self.bank_locations[SharedBufferArea.Accumulators]

109

110

def try_block(self, ofm_block: Block):

111

# Get IFM block configuration

112

ifm_block_depth = ofm_block.depth if self.is_equal_depth_op else self.ifm_block_depth

Tim Hall

c30f495

2020-06-15 20:47:35 +0100

[diff] [blame]

113

ifm_block = self.arch.get_ifm_block_size(

114

ifm_block_depth, ofm_block, self.kernel, ifm_resampling_mode=self.ifm_resampling_mode

115

)

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

116

ifm_config = self.arch.get_block_config(ifm_block.width, ifm_block.height, ifm_block.depth)

117

if ifm_config is None:

118

return None

119

120

# Get OFM block configuration

121

ofm_config = self.arch.get_block_config(ofm_block.width, ofm_block.height, ofm_block.depth)

122

if ofm_config is None:

123

return None

124

125

# Update bank counts for IFM and Accumulator

126

self.banks_required[SharedBufferArea.IFM] = ifm_config.banks[self.use_ifm_element]

127

self.banks_required[SharedBufferArea.Accumulators] = ofm_config.banks[self.use_accumulator_element]

128

129

# Validating calculates bank layout and returns validity

130

if not self.is_valid():

131

return None

132

133

return (ofm_block.height, ofm_block.width, ifm_block.depth, ofm_block.depth)

134

135

def generate_used_mask(self, active_set):

136

res = np.zeros(self.arch.shram_total_banks, dtype=np.int64)

137

for kind in active_set:

138

start = int(self.bank_locations[kind])

139

end = start + int(self.banks_required[kind])

res[start:end] = 1

return res

def is_compatible(first, second):

144

"""See if the bank allocations of two convolutions are compatible,

145

so that they can run back-to-back without a fence in between"""

146

147

first_set = set((SharedBufferArea.OFM, SharedBufferArea.Accumulators))

148

second_set = set((SharedBufferArea.IFM, SharedBufferArea.Weights))

149

150

first_mask = first.generate_used_mask(first_set)

151

second_mask = second.generate_used_mask(second_set)

152

153

if np.sum(first_mask & second_mask):

# overlap

return False

return True

def shared_buffer_allocation_for_pass_and_block_config(arch, ps, block_config):

161

alloc = SharedBufferAllocation(arch, ps)

162

assert (alloc.ifm_block_depth == block_config[2]) or alloc.is_equal_depth_op

163

if alloc.try_block(Block(block_config[1], block_config[0], block_config[3])):

return alloc

return None

def find_block_configs_suitable_for_pass_and_shared_buffer(arch, ps):

170

alloc = SharedBufferAllocation(arch, ps)

171

172

if arch.override_block_config:

173

config = alloc.try_block(arch.override_block_config)

Tim Hall

2a7ebe3

2020-06-18 11:42:21 +0100

[diff] [blame]

174

if config is None:

Fredrik Svedberg

a0c3624

2020-06-03 15:43:31 +0200

[diff] [blame]

175

raise VelaError("Block config override '{0}' cannot be allocated".format(arch.override_block_config))

Tim Hall

2a7ebe3

2020-06-18 11:42:21 +0100

[diff] [blame]

176

return [config]

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

177

178

# Constrain the search space if the OFM is smaller than the max block size

179

# - Add other block search constraints here if required

180

if len(alloc.ofm_tensor.shape) == 2:

181

max_block_height = max_block_width = alloc.ofm_tensor.shape[0]

182

else:

183

max_block_width = alloc.ofm_tensor.shape[-2]

184

max_block_height = alloc.ofm_tensor.shape[-3]

185

186

# Common block depth

187

max_block_depth = alloc.ofm_tensor.shape[-1]

188

189

# Constrain to valid ranges before search

190

max_block_width = min(arch.ofm_block_max.width, max_block_width)

191

max_block_height = min(arch.ofm_block_max.height, max_block_height)

192

max_block_depth = min(arch.ofm_block_max.depth, max_block_depth)

193

194

valid_block_configs = []

195

# Try a range of block shapes against this pass

196

for w in range(arch.ofm_ublock.width, max_block_width + arch.ofm_ublock.width, arch.ofm_ublock.width):

197

for h in range(arch.ofm_ublock.height, max_block_height + arch.ofm_ublock.height, arch.ofm_ublock.height):

198

# Try valid OFM block depths

199

for c in range(arch.ofm_ublock.depth, max_block_depth + arch.ofm_ublock.depth, arch.ofm_ublock.depth):

200

# OFM block depth has the constraint that if it causes the OFM to be

201

# split, it must be a multiple of the OFM split size

202

if (c >= max_block_depth) or (c < max_block_depth and (c % ArchitectureFeatures.OFMSplitDepth) == 0):

203

config = alloc.try_block(Block(w, h, c))

204

if config:

205

valid_block_configs.append(config)

206

207

assert len(valid_block_configs) > 0

208

return valid_block_configs