Blame - ethosu/vela/shared_buffer_allocation.py - ml/ethos-u/ethos-u-vela

2020-04-27 18:20:16 +0100

[diff] [blame]

1

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

16

# Description:

17

# Shared buffer allocation works out how to allocate the Ethos-U55 shared buffer for a given pass.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

18

import numpy as np

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

19

Diego Russo

e8a1045

2020-04-21 17:39:10 +0100

[diff] [blame]

20

from .architecture_features import ArchitectureFeatures

21

from .architecture_features import Block

Diego Russo

e8a1045

2020-04-21 17:39:10 +0100

[diff] [blame]

22

from .architecture_features import SharedBufferArea

23

from .architecture_features import SHRAMElements

Tim Hall

2a7ebe3

2020-06-18 11:42:21 +0100

[diff] [blame]

24

from .errors import VelaError

Dwight Lidman

7ad408b

2020-08-11 11:55:22 +0200

[diff] [blame]

25

from .ethos_u55_regs.ethos_u55_regs import resampling_mode

Tim Hall

2020-10-20 18:54:20 +0100

[diff] [blame]

26

from .operation import Kernel

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

27

from .operation import NpuBlockType

Louis Verhaard

2020-08-21 14:06:25 +0200

[diff] [blame]

28

from .range_set import MemoryRangeSet

29

from .tensor import MemArea

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

30

31

32

class SharedBufferAllocation:

33

def __init__(self, arch, ps):

34

self.arch = arch

35

36

self.bank_locations = np.zeros(SharedBufferArea.Size)

37

self.banks_required = np.zeros(SharedBufferArea.Size)

38

39

ifm_tensor, ifm2_tensor, weight_tensor, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()

40

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

41

self.kernel = Kernel(1, 1)

Tim Hall

2020-10-06 12:07:04 +0100

[diff] [blame]

42

self.is_elementwise = ps.npu_block_type == NpuBlockType.ElementWise

Louis Verhaard

2020-08-21 14:06:25 +0200

[diff] [blame]

43

self.uses_lut = False

Andreas Nevalainen

2020-10-14 13:55:43 +0200

[diff] [blame]

44

self.ifm_count = 1

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

45

46

if ps.primary_op:

Tim Hall

2020-10-20 18:54:20 +0100

[diff] [blame]

47

self.kernel = ps.primary_op.kernel

Louis Verhaard

2020-08-21 14:06:25 +0200

[diff] [blame]

48

self.uses_lut = ps.primary_op.activation_lut is not None

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

49

Tim Hall

2020-10-06 12:07:04 +0100

[diff] [blame]

50

self.is_equal_depth_op = self.is_elementwise or ps.npu_block_type in (

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

51

NpuBlockType.ConvolutionDepthWise,

52

NpuBlockType.Pooling,

53

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

54

55

self.use_accumulator_element = SHRAMElements.Acc32

Tim Hall

2020-10-06 12:07:04 +0100

[diff] [blame]

56

if self.is_elementwise:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

57

self.use_ifm_element = SHRAMElements.IFM8_Elementwise

58

else:

59

self.use_ifm_element = SHRAMElements.IFM8

60

Dwight Lidman

7ad408b

2020-08-11 11:55:22 +0200

[diff] [blame]

61

self.ifm_resampling_mode = resampling_mode.NONE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

62

self.ifm_bits = 0

63

self.ifm_depth = 0

64

if ifm_tensor:

Dwight Lidman

7ad408b

2020-08-11 11:55:22 +0200

[diff] [blame]

65

self.ifm_resampling_mode = ifm_tensor.resampling_mode

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

66

self.ifm_bits = ifm_tensor.dtype.size_in_bits()

Andreas Nevalainen

2020-10-14 13:55:43 +0200

[diff] [blame]

67

68

if ifm_tensor.shape != []:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

69

self.ifm_depth = ifm_tensor.shape[-1]

Andreas Nevalainen

2020-10-14 13:55:43 +0200

[diff] [blame]

70

71

if self.is_elementwise:

72

self.ifm_count = 2

Tim Hall

2020-10-20 18:54:20 +0100

[diff] [blame]

73

if ifm_tensor.shape == []: # Scalar in ifm1

Andreas Nevalainen

2020-10-14 13:55:43 +0200

[diff] [blame]

74

assert ifm2_tensor

75

self.ifm_depth = ifm2_tensor.shape[-1]

76

self.ifm_count = 1

Tim Hall

2020-10-20 18:54:20 +0100

[diff] [blame]

77

elif not ifm2_tensor or ifm2_tensor.shape == []: # Scalar in ifm2

Andreas Nevalainen

2020-10-14 13:55:43 +0200

[diff] [blame]

78

self.ifm_count = 1

79

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

80

if self.ifm_bits == 16:

Diqing Zhong

09387e2

2020-09-28 18:46:22 +0200

[diff] [blame^]

81

if is_acc_40bits_used(ps.npu_block_type, ifm_tensor, ofm_tensor, ifm2_tensor):

Tim Hall

749bfd5

2020-08-30 14:40:46 +0100

[diff] [blame]

82

self.use_accumulator_element = SHRAMElements.Acc40

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

83

self.use_ifm_element = self.use_ifm_element + 1

84

assert (self.use_ifm_element == SHRAMElements.IFM16) or (

85

self.use_ifm_element == SHRAMElements.IFM16_Elementwise

86

)

Tim Hall

2b7a162

2020-09-08 17:00:33 +0100

[diff] [blame]

87

elif self.ifm_bits == 32:

Louis Verhaard

aee5d75

2020-09-30 09:01:52 +0200

[diff] [blame]

88

assert (

89

self.is_elementwise or ps.npu_block_type == NpuBlockType.ReduceSum

90

), "Unsupported 32-bit IFM operation"

Fredrik Svedberg

597fd3f

2020-08-13 10:02:53 +0200

[diff] [blame]

91

self.use_ifm_element = SHRAMElements.IFM32

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

92

else:

93

assert self.ifm_bits == 8, "Unexpected IFM bitdepth"

94

95

self.ifm_block_depth = arch.calc_ifm_block_depth(self.ifm_depth, self.ifm_bits)

96

self.ofm_tensor = ofm_tensor

97

98

self.banks_required[SharedBufferArea.Weights] = arch.shram_reserved_weight_banks

99

self.banks_required[SharedBufferArea.OFM] = arch.shram_reserved_output_banks

100

101

def is_valid(self):

102

# Assign zero-based bank starts (first element remains zero)

103

self.bank_locations[1:] = np.cumsum(self.banks_required)[:-1]

104

105

# Accumulator area is measured from the end of the buffer

106

self.bank_locations[SharedBufferArea.Accumulators] = (

Louis Verhaard

2020-08-21 14:06:25 +0200

[diff] [blame]

107

self.arch.available_shram_banks(self.uses_lut) - self.banks_required[SharedBufferArea.Accumulators]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

108

)

109

ifm_end = self.bank_locations[SharedBufferArea.IFM] + self.banks_required[SharedBufferArea.IFM]

110

return ifm_end <= self.bank_locations[SharedBufferArea.Accumulators]

111

112

def try_block(self, ofm_block: Block):

113

# Get IFM block configuration

114

ifm_block_depth = ofm_block.depth if self.is_equal_depth_op else self.ifm_block_depth

Tim Hall

c30f495

2020-06-15 20:47:35 +0100

[diff] [blame]

115

ifm_block = self.arch.get_ifm_block_size(

116

ifm_block_depth, ofm_block, self.kernel, ifm_resampling_mode=self.ifm_resampling_mode

117

)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

118

ifm_config = self.arch.get_block_config(ifm_block.width, ifm_block.height, ifm_block.depth)

119

if ifm_config is None:

120

return None

121

122

# Get OFM block configuration

123

ofm_config = self.arch.get_block_config(ofm_block.width, ofm_block.height, ofm_block.depth)

124

if ofm_config is None:

125

return None

126

Tim Hall

2020-10-06 12:07:04 +0100

[diff] [blame]

127

acc_banks = ofm_config.banks[self.use_accumulator_element]

128

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

129

# Update bank counts for IFM and Accumulator

Andreas Nevalainen

2020-10-14 13:55:43 +0200

[diff] [blame]

130

self.banks_required[SharedBufferArea.IFM] = ifm_config.banks[self.use_ifm_element] * self.ifm_count

Tim Hall

2020-10-06 12:07:04 +0100

[diff] [blame]

131

self.banks_required[SharedBufferArea.Accumulators] = 0 if self.is_elementwise else acc_banks

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

132

133

# Validating calculates bank layout and returns validity

134

if not self.is_valid():

135

return None

136

137

return (ofm_block.height, ofm_block.width, ifm_block.depth, ofm_block.depth)

138

139

def generate_used_mask(self, active_set):

140

res = np.zeros(self.arch.shram_total_banks, dtype=np.int64)

141

for kind in active_set:

142

start = int(self.bank_locations[kind])

143

end = start + int(self.banks_required[kind])

res[start:end] = 1

return res

def is_compatible(first, second):

148

"""See if the bank allocations of two convolutions are compatible,

149

so that they can run back-to-back without a fence in between"""

150

151

first_set = set((SharedBufferArea.OFM, SharedBufferArea.Accumulators))

152

second_set = set((SharedBufferArea.IFM, SharedBufferArea.Weights))

153

154

first_mask = first.generate_used_mask(first_set)

155

second_mask = second.generate_used_mask(second_set)

156

157

if np.sum(first_mask & second_mask):

# overlap

return False

return True

Louis Verhaard

2020-08-21 14:06:25 +0200

[diff] [blame]

163

def get_shram_memory_access_range(self):

164

# Returns the SHRAM memory access range used by this shared buffer,

165

# excluding access to LUT

166

return MemoryRangeSet(

167

MemArea.Shram, 0, self.arch.available_shram_banks(self.uses_lut) * self.arch.shram_bank_size

168

)

169

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

170

Diqing Zhong

09387e2

2020-09-28 18:46:22 +0200

[diff] [blame^]

171

def is_acc_40bits_used(npu_block_type, ifm_tensor, ofm_tensor, ifm2_tensor=None):

172

tensors = [t for t in (ifm_tensor, ifm2_tensor, ofm_tensor) if t is not None]

173

scales = [t.quantization.scale_f32 for t in tensors if t.quantization is not None]

174

has_scale = len(tensors) == len(scales) and None not in scales

175

return npu_block_type != NpuBlockType.Pooling and has_scale

176

177

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

178

def shared_buffer_allocation_for_pass_and_block_config(arch, ps, block_config):

179

alloc = SharedBufferAllocation(arch, ps)

180

assert (alloc.ifm_block_depth == block_config[2]) or alloc.is_equal_depth_op

181

if alloc.try_block(Block(block_config[1], block_config[0], block_config[3])):

return alloc

return None

def find_block_configs_suitable_for_pass_and_shared_buffer(arch, ps):

188

alloc = SharedBufferAllocation(arch, ps)

189

190

if arch.override_block_config:

191

config = alloc.try_block(arch.override_block_config)

Tim Hall

2a7ebe3

2020-06-18 11:42:21 +0100

[diff] [blame]

192

if config is None:

Fredrik Svedberg

a0c3624

2020-06-03 15:43:31 +0200

[diff] [blame]

193

raise VelaError("Block config override '{0}' cannot be allocated".format(arch.override_block_config))

Tim Hall

2a7ebe3

2020-06-18 11:42:21 +0100

[diff] [blame]

194

return [config]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

195

196

# Constrain the search space if the OFM is smaller than the max block size

197

# - Add other block search constraints here if required

Fredrik Svedberg

0f98b36

2020-09-29 10:00:39 +0200

[diff] [blame]

198

if len(alloc.ofm_tensor.shape) <= 2:

Tim Hall