Blame - ethosu/vela/shared_buffer_allocation.py - ml/ethos-u/ethos-u-vela

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

1

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

16

# Description:

17

# Shared buffer allocation works out how to allocate the Ethos-U55 shared buffer for a given pass.

Louis Verhaard

e8a5a78

2020-11-02 18:04:27 +0100

[diff] [blame^]

18

from typing import List

19

from typing import Tuple

20

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

21

import numpy as np

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

22

Louis Verhaard

e8a5a78

2020-11-02 18:04:27 +0100

[diff] [blame^]

23

from .api import NpuActivationOp

24

from .api import NpuBlockOperation

Diego Russo

e8a1045

2020-04-21 17:39:10 +0100

[diff] [blame]

25

from .architecture_features import ArchitectureFeatures

26

from .architecture_features import Block

Diego Russo

e8a1045

2020-04-21 17:39:10 +0100

[diff] [blame]

27

from .architecture_features import SharedBufferArea

28

from .architecture_features import SHRAMElements

Tim Hall

2a7ebe3

2020-06-18 11:42:21 +0100

[diff] [blame]

29

from .errors import VelaError

Dwight Lidman

7ad408b

2020-08-11 11:55:22 +0200

[diff] [blame]

30

from .ethos_u55_regs.ethos_u55_regs import resampling_mode

Louis Verhaard

e8a5a78

2020-11-02 18:04:27 +0100

[diff] [blame^]

31

from .high_level_command_to_npu_op import to_kernel

Tim Hall

4ed38bc

2020-10-20 18:54:20 +0100

[diff] [blame]

32

from .operation import Kernel

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

33

from .operation import NpuBlockType

Louis Verhaard

814cfbb

2020-08-21 14:06:25 +0200

[diff] [blame]

34

from .range_set import MemoryRangeSet

35

from .tensor import MemArea

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

36

37

38

class SharedBufferAllocation:

Louis Verhaard

e8a5a78

2020-11-02 18:04:27 +0100

[diff] [blame^]

def __init__(

self,

arch,

kernel,

uses_lut,

npu_block_type,

all_fms_have_quant,

ifm_resampling_mode,

ifm_bits,

ifm_depth,

ifm_count,

ofm_shape,

):

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

52

self.arch = arch

53

54

self.bank_locations = np.zeros(SharedBufferArea.Size)

55

self.banks_required = np.zeros(SharedBufferArea.Size)

56

Louis Verhaard

e8a5a78

2020-11-02 18:04:27 +0100

[diff] [blame^]

57

self.kernel = Kernel(1, 1) if kernel is None else kernel

58

self.is_elementwise = npu_block_type == NpuBlockType.ElementWise

59

self.uses_lut = uses_lut

60

self.ifm_count = ifm_count

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

61

Louis Verhaard

e8a5a78

2020-11-02 18:04:27 +0100

[diff] [blame^]

62

self.is_equal_depth_op = self.is_elementwise or npu_block_type in (

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

63

NpuBlockType.ConvolutionDepthWise,

64

NpuBlockType.Pooling,

65

)

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

66

67

self.use_accumulator_element = SHRAMElements.Acc32

Tim Hall

d5044a4

2020-10-06 12:07:04 +0100

[diff] [blame]

68

if self.is_elementwise:

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

69

self.use_ifm_element = SHRAMElements.IFM8_Elementwise

70

else:

71

self.use_ifm_element = SHRAMElements.IFM8

72

Louis Verhaard

e8a5a78

2020-11-02 18:04:27 +0100

[diff] [blame^]

73

self.ifm_resampling_mode = ifm_resampling_mode

74

self.ifm_bits = ifm_bits

75

self.ifm_depth = ifm_depth

76

self.ifm_count = ifm_count

Andreas Nevalainen

6e82708

2020-10-14 13:55:43 +0200

[diff] [blame]

77

Louis Verhaard

e8a5a78

2020-11-02 18:04:27 +0100

[diff] [blame^]

78

if self.ifm_bits == 16:

79

if npu_block_type != NpuBlockType.Pooling and all_fms_have_quant:

80

self.use_accumulator_element = SHRAMElements.Acc40

81

self.use_ifm_element = self.use_ifm_element + 1

82

assert (self.use_ifm_element == SHRAMElements.IFM16) or (

83

self.use_ifm_element == SHRAMElements.IFM16_Elementwise

84

)

85

elif self.ifm_bits == 32:

86

assert self.is_elementwise or npu_block_type == NpuBlockType.ReduceSum, "Unsupported 32-bit IFM operation"

87

self.use_ifm_element = SHRAMElements.IFM32

88

else:

89

assert self.ifm_bits == 8, "Unexpected IFM bitdepth"

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

90

91

self.ifm_block_depth = arch.calc_ifm_block_depth(self.ifm_depth, self.ifm_bits)

Louis Verhaard

e8a5a78

2020-11-02 18:04:27 +0100

[diff] [blame^]

92

self.ofm_shape = ofm_shape

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

93

94

self.banks_required[SharedBufferArea.Weights] = arch.shram_reserved_weight_banks

95

self.banks_required[SharedBufferArea.OFM] = arch.shram_reserved_output_banks

96

97

def is_valid(self):

98

# Assign zero-based bank starts (first element remains zero)

99

self.bank_locations[1:] = np.cumsum(self.banks_required)[:-1]

100

101

# Accumulator area is measured from the end of the buffer

102

self.bank_locations[SharedBufferArea.Accumulators] = (

Louis Verhaard

814cfbb

2020-08-21 14:06:25 +0200

[diff] [blame]

103

self.arch.available_shram_banks(self.uses_lut) - self.banks_required[SharedBufferArea.Accumulators]

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

104

)

105

ifm_end = self.bank_locations[SharedBufferArea.IFM] + self.banks_required[SharedBufferArea.IFM]

106

return ifm_end <= self.bank_locations[SharedBufferArea.Accumulators]

107

108

def try_block(self, ofm_block: Block):

109

# Get IFM block configuration

110

ifm_block_depth = ofm_block.depth if self.is_equal_depth_op else self.ifm_block_depth

Tim Hall

c30f495

2020-06-15 20:47:35 +0100

[diff] [blame]

111

ifm_block = self.arch.get_ifm_block_size(

112

ifm_block_depth, ofm_block, self.kernel, ifm_resampling_mode=self.ifm_resampling_mode

113

)

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

114

ifm_config = self.arch.get_block_config(ifm_block.width, ifm_block.height, ifm_block.depth)

115

if ifm_config is None:

116

return None

117

118

# Get OFM block configuration

119

ofm_config = self.arch.get_block_config(ofm_block.width, ofm_block.height, ofm_block.depth)

120

if ofm_config is None:

121

return None

122

Tim Hall

d5044a4

2020-10-06 12:07:04 +0100

[diff] [blame]

123

acc_banks = ofm_config.banks[self.use_accumulator_element]

124

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

125

# Update bank counts for IFM and Accumulator

Andreas Nevalainen

6e82708

2020-10-14 13:55:43 +0200

[diff] [blame]

126

self.banks_required[SharedBufferArea.IFM] = ifm_config.banks[self.use_ifm_element] * self.ifm_count

Tim Hall

d5044a4

2020-10-06 12:07:04 +0100

[diff] [blame]

127

self.banks_required[SharedBufferArea.Accumulators] = 0 if self.is_elementwise else acc_banks

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

128

129

# Validating calculates bank layout and returns validity

130

if not self.is_valid():

131

return None

132

133

return (ofm_block.height, ofm_block.width, ifm_block.depth, ofm_block.depth)

134

135

def generate_used_mask(self, active_set):

136

res = np.zeros(self.arch.shram_total_banks, dtype=np.int64)

137

for kind in active_set:

138

start = int(self.bank_locations[kind])

139

end = start + int(self.banks_required[kind])

res[start:end] = 1

return res

def is_compatible(first, second):

144

"""See if the bank allocations of two convolutions are compatible,

145

so that they can run back-to-back without a fence in between"""

146

147

first_set = set((SharedBufferArea.OFM, SharedBufferArea.Accumulators))

148

second_set = set((SharedBufferArea.IFM, SharedBufferArea.Weights))

149

150

first_mask = first.generate_used_mask(first_set)

151

second_mask = second.generate_used_mask(second_set)

152

153

if np.sum(first_mask & second_mask):

# overlap

return False

return True

Louis Verhaard

2020-08-21 14:06:25 +0200

[diff] [blame]

159

def get_shram_memory_access_range(self):

160

# Returns the SHRAM memory access range used by this shared buffer,

161

# excluding access to LUT

162

return MemoryRangeSet(

163

MemArea.Shram, 0, self.arch.available_shram_banks(self.uses_lut) * self.arch.shram_bank_size

164

)

165

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

166

Louis Verhaard

e8a5a78

2020-11-02 18:04:27 +0100

[diff] [blame^]

167

def _all_fms_have_quant(ifm_tensor, ofm_tensor, ifm2_tensor=None) -> bool:

Diqing Zhong

09387e2

2020-09-28 18:46:22 +0200

[diff] [blame]

168

tensors = [t for t in (ifm_tensor, ifm2_tensor, ofm_tensor) if t is not None]

169

scales = [t.quantization.scale_f32 for t in tensors if t.quantization is not None]

Louis Verhaard

e8a5a78

2020-11-02 18:04:27 +0100

[diff] [blame^]

170

return len(tensors) == len(scales) and None not in scales

Diqing Zhong

09387e2

2020-09-28 18:46:22 +0200

[diff] [blame]

171

172

Louis Verhaard

e8a5a78

2020-11-02 18:04:27 +0100

[diff] [blame^]

173

def is_acc_40bits_used(npu_block_type, ifm_tensor, ofm_tensor, ifm2_tensor=None):

174

return npu_block_type != NpuBlockType.Pooling and _all_fms_have_quant(ifm_tensor, ofm_tensor, ifm2_tensor)

175

176

177

def shared_buffer_allocation_for_pass(arch, ps) -> SharedBufferAllocation:

178

ifm_tensor, ifm2_tensor, _, ofm_tensor = ps.get_primary_op_ifm_ifm2_weights_ofm()

179

all_fms_have_quant = _all_fms_have_quant(ifm_tensor, ifm2_tensor, ofm_tensor)

180

181

kernel = Kernel(1, 1)

182

is_elementwise = ps.npu_block_type == NpuBlockType.ElementWise

uses_lut = False

ifm_count = 1

if ps.primary_op:

kernel = ps.primary_op.kernel

188

uses_lut = ps.primary_op.activation_lut is not None

189

190

ifm_resampling_mode = resampling_mode.NONE

ifm_bits = 0

ifm_depth = 0

if ifm_tensor:

ifm_resampling_mode = ifm_tensor.resampling_mode

195

ifm_bits = ifm_tensor.dtype.size_in_bits()

196

197

if ifm_tensor.shape != []:

198

ifm_depth = ifm_tensor.shape[-1]

if is_elementwise:

ifm_count = 2

if ifm_tensor.shape == []: # Scalar in ifm1

203

assert ifm2_tensor

204

ifm_depth = ifm2_tensor.shape[-1]

205

ifm_count = 1

206

elif not ifm2_tensor or ifm2_tensor.shape == []: # Scalar in ifm2

207

ifm_count = 1

208

return SharedBufferAllocation(

arch,

kernel,

uses_lut,

npu_block_type=ps.npu_block_type,

213

all_fms_have_quant=all_fms_have_quant,

214

ifm_resampling_mode=ifm_resampling_mode,

ifm_bits=ifm_bits,

ifm_depth=ifm_depth,

ifm_count=ifm_count,

ofm_shape=ofm_tensor.shape,

)

def shared_buffer_allocation_for_pass_and_block_config(arch, ps, block_config) -> SharedBufferAllocation:

223

alloc = shared_buffer_allocation_for_pass(arch, ps)

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

224

assert (alloc.ifm_block_depth == block_config[2]) or alloc.is_equal_depth_op

225

if alloc.try_block(Block(block_config[1], block_config[0], block_config[3])):

return alloc

return None

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

231

def shared_buffer_allocation_for_npu_op(

232

arch, npu_op: NpuBlockOperation, npu_block_type: NpuBlockType, ifm_resampling_mode

233

) -> SharedBufferAllocation:

234

uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP

235

fms = [npu_op.ifm, npu_op.ofm]

236

if npu_op.ifm2 is not None:

237

fms.append(npu_op.ifm2)

238

all_fms_have_quant = not any(fm.quantization is None or fm.quantization.scale_f32 is None for fm in fms)

239

ifm_bits = npu_op.ifm.data_type.size_in_bits()

240

ifm_depth = npu_op.ifm.shape.depth

241

ifm_count = 2 if npu_op.ifm2 is not None and npu_op.ifm2_scalar is None else 1

242

ofm_shape = [1, npu_op.ofm.shape.height, npu_op.ofm.shape.width, npu_op.ofm.shape.depth]

243

return SharedBufferAllocation(

244

arch,

245

to_kernel(npu_op.kernel),

246

uses_lut,

247

npu_block_type=npu_block_type,

248

all_fms_have_quant=all_fms_have_quant,

249

ifm_resampling_mode=ifm_resampling_mode,

ifm_bits=ifm_bits,

ifm_depth=ifm_depth,

ifm_count=ifm_count,

ofm_shape=ofm_shape,

)

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

255

Louis Verhaard

e8a5a78

2020-11-02 18:04:27 +0100

[diff] [blame^]

256

257

def find_suitable_block_configs(arch, alloc: SharedBufferAllocation) -> List[Tuple]:

258

"""Returns list of block configs that would fit with the given shared buffer allocation"""

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

259

if arch.override_block_config:

260

config = alloc.try_block(arch.override_block_config)

Tim Hall

2a7ebe3

2020-06-18 11:42:21 +0100

[diff] [blame]

261

if config is None:

Fredrik Svedberg

a0c3624

2020-06-03 15:43:31 +0200

[diff] [blame]

262

raise VelaError("Block config override '{0}' cannot be allocated".format(arch.override_block_config))

Tim Hall

2a7ebe3

2020-06-18 11:42:21 +0100

[diff] [blame]

263

return [config]

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

264

265

# Constrain the search space if the OFM is smaller than the max block size

266

# - Add other block search constraints here if required

Louis Verhaard

e8a5a78

2020-11-02 18:04:27 +0100

[diff] [blame^]

267

if len(alloc.ofm_shape) <= 2:

268

max_block_height = max_block_width = alloc.ofm_shape[0]

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

269

else:

Louis Verhaard

e8a5a78

2020-11-02 18:04:27 +0100

[diff] [blame^]

270

max_block_width = alloc.ofm_shape[-2]

271

max_block_height = alloc.ofm_shape[-3]

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

272

273

# Common block depth

Louis Verhaard

e8a5a78

2020-11-02 18:04:27 +0100

[diff] [blame^]

274

max_block_depth = alloc.ofm_shape[-1]

Tim Hall

79d07d2

2020-04-27 18:20:16 +0100

[diff] [blame]

275

276

# Constrain to valid ranges before search

277

max_block_width = min(arch.ofm_block_max.width, max_block_width)

278

max_block_height = min(arch.ofm_block_max.height, max_block_height)

279

max_block_depth = min(arch.ofm_block_max.depth, max_block_depth)

280

281

valid_block_configs = []

282

# Try a range of block shapes against this pass

283

for w in range(arch.ofm_ublock.width, max_block_width + arch.ofm_ublock.width, arch.ofm_ublock.width):

284

for h in range(arch.ofm_ublock.height, max_block_height + arch.ofm_ublock.height, arch.ofm_ublock.height):

285

# Try valid OFM block depths

286

for c in range(arch.ofm_ublock.depth, max_block_depth + arch.ofm_ublock.depth, arch.ofm_ublock.depth):

287

# OFM block depth has the constraint that if it causes the OFM to be

288

# split, it must be a multiple of the OFM split size

289

if (c >= max_block_depth) or (c < max_block_depth and (c % ArchitectureFeatures.OFMSplitDepth) == 0):

290

config = alloc.try_block(Block(w, h, c))

291

if config:

292

valid_block_configs.append(config)

293

294

assert len(valid_block_configs) > 0

295

return valid_block_configs

Louis Verhaard

e8a5a78

2020-11-02 18:04:27 +0100

[diff] [blame^]

296

297

298

def find_block_configs_suitable_for_pass_and_shared_buffer(arch, ps) -> List[Tuple]:

299

alloc = shared_buffer_allocation_for_pass(arch, ps)

300

return find_suitable_block_configs(arch, alloc)