Blame - ethosu/vela/architecture_allocator.py - ml/ethos-u/ethos-u-vela

2021-05-27 18:49:40 +0100

[diff] [blame]

1

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

16

#

17

# Description: Architecture SHRAM allocator

18

import enum

19

import math

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame^]

20

from typing import Dict

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

21

from typing import Optional

22

from typing import Tuple

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame^]

23

from typing import Union

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

24

25

from .architecture_features import ArchitectureFeatures

26

from .architecture_features import Block

27

from .architecture_features import SHRAMConfig

28

from .architecture_features import SHRAMElements

29

from .ethos_u55_regs.ethos_u55_regs import resampling_mode

30

from .numeric_util import round_up

31

from .numeric_util import round_up_divide

32

from .operation import Kernel

33

from .operation import NpuBlockType

34

from .range_set import MemoryRangeSet

35

from .shape4d import Shape4D

36

from .tensor import MemArea

class SHRAMLayout:

def __init__(self):

self.ib_start = 0

self.ib_end = 0

self.ib_start2 = 0

self.ab_start = 0

self.lut_start = 0

class ArchitectureBlockConfig:

49

def __init__(self):

50

self.layout = SHRAMLayout()

51

self.ifm_block = Shape4D()

James Ward

399c4a2

2021-10-20 11:04:46 +0100

[diff] [blame]

52

self.ofm_block = Shape4D() # non-1D-optimised block

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

53

self.acc_type = SHRAMElements.Acc32

54

self.is_partkernel = False

55

self.bank_size = 0

Louis Verhaard

2022-03-17 15:59:04 +0100

[diff] [blame]

56

self.ifm_depth_buf_scaling = 0

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

57

58

def get_shram_memory_access_range(self):

59

# Returns the SHRAM memory access range used by this shared buffer,

60

# excluding access to LUT

61

return MemoryRangeSet(MemArea.Shram, 0, self.layout.lut_start * self.bank_size)

62

63

def old_style_representation(self):

64

return [self.ofm_block.height, self.ofm_block.width, self.ifm_block.depth, self.ofm_block.depth]

65

66

def __str__(self):

67

return str(self.old_style_representation())

68

69

70

_AccumulatorBits = {SHRAMElements.Acc16: 16, SHRAMElements.Acc32: 32, SHRAMElements.Acc40: 40}

71

72

73

class ElementwiseUsage(enum.IntEnum):

No = 0

Full = 1

Scalar = 2

def _try_block_config(

80

shram: SHRAMConfig,

81

ew_usage: ElementwiseUsage,

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame^]

82

ofm_block: Union[Shape4D, Block],

83

ifm_block: Union[Shape4D, Block],

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

ifm_bits: int,

ifm_granule: int,

acc_bits: int,

acc_granule: int,

lut_banks: int,

Louis Verhaard

2022-03-17 15:59:04 +0100

[diff] [blame]

89

ifm_depth_buf_scaling: int,

90

cores: int,

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame^]

91

) -> Union[SHRAMLayout, None]:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

92

assert (acc_bits > 0) and (acc_granule > 0)

93

assert (ifm_bits >= 8) and ((ifm_bits % 8) == 0) and (ifm_granule > 0)

94

Louis Verhaard

2022-03-17 15:59:04 +0100

[diff] [blame]

95

# Scale depth with cores

96

ifm_depth = round_up_divide(ifm_block.depth, ifm_depth_buf_scaling)

97

ofm_depth = round_up_divide(ofm_block.depth, cores)

98

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

99

# Aways need IFM space

Louis Verhaard

2022-03-17 15:59:04 +0100

[diff] [blame]

100

ifm_bytes = ifm_block.elements_wh() * round_up((ifm_depth * ifm_bits) / 8, 8)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

101

ifm_banks = round_up_divide(ifm_bytes, shram.bank_size_bytes) * 2

102

ifm_banks = round_up(ifm_banks, ifm_granule)

103

104

# Calculate SHRAM boundaries of the IFM and Accumulators

105

lut_start = shram.total_banks - lut_banks

106

ifm_end = shram.reserved_output_banks + ifm_banks

107

ifm2_start = ifm_end

108

acc_start = lut_start

109

110

# If not elementwise then we need accumulator space

111

if ew_usage == ElementwiseUsage.No:

Louis Verhaard

2022-03-17 15:59:04 +0100

[diff] [blame]

112

acc_bytes = (ofm_block.elements_wh() * round_up(ofm_depth, 8) * acc_bits) // 8

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

113

acc_banks = round_up_divide(acc_bytes, shram.bank_size_bytes) * 2

114

acc_banks = round_up(acc_banks, acc_granule)

115

acc_start = acc_start - acc_banks

116

else:

117

ifm2_banks = ifm_banks if ew_usage == ElementwiseUsage.Full else 0

118

if ifm2_start + ifm2_banks > acc_start:

return None

ifm_end = acc_start

# IFM must still fit before accumulators

123

if ifm_end > acc_start:

124

return None

125

126

# Should all fit, so return this layout

127

layout = SHRAMLayout()

128

layout.ib_start = shram.reserved_output_banks

129

layout.ib_start2 = ifm2_start

130

layout.ib_end = ifm_end

131

layout.ab_start = acc_start

132

layout.lut_start = lut_start

return layout

def _choose_kernel_method(ifm_shape: Shape4D, ifm_bits: int, kernel: Kernel) -> bool:

137

if ifm_shape.depth <= 8:

138

return True

139

140

# Compare part-kernel to depth-kernel and choose the one with best utilisation

141

kernel_elements = kernel.elements_wh()

142

depth_utilisation = ifm_shape.depth / round_up(ifm_shape.depth, 32 if ifm_bits == 8 else 16)

part_utilisation = (

ifm_shape.depth

* kernel_elements

/ (round_up(ifm_shape.depth, 8) * round_up(kernel_elements, 4 if ifm_bits == 8 else 2))

147

)

148

149

return part_utilisation > depth_utilisation

150

151

152

def _ew_usage(npu_op_type: NpuBlockType, uses_scalar: bool) -> ElementwiseUsage:

153

ew_usage = ElementwiseUsage.No

154

if npu_op_type == NpuBlockType.ElementWise:

155

ew_usage = ElementwiseUsage.Full

156

if uses_scalar:

157

ew_usage = ElementwiseUsage.Scalar

return ew_usage

def _acc_type(npu_op_type: NpuBlockType, ifm_bits: int, scaled: bool) -> int:

162

"""Returns accumulator type"""

163

acc_type = SHRAMElements.Acc32

164

if (ifm_bits == 16) and npu_op_type != NpuBlockType.Pooling and scaled:

165

acc_type = SHRAMElements.Acc40

return acc_type

Fredrik Svedberg

2021-09-29 10:08:04 +0200

[diff] [blame]

169

def is_nearest(ifm_resampling: resampling_mode) -> bool:

170

return ifm_resampling == resampling_mode.NEAREST

171

172

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

173

def to_upscale(ifm_resampling: resampling_mode) -> int:

174

# Upscaling depending on resampling mode

175

return 1 if ifm_resampling == resampling_mode.NONE else 2

176

177

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame^]

178

def _ifm_blockdepth(arch, ifm_shape: Union[Shape4D, Block], ifm_bits: int, is_partkernel: bool):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

179

if ifm_bits == 16:

180

ifm_blockdepth = round_up(min(ifm_shape.depth, 16), 4)

181

else:

182

ifm_blockdepth = round_up(min(ifm_shape.depth, 16 if is_partkernel else 32), arch.ifm_ublock.depth)

183

return ifm_blockdepth

184

185

Fredrik Svedberg

2021-09-29 10:08:04 +0200

[diff] [blame]

186

def _required_size(value: int, stride: int, border: int, upscale: int, nearest: bool) -> int:

187

return int(math.ceil(((value - 1) * stride + border + nearest) / upscale))

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

188

189

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame^]

190

def get_ifm_area_required(

191

ofm_shape: Union[Shape4D, Block], kernel: Kernel, resampling_mode: resampling_mode

192

) -> Tuple[int, int]:

Fredrik Svedberg

2021-09-29 10:08:04 +0200

[diff] [blame]

193

upscale = to_upscale(resampling_mode)

194

nearest = is_nearest(resampling_mode)

195

h1 = _required_size(ofm_shape.height, kernel.stride.y, kernel.area_height(), upscale, nearest)

196

w1 = _required_size(ofm_shape.width, kernel.stride.x, kernel.area_width(), upscale, nearest)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

return (w1, h1)

def _get_ifm_blocksize(

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame^]

201

ofm_block: Union[Shape4D, Block], kernel: Kernel, ublock: Block, subkernel_limit: Block, upscale: int, nearest: bool

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

202

) -> Shape4D:

203

# IFM block height

Fredrik Svedberg

2021-09-29 10:08:04 +0200

[diff] [blame]

204

h1 = _required_size(

205

ofm_block.height, kernel.stride.y, min(kernel.area_height(), subkernel_limit.height), upscale, nearest

206

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

207

h2 = h1

208

height = round_up(min(h1, h2), ublock.height)

209

210

# IFM block width

Fredrik Svedberg

2021-09-29 10:08:04 +0200

[diff] [blame]

211

w1 = _required_size(

212

ofm_block.width, kernel.stride.x, min(kernel.area_width(), subkernel_limit.width), upscale, nearest

213

)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

214

w2 = w1

215

width = round_up(min(w1, w2), ublock.width)

216

217

return Shape4D(1, height, width, ofm_block.depth)

218

219

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame^]

220

def fit_block_for_ofm(

221

arch: ArchitectureFeatures, ofm_shape: Union[Shape4D, Block], kernel: Kernel, block: Union[Shape4D, Block]

222

):

Tim Hall

2021-06-17 17:03:49 +0100

[diff] [blame]

223

# 256/512 Conv1D optimisation (ratio of IFM:Accumulators changes) This is a specific

224

# interpretation of a more general constraint that can't be applied because the

225

# find_block_config function must return block configs that can be applied to any OFM shape.

226

if (ofm_shape.height == 1) and (kernel.height == 1) and (arch.ofm_ublock.height == 2):

227

return Shape4D(1, min(block.height, ofm_shape.height), block.width, block.depth)

return block

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

231

def find_block_config(

232

arch: ArchitectureFeatures,

233

npu_op_type: NpuBlockType,

234

ofm_shape: Shape4D,

235

ifm_shape: Shape4D,

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame^]

236

ifm2_shape: Optional[Shape4D],

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

uses_scalar: bool,

ifm_bits: int,

kernel: Kernel,

lut_banks: int,

scaled: bool,

ifm_resampling: resampling_mode,

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame^]

243

) -> Optional[ArchitectureBlockConfig]:

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

244

SplitDepth = ArchitectureFeatures.OFMSplitDepth

245

# Elementwise larger-volume correction

246

if ifm2_shape is not None and ifm2_shape.elements() > ifm_shape.elements():

247

ifm_shape = ifm2_shape

248

249

# Figure out if SHRAM should be portioned for elementwise

250

ew_usage = _ew_usage(npu_op_type, uses_scalar)

251

252

# Operator typing help

253

is_pooling = npu_op_type == NpuBlockType.Pooling

254

is_depthwise = npu_op_type == NpuBlockType.ConvolutionDepthWise

255

is_equal_depth_op = (ew_usage != ElementwiseUsage.No) or is_pooling or is_depthwise

256

is_convolution = (npu_op_type == NpuBlockType.ConvolutionMxN) or is_depthwise

257

258

# Block config to be returned

259

config = ArchitectureBlockConfig()

260

config.is_partkernel = is_convolution and _choose_kernel_method(ifm_shape, ifm_bits, kernel)

261

Louis Verhaard

2022-03-17 15:59:04 +0100

[diff] [blame]

262

# IFM is not broadcasted for pooling and depthwise ops and for elementwise

263

# when there's no elementwise-broadcasting in depth

264

elemwise_buf_scalable = npu_op_type == NpuBlockType.ElementWise and (

265

not ifm2_shape or ifm_shape.depth == ifm2_shape.depth

266

)

267

ifm_depth_buf_scaling = arch.ncores if is_pooling or is_depthwise or elemwise_buf_scalable else 1

268

config.ifm_depth_buf_scaling = ifm_depth_buf_scaling

269

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

270

# Accumulator & granule settings

271

config.acc_type = _acc_type(npu_op_type, ifm_bits, scaled)

272

273

# Memory rounding granules

274

acc_granule = arch.accumulator_granules[config.acc_type]

275

acc_bits = _AccumulatorBits[config.acc_type]

276

if ew_usage != ElementwiseUsage.No:

277

ifm_granule = arch.ifm_ew_bank_granules[ifm_bits]

278

else:

279

ifm_granule = arch.ifm_bank_granules[ifm_bits]

280

lut_banks = max(lut_banks, arch.shram.reserved_end_banks)

281

upscale = to_upscale(ifm_resampling)

Fredrik Svedberg

2021-09-29 10:08:04 +0200

[diff] [blame]

282

nearest = is_nearest(ifm_resampling)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

283

284

# Subkernel repeats of the IFM

285

ifm_repeats = round_up_divide(kernel.area_width(), arch.SubKernelMax.width) * round_up_divide(

286

kernel.area_height(), arch.SubKernelMax.height

287

)

288

ifm_blockdepth = _ifm_blockdepth(arch, ifm_shape, ifm_bits, config.is_partkernel)

289

290

# Weights fetch (for operators that have them)

291

weight_fetch_wh = (kernel.area_width() * kernel.area_height()) if is_convolution else 0

292

Louis Verhaard

2022-03-17 15:59:04 +0100

[diff] [blame]

293

ofm_ublock_depth = arch.ofm_ublock.depth * arch.ncores

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

294

search_space = Shape4D.min(ofm_shape, Shape4D(arch.ofm_block_max.to_hwc()))

Louis Verhaard

2022-03-17 15:59:04 +0100

[diff] [blame]

295

search_space = Shape4D.round_up(search_space, Shape4D(arch.ofm_ublock.to_hwc()).with_depth(ofm_ublock_depth))

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

296

297

# Block WHC search, loops across the search space looking for best efficiency

298

best_cost = math.inf

Tim Hall

daed152

2021-07-19 21:22:46 +0100

[diff] [blame]

299

best_coverage = math.inf

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

300

depth = max(arch.ofm_ublock.depth, min(search_space.depth, SplitDepth))

301

if depth < ofm_shape.depth:

302

depth = round_up(depth, SplitDepth)

303

304

while depth <= search_space.depth:

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame^]

305

wont_fit: Dict[Tuple[int, int], bool] = {}

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

306

for height in range(arch.ofm_ublock.height, search_space.height + 1, arch.ofm_ublock.height):

307

for width in range(arch.ofm_ublock.width, search_space.width + 1, arch.ofm_ublock.width):

308

# Avoid checking W/H transposed blocks that already didn't fit. i.e. if 8x4x16 didn't

309

# fit, then 4x8x16 won't either.

310

if wont_fit.get((height, width), False):

311

continue

312

313

# Calculate the IFM block dimensions required to feed this OFM block

314

ofm_block = Shape4D(1, height, width, depth)

Fredrik Svedberg

2021-09-29 10:08:04 +0200

[diff] [blame]

315

ifm_block = _get_ifm_blocksize(ofm_block, kernel, arch.ofm_ublock, arch.SubKernelMax, upscale, nearest)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

316

if not is_equal_depth_op:

317

ifm_block = ifm_block.with_depth(ifm_blockdepth)

318

319

# Test if the IFM/OFM blocks fit into SHRAM

Tim Hall

2021-06-17 17:03:49 +0100

[diff] [blame]

320

ofm_block = fit_block_for_ofm(arch, ofm_shape, kernel, ofm_block)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

321

layout = _try_block_config(

Louis Verhaard

2022-03-17 15:59:04 +0100

[diff] [blame]

322

arch.shram,

323

ew_usage,

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame^]

324

Block(ofm_block.width, ofm_block.height, ofm_block.depth),

325

Block(ifm_block.width, ifm_block.height, ifm_block.depth),

Louis Verhaard

2022-03-17 15:59:04 +0100

[diff] [blame]

ifm_bits,

ifm_granule,

acc_bits,

acc_granule,

lut_banks,

ifm_depth_buf_scaling,

332

arch.ncores,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

333

)

334

335

if layout:

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

336

full_blocks = Shape4D.div_round_up(ofm_shape, ofm_block)

337

blocks = ofm_shape / ofm_block

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

338

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

339

# Weights fetching

340

weight_fetch = weight_fetch_wh * ifm_shape.depth * full_blocks.elements_wh()

341

if not is_depthwise:

342

weight_fetch *= ofm_block.depth * blocks.depth

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

343

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

344

# IFM fetching

345

ifm_fetch = ifm_block.elements_wh() * ifm_shape.depth * ifm_repeats * blocks.elements_wh()

346

if not is_equal_depth_op:

347

ifm_fetch *= full_blocks.depth

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

348

Tim Hall

2021-06-17 17:02:31 +0100

[diff] [blame]

349

# Scale relative to every output OFM element

350

relative_cost = (ifm_fetch + weight_fetch) / ofm_shape.elements()

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

351

352

# If the entire IFM can be encompassed by both buffers, bias to prefer this configuration

353

if ifm_shape.elements() < ifm_block.elements() * 2:

354

relative_cost = relative_cost / 2

355

Tim Hall

daed152

2021-07-19 21:22:46 +0100

[diff] [blame]

356

# Choose based on relative minimum cost or larger IFM area (if equal cost)

357

if relative_cost <= best_cost:

358

choose_this = False

359

# Check IFM coverage only when it's equal best_cost and small OFM

360

if relative_cost == best_cost:

361

coverage_shape = Shape4D.min(ifm_shape, ifm_block)

362

coverage = ifm_shape.elements_wh() / coverage_shape.elements_wh()

363

# Small 4x4 IFM constraint found through analysis of networks

364

if coverage <= best_coverage and (height <= 4 and width <= 4):

365

best_coverage = coverage

366

choose_this = True

367

else:

368

best_coverage = math.inf

choose_this = True

if choose_this:

best_cost = relative_cost

373

config.layout = layout

374

config.bank_size = arch.shram_bank_size

375

config.ifm_block = ifm_block

376

config.ofm_block = Shape4D(1, height, width, depth)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

377

else:

378

wont_fit[(width, height)] = True

379

380

depth = depth + arch.ofm_ublock.depth

381

if depth < ofm_shape.depth:

382

depth = round_up(depth, SplitDepth)

383

384

if best_cost != math.inf:

return config

return None

def try_block_config(

391

block_config: Block,

392

arch: ArchitectureFeatures,

393

npu_op_type: NpuBlockType,

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame^]

394

ofm_shape: Union[Shape4D, Block],

395

ifm_shape: Union[Shape4D, Block],

396

ifm2_shape: Optional[Union[Shape4D, Block]],

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

uses_scalar: bool,

ifm_bits: int,

is_partkernel: bool,

kernel: Kernel,

lut_banks: int,

scaled: bool,

ifm_resampling: resampling_mode,

404

) -> Optional[ArchitectureBlockConfig]:

405

"""

406

Given a block_config, returns a corresponding ArchitectureBlockConfig.

407

Returns None if the block_config does not fit or is invalid.

408

"""

409

# Check block config validity

410

if not all(

411

blk > 0 and blk <= blk_max and blk % ublk == 0

412

for blk, blk_max, ublk in zip(block_config.as_list(), arch.ofm_block_max.as_list(), arch.ofm_ublock.as_list())

413

):

414

return None

415

# Elementwise larger-volume correction

416

if ifm2_shape is not None and ifm2_shape.elements() > ifm_shape.elements():

417

ifm_shape = ifm2_shape

418

419

ew_usage = _ew_usage(npu_op_type, uses_scalar)

420

421

# Operator typing help

422

is_pooling = npu_op_type == NpuBlockType.Pooling

423

is_depthwise = npu_op_type == NpuBlockType.ConvolutionDepthWise

424

is_equal_depth_op = (ew_usage != ElementwiseUsage.No) or is_pooling or is_depthwise

425

426

# Block config to be returned

427

config = ArchitectureBlockConfig()

428

config.is_partkernel = is_partkernel

429

Louis Verhaard

2022-03-17 15:59:04 +0100

[diff] [blame]

430

# IFM is not broadcasted for pooling and depthwise ops and for elementwise

431

# when there's no elementwise-broadcasting in depth

432

elemwise_buf_scalable = npu_op_type == NpuBlockType.ElementWise and (

433

not ifm2_shape or ifm_shape.depth == ifm2_shape.depth

434

)

435

ifm_depth_buf_scaling = arch.ncores if is_pooling or is_depthwise or elemwise_buf_scalable else 1

436

config.ifm_depth_buf_scaling = ifm_depth_buf_scaling

437

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

438

# Accumulator & granule settings

439

config.acc_type = _acc_type(npu_op_type, ifm_bits, scaled)

440

441

# Memory rounding granules

442

acc_granule = arch.accumulator_granules[config.acc_type]

443

acc_bits = _AccumulatorBits[config.acc_type]

444

if ew_usage != ElementwiseUsage.No:

445

ifm_granule = arch.ifm_ew_bank_granules[ifm_bits]

446

else:

447

ifm_granule = arch.ifm_bank_granules[ifm_bits]

448

lut_banks = max(lut_banks, arch.shram.reserved_end_banks)

449

upscale = to_upscale(ifm_resampling)

Fredrik Svedberg

2021-09-29 10:08:04 +0200

[diff] [blame]

450

nearest = is_nearest(ifm_resampling)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

451

ifm_blockdepth = _ifm_blockdepth(arch, ifm_shape, ifm_bits, is_partkernel)

Fredrik Svedberg

2021-09-29 10:08:04 +0200

[diff] [blame]

452

ifm_block = _get_ifm_blocksize(block_config, kernel, arch.ofm_ublock, arch.SubKernelMax, upscale, nearest)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

453

if not is_equal_depth_op:

454

ifm_block = ifm_block.with_depth(ifm_blockdepth)

455

Tim Hall

2021-06-17 17:03:49 +0100

[diff] [blame]

456

# 256/512 Conv1D optimisation (ratio of IFM:Accumulators changes)

James Ward

399c4a2

2021-10-20 11:04:46 +0100

[diff] [blame]

457

block_config_opt = fit_block_for_ofm(arch, ofm_shape, kernel, block_config)

Tim Hall

2021-06-17 17:03:49 +0100

[diff] [blame]

458

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

459

layout = _try_block_config(

Louis Verhaard

2022-03-17 15:59:04 +0100

[diff] [blame]

arch.shram,

ew_usage,

block_config_opt,

ifm_block,

ifm_bits,

ifm_granule,

acc_bits,

acc_granule,

lut_banks,

ifm_depth_buf_scaling,

470

arch.ncores,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

)

if layout is None:

return None

config.layout = layout

475

config.bank_size = arch.shram_bank_size

476

config.ifm_block = ifm_block

Jacob Bohlin

b8060f5

2021-08-09 12:22:51 +0100

[diff] [blame]

477

config.ofm_block = block_config

Tim Hall