Blame - ethosu/vela/register_command_stream_generator.py - ml/ethos-u/ethos-u-vela

2020-04-27 18:20:16 +0100

[diff] [blame]

1

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

16

# Description:

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

17

# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

18

# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

19

# stream suitable for interpretation by the Ethos-U processor.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

20

from collections import defaultdict

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

21

from collections import namedtuple

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

22

from enum import Enum

23

from enum import IntEnum

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

24

from typing import List

25

from typing import Optional

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

import numpy as np

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

29

from . import numeric_util

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

30

from . import scaling

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

31

from .api import NpuAccelerator

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

32

from .api import NpuActivation

33

from .api import NpuActivationOp

34

from .api import NpuAddressRange

35

from .api import NpuBlockOperation

36

from .api import NpuBlockTraversal

37

from .api import NpuConv2DOperation

38

from .api import NpuDataType

39

from .api import NpuDmaOperation

40

from .api import NpuElementWiseOp

41

from .api import NpuElementWiseOperation

42

from .api import NpuFeatureMap

43

from .api import NpuKernel

44

from .api import NpuLayout

45

from .api import NpuOperation

46

from .api import NpuOperationType

47

from .api import NpuPadding

48

from .api import NpuPoolingOp

49

from .api import NpuPoolingOperation

50

from .api import NpuQuantization

51

from .api import NpuResamplingMode

52

from .api import NpuRoundingMode

53

from .api import NpuShape3D

54

from .api import NpuTileBox

55

from .architecture_features import Accelerator

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

56

from .architecture_features import ArchitectureFeatures

57

from .architecture_features import Block

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

58

from .architecture_features import Rect

59

from .architecture_features import SharedBufferArea

60

from .architecture_features import SHRAMElements

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

61

from .debug_database import DebugDatabase

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

62

from .ethos_u55_regs.ethos_u55_regs import acc_format

63

from .ethos_u55_regs.ethos_u55_regs import activation

64

from .ethos_u55_regs.ethos_u55_regs import cmd0

65

from .ethos_u55_regs.ethos_u55_regs import cmd1

66

from .ethos_u55_regs.ethos_u55_regs import elementwise_mode

Fredrik Svedberg

a0c3624

2020-06-03 15:43:31 +0200

[diff] [blame]

67

from .ethos_u55_regs.ethos_u55_regs import pooling_mode

Jacob Bohlin

cf7da10

2020-05-20 09:03:40 +0200

[diff] [blame]

68

from .ethos_u55_regs.ethos_u55_regs import resampling_mode

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

69

from .ethos_u55_regs.ethos_u55_regs import rounding

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

70

from .high_level_command_stream import CommandType

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

71

from .high_level_command_to_npu_op import convert_command_to_npu_op

72

from .high_level_command_to_npu_op import to_kernel

73

from .high_level_command_to_npu_op import unary_elementwise_ops

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

74

from .numeric_util import quantise_float32

75

from .numeric_util import round_away_zero

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

76

from .numeric_util import round_up_to_int

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

77

from .operation import NpuBlockType

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

78

from .range_set import AccessDirection

79

from .range_set import MemoryAccessSet

80

from .range_set import MemoryRangeSet

81

from .shared_buffer_allocation import find_suitable_block_configs

82

from .shared_buffer_allocation import shared_buffer_allocation_for_npu_op

83

from .shared_buffer_allocation import SharedBufferAllocation

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

84

85

86

class RegisterMachine:

87

def __init__(self):

88

self.n_banks = 1

89

self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]

90

self.bank_idx = 0

91

92

def set_register(self, reg, value):

93

is_changed = self.registers[self.bank_idx][reg] != value

94

self.registers[self.bank_idx][reg] = value

95

# is_changed = True # force command

96

return is_changed

97

98

def switch_bank(self):

99

self.bank_idx = (self.bank_idx + 1) % self.n_banks

100

101

102

class CmdMode(IntEnum):

NoPayload = 0x0000

Payload32 = 0x4000

Mask = 0xC000

CmdOpMask = 0x03FF

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

109

class CommandStreamEmitter:

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

110

WORD_SIZE = 4

111

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

112

def __init__(self):

113

self.cmd_stream = []

114

self.reg_machine = [RegisterMachine(), RegisterMachine()]

115

self.last_absolute_wait = defaultdict(int)

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

116

self.offset = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

117

118

def get_reg_machine(self, cmd):

119

if "DMA" in cmd.name:

120

return self.reg_machine[1]

121

else:

122

return self.reg_machine[0]

123

124

def size_in_bytes(self):

125

sz = 0

126

for cmd in self.cmd_stream:

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

127

sz += len(cmd) * CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

128

return sz

129

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

130

def to_list(self) -> List[int]:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

131

return [elem for cmd in self.cmd_stream for elem in cmd]

132

133

def print_cmds(self):

134

print("Code: Command: Param: Payload:")

135

for words_for_one_command in self.cmd_stream:

136

code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits

137

param = words_for_one_command[0] >> 16 # higher 16 bits

138

139

payload_mode = CmdMode(code & CmdMode.Mask)

140

141

# code and command

142

s = " 0x%04x " % code

143

if payload_mode == CmdMode.NoPayload:

144

s += str(cmd0(code & CmdMode.CmdOpMask))

145

else:

146

s += str(cmd1(code & CmdMode.CmdOpMask))

s = s.ljust(40)

s += "%5d" % param

# payload

if payload_mode == CmdMode.Payload32:

153

s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])

else:

s += " -"

print(s)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

159

def cmd0_with_param(self, cmd: cmd0, param):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

160

if isinstance(param, Enum):

161

param = int(param.value)

162

else:

163

param = int(param)

164

param = param & 0xFFFF

165

command = cmd.value | (param << 16)

166

if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):

167

return

168

169

# This is not a redundant command, actually write it

170

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

171

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

172

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

173

def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

174

offset = int(offset) & 0xFFFFFFFFF

175

command = cmd.value | CmdMode.Payload32.value | (param << 16)

176

177

if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):

178

return

179

180

# This is not a redundant command, actually write it

181

self.cmd_stream.append((command, offset))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

182

self.offset += CommandStreamEmitter.WORD_SIZE * 2

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

183

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

184

def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

185

param = (16 * channel) + outstanding_count

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

186

command = ((param & 0xFFFF) << 16) | cmd.value

187

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

188

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

189

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

190

def cmd_do_operation(self, cmd: cmd0, param=0):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

191

param = int(param)

192

command = ((param & 0xFFFF) << 16) | cmd.value

193

194

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

195

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

196

self.get_reg_machine(cmd).switch_bank()

197

198

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

199

# -------------------------------------------------------------------

200

# REGISTER GENERATION

201

# -------------------------------------------------------------------

202

203

204

class BasePointerIndex(IntEnum):

205

WeightTensor = 0 # base address index for the Weight tensor

206

ScratchTensor = 1 # base address index for the Scratch_tensor in the TensorArena

207

ScratchFastTensor = 2 # base address for the Scratch_fast_tensor

208

Mem2Mem = (1 << 8) | (3 << 0) # base address slot for memory 2 memory transfer

209

210

211

# TODO: Replace with definitions from ethos_u55_regs

212

class IFM2Broadcast(IntEnum):

213

BroadcastHdim = 1 << 0

214

BroadcastWdim = 1 << 1

215

BroadcastCdim = 1 << 2

216

ReverseOperandOrder = 1 << 6

217

UseIFM2Scalar = 1 << 7

pooling_op_map = {

NpuPoolingOp.MAX: pooling_mode.MAX.value,

222

NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,

223

NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,

224

}

225

226

elementwise_op_map = {

227

NpuElementWiseOp.MUL: elementwise_mode.MUL.value,

228

NpuElementWiseOp.ADD: elementwise_mode.ADD.value,

229

NpuElementWiseOp.SUB: elementwise_mode.SUB.value,

230

NpuElementWiseOp.MIN: elementwise_mode.MIN.value,

231

NpuElementWiseOp.MAX: elementwise_mode.MAX.value,

232

NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,

233

NpuElementWiseOp.ABS: elementwise_mode.ABS.value,

234

NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,

235

NpuElementWiseOp.SHR: elementwise_mode.SHR.value,

236

NpuElementWiseOp.SHL: elementwise_mode.SHL.value,

237

}

238

239

activation_op_map = {

240

NpuActivationOp.NONE_OR_RELU: activation.NONE,

241

NpuActivationOp.TANH: activation.TANH,

242

NpuActivationOp.SIGMOID: activation.SIGMOID,

243

}

244

245

# Maps an AccumulatorType enum to the corresponding acc_format value

246

acc_format_map = {

247

SHRAMElements.Acc16: acc_format.FP_S5_10.value,

248

SHRAMElements.Acc32: acc_format.INT_32BIT.value,

249

SHRAMElements.Acc40: acc_format.INT_40BIT.value,

250

}

251

252

resampling_mode_map = {

253

NpuResamplingMode.NONE: resampling_mode.NONE,

254

NpuResamplingMode.NEAREST: resampling_mode.NEAREST,

255

NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,

256

}

257

258

# Maps data type size in bits to activation precision

259

precision_map = {8: 0, 16: 1, 32: 2}

260

261

# Maps rounding mode to the corresponding value

262

rounding_mode_map = {

263

NpuRoundingMode.TFL: rounding.TFL.value,

264

NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,

265

NpuRoundingMode.NATURAL: rounding.NATURAL.value,

}

def quantise(value: float, quant: Optional[NpuQuantization]) -> int:

270

"""Quantizes the given value"""

271

scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32

272

zp = 0 if quant is None else quant.zero_point

273

return quantise_float32(value, scale, zp)

274

275

276

def has_ifm2(npu_op: NpuBlockOperation) -> bool:

277

"""Checks if op has non-scalar IFM2"""

278

return npu_op.ifm2 is not None and npu_op.ifm2_scalar is None

279

280

281

def is_dma_op(npu_op: NpuOperation) -> bool:

282

"""Checks if op is a DMA operation"""

283

return npu_op.op_type == NpuOperationType.Dma

284

285

286

def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):

287

"""Generates IFM_PAD registers"""

288

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)

289

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)

290

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)

291

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)

292

293

294

def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):

295

"""Generates ACTIVATION registers"""

296

act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)

297

298

if act.min is None:

299

quantized_min = ofm.data_type.min_value()

300

else:

301

quantized_min = quantise(act.min, ofm.quantization)

302

if act.max is None:

303

quantized_max = ofm.data_type.max_value()

304

else:

305

quantized_max = quantise(act.max, ofm.quantization)

306

quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())

307

quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())

308

if act.op_type == NpuActivationOp.TABLE_LOOKUP:

309

assert 0 <= act.lookup_table_index < 8

310

activation_value = 16 + act.lookup_table_index

311

if ofm.data_type == NpuDataType.INT32:

312

activation_value |= 3 << 12 # Force I8 range

313

quantized_min = max(-128, quantized_min)

314

quantized_max = min(127, quantized_max)

315

else:

316

activation_value = activation_op_map[act.op_type]

317

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)

318

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)

319

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)

320

321

322

def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):

323

"""Generates xFM_BASE registers"""

324

if layout == NpuLayout.NHCWB16:

325

# Check that all BasePointer addresses are aligned to 16 bytes

326

assert all((int(addr) % 16) == 0 for addr in addresses)

327

emit.cmd1_with_offset(ptr_cmds[0], addresses[0])

328

emit.cmd1_with_offset(ptr_cmds[1], addresses[1])

329

emit.cmd1_with_offset(ptr_cmds[2], addresses[2])

330

emit.cmd1_with_offset(ptr_cmds[3], addresses[3])

331

332

333

def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):

334

"""Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""

335

emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)

336

emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)

337

emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)

338

339

340

def generate_strides(

341

emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1

342

):

343

"""Generates STRIDE_C/Y/X registers"""

344

strides = get_strides(fm)

345

emit.cmd1_with_offset(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)

346

emit.cmd1_with_offset(stride_y_cmd, strides.height) # stride between vertical values (H)

347

emit.cmd1_with_offset(stride_x_cmd, strides.width) # stride between horisontal values (W)

348

349

350

def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):

351

"""Generates IFM/IFM2_PRECISION register"""

352

dtype = fm.data_type

353

prec = 1 if dtype.is_signed() else 0

354

activation_precision = precision_map[dtype.size_in_bits()]

355

prec += activation_precision << 2

356

357

if fm.layout == NpuLayout.NHCWB16:

358

prec |= 1 << 6

359

360

prec |= op_to_scale << 8

361

emit.cmd0_with_param(precision_cmd, prec)

362

363

364

def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):

365

"""Generates OFM_PRECISION register"""

366

dtype = npu_op.ofm.data_type

367

prec = 1 if dtype.is_signed() else 0

368

activation_precision = precision_map[dtype.size_in_bits()]

369

prec += activation_precision << 1

370

371

if use_global_scale:

372

# Set global scale bit, as opposed to using per channel scale

373

prec |= 1 << 8

374

if npu_op.ofm.layout == NpuLayout.NHCWB16:

375

prec |= 1 << 6

376

prec |= rounding_mode_map[npu_op.rounding_mode] << 14

377

emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)

378

379

380

def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):

381

"""Generates IFM2_BROADCAST register for binary elementwise operations"""

ifm2_broadcast = 0

ifm = npu_op.ifm

ifm2 = npu_op.ifm2

if npu_op.reversed_operands:

386

ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder

387

if npu_op.ifm2_scalar is not None:

388

# IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST

389

ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar

390

else:

391

if ifm.shape.height != ifm2.shape.height:

392

# Broadcast in 'H' dimension

393

assert ifm2.shape.height == 1

394

ifm2_broadcast |= IFM2Broadcast.BroadcastHdim

395

396

if ifm.shape.width != ifm2.shape.width:

397

# Broadcast in 'W' dimension

398

assert ifm2.shape.width == 1

399

ifm2_broadcast |= IFM2Broadcast.BroadcastWdim

400

401

if ifm.shape.depth != ifm2.shape.depth:

402

# Broadcast in 'C' dimension

403

assert ifm2.shape.depth == 1

404

ifm2_broadcast |= IFM2Broadcast.BroadcastCdim

405

406

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)

407

408

409

def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):

410

"""Generates general IFM registers"""

411

emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)

412

generate_addresses(

413

emit,

414

[cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],

ifm.tiles.addresses,

ifm.layout,

)

generate_tiles(

emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles

420

)

421

emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)

422

generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)

423

emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))

424

425

426

def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):

427

"""Generates general IFM2 registers"""

428

if not has_scalar:

429

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)

430

generate_addresses(

431

emit,

432

[cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],

433

ifm2.tiles.addresses,

ifm2.layout,

)

generate_tiles(

emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles

438

)

439

generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)

440

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))

441

442

443

def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):

444

"""Generates general OFM registers"""

445

emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)

446

generate_addresses(

447

emit,

448

[cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],

ofm.tiles.addresses,

ofm.layout,

)

generate_tiles(

emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles

454

)

455

emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)

456

emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)

457

emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)

458

generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)

459

emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))

460

461

462

def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):

463

"""Generates KERNEL related registers"""

464

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))

465

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))

466

# set kernel x stride low bit

467

stride = (kernel.stride_x - 1) & 1

468

# set kernel y stride low bit

469

stride |= (kernel.stride_y - 1 & 1) << 1

470

# set kernel x stride extension bits

471

stride |= (kernel.stride_x - 1 >> 1) << 6

472

# set kernel y stride extension bits

473

stride |= (kernel.stride_y - 1 >> 1) << 9

474

stride |= (kernel.dilation_x - 1) << 3

475

stride |= (kernel.dilation_y - 1) << 4

476

if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:

477

stride |= 1 << 2

478

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)

479

480

481

def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):

482

"""Generates WEIGHT registers"""

483

if len(weights) == 0:

484

return

485

emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)

486

# Set weights sources for active and present cores

487

for core, (addr, length) in enumerate(

488

[

489

(cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),

490

(cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),

491

]

492

):

493

if core < len(weights):

494

emit.cmd1_with_offset(addr, weights[core].address)

495

emit.cmd1_with_offset(length, weights[core].length)

496

elif core < arch.ncores:

497

emit.cmd1_with_offset(addr, weights[0].address)

498

emit.cmd1_with_offset(length, 0)

499

500

501

def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):

502

"""Generates SCALE registers"""

503

if len(biases) == 0:

504

return

505

emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)

506

# Set weights sources for active and present cores

507

for core, (addr, length) in enumerate(

508

[(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]

509

):

510

if core < len(biases):

511

emit.cmd1_with_offset(addr, biases[core].address)

512

emit.cmd1_with_offset(length, biases[core].length)

513

elif core < arch.ncores:

514

emit.cmd1_with_offset(addr, biases[0].address)

515

emit.cmd1_with_offset(length, 0)

516

517

518

def generate_block_config(

519

emit: CommandStreamEmitter,

520

npu_op: NpuBlockOperation,

521

arch: ArchitectureFeatures,

522

shared_buffer: SharedBufferAllocation,

523

) -> NpuShape3D:

524

"""Selects a suitable block config if none has been set, and generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""

525

block_config = npu_op.block_config

526

if block_config is None or block_config.height < 0:

527

# Note: this code only used if the public API to generate command streams is used;

528

# in the "normal" flow, the block config selected by the scheduler is used

529

if npu_op.weights:

530

assert block_config is not None, "block_config.depth must be provided for ops with weights"

531

# Block config has not been provided: find one

532

blocks = find_suitable_block_configs(arch, shared_buffer)

533

# Return the block with biggest volume

534

# TODO: use a better algorithm to find the best block

best_block = None

best_value = 0

for block in blocks:

if block_config is not None and block[3] != block_config.depth:

539

continue

540

value = block[0] * block[1] * block[3]

541

if value > best_value:

542

best_value = value

543

best_block = block

544

assert best_block is not None, f"No suitable block config was found, {npu_op.op_type}"

545

block_config = NpuShape3D(height=best_block[0], width=best_block[1], depth=best_block[3])

546

alloc = shared_buffer.try_block(Block(block_config.width, block_config.height, block_config.depth))

547

assert alloc is not None, f"Block config {block_config} does not fit, op: {npu_op.op_type}"

548

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)

549

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)

550

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)

return block_config

def generate_shram_registers_elementwise(

555

emit: CommandStreamEmitter,

556

npu_op: NpuElementWiseOperation,

557

arch: ArchitectureFeatures,

558

shared_buffer: SharedBufferAllocation,

559

):

560

"""Generates IB_END/IB_START/AB_START registers for elementwise operations"""

561

# For elementwise set the required SHRAM to be equal to the total size of available SHRAM

562

uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP

563

shram_required = arch.available_shram_banks(uses_lut)

564

565

# Acc buffers not needed so set AB_START to size of SHRAM

566

emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)

567

emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shram_required)

568

if has_ifm2(npu_op):

569

# Set IFM2_IB_START to the latter half of the IB space

570

ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]

571

emit.cmd0_with_param(

572

cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) // shared_buffer.ifm_count + ifm_ib_start,

573

)

574

emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])

575

576

577

def generate_shram_registers_non_elementwise(emit: CommandStreamEmitter, shared_buffer: SharedBufferAllocation):

578

"""Generates IB_END/IB_START/AB_START registers for non-elementwise operations"""

579

emit.cmd0_with_param(

580

cmd0.NPU_SET_IFM_IB_END,

581

shared_buffer.bank_locations[SharedBufferArea.IFM] + shared_buffer.banks_required[SharedBufferArea.IFM],

582

)

583

emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])

584

emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])

def generate_common(

emit: CommandStreamEmitter,

589

npu_op: NpuBlockOperation,

590

block_traversal: NpuBlockTraversal,

591

arch: ArchitectureFeatures,

592

use_global_scale: bool = False,

593

op_to_scale: int = 0,

594

):

595

"""Generate registers that are common to most operations"""

596

assert npu_op.ifm is not None and npu_op.ofm is not None

597

generate_ifm(emit, npu_op.ifm)

598

generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)

599

emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])

600

if npu_op.padding is not None:

601

generate_padding(emit, npu_op.padding)

602

generate_ofm(emit, npu_op.ofm)

603

generate_ofm_precision(emit, npu_op, use_global_scale)

604

if npu_op.op_type != NpuOperationType.ElementWise:

605

assert npu_op.kernel is not None

606

generate_kernel(emit, npu_op.kernel, block_traversal)

607

generate_weights(emit, npu_op.weights, arch)

608

generate_biases(emit, npu_op.biases, arch)

609

generate_activation(emit, npu_op.activation, npu_op.ofm)

610

611

612

# -------------------------------------------------------------------

613

# SCALING

614

# -------------------------------------------------------------------

615

616

617

def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):

618

"""Generates OFM_SCALE register for pooling operations"""

619

# For valid padding vela has to output scaling values

620

kernel = pool_op.kernel

621

ifm_quant = pool_op.ifm.quantization

622

ofm_quant = pool_op.ofm.quantization

623

if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):

624

assert ifm_quant.scale_f32 is not None

625

rescale = 0x3000 * ifm_quant.scale_f32

626

if pool_op.ifm.data_type == NpuDataType.INT16:

627

# Calculate scale and shift for the output scale of 1/(3*4096)

628

shift = 0

629

max_rescale = np.iinfo(np.int16).max / 2

630

while rescale <= max_rescale and shift <= 30:

shift += 1

rescale *= 2

scale = int(rescale)

else:

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

636

scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)

637

scale = int(round_away_zero(scale * rescale))

638

elif pool_op.fused_quantize:

639

# Quantize op requires different scaling

640

ifm_scale_f64 = np.double(ifm_quant.scale_f32)

641

ofm_scale_f64 = np.double(ofm_quant.scale_f32)

642

scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)

643

elif pool_op.rescale is not None:

644

# for ResizeBilinear operations with "rescale" in primary_op.attrs

645

rescale = pool_op.rescale

646

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

647

scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)

648

scale = int(round_away_zero(scale * rescale))

649

else:

650

# In case avg pool fused with concat or other memory operation, rescaling might be needed.

651

# kernel height == kernel width == 1 is always true in this case

652

# Normally the scale is maximised, to get maximum precision, which means that

653

# if rescale != 1, scale need to consider the number of bits needed for rescaling

654

if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:

655

rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32

656

rescale_bits = 0

657

if kernel.height == kernel.width == 1:

658

if rescale > 1:

659

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

660

elif rescale < 1:

661

rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)

662

scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)

663

scale = int(round_away_zero(scale * rescale))

else:

scale = 1

shift = 0

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)

669

670

671

def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:

672

"""

673

Generates OFM/OPA/OPB_SCALE registers for elementwise operators.

674

Returns the operator to scale

675

"""

676

op_to_scale = 0

677

if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):

678

input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None

679

input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None

680

output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None

681

682

if npu_op.activation is not None and npu_op.activation.op_type in (

683

NpuActivationOp.SIGMOID,

684

NpuActivationOp.TANH,

685

):

686

output_scale = 1 / 0x3000

687

688

if npu_op.sub_op_type == NpuElementWiseOp.MUL:

689

if None in (input_scale, input2_scale, output_scale):

ofm_scale = 1

shift = 0

else:

ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)

694

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

695

else: # Add/Sub

696

if None in (input_scale, input2_scale, output_scale):

697

opa_scale = opb_scale = ofm_scale = 1

698

opa_shift = shift = 0

699

if npu_op.rescale is not None:

700

ofm_scale, shift = npu_op.rescale

701

elif input_scale == input2_scale:

702

opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(

703

input_scale, input2_scale, output_scale

704

)

705

opa_shift = 0 # Unused for this case

706

else:

707

# Use advanced implementation only when input scales differ

708

bitdepth = npu_op.ifm.data_type.size_in_bits()

709

(opa_scale, opa_shift, ofm_scale, shift, op_to_scale,) = scaling.advanced_elementwise_add_sub_scale(

710

input_scale, input2_scale, output_scale, bitdepth

711

)

712

opb_scale = 0 # Unused for this case

713

if npu_op.reversed_operands:

714

# If the operand order is reversed we also have to swap which operand is scaled

715

if op_to_scale == scaling.OperandToScale.OPa:

716

op_to_scale = scaling.OperandToScale.OPb

717

else:

718

op_to_scale = scaling.OperandToScale.OPa

719

emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)

720

emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)

721

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

722

elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):

723

output_scale = npu_op.ofm.quantization.scale_f32

724

ofm_scale, shift = scaling.quantise_scale(output_scale)

725

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

726

else:

727

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)

return op_to_scale

# -------------------------------------------------------------------

732

# ADDRESSING/STRIDES (helper functions)

733

# -------------------------------------------------------------------

734

735

736

def ranges_overlap(range1: NpuAddressRange, range2: NpuAddressRange) -> bool:

737

"""Checks if the ranges overlap"""

738

return range1.region == range2.region and numeric_util.overlaps(

739

range1.address, range1.address + range1.length, range2.address, range2.address + range2.length

)

def get_strides(fm: NpuFeatureMap) -> NpuShape3D:

744

"""Calculates STRIDE_C/Y/X"""

745

if fm.strides is not None:

746

return fm.strides

747

elem_size = fm.data_type.size_in_bytes()

748

if fm.layout == NpuLayout.NHWC:

749

stride_c = elem_size

750

stride_x = fm.shape.depth * stride_c

751

stride_y = fm.shape.width * stride_x

752

else:

753

stride_x = 16 * elem_size

754

stride_c = stride_x * fm.shape.width

755

stride_y = elem_size * fm.shape.width * numeric_util.round_up(fm.shape.depth, 16)

756

return NpuShape3D(depth=stride_c, height=stride_y, width=stride_x)

757

758

759

def get_address(fm: NpuFeatureMap, strides: NpuShape3D, y: int, x: int, c: int) -> int:

760

"""Returns address of given coordinate"""

761

t = 0

762

BRICK = 16

763

stride_c = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHWC else strides.depth

764

stride_x = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHCWB16 else strides.width

765

if x >= fm.tiles.width_0:

766

x -= fm.tiles.width_0

767

t = 1

768

if y >= fm.tiles.height_1:

769

y -= fm.tiles.height_1

770

t += 2

771

elif y >= fm.tiles.height_0:

772

y -= fm.tiles.height_0

773

t += 2

774

elem_size = fm.data_type.size_in_bytes()

775

return (

776

fm.tiles.addresses[t] + y * strides.height + x * stride_x + (c // BRICK) * stride_c + int(c % BRICK) * elem_size

)

def get_address_range(

781

fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int

782

) -> NpuAddressRange:

783

"""Gets address range for (y0, x0, c0) - (y1, x1, c1)"""

784

addr0 = get_address(fm, strides, y0, x0, c0)

785

addr1 = get_address(fm, strides, y1, x1, c1)

786

return NpuAddressRange(region=fm.region, address=addr0, length=addr1 - addr0 + fm.data_type.size_in_bytes())

787

788

789

def get_address_ranges(fm: NpuFeatureMap) -> List[Optional[NpuAddressRange]]:

790

"""Returns 4 adddress ranges, one for every tile, None if the tile is not in use"""

791

strides = get_strides(fm)

792

height, width, depth = fm.shape.height, fm.shape.width, fm.shape.depth

793

height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0

794

t0 = get_address_range(fm, strides, 0, 0, 0, min(height, height_0) - 1, min(width, width_0) - 1, depth - 1,)

795

if width > width_0:

796

t1 = get_address_range(fm, strides, 0, width_0, 0, min(height, height_1) - 1, width - 1, depth - 1)

797

else:

798

t1 = None

799

if height > height_0:

800

t2 = get_address_range(fm, strides, height_0, 0, 0, height - 1, min(width, width_0) - 1, depth - 1)

801

else:

802

t2 = None

803

if t1 is not None and t2 is not None:

804

t3 = get_address_range(fm, strides, height_0, width_0, 0, height - 1, width - 1, depth - 1)

805

else:

806

t3 = None

807

return [t0, t1, t2, t3]

808

809

810

# -------------------------------------------------------------------

811

# DMA_WAIT/KERNEL_WAIT

812

# -------------------------------------------------------------------

813

814

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

815

Watermark = namedtuple("Watermark", ["npu", "dma"])

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

816

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

817

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

818

def memory_range_set(range: NpuAddressRange) -> MemoryRangeSet:

819

return MemoryRangeSet(range.region, range.address, range.address + range.length)

820

821

822

def get_dma_memory_accesses(dma_op: NpuDmaOperation) -> MemoryAccessSet:

823

"""Returns the address that are read and written by the given DMA operation"""

824

res = MemoryAccessSet()

825

res.add(memory_range_set(dma_op.src), AccessDirection.Read)

826

res.add(memory_range_set(dma_op.dest), AccessDirection.Write)

return res

def get_op_memory_accesses(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> MemoryAccessSet:

831

"""Returns the addresses that are read and written by the given operation"""

832

assert npu_op.ifm is not None and npu_op.ofm is not None

833

# Read addresses

834

read_ranges = get_address_ranges(npu_op.ifm)

835

if has_ifm2(npu_op):

836

assert npu_op.ifm2 is not None

837

read_ranges.extend(get_address_ranges(npu_op.ifm2))

838

read_ranges.extend(npu_op.weights)

839

read_ranges.extend(npu_op.biases)

840

if npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP:

841

address = arch.available_shram_banks(True) * arch.shram_bank_size

842

read_ranges.append(NpuAddressRange(region=BasePointerIndex.Mem2Mem, address=address, length=2048))

843

# Written addresses

844

write_ranges = get_address_ranges(npu_op.ofm)

845

# Add write access to SHRAM, needed when LUTs can overwrite accumulator banks

846

uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP

847

written_shram_size = arch.available_shram_banks(uses_lut) * arch.shram_bank_size

848

write_ranges.append(NpuAddressRange(region=BasePointerIndex.Mem2Mem, address=0, length=written_shram_size))

849

850

res = MemoryAccessSet()

851

for read_range in read_ranges:

852

if read_range is not None:

853

res.add(memory_range_set(read_range), AccessDirection.Read)

854

for write_range in write_ranges:

855

if write_range is not None:

856

res.add(memory_range_set(write_range), AccessDirection.Write)

return res

def get_wait_dependency(

861

arch: ArchitectureFeatures, npu_op_list: List[NpuOperation], memory_accesses, op_index: int, watermark: Watermark

862

):

863

"""Used to calculate whether DMA wait or kernel wait operations are needed"""

864

npu_op = npu_op_list[op_index]

865

op_access = memory_accesses[npu_op]

866

index = op_index - 1

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

867

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

868

# NPU dependency tracking

869

npu_outstanding = -1

870

npu_ops = 0

871

npu_index = watermark.npu

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

872

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

873

# DMA dependency tracking

874

dma_outstanding = -1

875

dma_ops = 0

876

dma_index = watermark.dma

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

877

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

878

# Seek back in the command stream looking for NPU or DMA dependencies

879

# but only as far as the first dependency or the watermarks (dependencies

880

# before this point have been satisfied already).

881

# The watermark moves to after the latest element we must wait for, not

882

# the command that issues the wait.

883

# NPU->NPU dependency is handled via blockdep.

884

while (index >= npu_index) or (index >= dma_index):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

885

prev_op = npu_op_list[index]

886

prev_access = memory_accesses[prev_op]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

887

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

888

# Check NPU consuming DMA output

889

if is_dma_op(prev_op):

890

if index >= dma_index:

891

if not is_dma_op(npu_op):

892

if (dma_outstanding == -1) and prev_access.conflicts(op_access):

893

dma_outstanding = dma_ops

894

dma_ops += 1 # Count DMA ops in the pipeline

895

if dma_ops >= arch.max_outstanding_dma:

896

dma_index = max(index + 1, dma_index)

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

897

# Check DMA consuming NPU output

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

898

else:

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

899

if index >= npu_index:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

900

if is_dma_op(npu_op) and npu_outstanding == -1 and prev_access.conflicts(op_access):

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

901

npu_outstanding = npu_ops

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

902

npu_ops += 1 # Count NPU ops in the pipeline

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

903

if npu_ops >= arch.max_outstanding_kernels:

904

npu_index = max(index + 1, npu_index)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

905

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

906

index -= 1

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

907

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

908

# Update DMA watermark if we didn't see any and the NPU pipeline is full

909

if (dma_ops == 0) and (npu_ops >= arch.max_outstanding_kernels):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

910

dma_index = op_index

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

911

912

# Bring the search watermark forwards as we complete for those dependencies

913

watermark = Watermark(npu_index, dma_index)

914

outstanding = Watermark(npu_outstanding, dma_outstanding)

915

916

return watermark, outstanding

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

917

918

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

919

def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):

920

if cmd_waits.npu >= 0:

921

emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)

922

923

if cmd_waits.dma >= 0:

924

emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)

925

926

927

# -------------------------------------------------------------------

928

# BLOCKDEP

929

# -------------------------------------------------------------------

930

931

932

def is_dependent_on_prev_op(prev_op: NpuBlockOperation, npu_op: NpuBlockOperation) -> bool:

933

"""Checks if npu_op's input is dependent on prev_op's output"""

934

assert npu_op.ifm is not None

935

assert prev_op.ofm is not None

936

curr_input_ranges = get_address_ranges(npu_op.ifm)

937

938

if has_ifm2(npu_op):

939

assert npu_op.ifm2 is not None

940

curr_input_ranges.extend(get_address_ranges(npu_op.ifm2))

941

for prev_range in get_address_ranges(prev_op.ofm):

942

if prev_range is None:

943

continue

944

for curr_range in curr_input_ranges:

945

if curr_range is not None and ranges_overlap(prev_range, curr_range):

946

return True

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

return False

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

950

def shape3d_to_rect(shape: NpuShape3D) -> Rect:

951

return Rect(0, 0, 0, shape.width - 1, shape.height - 1, shape.depth - 1)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

952

953

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

954

def get_ifm_ofm_block_depth(arch: ArchitectureFeatures, npu_op: NpuBlockOperation) -> int:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

955

# Note: NOT equivalent to the normal ifm block depth calculation since

956

# it takes into account 'depthless' block operations by returning full

957

# depth

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

958

if npu_op.op_type == NpuOperationType.Conv2D:

959

res = arch.calc_ifm_block_depth(npu_op.ifm.shape.depth, npu_op.ifm.data_type.size_in_bits())

960

return res

961

return npu_op.ofm.shape.depth

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

962

963

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

964

def calc_blockdep(

965

arch: ArchitectureFeatures,

966

prev_op: Optional[NpuBlockOperation],

967

prev_block_config: Optional[NpuShape3D],

968

npu_op: NpuBlockOperation,

969

block_config: NpuShape3D,

970

) -> int:

971

"""Calculates the value of the BLOCKDEP register"""

972

if prev_op is None:

973

return 0

974

if not is_dependent_on_prev_op(prev_op, npu_op):

975

return ArchitectureFeatures.MAX_BLOCKDEP

976

if prev_op.ofm.shape != npu_op.ifm.shape:

977

return 0

978

prev_ifm_block_depth = get_ifm_ofm_block_depth(arch, prev_op)

979

prev_ofm_block = Block(prev_block_config.width, prev_block_config.height, prev_block_config.depth)

980

prev_ofm_rect = shape3d_to_rect(prev_op.ofm.shape)

981

prev_ifm_rect = shape3d_to_rect(prev_op.ifm.shape)

982

cur_ifm_block_depth = get_ifm_ofm_block_depth(arch, npu_op)

983

cur_ofm_block = Block(block_config.width, block_config.height, block_config.depth)

984

cur_ofm_rect = shape3d_to_rect(npu_op.ofm.shape)

985

cur_ifm_rect = shape3d_to_rect(npu_op.ifm.shape)

986

cur_padLT = (0, 0) if npu_op.padding is None else (npu_op.padding.left, npu_op.padding.top)

987

blockdep = arch.calc_block_dep(

988

prev_ifm_rect,

989

prev_ofm_rect,

990

prev_ifm_block_depth,

991

prev_ofm_block,

992

to_kernel(prev_op.kernel),

cur_ifm_rect,

cur_ofm_rect,

cur_ifm_block_depth,

cur_ofm_block,

to_kernel(npu_op.kernel),

998

cur_padLT,

999

)

1000

return blockdep

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1001

1002

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1003

# -------------------------------------------------------------------

1004

# PRINT

1005

# -------------------------------------------------------------------

Jacob Bohlin

e99b893

2020-07-13 16:01:51 +0200

[diff] [blame]

1006

1007

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1008

def print_feature_map(fm: NpuFeatureMap, name: str):

if fm is not None:

q = (

"no quantization"

if fm.quantization is None

1013

else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"

1014

)

1015

h, w, c = fm.shape

1016

sz = h * w * c * fm.data_type.size_in_bytes()

1017

print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")

1018

strides = get_strides(fm)

1019

stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"

1020

t = fm.tiles

1021

addresses = [hex(addr) for addr in t.addresses]

1022

print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1023

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1024

1025

def print_operation(npu_op: NpuOperation, index: int = 0):

1026

pass_info = f", {npu_op.cmd}" if hasattr(npu_op, "cmd") else ""

1027

if is_dma_op(npu_op):

1028

print(f"{index} DMA_START src={npu_op.src}, dest={npu_op.dest}{pass_info}")

1029

return

1030

k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)

1031

if npu_op.op_type in (NpuOperationType.Pooling, NpuOperationType.ElementWise):

1032

print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name}:{pass_info}")

Patrik Gustavsson

eca2e95

2020-05-27 09:15:11 +0200

[diff] [blame]

1033

else:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1034

if (

1035

npu_op.op_type == NpuOperationType.Conv2D

1036

and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1

1037

):

1038

fc = "FullyConnected "

1039

else:

1040

fc = ""

1041

print(f"{index} {fc}{npu_op.op_type.name}{pass_info}")

1042

print_feature_map(npu_op.ifm, "IFM")

1043

if npu_op.ifm2_scalar is not None:

1044

quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)

1045

print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")

1046

else:

1047

print_feature_map(npu_op.ifm2, "IFM2")

1048

print_feature_map(npu_op.ofm, "OFM")

1049

if k is not None and npu_op.op_type != NpuOperationType.ElementWise:

1050

print(f" Kernel: {k}")

1051

if npu_op.padding is not None:

1052

print(f" {npu_op.padding}")

1053

for weights in npu_op.weights:

1054

print(f" Weights: {weights}")

1055

for bias in npu_op.biases:

1056

print(f" Scales: {bias}")

1057

if npu_op.activation is not None:

1058

act = npu_op.activation

1059

if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:

1060

lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""

1061

print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")

1062

if npu_op.op_type == NpuOperationType.Conv2D:

1063

print(f" {npu_op.block_traversal}")

1064

bh, bw, bc = npu_op.block_config

1065

rescale = f", rescale={npu_op.rescale}" if hasattr(npu_op, "rescale") else ""

1066

print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1067

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1068

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1069

def print_operations(npu_op_list: List[NpuOperation]):

1070

for index, npu_op in enumerate(npu_op_list):

1071

print_operation(npu_op, index)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1072

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1073

1074

# -------------------------------------------------------------------

1075

# OPERATIONS

1076

# -------------------------------------------------------------------

1077

1078

1079

def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):

1080

"""Generates NPU_OP_* command"""

1081

op_type = npu_op.op_type

1082

if op_type == NpuOperationType.Dma:

1083

emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)

1084

elif op_type == NpuOperationType.Conv2D:

1085

emit.cmd_do_operation(cmd0.NPU_OP_CONV)

1086

elif op_type == NpuOperationType.ConvDepthWise:

1087

emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)

1088

elif op_type == NpuOperationType.Pooling:

1089

emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])

1090

elif op_type == NpuOperationType.ElementWise:

1091

emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])

1092

else:

1093

assert 0, "Unsupported operation"

1094

1095

1096

def generate_conv2d_op(

1097

emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures

1098

) -> NpuShape3D:

1099

"""Generates register commands for Conv2D operations"""

1100

generate_common(emit, npu_op, npu_op.block_traversal, arch)

1101

ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]

1102

shared_buffer = shared_buffer_allocation_for_npu_op(arch, npu_op, NpuBlockType.ConvolutionMxN, ifm_resampling_mode)

1103

block_config = generate_block_config(emit, npu_op, arch, shared_buffer)

1104

generate_shram_registers_non_elementwise(emit, shared_buffer)

return block_config

def generate_conv_depthwise_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):

1109

"""Generates register commands for depthwise convolution operations"""

1110

generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)

1111

ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]

1112

shared_buffer = shared_buffer_allocation_for_npu_op(

1113

arch, npu_op, NpuBlockType.ConvolutionDepthWise, ifm_resampling_mode

1114

)

1115

block_config = generate_block_config(emit, npu_op, arch, shared_buffer)

1116

generate_shram_registers_non_elementwise(emit, shared_buffer)

return block_config

def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):

1121

"""Generates register commands for pooling operations"""

1122

use_global_scale = (

1123

npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0

1124

)

1125

generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)

1126

# Pooling op specific

1127

if use_global_scale:

1128

generate_ofm_scaling_for_pooling(emit, npu_op)

1129

ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]

1130

npu_block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling

1131

shared_buffer = shared_buffer_allocation_for_npu_op(arch, npu_op, npu_block_type, ifm_resampling_mode)

1132

block_config = generate_block_config(emit, npu_op, arch, shared_buffer)

1133

generate_shram_registers_non_elementwise(emit, shared_buffer)

return block_config

def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):

1138

"""Generates register commands for elementwise operations"""

1139

use_global_scale = npu_op.sub_op_type in (

1140

NpuElementWiseOp.ADD,

1141

NpuElementWiseOp.SUB,

1142

NpuElementWiseOp.MUL,

1143

NpuElementWiseOp.LRELU,

1144

NpuElementWiseOp.ABS,

1145

)

1146

op_to_scale = generate_scaling_for_elementwise(emit, npu_op)

1147

generate_common(

1148

emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale

1149

)

1150

# Elementwise op specific

1151

if npu_op.sub_op_type not in unary_elementwise_ops:

1152

# Binary operation; generate IFM2 registers

1153

assert npu_op.ifm2 is not None

1154

has_scalar = npu_op.ifm2_scalar is not None

1155

generate_ifm2(emit, npu_op.ifm2, has_scalar)

1156

generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)

1157

generate_ifm2_broadcast(emit, npu_op)

1158

if has_scalar:

1159

quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)

1160

assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()

1161

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)

1162

ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]

1163

shared_buffer = shared_buffer_allocation_for_npu_op(arch, npu_op, NpuBlockType.ElementWise, ifm_resampling_mode)

1164

block_config = generate_block_config(emit, npu_op, arch, shared_buffer)

1165

generate_shram_registers_elementwise(emit, npu_op, arch, shared_buffer)

return block_config

def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):

1170

"""Generates register commands for DMA operations"""

1171

emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)

1172

emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)

1173

emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)

1174

1175

emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)

1176

emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)

1177

1178

1179

def generate_registers_for_op(

1180

emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures

1181

) -> Optional[NpuShape3D]:

1182

"""

1183

Generates register commands for the given operation, but not the final NPU_OP_... command.

1184

Returns the selected block config

1185

"""

1186

op_type = npu_op.op_type

1187

block_config = None

1188

if op_type == NpuOperationType.Conv2D:

1189

block_config = generate_conv2d_op(emit, npu_op, arch)

1190

elif op_type == NpuOperationType.ConvDepthWise:

1191

block_config = generate_conv_depthwise_op(emit, npu_op, arch)

1192

elif op_type == NpuOperationType.Pooling:

1193

block_config = generate_pooling_op(emit, npu_op, arch)

1194

elif op_type == NpuOperationType.ElementWise:

1195

block_config = generate_elementwise_op(emit, npu_op, arch)

1196

elif op_type == NpuOperationType.Dma:

1197

generate_dma_op(emit, npu_op)

1198

else:

1199

assert 0, "Unsupported operation"

return block_config

def generate_command_stream(

1204

emit: CommandStreamEmitter, npu_op_list: List[NpuOperation], arch: ArchitectureFeatures, add_to_debug_db=None

1205

):

1206

"""Generates register commands for the given list of NPU operations"""

1207

# Calculate memory accesses for every operation

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

1208

memory_accesses = {}

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1209

for npu_op in npu_op_list:

1210

if is_dma_op(npu_op):

1211

memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)

1212

else:

1213

memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

1214

if arch.is_ethos_u65_system:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1215

emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)

1216

dep_watermark = Watermark(0, 0)

1217

prev_op = None

1218

prev_block_config = None

1219

# Generate register commands for all operations

1220

for op_index, npu_op in enumerate(npu_op_list):

1221

dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)

1222

block_config = generate_registers_for_op(emit, npu_op, arch)

1223

if not is_dma_op(npu_op):

1224

# Generate BLOCKDEP

1225

assert block_config is not None

1226

blockdep = calc_blockdep(arch, prev_op, prev_block_config, npu_op, block_config)

1227

blockdep = min(blockdep, arch.max_blockdep)

1228

emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)

1229

prev_op = npu_op

1230

prev_block_config = block_config

1231

1232

generate_cmd_waits(emit, cmd_waits)

1233

# Generate the actual NPU_OP command

1234

generate_operation_code(emit, npu_op)

1235

if add_to_debug_db is not None:

1236

add_to_debug_db(npu_op, emit.offset)

1237

# Fill in final part of command stream:

1238

emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)

1239

1240

1241

def generate_register_command_stream_for_sg(nng, sg, arch, verbose=False):

1242

"""Generates command stream for the subgraph, adds it to sg.register_command_stream"""

1243

# Convert high level command stream to list of NpuOperation

1244

npu_op_list = []

1245

npu_op_to_cmd = dict() # map from npu op to high level command

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1246

for cmd in sg.high_level_command_stream:

1247

if cmd.cmdtype == CommandType.NpuStripe and cmd.ps.npu_block_type == NpuBlockType.Default:

1248

print("Warning: Skipping register command stream generation for", cmd.ps)

1249

else:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1250

npu_op = convert_command_to_npu_op(cmd, arch)

1251

npu_op_list.append(npu_op)

1252

npu_op_to_cmd[npu_op] = cmd

1253

if verbose:

1254

print_operations(npu_op_list)

1255

# Generate register commands

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

1256

stream_id = DebugDatabase.add_stream(sg)

1257

DebugDatabase.set_stream_offset(sg, 0) # Default to zero, can only set during file writing

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1258

emit = CommandStreamEmitter()

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

1259

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1260

def add_to_debug_db(npu_op: NpuOperation, offset: int):

1261

"""Adds info to the debug database"""

1262

if not is_dma_op(npu_op):

1263

cmd = npu_op_to_cmd[npu_op]

1264

DebugDatabase.add_command(stream_id, offset, cmd.ps.primary_op)

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

1265

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1266

generate_command_stream(emit, npu_op_list, arch, add_to_debug_db)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1267

sg.register_command_stream = emit.to_list()

1268

if verbose:

1269

emit.print_cmds()

1270

print("number of commands", len(emit.cmd_stream))

1271

print("command stream length in words", len(sg.register_command_stream))

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1272

1273

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

1274

def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1275

"""

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

1276

Internal implementation of the public facing API for generating an Ethos-U register command stream.

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1277

Calculates dependencies between commands and inserts wait operations if needed.

1278

1279

:param npu_op_list: List[NpuOperation] list of high level NPU operations

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

1280

:param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator

1281

:return Ethos-U instructions, as a list of 32-bit integers

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1282

"""

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

1283

accelerator = Accelerator.from_npu_accelerator(npu_accelerator)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1284

emit = CommandStreamEmitter()

1285

arch = ArchitectureFeatures(

Tim Hall

1bd531d

2020-11-01 20:59:36 +0000

[diff] [blame]

1286

vela_config_files=None,

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1287

accelerator_config=accelerator.value,

Tim Hall

1bd531d

2020-11-01 20:59:36 +0000

[diff] [blame]

1288

system_config=ArchitectureFeatures.DEFAULT_CONFIG,

1289

memory_mode=ArchitectureFeatures.DEFAULT_CONFIG,

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1290

override_block_config=None,

1291

block_config_limit=None,

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1292

max_blockdep=ArchitectureFeatures.MAX_BLOCKDEP,

1293

weight_estimation_scaling=1.0,

Tim Hall

1bd531d

2020-11-01 20:59:36 +0000

[diff] [blame]

1294

verbose_config=False,

Louis Verhaard