Blame - ethosu/vela/register_command_stream_generator.py - ml/ethos-u/ethos-u-vela

2020-04-27 18:20:16 +0100

[diff] [blame]

1

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

16

# Description:

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

17

# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

18

# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

19

# stream suitable for interpretation by the Ethos-U processor.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

20

from collections import defaultdict

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

21

from collections import namedtuple

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

22

from enum import Enum

23

from enum import IntEnum

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

24

from typing import List

25

from typing import Optional

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

import numpy as np

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

29

from . import numeric_util

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

30

from . import scaling

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

31

from .api import NpuAccelerator

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

32

from .api import NpuActivation

33

from .api import NpuActivationOp

34

from .api import NpuAddressRange

35

from .api import NpuBlockOperation

36

from .api import NpuBlockTraversal

37

from .api import NpuConv2DOperation

38

from .api import NpuDataType

39

from .api import NpuDmaOperation

40

from .api import NpuElementWiseOp

41

from .api import NpuElementWiseOperation

42

from .api import NpuFeatureMap

43

from .api import NpuKernel

44

from .api import NpuLayout

45

from .api import NpuOperation

46

from .api import NpuOperationType

47

from .api import NpuPadding

48

from .api import NpuPoolingOp

49

from .api import NpuPoolingOperation

50

from .api import NpuQuantization

51

from .api import NpuResamplingMode

52

from .api import NpuRoundingMode

53

from .api import NpuShape3D

54

from .api import NpuTileBox

55

from .architecture_features import Accelerator

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

56

from .architecture_features import ArchitectureFeatures

57

from .architecture_features import Block

Louis Verhaard

5207830

2020-11-18 13:35:06 +0100

[diff] [blame^]

58

from .architecture_features import create_default_arch

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

59

from .architecture_features import Rect

60

from .architecture_features import SharedBufferArea

61

from .architecture_features import SHRAMElements

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

62

from .debug_database import DebugDatabase

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

63

from .ethos_u55_regs.ethos_u55_regs import acc_format

64

from .ethos_u55_regs.ethos_u55_regs import activation

65

from .ethos_u55_regs.ethos_u55_regs import cmd0

66

from .ethos_u55_regs.ethos_u55_regs import cmd1

67

from .ethos_u55_regs.ethos_u55_regs import elementwise_mode

Fredrik Svedberg

a0c3624

2020-06-03 15:43:31 +0200

[diff] [blame]

68

from .ethos_u55_regs.ethos_u55_regs import pooling_mode

Jacob Bohlin

cf7da10

2020-05-20 09:03:40 +0200

[diff] [blame]

69

from .ethos_u55_regs.ethos_u55_regs import resampling_mode

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

70

from .ethos_u55_regs.ethos_u55_regs import rounding

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

71

from .high_level_command_stream import CommandType

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

72

from .high_level_command_to_npu_op import convert_command_to_npu_op

73

from .high_level_command_to_npu_op import to_kernel

74

from .high_level_command_to_npu_op import unary_elementwise_ops

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

75

from .numeric_util import quantise_float32

76

from .numeric_util import round_away_zero

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

77

from .numeric_util import round_up_to_int

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

78

from .operation import NpuBlockType

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

79

from .range_set import AccessDirection

80

from .range_set import MemoryAccessSet

81

from .range_set import MemoryRangeSet

82

from .shared_buffer_allocation import find_suitable_block_configs

83

from .shared_buffer_allocation import shared_buffer_allocation_for_npu_op

84

from .shared_buffer_allocation import SharedBufferAllocation

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

85

86

87

class RegisterMachine:

88

def __init__(self):

89

self.n_banks = 1

90

self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]

91

self.bank_idx = 0

92

93

def set_register(self, reg, value):

94

is_changed = self.registers[self.bank_idx][reg] != value

95

self.registers[self.bank_idx][reg] = value

96

# is_changed = True # force command

97

return is_changed

98

99

def switch_bank(self):

100

self.bank_idx = (self.bank_idx + 1) % self.n_banks

101

102

103

class CmdMode(IntEnum):

NoPayload = 0x0000

Payload32 = 0x4000

Mask = 0xC000

CmdOpMask = 0x03FF

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

110

class CommandStreamEmitter:

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

111

WORD_SIZE = 4

112

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

113

def __init__(self):

114

self.cmd_stream = []

115

self.reg_machine = [RegisterMachine(), RegisterMachine()]

116

self.last_absolute_wait = defaultdict(int)

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

117

self.offset = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

118

119

def get_reg_machine(self, cmd):

120

if "DMA" in cmd.name:

121

return self.reg_machine[1]

122

else:

123

return self.reg_machine[0]

124

125

def size_in_bytes(self):

126

sz = 0

127

for cmd in self.cmd_stream:

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

128

sz += len(cmd) * CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

129

return sz

130

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

131

def to_list(self) -> List[int]:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

132

return [elem for cmd in self.cmd_stream for elem in cmd]

133

134

def print_cmds(self):

135

print("Code: Command: Param: Payload:")

136

for words_for_one_command in self.cmd_stream:

137

code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits

138

param = words_for_one_command[0] >> 16 # higher 16 bits

139

140

payload_mode = CmdMode(code & CmdMode.Mask)

141

142

# code and command

143

s = " 0x%04x " % code

144

if payload_mode == CmdMode.NoPayload:

145

s += str(cmd0(code & CmdMode.CmdOpMask))

146

else:

147

s += str(cmd1(code & CmdMode.CmdOpMask))

s = s.ljust(40)

s += "%5d" % param

# payload

if payload_mode == CmdMode.Payload32:

154

s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])

else:

s += " -"

print(s)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

160

def cmd0_with_param(self, cmd: cmd0, param):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

161

if isinstance(param, Enum):

162

param = int(param.value)

163

else:

164

param = int(param)

165

param = param & 0xFFFF

166

command = cmd.value | (param << 16)

167

if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):

168

return

169

170

# This is not a redundant command, actually write it

171

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

172

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

173

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

174

def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

175

offset = int(offset) & 0xFFFFFFFFF

176

command = cmd.value | CmdMode.Payload32.value | (param << 16)

177

178

if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):

179

return

180

181

# This is not a redundant command, actually write it

182

self.cmd_stream.append((command, offset))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

183

self.offset += CommandStreamEmitter.WORD_SIZE * 2

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

184

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

185

def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

186

param = (16 * channel) + outstanding_count

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

187

command = ((param & 0xFFFF) << 16) | cmd.value

188

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

189

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

190

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

191

def cmd_do_operation(self, cmd: cmd0, param=0):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

192

param = int(param)

193

command = ((param & 0xFFFF) << 16) | cmd.value

194

195

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

196

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

197

self.get_reg_machine(cmd).switch_bank()

198

199

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

200

# -------------------------------------------------------------------

201

# REGISTER GENERATION

202

# -------------------------------------------------------------------

203

204

205

class BasePointerIndex(IntEnum):

206

WeightTensor = 0 # base address index for the Weight tensor

207

ScratchTensor = 1 # base address index for the Scratch_tensor in the TensorArena

208

ScratchFastTensor = 2 # base address for the Scratch_fast_tensor

209

Mem2Mem = (1 << 8) | (3 << 0) # base address slot for memory 2 memory transfer

210

211

212

# TODO: Replace with definitions from ethos_u55_regs

213

class IFM2Broadcast(IntEnum):

214

BroadcastHdim = 1 << 0

215

BroadcastWdim = 1 << 1

216

BroadcastCdim = 1 << 2

217

ReverseOperandOrder = 1 << 6

218

UseIFM2Scalar = 1 << 7

pooling_op_map = {

NpuPoolingOp.MAX: pooling_mode.MAX.value,

223

NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,

224

NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,

225

}

226

227

elementwise_op_map = {

228

NpuElementWiseOp.MUL: elementwise_mode.MUL.value,

229

NpuElementWiseOp.ADD: elementwise_mode.ADD.value,

230

NpuElementWiseOp.SUB: elementwise_mode.SUB.value,

231

NpuElementWiseOp.MIN: elementwise_mode.MIN.value,

232

NpuElementWiseOp.MAX: elementwise_mode.MAX.value,

233

NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,

234

NpuElementWiseOp.ABS: elementwise_mode.ABS.value,

235

NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,

236

NpuElementWiseOp.SHR: elementwise_mode.SHR.value,

237

NpuElementWiseOp.SHL: elementwise_mode.SHL.value,

238

}

239

240

activation_op_map = {

241

NpuActivationOp.NONE_OR_RELU: activation.NONE,

242

NpuActivationOp.TANH: activation.TANH,

243

NpuActivationOp.SIGMOID: activation.SIGMOID,

244

}

245

246

# Maps an AccumulatorType enum to the corresponding acc_format value

247

acc_format_map = {

248

SHRAMElements.Acc16: acc_format.FP_S5_10.value,

249

SHRAMElements.Acc32: acc_format.INT_32BIT.value,

250

SHRAMElements.Acc40: acc_format.INT_40BIT.value,

251

}

252

253

resampling_mode_map = {

254

NpuResamplingMode.NONE: resampling_mode.NONE,

255

NpuResamplingMode.NEAREST: resampling_mode.NEAREST,

256

NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,

257

}

258

259

# Maps data type size in bits to activation precision

260

precision_map = {8: 0, 16: 1, 32: 2}

261

262

# Maps rounding mode to the corresponding value

263

rounding_mode_map = {

264

NpuRoundingMode.TFL: rounding.TFL.value,

265

NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,

266

NpuRoundingMode.NATURAL: rounding.NATURAL.value,

}

def quantise(value: float, quant: Optional[NpuQuantization]) -> int:

271

"""Quantizes the given value"""

272

scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32

273

zp = 0 if quant is None else quant.zero_point

274

return quantise_float32(value, scale, zp)

275

276

277

def has_ifm2(npu_op: NpuBlockOperation) -> bool:

278

"""Checks if op has non-scalar IFM2"""

279

return npu_op.ifm2 is not None and npu_op.ifm2_scalar is None

280

281

282

def is_dma_op(npu_op: NpuOperation) -> bool:

283

"""Checks if op is a DMA operation"""

284

return npu_op.op_type == NpuOperationType.Dma

285

286

287

def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):

288

"""Generates IFM_PAD registers"""

289

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)

290

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)

291

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)

292

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)

293

294

295

def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):

296

"""Generates ACTIVATION registers"""

297

act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)

298

299

if act.min is None:

300

quantized_min = ofm.data_type.min_value()

301

else:

302

quantized_min = quantise(act.min, ofm.quantization)

303

if act.max is None:

304

quantized_max = ofm.data_type.max_value()

305

else:

306

quantized_max = quantise(act.max, ofm.quantization)

307

quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())

308

quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())

309

if act.op_type == NpuActivationOp.TABLE_LOOKUP:

310

assert 0 <= act.lookup_table_index < 8

311

activation_value = 16 + act.lookup_table_index

312

if ofm.data_type == NpuDataType.INT32:

313

activation_value |= 3 << 12 # Force I8 range

314

quantized_min = max(-128, quantized_min)

315

quantized_max = min(127, quantized_max)

316

else:

317

activation_value = activation_op_map[act.op_type]

318

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)

319

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)

320

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)

321

322

323

def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):

324

"""Generates xFM_BASE registers"""

325

if layout == NpuLayout.NHCWB16:

326

# Check that all BasePointer addresses are aligned to 16 bytes

327

assert all((int(addr) % 16) == 0 for addr in addresses)

328

emit.cmd1_with_offset(ptr_cmds[0], addresses[0])

329

emit.cmd1_with_offset(ptr_cmds[1], addresses[1])

330

emit.cmd1_with_offset(ptr_cmds[2], addresses[2])

331

emit.cmd1_with_offset(ptr_cmds[3], addresses[3])

332

333

334

def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):

335

"""Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""

336

emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)

337

emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)

338

emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)

339

340

341

def generate_strides(

342

emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1

343

):

344

"""Generates STRIDE_C/Y/X registers"""

345

strides = get_strides(fm)

346

emit.cmd1_with_offset(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)

347

emit.cmd1_with_offset(stride_y_cmd, strides.height) # stride between vertical values (H)

348

emit.cmd1_with_offset(stride_x_cmd, strides.width) # stride between horisontal values (W)

349

350

351

def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):

352

"""Generates IFM/IFM2_PRECISION register"""

353

dtype = fm.data_type

354

prec = 1 if dtype.is_signed() else 0

355

activation_precision = precision_map[dtype.size_in_bits()]

356

prec += activation_precision << 2

357

358

if fm.layout == NpuLayout.NHCWB16:

359

prec |= 1 << 6

360

361

prec |= op_to_scale << 8

362

emit.cmd0_with_param(precision_cmd, prec)

363

364

365

def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):

366

"""Generates OFM_PRECISION register"""

367

dtype = npu_op.ofm.data_type

368

prec = 1 if dtype.is_signed() else 0

369

activation_precision = precision_map[dtype.size_in_bits()]

370

prec += activation_precision << 1

371

372

if use_global_scale:

373

# Set global scale bit, as opposed to using per channel scale

374

prec |= 1 << 8

375

if npu_op.ofm.layout == NpuLayout.NHCWB16:

376

prec |= 1 << 6

377

prec |= rounding_mode_map[npu_op.rounding_mode] << 14

378

emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)

379

380

381

def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):

382

"""Generates IFM2_BROADCAST register for binary elementwise operations"""

ifm2_broadcast = 0

ifm = npu_op.ifm

ifm2 = npu_op.ifm2

if npu_op.reversed_operands:

387

ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder

388

if npu_op.ifm2_scalar is not None:

389

# IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST

390

ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar

391

else:

392

if ifm.shape.height != ifm2.shape.height:

393

# Broadcast in 'H' dimension

394

assert ifm2.shape.height == 1

395

ifm2_broadcast |= IFM2Broadcast.BroadcastHdim

396

397

if ifm.shape.width != ifm2.shape.width:

398

# Broadcast in 'W' dimension

399

assert ifm2.shape.width == 1

400

ifm2_broadcast |= IFM2Broadcast.BroadcastWdim

401

402

if ifm.shape.depth != ifm2.shape.depth:

403

# Broadcast in 'C' dimension

404

assert ifm2.shape.depth == 1

405

ifm2_broadcast |= IFM2Broadcast.BroadcastCdim

406

407

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)

408

409

410

def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):

411

"""Generates general IFM registers"""

412

emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)

413

generate_addresses(

414

emit,

415

[cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],

ifm.tiles.addresses,

ifm.layout,

)

generate_tiles(

emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles

421

)

422

emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)

423

generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)

424

emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))

425

426

427

def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):

428

"""Generates general IFM2 registers"""

429

if not has_scalar:

430

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)

431

generate_addresses(

432

emit,

433

[cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],

434

ifm2.tiles.addresses,

ifm2.layout,

)

generate_tiles(

emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles

439

)

440

generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)

441

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))

442

443

444

def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):

445

"""Generates general OFM registers"""

446

emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)

447

generate_addresses(

448

emit,

449

[cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],

ofm.tiles.addresses,

ofm.layout,

)

generate_tiles(

emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles

455

)

456

emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)

457

emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)

458

emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)

459

generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)

460

emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))

461

462

463

def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):

464

"""Generates KERNEL related registers"""

465

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))

466

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))

467

# set kernel x stride low bit

468

stride = (kernel.stride_x - 1) & 1

469

# set kernel y stride low bit

470

stride |= (kernel.stride_y - 1 & 1) << 1

471

# set kernel x stride extension bits

472

stride |= (kernel.stride_x - 1 >> 1) << 6

473

# set kernel y stride extension bits

474

stride |= (kernel.stride_y - 1 >> 1) << 9

475

stride |= (kernel.dilation_x - 1) << 3

476

stride |= (kernel.dilation_y - 1) << 4

477

if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:

478

stride |= 1 << 2

479

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)

480

481

482

def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):

483

"""Generates WEIGHT registers"""

484

if len(weights) == 0:

485

return

486

emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)

487

# Set weights sources for active and present cores

488

for core, (addr, length) in enumerate(

489

[

490

(cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),

491

(cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),

492

]

493

):

494

if core < len(weights):

495

emit.cmd1_with_offset(addr, weights[core].address)

496

emit.cmd1_with_offset(length, weights[core].length)

497

elif core < arch.ncores:

498

emit.cmd1_with_offset(addr, weights[0].address)

499

emit.cmd1_with_offset(length, 0)

500

501

502

def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):

503

"""Generates SCALE registers"""

504

if len(biases) == 0:

505

return

506

emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)

507

# Set weights sources for active and present cores

508

for core, (addr, length) in enumerate(

509

[(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]

510

):

511

if core < len(biases):

512

emit.cmd1_with_offset(addr, biases[core].address)

513

emit.cmd1_with_offset(length, biases[core].length)

514

elif core < arch.ncores:

515

emit.cmd1_with_offset(addr, biases[0].address)

516

emit.cmd1_with_offset(length, 0)

517

518

519

def generate_block_config(

520

emit: CommandStreamEmitter,

521

npu_op: NpuBlockOperation,

522

arch: ArchitectureFeatures,

523

shared_buffer: SharedBufferAllocation,

524

) -> NpuShape3D:

525

"""Selects a suitable block config if none has been set, and generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""

526

block_config = npu_op.block_config

527

if block_config is None or block_config.height < 0:

528

# Note: this code only used if the public API to generate command streams is used;

529

# in the "normal" flow, the block config selected by the scheduler is used

530

if npu_op.weights:

531

assert block_config is not None, "block_config.depth must be provided for ops with weights"

532

# Block config has not been provided: find one

533

blocks = find_suitable_block_configs(arch, shared_buffer)

534

# Return the block with biggest volume

535

# TODO: use a better algorithm to find the best block

best_block = None

best_value = 0

for block in blocks:

if block_config is not None and block[3] != block_config.depth:

540

continue

541

value = block[0] * block[1] * block[3]

542

if value > best_value:

543

best_value = value

544

best_block = block

545

assert best_block is not None, f"No suitable block config was found, {npu_op.op_type}"

546

block_config = NpuShape3D(height=best_block[0], width=best_block[1], depth=best_block[3])

547

alloc = shared_buffer.try_block(Block(block_config.width, block_config.height, block_config.depth))

548

assert alloc is not None, f"Block config {block_config} does not fit, op: {npu_op.op_type}"

549

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)

550

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)

551

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)

return block_config

def generate_shram_registers_elementwise(

556

emit: CommandStreamEmitter,

557

npu_op: NpuElementWiseOperation,

558

arch: ArchitectureFeatures,

559

shared_buffer: SharedBufferAllocation,

560

):

561

"""Generates IB_END/IB_START/AB_START registers for elementwise operations"""

562

# For elementwise set the required SHRAM to be equal to the total size of available SHRAM

563

uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP

564

shram_required = arch.available_shram_banks(uses_lut)

565

566

# Acc buffers not needed so set AB_START to size of SHRAM

567

emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)

568

emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shram_required)

569

if has_ifm2(npu_op):

570

# Set IFM2_IB_START to the latter half of the IB space

571

ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]

572

emit.cmd0_with_param(

573

cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) // shared_buffer.ifm_count + ifm_ib_start,

574

)

575

emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])

576

577

578

def generate_shram_registers_non_elementwise(emit: CommandStreamEmitter, shared_buffer: SharedBufferAllocation):

579

"""Generates IB_END/IB_START/AB_START registers for non-elementwise operations"""

580

emit.cmd0_with_param(

581

cmd0.NPU_SET_IFM_IB_END,

582

shared_buffer.bank_locations[SharedBufferArea.IFM] + shared_buffer.banks_required[SharedBufferArea.IFM],

583

)

584

emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])

585

emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])

def generate_common(

emit: CommandStreamEmitter,

590

npu_op: NpuBlockOperation,

591

block_traversal: NpuBlockTraversal,

592

arch: ArchitectureFeatures,

593

use_global_scale: bool = False,

594

op_to_scale: int = 0,

595

):

596

"""Generate registers that are common to most operations"""

597

assert npu_op.ifm is not None and npu_op.ofm is not None

598

generate_ifm(emit, npu_op.ifm)

599

generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)

600

emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])

601

if npu_op.padding is not None:

602

generate_padding(emit, npu_op.padding)

603

generate_ofm(emit, npu_op.ofm)

604

generate_ofm_precision(emit, npu_op, use_global_scale)

605

if npu_op.op_type != NpuOperationType.ElementWise:

606

assert npu_op.kernel is not None

607

generate_kernel(emit, npu_op.kernel, block_traversal)

608

generate_weights(emit, npu_op.weights, arch)

609

generate_biases(emit, npu_op.biases, arch)

610

generate_activation(emit, npu_op.activation, npu_op.ofm)

611

612

613

# -------------------------------------------------------------------

614

# SCALING

615

# -------------------------------------------------------------------

616

617

618

def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):

619

"""Generates OFM_SCALE register for pooling operations"""

620

# For valid padding vela has to output scaling values

621

kernel = pool_op.kernel

622

ifm_quant = pool_op.ifm.quantization

623

ofm_quant = pool_op.ofm.quantization

624

if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):

625

assert ifm_quant.scale_f32 is not None

626

rescale = 0x3000 * ifm_quant.scale_f32

627

if pool_op.ifm.data_type == NpuDataType.INT16:

628

# Calculate scale and shift for the output scale of 1/(3*4096)

629

shift = 0

630

max_rescale = np.iinfo(np.int16).max / 2

631

while rescale <= max_rescale and shift <= 30:

shift += 1

rescale *= 2

scale = int(rescale)

else:

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

637

scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)

638

scale = int(round_away_zero(scale * rescale))

639

elif pool_op.fused_quantize:

640

# Quantize op requires different scaling

641

ifm_scale_f64 = np.double(ifm_quant.scale_f32)

642

ofm_scale_f64 = np.double(ofm_quant.scale_f32)

643

scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)

644

elif pool_op.rescale is not None:

645

# for ResizeBilinear operations with "rescale" in primary_op.attrs

646

rescale = pool_op.rescale

647

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

648

scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)

649

scale = int(round_away_zero(scale * rescale))

650

else:

651

# In case avg pool fused with concat or other memory operation, rescaling might be needed.

652

# kernel height == kernel width == 1 is always true in this case

653

# Normally the scale is maximised, to get maximum precision, which means that

654

# if rescale != 1, scale need to consider the number of bits needed for rescaling

655

if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:

656

rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32

657

rescale_bits = 0

658

if kernel.height == kernel.width == 1:

659

if rescale > 1:

660

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

661

elif rescale < 1:

662

rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)

663

scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)

664

scale = int(round_away_zero(scale * rescale))

else:

scale = 1

shift = 0

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)

670

671

672

def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:

673

"""

674

Generates OFM/OPA/OPB_SCALE registers for elementwise operators.

675

Returns the operator to scale

676

"""

677

op_to_scale = 0

678

if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):

679

input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None

680

input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None

681

output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None

682

683

if npu_op.activation is not None and npu_op.activation.op_type in (

684

NpuActivationOp.SIGMOID,

685

NpuActivationOp.TANH,

686

):

687

output_scale = 1 / 0x3000

688

689

if npu_op.sub_op_type == NpuElementWiseOp.MUL:

690

if None in (input_scale, input2_scale, output_scale):

ofm_scale = 1

shift = 0

else:

ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)

695

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

696

else: # Add/Sub

697

if None in (input_scale, input2_scale, output_scale):

698

opa_scale = opb_scale = ofm_scale = 1

699

opa_shift = shift = 0

700

if npu_op.rescale is not None:

701

ofm_scale, shift = npu_op.rescale

702

elif input_scale == input2_scale:

703

opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(

704

input_scale, input2_scale, output_scale

705

)

706

opa_shift = 0 # Unused for this case

707

else:

708

# Use advanced implementation only when input scales differ

709

bitdepth = npu_op.ifm.data_type.size_in_bits()

710

(opa_scale, opa_shift, ofm_scale, shift, op_to_scale,) = scaling.advanced_elementwise_add_sub_scale(

711

input_scale, input2_scale, output_scale, bitdepth

712

)

713

opb_scale = 0 # Unused for this case

714

if npu_op.reversed_operands:

715

# If the operand order is reversed we also have to swap which operand is scaled

716

if op_to_scale == scaling.OperandToScale.OPa:

717

op_to_scale = scaling.OperandToScale.OPb

718

else:

719

op_to_scale = scaling.OperandToScale.OPa

720

emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)

721

emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)

722

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

723

elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):

724

output_scale = npu_op.ofm.quantization.scale_f32

725

ofm_scale, shift = scaling.quantise_scale(output_scale)

726

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

727

else:

728

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)

return op_to_scale

# -------------------------------------------------------------------

733

# ADDRESSING/STRIDES (helper functions)

734

# -------------------------------------------------------------------

735

736

737

def ranges_overlap(range1: NpuAddressRange, range2: NpuAddressRange) -> bool:

738

"""Checks if the ranges overlap"""

739

return range1.region == range2.region and numeric_util.overlaps(

740

range1.address, range1.address + range1.length, range2.address, range2.address + range2.length

)

def get_strides(fm: NpuFeatureMap) -> NpuShape3D:

745

"""Calculates STRIDE_C/Y/X"""

746

if fm.strides is not None:

747

return fm.strides

748

elem_size = fm.data_type.size_in_bytes()

749

if fm.layout == NpuLayout.NHWC:

750

stride_c = elem_size

751

stride_x = fm.shape.depth * stride_c

752

stride_y = fm.shape.width * stride_x

753

else:

754

stride_x = 16 * elem_size

755

stride_c = stride_x * fm.shape.width

756

stride_y = elem_size * fm.shape.width * numeric_util.round_up(fm.shape.depth, 16)

757

return NpuShape3D(depth=stride_c, height=stride_y, width=stride_x)

758

759

760

def get_address(fm: NpuFeatureMap, strides: NpuShape3D, y: int, x: int, c: int) -> int:

761

"""Returns address of given coordinate"""

762

t = 0

763

BRICK = 16

764

stride_c = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHWC else strides.depth

765

stride_x = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHCWB16 else strides.width

766

if x >= fm.tiles.width_0:

767

x -= fm.tiles.width_0

768

t = 1

769

if y >= fm.tiles.height_1:

770

y -= fm.tiles.height_1

771

t += 2

772

elif y >= fm.tiles.height_0:

773

y -= fm.tiles.height_0

774

t += 2

775

elem_size = fm.data_type.size_in_bytes()

776

return (

777

fm.tiles.addresses[t] + y * strides.height + x * stride_x + (c // BRICK) * stride_c + int(c % BRICK) * elem_size

)

def get_address_range(

782

fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int

783

) -> NpuAddressRange:

784

"""Gets address range for (y0, x0, c0) - (y1, x1, c1)"""

785

addr0 = get_address(fm, strides, y0, x0, c0)

786

addr1 = get_address(fm, strides, y1, x1, c1)

787

return NpuAddressRange(region=fm.region, address=addr0, length=addr1 - addr0 + fm.data_type.size_in_bytes())

788

789

790

def get_address_ranges(fm: NpuFeatureMap) -> List[Optional[NpuAddressRange]]:

791

"""Returns 4 adddress ranges, one for every tile, None if the tile is not in use"""

792

strides = get_strides(fm)

793

height, width, depth = fm.shape.height, fm.shape.width, fm.shape.depth

794

height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0

795

t0 = get_address_range(fm, strides, 0, 0, 0, min(height, height_0) - 1, min(width, width_0) - 1, depth - 1,)

796

if width > width_0:

797

t1 = get_address_range(fm, strides, 0, width_0, 0, min(height, height_1) - 1, width - 1, depth - 1)

798

else:

799

t1 = None

800

if height > height_0:

801

t2 = get_address_range(fm, strides, height_0, 0, 0, height - 1, min(width, width_0) - 1, depth - 1)

802

else:

803

t2 = None

804

if t1 is not None and t2 is not None:

805

t3 = get_address_range(fm, strides, height_0, width_0, 0, height - 1, width - 1, depth - 1)

806

else:

807

t3 = None

808

return [t0, t1, t2, t3]

809

810

811

# -------------------------------------------------------------------

812

# DMA_WAIT/KERNEL_WAIT

813

# -------------------------------------------------------------------

814

815

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

816

Watermark = namedtuple("Watermark", ["npu", "dma"])

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

817

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

818

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

819

def memory_range_set(range: NpuAddressRange) -> MemoryRangeSet:

820

return MemoryRangeSet(range.region, range.address, range.address + range.length)

821

822

823

def get_dma_memory_accesses(dma_op: NpuDmaOperation) -> MemoryAccessSet:

824

"""Returns the address that are read and written by the given DMA operation"""

825

res = MemoryAccessSet()

826

res.add(memory_range_set(dma_op.src), AccessDirection.Read)

827

res.add(memory_range_set(dma_op.dest), AccessDirection.Write)

return res

def get_op_memory_accesses(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> MemoryAccessSet:

832

"""Returns the addresses that are read and written by the given operation"""

833

assert npu_op.ifm is not None and npu_op.ofm is not None

834

# Read addresses

835

read_ranges = get_address_ranges(npu_op.ifm)

836

if has_ifm2(npu_op):

837

assert npu_op.ifm2 is not None

838

read_ranges.extend(get_address_ranges(npu_op.ifm2))

839

read_ranges.extend(npu_op.weights)

840

read_ranges.extend(npu_op.biases)

841

if npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP:

842

address = arch.available_shram_banks(True) * arch.shram_bank_size

843

read_ranges.append(NpuAddressRange(region=BasePointerIndex.Mem2Mem, address=address, length=2048))

844

# Written addresses

845

write_ranges = get_address_ranges(npu_op.ofm)

846

# Add write access to SHRAM, needed when LUTs can overwrite accumulator banks

847

uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP

848

written_shram_size = arch.available_shram_banks(uses_lut) * arch.shram_bank_size

849

write_ranges.append(NpuAddressRange(region=BasePointerIndex.Mem2Mem, address=0, length=written_shram_size))

850

851

res = MemoryAccessSet()

852

for read_range in read_ranges:

853

if read_range is not None:

854

res.add(memory_range_set(read_range), AccessDirection.Read)

855

for write_range in write_ranges:

856

if write_range is not None:

857

res.add(memory_range_set(write_range), AccessDirection.Write)

return res

def get_wait_dependency(

862

arch: ArchitectureFeatures, npu_op_list: List[NpuOperation], memory_accesses, op_index: int, watermark: Watermark

863

):

864

"""Used to calculate whether DMA wait or kernel wait operations are needed"""

865

npu_op = npu_op_list[op_index]

866

op_access = memory_accesses[npu_op]

867

index = op_index - 1

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

868

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

869

# NPU dependency tracking

870

npu_outstanding = -1

871

npu_ops = 0

872

npu_index = watermark.npu

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

873

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

874

# DMA dependency tracking

875

dma_outstanding = -1

876

dma_ops = 0

877

dma_index = watermark.dma

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

878

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

879

# Seek back in the command stream looking for NPU or DMA dependencies

880

# but only as far as the first dependency or the watermarks (dependencies

881

# before this point have been satisfied already).

882

# The watermark moves to after the latest element we must wait for, not

883

# the command that issues the wait.

884

# NPU->NPU dependency is handled via blockdep.

885

while (index >= npu_index) or (index >= dma_index):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

886

prev_op = npu_op_list[index]

887

prev_access = memory_accesses[prev_op]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

888

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

889

# Check NPU consuming DMA output

890

if is_dma_op(prev_op):

891

if index >= dma_index:

892

if not is_dma_op(npu_op):

893

if (dma_outstanding == -1) and prev_access.conflicts(op_access):

894

dma_outstanding = dma_ops

895

dma_ops += 1 # Count DMA ops in the pipeline

896

if dma_ops >= arch.max_outstanding_dma:

897

dma_index = max(index + 1, dma_index)

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

898

# Check DMA consuming NPU output

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

899

else:

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

900

if index >= npu_index:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

901

if is_dma_op(npu_op) and npu_outstanding == -1 and prev_access.conflicts(op_access):

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

902

npu_outstanding = npu_ops

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

903

npu_ops += 1 # Count NPU ops in the pipeline

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

904

if npu_ops >= arch.max_outstanding_kernels:

905

npu_index = max(index + 1, npu_index)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

906

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

907

index -= 1

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

908

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

909

# Update DMA watermark if we didn't see any and the NPU pipeline is full

910

if (dma_ops == 0) and (npu_ops >= arch.max_outstanding_kernels):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

911

dma_index = op_index

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

912

913

# Bring the search watermark forwards as we complete for those dependencies

914

watermark = Watermark(npu_index, dma_index)

915

outstanding = Watermark(npu_outstanding, dma_outstanding)

916

917

return watermark, outstanding

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

918

919

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

920

def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):

921

if cmd_waits.npu >= 0:

922

emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)

923

924

if cmd_waits.dma >= 0:

925

emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)

926

927

928

# -------------------------------------------------------------------

929

# BLOCKDEP

930

# -------------------------------------------------------------------

931

932

933

def is_dependent_on_prev_op(prev_op: NpuBlockOperation, npu_op: NpuBlockOperation) -> bool:

934

"""Checks if npu_op's input is dependent on prev_op's output"""

935

assert npu_op.ifm is not None

936

assert prev_op.ofm is not None

937

curr_input_ranges = get_address_ranges(npu_op.ifm)

938

939

if has_ifm2(npu_op):

940

assert npu_op.ifm2 is not None

941

curr_input_ranges.extend(get_address_ranges(npu_op.ifm2))

942

for prev_range in get_address_ranges(prev_op.ofm):

943

if prev_range is None:

944

continue

945

for curr_range in curr_input_ranges:

946

if curr_range is not None and ranges_overlap(prev_range, curr_range):

947

return True

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

return False

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

951

def shape3d_to_rect(shape: NpuShape3D) -> Rect:

952

return Rect(0, 0, 0, shape.width - 1, shape.height - 1, shape.depth - 1)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

953

954

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

955

def get_ifm_ofm_block_depth(arch: ArchitectureFeatures, npu_op: NpuBlockOperation) -> int:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

956

# Note: NOT equivalent to the normal ifm block depth calculation since

957

# it takes into account 'depthless' block operations by returning full

958

# depth

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

959

if npu_op.op_type == NpuOperationType.Conv2D:

960

res = arch.calc_ifm_block_depth(npu_op.ifm.shape.depth, npu_op.ifm.data_type.size_in_bits())

961

return res

962

return npu_op.ofm.shape.depth

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

963

964

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

965

def calc_blockdep(

966

arch: ArchitectureFeatures,

967

prev_op: Optional[NpuBlockOperation],

968

prev_block_config: Optional[NpuShape3D],

969

npu_op: NpuBlockOperation,

970

block_config: NpuShape3D,

971

) -> int:

972

"""Calculates the value of the BLOCKDEP register"""

973

if prev_op is None:

974

return 0

975

if not is_dependent_on_prev_op(prev_op, npu_op):

976

return ArchitectureFeatures.MAX_BLOCKDEP

977

if prev_op.ofm.shape != npu_op.ifm.shape:

978

return 0

979

prev_ifm_block_depth = get_ifm_ofm_block_depth(arch, prev_op)

980

prev_ofm_block = Block(prev_block_config.width, prev_block_config.height, prev_block_config.depth)

981

prev_ofm_rect = shape3d_to_rect(prev_op.ofm.shape)

982

prev_ifm_rect = shape3d_to_rect(prev_op.ifm.shape)

983

cur_ifm_block_depth = get_ifm_ofm_block_depth(arch, npu_op)

984

cur_ofm_block = Block(block_config.width, block_config.height, block_config.depth)

985

cur_ofm_rect = shape3d_to_rect(npu_op.ofm.shape)

986

cur_ifm_rect = shape3d_to_rect(npu_op.ifm.shape)

987

cur_padLT = (0, 0) if npu_op.padding is None else (npu_op.padding.left, npu_op.padding.top)

988

blockdep = arch.calc_block_dep(

989

prev_ifm_rect,

990

prev_ofm_rect,

991

prev_ifm_block_depth,

992

prev_ofm_block,

993

to_kernel(prev_op.kernel),

cur_ifm_rect,

cur_ofm_rect,

cur_ifm_block_depth,

cur_ofm_block,

to_kernel(npu_op.kernel),

999

cur_padLT,

1000

)

1001

return blockdep

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1002

1003

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1004

# -------------------------------------------------------------------

1005

# PRINT

1006

# -------------------------------------------------------------------

Jacob Bohlin

e99b893

2020-07-13 16:01:51 +0200

[diff] [blame]

1007

1008

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1009

def print_feature_map(fm: NpuFeatureMap, name: str):

if fm is not None:

q = (

"no quantization"

if fm.quantization is None

1014

else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"

1015

)

1016

h, w, c = fm.shape

1017

sz = h * w * c * fm.data_type.size_in_bytes()

1018

print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")

1019

strides = get_strides(fm)

1020

stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"

1021

t = fm.tiles

1022

addresses = [hex(addr) for addr in t.addresses]

1023

print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1024

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1025

1026

def print_operation(npu_op: NpuOperation, index: int = 0):

1027

pass_info = f", {npu_op.cmd}" if hasattr(npu_op, "cmd") else ""

1028

if is_dma_op(npu_op):

1029

print(f"{index} DMA_START src={npu_op.src}, dest={npu_op.dest}{pass_info}")

1030

return

1031

k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)

1032

if npu_op.op_type in (NpuOperationType.Pooling, NpuOperationType.ElementWise):

1033

print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name}:{pass_info}")

Patrik Gustavsson

eca2e95

2020-05-27 09:15:11 +0200

[diff] [blame]

1034

else:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1035

if (

1036

npu_op.op_type == NpuOperationType.Conv2D

1037

and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1

1038

):

1039

fc = "FullyConnected "

1040

else:

1041

fc = ""

1042

print(f"{index} {fc}{npu_op.op_type.name}{pass_info}")

1043

print_feature_map(npu_op.ifm, "IFM")

1044

if npu_op.ifm2_scalar is not None:

1045

quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)

1046

print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")

1047

else:

1048

print_feature_map(npu_op.ifm2, "IFM2")

1049

print_feature_map(npu_op.ofm, "OFM")

1050

if k is not None and npu_op.op_type != NpuOperationType.ElementWise:

1051

print(f" Kernel: {k}")

1052

if npu_op.padding is not None:

1053

print(f" {npu_op.padding}")

1054

for weights in npu_op.weights:

1055

print(f" Weights: {weights}")

1056

for bias in npu_op.biases:

1057

print(f" Scales: {bias}")

1058

if npu_op.activation is not None:

1059

act = npu_op.activation

1060

if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:

1061

lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""

1062

print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")

1063

if npu_op.op_type == NpuOperationType.Conv2D:

1064

print(f" {npu_op.block_traversal}")

1065

bh, bw, bc = npu_op.block_config

1066

rescale = f", rescale={npu_op.rescale}" if hasattr(npu_op, "rescale") else ""

1067

print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1068

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1069

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1070

def print_operations(npu_op_list: List[NpuOperation]):

1071

for index, npu_op in enumerate(npu_op_list):

1072

print_operation(npu_op, index)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1073

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1074

1075

# -------------------------------------------------------------------

1076

# OPERATIONS

1077

# -------------------------------------------------------------------

1078

1079

1080

def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):

1081

"""Generates NPU_OP_* command"""

1082

op_type = npu_op.op_type

1083

if op_type == NpuOperationType.Dma:

1084

emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)

1085

elif op_type == NpuOperationType.Conv2D:

1086

emit.cmd_do_operation(cmd0.NPU_OP_CONV)

1087

elif op_type == NpuOperationType.ConvDepthWise:

1088

emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)

1089

elif op_type == NpuOperationType.Pooling:

1090

emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])

1091

elif op_type == NpuOperationType.ElementWise:

1092

emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])

1093

else:

1094

assert 0, "Unsupported operation"

1095

1096

1097

def generate_conv2d_op(

1098

emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures

1099

) -> NpuShape3D:

1100

"""Generates register commands for Conv2D operations"""

1101

generate_common(emit, npu_op, npu_op.block_traversal, arch)

1102

ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]

1103

shared_buffer = shared_buffer_allocation_for_npu_op(arch, npu_op, NpuBlockType.ConvolutionMxN, ifm_resampling_mode)

1104

block_config = generate_block_config(emit, npu_op, arch, shared_buffer)

1105

generate_shram_registers_non_elementwise(emit, shared_buffer)

return block_config

def generate_conv_depthwise_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):

1110

"""Generates register commands for depthwise convolution operations"""

1111

generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)

1112

ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]

1113

shared_buffer = shared_buffer_allocation_for_npu_op(

1114

arch, npu_op, NpuBlockType.ConvolutionDepthWise, ifm_resampling_mode

1115

)

1116

block_config = generate_block_config(emit, npu_op, arch, shared_buffer)

1117

generate_shram_registers_non_elementwise(emit, shared_buffer)

return block_config

def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):

1122

"""Generates register commands for pooling operations"""

1123

use_global_scale = (

1124

npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0

1125

)

1126

generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)

1127

# Pooling op specific

1128

if use_global_scale:

1129

generate_ofm_scaling_for_pooling(emit, npu_op)

1130

ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]

1131

npu_block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling

1132

shared_buffer = shared_buffer_allocation_for_npu_op(arch, npu_op, npu_block_type, ifm_resampling_mode)

1133

block_config = generate_block_config(emit, npu_op, arch, shared_buffer)

1134

generate_shram_registers_non_elementwise(emit, shared_buffer)

return block_config

def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):

1139

"""Generates register commands for elementwise operations"""

1140

use_global_scale = npu_op.sub_op_type in (

1141

NpuElementWiseOp.ADD,

1142

NpuElementWiseOp.SUB,

1143

NpuElementWiseOp.MUL,

1144

NpuElementWiseOp.LRELU,

1145

NpuElementWiseOp.ABS,

1146

)

1147

op_to_scale = generate_scaling_for_elementwise(emit, npu_op)

1148

generate_common(

1149

emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale

1150

)

1151

# Elementwise op specific

1152

if npu_op.sub_op_type not in unary_elementwise_ops:

1153

# Binary operation; generate IFM2 registers

1154

assert npu_op.ifm2 is not None

1155

has_scalar = npu_op.ifm2_scalar is not None

1156

generate_ifm2(emit, npu_op.ifm2, has_scalar)

1157

generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)

1158

generate_ifm2_broadcast(emit, npu_op)

1159

if has_scalar:

1160

quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)

1161

assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()

1162

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)

1163

ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]

1164

shared_buffer = shared_buffer_allocation_for_npu_op(arch, npu_op, NpuBlockType.ElementWise, ifm_resampling_mode)

1165

block_config = generate_block_config(emit, npu_op, arch, shared_buffer)

1166

generate_shram_registers_elementwise(emit, npu_op, arch, shared_buffer)

return block_config

def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):

1171

"""Generates register commands for DMA operations"""

1172

emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)

1173

emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)

1174

emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)

1175

1176

emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)

1177

emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)

1178

1179

1180

def generate_registers_for_op(

1181

emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures

1182

) -> Optional[NpuShape3D]:

1183

"""

1184

Generates register commands for the given operation, but not the final NPU_OP_... command.

1185

Returns the selected block config

1186

"""

1187

op_type = npu_op.op_type

1188

block_config = None

1189

if op_type == NpuOperationType.Conv2D:

1190

block_config = generate_conv2d_op(emit, npu_op, arch)

1191

elif op_type == NpuOperationType.ConvDepthWise:

1192

block_config = generate_conv_depthwise_op(emit, npu_op, arch)

1193

elif op_type == NpuOperationType.Pooling:

1194

block_config = generate_pooling_op(emit, npu_op, arch)

1195

elif op_type == NpuOperationType.ElementWise:

1196

block_config = generate_elementwise_op(emit, npu_op, arch)

1197

elif op_type == NpuOperationType.Dma:

1198

generate_dma_op(emit, npu_op)

1199

else:

1200

assert 0, "Unsupported operation"

return block_config

def generate_command_stream(

1205

emit: CommandStreamEmitter, npu_op_list: List[NpuOperation], arch: ArchitectureFeatures, add_to_debug_db=None

1206

):

1207

"""Generates register commands for the given list of NPU operations"""

1208

# Calculate memory accesses for every operation

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

1209

memory_accesses = {}

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1210

for npu_op in npu_op_list:

1211

if is_dma_op(npu_op):

1212

memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)

1213

else:

1214

memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

1215

if arch.is_ethos_u65_system:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1216

emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)

1217

dep_watermark = Watermark(0, 0)

1218

prev_op = None

1219

prev_block_config = None

1220

# Generate register commands for all operations

1221

for op_index, npu_op in enumerate(npu_op_list):

1222

dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)

1223

block_config = generate_registers_for_op(emit, npu_op, arch)

1224

if not is_dma_op(npu_op):

1225

# Generate BLOCKDEP

1226

assert block_config is not None

1227

blockdep = calc_blockdep(arch, prev_op, prev_block_config, npu_op, block_config)

1228

blockdep = min(blockdep, arch.max_blockdep)

1229

emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)

1230

prev_op = npu_op

1231

prev_block_config = block_config

1232

1233

generate_cmd_waits(emit, cmd_waits)

1234

# Generate the actual NPU_OP command

1235

generate_operation_code(emit, npu_op)

1236

if add_to_debug_db is not None:

1237

add_to_debug_db(npu_op, emit.offset)

1238

# Fill in final part of command stream:

1239

emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)

1240

1241

1242

def generate_register_command_stream_for_sg(nng, sg, arch, verbose=False):

1243

"""Generates command stream for the subgraph, adds it to sg.register_command_stream"""

1244

# Convert high level command stream to list of NpuOperation

1245

npu_op_list = []

1246

npu_op_to_cmd = dict() # map from npu op to high level command

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1247

for cmd in sg.high_level_command_stream:

1248

if cmd.cmdtype == CommandType.NpuStripe and cmd.ps.npu_block_type == NpuBlockType.Default:

1249

print("Warning: Skipping register command stream generation for", cmd.ps)

1250

else:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1251

npu_op = convert_command_to_npu_op(cmd, arch)

1252

npu_op_list.append(npu_op)

1253

npu_op_to_cmd[npu_op] = cmd

1254

if verbose:

1255

print_operations(npu_op_list)

1256

# Generate register commands

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

1257

stream_id = DebugDatabase.add_stream(sg)

1258

DebugDatabase.set_stream_offset(sg, 0) # Default to zero, can only set during file writing

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1259

emit = CommandStreamEmitter()

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

1260

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1261

def add_to_debug_db(npu_op: NpuOperation, offset: int):

1262

"""Adds info to the debug database"""

1263

if not is_dma_op(npu_op):

1264

cmd = npu_op_to_cmd[npu_op]

1265

DebugDatabase.add_command(stream_id, offset, cmd.ps.primary_op)

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

1266

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1267

generate_command_stream(emit, npu_op_list, arch, add_to_debug_db)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1268

sg.register_command_stream = emit.to_list()

1269

if verbose:

1270

emit.print_cmds()

1271

print("number of commands", len(emit.cmd_stream))

1272

print("command stream length in words", len(sg.register_command_stream))

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1273

1274

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1275

def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1276

"""

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1277

Internal implementation of the public facing API for generating an Ethos-U register command stream.

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1278

Calculates dependencies between commands and inserts wait operations if needed.

1279

1280

:param npu_op_list: List[NpuOperation] list of high level NPU operations

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

1281

:param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator

1282

:return Ethos-U instructions, as a list of 32-bit integers

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1283

"""

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1284

accelerator = Accelerator.from_npu_accelerator(npu_accelerator)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1285

emit = CommandStreamEmitter()

Louis Verhaard

5207830

2020-11-18 13:35:06 +0100

[diff] [blame^]

1286

arch = create_default_arch(accelerator)

Louis Verhaard