Blame - ethosu/vela/register_command_stream_generator.py - ml/ethos-u/ethos-u-vela

2020-04-27 18:20:16 +0100

[diff] [blame]

1

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

16

# Description:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

17

# Register level (low-level) command stream generation for Ethos-U55. Takes a list of NPU operations and generates

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

18

# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit

19

# stream suitable for interpretation by the Ethos-U55 processor.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

20

from collections import defaultdict

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

21

from collections import namedtuple

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

22

from enum import Enum

23

from enum import IntEnum

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

24

from typing import List

25

from typing import Optional

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

import numpy as np

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

29

from . import numeric_util

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

30

from . import scaling

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

31

from .api import NpuActivation

32

from .api import NpuActivationOp

33

from .api import NpuAddressRange

34

from .api import NpuBlockOperation

35

from .api import NpuBlockTraversal

36

from .api import NpuConv2DOperation

37

from .api import NpuDataType

38

from .api import NpuDmaOperation

39

from .api import NpuElementWiseOp

40

from .api import NpuElementWiseOperation

41

from .api import NpuFeatureMap

42

from .api import NpuKernel

43

from .api import NpuLayout

44

from .api import NpuOperation

45

from .api import NpuOperationType

46

from .api import NpuPadding

47

from .api import NpuPoolingOp

48

from .api import NpuPoolingOperation

49

from .api import NpuQuantization

50

from .api import NpuResamplingMode

51

from .api import NpuRoundingMode

52

from .api import NpuShape3D

53

from .api import NpuTileBox

54

from .architecture_features import Accelerator

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

55

from .architecture_features import ArchitectureFeatures

56

from .architecture_features import Block

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

57

from .architecture_features import Rect

58

from .architecture_features import SharedBufferArea

59

from .architecture_features import SHRAMElements

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

60

from .debug_database import DebugDatabase

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

61

from .ethos_u55_regs.ethos_u55_regs import acc_format

62

from .ethos_u55_regs.ethos_u55_regs import activation

63

from .ethos_u55_regs.ethos_u55_regs import cmd0

64

from .ethos_u55_regs.ethos_u55_regs import cmd1

65

from .ethos_u55_regs.ethos_u55_regs import elementwise_mode

Fredrik Svedberg

a0c3624

2020-06-03 15:43:31 +0200

[diff] [blame]

66

from .ethos_u55_regs.ethos_u55_regs import pooling_mode

Jacob Bohlin

cf7da10

2020-05-20 09:03:40 +0200

[diff] [blame]

67

from .ethos_u55_regs.ethos_u55_regs import resampling_mode

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

68

from .ethos_u55_regs.ethos_u55_regs import rounding

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

69

from .high_level_command_stream import CommandType

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

70

from .high_level_command_to_npu_op import convert_command_to_npu_op

71

from .high_level_command_to_npu_op import to_kernel

72

from .high_level_command_to_npu_op import unary_elementwise_ops

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

73

from .numeric_util import quantise_float32

74

from .numeric_util import round_away_zero

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

75

from .numeric_util import round_up_to_int

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

76

from .operation import NpuBlockType

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

77

from .range_set import AccessDirection

78

from .range_set import MemoryAccessSet

79

from .range_set import MemoryRangeSet

80

from .shared_buffer_allocation import find_suitable_block_configs

81

from .shared_buffer_allocation import shared_buffer_allocation_for_npu_op

82

from .shared_buffer_allocation import SharedBufferAllocation

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

83

84

85

class RegisterMachine:

86

def __init__(self):

87

self.n_banks = 1

88

self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]

89

self.bank_idx = 0

90

91

def set_register(self, reg, value):

92

is_changed = self.registers[self.bank_idx][reg] != value

93

self.registers[self.bank_idx][reg] = value

94

# is_changed = True # force command

95

return is_changed

96

97

def switch_bank(self):

98

self.bank_idx = (self.bank_idx + 1) % self.n_banks

99

100

101

class CmdMode(IntEnum):

NoPayload = 0x0000

Payload32 = 0x4000

Mask = 0xC000

CmdOpMask = 0x03FF

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

108

class CommandStreamEmitter:

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

109

WORD_SIZE = 4

110

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

111

def __init__(self):

112

self.cmd_stream = []

113

self.reg_machine = [RegisterMachine(), RegisterMachine()]

114

self.last_absolute_wait = defaultdict(int)

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

115

self.offset = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

116

117

def get_reg_machine(self, cmd):

118

if "DMA" in cmd.name:

119

return self.reg_machine[1]

120

else:

121

return self.reg_machine[0]

122

123

def size_in_bytes(self):

124

sz = 0

125

for cmd in self.cmd_stream:

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

126

sz += len(cmd) * CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

127

return sz

128

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

129

def to_list(self) -> List[int]:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

130

return [elem for cmd in self.cmd_stream for elem in cmd]

131

132

def print_cmds(self):

133

print("Code: Command: Param: Payload:")

134

for words_for_one_command in self.cmd_stream:

135

code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits

136

param = words_for_one_command[0] >> 16 # higher 16 bits

137

138

payload_mode = CmdMode(code & CmdMode.Mask)

139

140

# code and command

141

s = " 0x%04x " % code

142

if payload_mode == CmdMode.NoPayload:

143

s += str(cmd0(code & CmdMode.CmdOpMask))

144

else:

145

s += str(cmd1(code & CmdMode.CmdOpMask))

s = s.ljust(40)

s += "%5d" % param

# payload

if payload_mode == CmdMode.Payload32:

152

s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])

else:

s += " -"

print(s)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

158

def cmd0_with_param(self, cmd: cmd0, param):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

159

if isinstance(param, Enum):

160

param = int(param.value)

161

else:

162

param = int(param)

163

param = param & 0xFFFF

164

command = cmd.value | (param << 16)

165

if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):

166

return

167

168

# This is not a redundant command, actually write it

169

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

170

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

171

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

172

def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

173

offset = int(offset) & 0xFFFFFFFFF

174

command = cmd.value | CmdMode.Payload32.value | (param << 16)

175

176

if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):

177

return

178

179

# This is not a redundant command, actually write it

180

self.cmd_stream.append((command, offset))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

181

self.offset += CommandStreamEmitter.WORD_SIZE * 2

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

182

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

183

def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

184

param = (16 * channel) + outstanding_count

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

185

command = ((param & 0xFFFF) << 16) | cmd.value

186

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

187

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

188

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

189

def cmd_do_operation(self, cmd: cmd0, param=0):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

190

param = int(param)

191

command = ((param & 0xFFFF) << 16) | cmd.value

192

193

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

194

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

195

self.get_reg_machine(cmd).switch_bank()

196

197

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

198

# -------------------------------------------------------------------

199

# REGISTER GENERATION

200

# -------------------------------------------------------------------

201

202

203

class BasePointerIndex(IntEnum):

204

WeightTensor = 0 # base address index for the Weight tensor

205

ScratchTensor = 1 # base address index for the Scratch_tensor in the TensorArena

206

ScratchFastTensor = 2 # base address for the Scratch_fast_tensor

207

Mem2Mem = (1 << 8) | (3 << 0) # base address slot for memory 2 memory transfer

208

209

210

# TODO: Replace with definitions from ethos_u55_regs

211

class IFM2Broadcast(IntEnum):

212

BroadcastHdim = 1 << 0

213

BroadcastWdim = 1 << 1

214

BroadcastCdim = 1 << 2

215

ReverseOperandOrder = 1 << 6

216

UseIFM2Scalar = 1 << 7

pooling_op_map = {

NpuPoolingOp.MAX: pooling_mode.MAX.value,

221

NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,

222

NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,

223

}

224

225

elementwise_op_map = {

226

NpuElementWiseOp.MUL: elementwise_mode.MUL.value,

227

NpuElementWiseOp.ADD: elementwise_mode.ADD.value,

228

NpuElementWiseOp.SUB: elementwise_mode.SUB.value,

229

NpuElementWiseOp.MIN: elementwise_mode.MIN.value,

230

NpuElementWiseOp.MAX: elementwise_mode.MAX.value,

231

NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,

232

NpuElementWiseOp.ABS: elementwise_mode.ABS.value,

233

NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,

234

NpuElementWiseOp.SHR: elementwise_mode.SHR.value,

235

NpuElementWiseOp.SHL: elementwise_mode.SHL.value,

236

}

237

238

activation_op_map = {

239

NpuActivationOp.NONE_OR_RELU: activation.NONE,

240

NpuActivationOp.TANH: activation.TANH,

241

NpuActivationOp.SIGMOID: activation.SIGMOID,

242

}

243

244

# Maps an AccumulatorType enum to the corresponding acc_format value

245

acc_format_map = {

246

SHRAMElements.Acc16: acc_format.FP_S5_10.value,

247

SHRAMElements.Acc32: acc_format.INT_32BIT.value,

248

SHRAMElements.Acc40: acc_format.INT_40BIT.value,

249

}

250

251

resampling_mode_map = {

252

NpuResamplingMode.NONE: resampling_mode.NONE,

253

NpuResamplingMode.NEAREST: resampling_mode.NEAREST,

254

NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,

255

}

256

257

# Maps data type size in bits to activation precision

258

precision_map = {8: 0, 16: 1, 32: 2}

259

260

# Maps rounding mode to the corresponding value

261

rounding_mode_map = {

262

NpuRoundingMode.TFL: rounding.TFL.value,

263

NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,

264

NpuRoundingMode.NATURAL: rounding.NATURAL.value,

}

def quantise(value: float, quant: Optional[NpuQuantization]) -> int:

269

"""Quantizes the given value"""

270

scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32

271

zp = 0 if quant is None else quant.zero_point

272

return quantise_float32(value, scale, zp)

273

274

275

def has_ifm2(npu_op: NpuBlockOperation) -> bool:

276

"""Checks if op has non-scalar IFM2"""

277

return npu_op.ifm2 is not None and npu_op.ifm2_scalar is None

278

279

280

def is_dma_op(npu_op: NpuOperation) -> bool:

281

"""Checks if op is a DMA operation"""

282

return npu_op.op_type == NpuOperationType.Dma

283

284

285

def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):

286

"""Generates IFM_PAD registers"""

287

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)

288

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)

289

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)

290

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)

291

292

293

def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):

294

"""Generates ACTIVATION registers"""

295

act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)

296

297

if act.min is None:

298

quantized_min = ofm.data_type.min_value()

299

else:

300

quantized_min = quantise(act.min, ofm.quantization)

301

if act.max is None:

302

quantized_max = ofm.data_type.max_value()

303

else:

304

quantized_max = quantise(act.max, ofm.quantization)

305

quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())

306

quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())

307

if act.op_type == NpuActivationOp.TABLE_LOOKUP:

308

assert 0 <= act.lookup_table_index < 8

309

activation_value = 16 + act.lookup_table_index

310

if ofm.data_type == NpuDataType.INT32:

311

activation_value |= 3 << 12 # Force I8 range

312

quantized_min = max(-128, quantized_min)

313

quantized_max = min(127, quantized_max)

314

else:

315

activation_value = activation_op_map[act.op_type]

316

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)

317

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)

318

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)

319

320

321

def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):

322

"""Generates xFM_BASE registers"""

323

if layout == NpuLayout.NHCWB16:

324

# Check that all BasePointer addresses are aligned to 16 bytes

325

assert all((int(addr) % 16) == 0 for addr in addresses)

326

emit.cmd1_with_offset(ptr_cmds[0], addresses[0])

327

emit.cmd1_with_offset(ptr_cmds[1], addresses[1])

328

emit.cmd1_with_offset(ptr_cmds[2], addresses[2])

329

emit.cmd1_with_offset(ptr_cmds[3], addresses[3])

330

331

332

def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):

333

"""Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""

334

emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)

335

emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)

336

emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)

337

338

339

def generate_strides(

340

emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1

341

):

342

"""Generates STRIDE_C/Y/X registers"""

343

strides = get_strides(fm)

344

emit.cmd1_with_offset(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)

345

emit.cmd1_with_offset(stride_y_cmd, strides.height) # stride between vertical values (H)

346

emit.cmd1_with_offset(stride_x_cmd, strides.width) # stride between horisontal values (W)

347

348

349

def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):

350

"""Generates IFM/IFM2_PRECISION register"""

351

dtype = fm.data_type

352

prec = 1 if dtype.is_signed() else 0

353

activation_precision = precision_map[dtype.size_in_bits()]

354

prec += activation_precision << 2

355

356

if fm.layout == NpuLayout.NHCWB16:

357

prec |= 1 << 6

358

359

prec |= op_to_scale << 8

360

emit.cmd0_with_param(precision_cmd, prec)

361

362

363

def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):

364

"""Generates OFM_PRECISION register"""

365

dtype = npu_op.ofm.data_type

366

prec = 1 if dtype.is_signed() else 0

367

activation_precision = precision_map[dtype.size_in_bits()]

368

prec += activation_precision << 1

369

370

if use_global_scale:

371

# Set global scale bit, as opposed to using per channel scale

372

prec |= 1 << 8

373

if npu_op.ofm.layout == NpuLayout.NHCWB16:

374

prec |= 1 << 6

375

prec |= rounding_mode_map[npu_op.rounding_mode] << 14

376

emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)

377

378

379

def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):

380

"""Generates IFM2_BROADCAST register for binary elementwise operations"""

ifm2_broadcast = 0

ifm = npu_op.ifm

ifm2 = npu_op.ifm2

if npu_op.reversed_operands:

385

ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder

386

if npu_op.ifm2_scalar is not None:

387

# IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST

388

ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar

389

else:

390

if ifm.shape.height != ifm2.shape.height:

391

# Broadcast in 'H' dimension

392

assert ifm2.shape.height == 1

393

ifm2_broadcast |= IFM2Broadcast.BroadcastHdim

394

395

if ifm.shape.width != ifm2.shape.width:

396

# Broadcast in 'W' dimension

397

assert ifm2.shape.width == 1

398

ifm2_broadcast |= IFM2Broadcast.BroadcastWdim

399

400

if ifm.shape.depth != ifm2.shape.depth:

401

# Broadcast in 'C' dimension

402

assert ifm2.shape.depth == 1

403

ifm2_broadcast |= IFM2Broadcast.BroadcastCdim

404

405

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)

406

407

408

def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):

409

"""Generates general IFM registers"""

410

emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)

411

generate_addresses(

412

emit,

413

[cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],

ifm.tiles.addresses,

ifm.layout,

)

generate_tiles(

emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles

419

)

420

emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)

421

generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)

422

emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))

423

424

425

def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):

426

"""Generates general IFM2 registers"""

427

if not has_scalar:

428

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)

429

generate_addresses(

430

emit,

431

[cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],

432

ifm2.tiles.addresses,

ifm2.layout,

)

generate_tiles(

emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles

437

)

438

generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)

439

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))

440

441

442

def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):

443

"""Generates general OFM registers"""

444

emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)

445

generate_addresses(

446

emit,

447

[cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],

ofm.tiles.addresses,

ofm.layout,

)

generate_tiles(

emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles

453

)

454

emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)

455

emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)

456

emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)

457

generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)

458

emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))

459

460

461

def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):

462

"""Generates KERNEL related registers"""

463

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))

464

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))

465

# set kernel x stride low bit

466

stride = (kernel.stride_x - 1) & 1

467

# set kernel y stride low bit

468

stride |= (kernel.stride_y - 1 & 1) << 1

469

# set kernel x stride extension bits

470

stride |= (kernel.stride_x - 1 >> 1) << 6

471

# set kernel y stride extension bits

472

stride |= (kernel.stride_y - 1 >> 1) << 9

473

stride |= (kernel.dilation_x - 1) << 3

474

stride |= (kernel.dilation_y - 1) << 4

475

if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:

476

stride |= 1 << 2

477

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)

478

479

480

def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):

481

"""Generates WEIGHT registers"""

482

if len(weights) == 0:

483

return

484

emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)

485

# Set weights sources for active and present cores

486

for core, (addr, length) in enumerate(

487

[

488

(cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),

489

(cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),

490

]

491

):

492

if core < len(weights):

493

emit.cmd1_with_offset(addr, weights[core].address)

494

emit.cmd1_with_offset(length, weights[core].length)

495

elif core < arch.ncores:

496

emit.cmd1_with_offset(addr, weights[0].address)

497

emit.cmd1_with_offset(length, 0)

498

499

500

def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):

501

"""Generates SCALE registers"""

502

if len(biases) == 0:

503

return

504

emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)

505

# Set weights sources for active and present cores

506

for core, (addr, length) in enumerate(

507

[(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]

508

):

509

if core < len(biases):

510

emit.cmd1_with_offset(addr, biases[core].address)

511

emit.cmd1_with_offset(length, biases[core].length)

512

elif core < arch.ncores:

513

emit.cmd1_with_offset(addr, biases[0].address)

514

emit.cmd1_with_offset(length, 0)

515

516

517

def generate_block_config(

518

emit: CommandStreamEmitter,

519

npu_op: NpuBlockOperation,

520

arch: ArchitectureFeatures,

521

shared_buffer: SharedBufferAllocation,

522

) -> NpuShape3D:

523

"""Selects a suitable block config if none has been set, and generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""

524

block_config = npu_op.block_config

525

if block_config is None or block_config.height < 0:

526

# Note: this code only used if the public API to generate command streams is used;

527

# in the "normal" flow, the block config selected by the scheduler is used

528

if npu_op.weights:

529

assert block_config is not None, "block_config.depth must be provided for ops with weights"

530

# Block config has not been provided: find one

531

blocks = find_suitable_block_configs(arch, shared_buffer)

532

# Return the block with biggest volume

533

# TODO: use a better algorithm to find the best block

best_block = None

best_value = 0

for block in blocks:

if block_config is not None and block[3] != block_config.depth:

538

continue

539

value = block[0] * block[1] * block[3]

540

if value > best_value:

541

best_value = value

542

best_block = block

543

assert best_block is not None, f"No suitable block config was found, {npu_op.op_type}"

544

block_config = NpuShape3D(height=best_block[0], width=best_block[1], depth=best_block[3])

545

alloc = shared_buffer.try_block(Block(block_config.width, block_config.height, block_config.depth))

546

assert alloc is not None, f"Block config {block_config} does not fit, op: {npu_op.op_type}"

547

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)

548

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)

549

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)

return block_config

def generate_shram_registers_elementwise(

554

emit: CommandStreamEmitter,

555

npu_op: NpuElementWiseOperation,

556

arch: ArchitectureFeatures,

557

shared_buffer: SharedBufferAllocation,

558

):

559

"""Generates IB_END/IB_START/AB_START registers for elementwise operations"""

560

# For elementwise set the required SHRAM to be equal to the total size of available SHRAM

561

uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP

562

shram_required = arch.available_shram_banks(uses_lut)

563

564

# Acc buffers not needed so set AB_START to size of SHRAM

565

emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)

566

emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shram_required)

567

if has_ifm2(npu_op):

568

# Set IFM2_IB_START to the latter half of the IB space

569

ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]

570

emit.cmd0_with_param(

571

cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) // shared_buffer.ifm_count + ifm_ib_start,

572

)

573

emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])

574

575

576

def generate_shram_registers_non_elementwise(emit: CommandStreamEmitter, shared_buffer: SharedBufferAllocation):

577

"""Generates IB_END/IB_START/AB_START registers for non-elementwise operations"""

578

emit.cmd0_with_param(

579

cmd0.NPU_SET_IFM_IB_END,

580

shared_buffer.bank_locations[SharedBufferArea.IFM] + shared_buffer.banks_required[SharedBufferArea.IFM],

581

)

582

emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])

583

emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])

def generate_common(

emit: CommandStreamEmitter,

588

npu_op: NpuBlockOperation,

589

block_traversal: NpuBlockTraversal,

590

arch: ArchitectureFeatures,

591

use_global_scale: bool = False,

592

op_to_scale: int = 0,

593

):

594

"""Generate registers that are common to most operations"""

595

assert npu_op.ifm is not None and npu_op.ofm is not None

596

generate_ifm(emit, npu_op.ifm)

597

generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)

598

emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])

599

if npu_op.padding is not None:

600

generate_padding(emit, npu_op.padding)

601

generate_ofm(emit, npu_op.ofm)

602

generate_ofm_precision(emit, npu_op, use_global_scale)

603

if npu_op.op_type != NpuOperationType.ElementWise:

604

assert npu_op.kernel is not None

605

generate_kernel(emit, npu_op.kernel, block_traversal)

606

generate_weights(emit, npu_op.weights, arch)

607

generate_biases(emit, npu_op.biases, arch)

608

generate_activation(emit, npu_op.activation, npu_op.ofm)

609

610

611

# -------------------------------------------------------------------

612

# SCALING

613

# -------------------------------------------------------------------

614

615

616

def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):

617

"""Generates OFM_SCALE register for pooling operations"""

618

# For valid padding vela has to output scaling values

619

kernel = pool_op.kernel

620

ifm_quant = pool_op.ifm.quantization

621

ofm_quant = pool_op.ofm.quantization

622

if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):

623

assert ifm_quant.scale_f32 is not None

624

rescale = 0x3000 * ifm_quant.scale_f32

625

if pool_op.ifm.data_type == NpuDataType.INT16:

626

# Calculate scale and shift for the output scale of 1/(3*4096)

627

shift = 0

628

max_rescale = np.iinfo(np.int16).max / 2

629

while rescale <= max_rescale and shift <= 30:

shift += 1

rescale *= 2

scale = int(rescale)

else:

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

635

scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)

636

scale = int(round_away_zero(scale * rescale))

637

elif pool_op.fused_quantize:

638

# Quantize op requires different scaling

639

ifm_scale_f64 = np.double(ifm_quant.scale_f32)

640

ofm_scale_f64 = np.double(ofm_quant.scale_f32)

641

scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)

642

elif pool_op.rescale is not None:

643

# for ResizeBilinear operations with "rescale" in primary_op.attrs

644

rescale = pool_op.rescale

645

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

646

scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)

647

scale = int(round_away_zero(scale * rescale))

648

else:

649

# In case avg pool fused with concat or other memory operation, rescaling might be needed.

650

# kernel height == kernel width == 1 is always true in this case

651

# Normally the scale is maximised, to get maximum precision, which means that

652

# if rescale != 1, scale need to consider the number of bits needed for rescaling

653

if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:

654

rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32

655

rescale_bits = 0

656

if kernel.height == kernel.width == 1:

657

if rescale > 1:

658

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

659

elif rescale < 1:

660

rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)

661

scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)

662

scale = int(round_away_zero(scale * rescale))

else:

scale = 1

shift = 0

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)

668

669

670

def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:

671

"""

672

Generates OFM/OPA/OPB_SCALE registers for elementwise operators.

673

Returns the operator to scale

674

"""

675

op_to_scale = 0

676

if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):

677

input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None

678

input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None

679

output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None

680

681

if npu_op.activation is not None and npu_op.activation.op_type in (

682

NpuActivationOp.SIGMOID,

683

NpuActivationOp.TANH,

684

):

685

output_scale = 1 / 0x3000

686

687

if npu_op.sub_op_type == NpuElementWiseOp.MUL:

688

if None in (input_scale, input2_scale, output_scale):

ofm_scale = 1

shift = 0

else:

ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)

693

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

694

else: # Add/Sub

695

if None in (input_scale, input2_scale, output_scale):

696

opa_scale = opb_scale = ofm_scale = 1

697

opa_shift = shift = 0

698

if npu_op.rescale is not None:

699

ofm_scale, shift = npu_op.rescale

700

elif input_scale == input2_scale:

701

opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(

702

input_scale, input2_scale, output_scale

703

)

704

opa_shift = 0 # Unused for this case

705

else:

706

# Use advanced implementation only when input scales differ

707

bitdepth = npu_op.ifm.data_type.size_in_bits()

708

(opa_scale, opa_shift, ofm_scale, shift, op_to_scale,) = scaling.advanced_elementwise_add_sub_scale(

709

input_scale, input2_scale, output_scale, bitdepth

710

)

711

opb_scale = 0 # Unused for this case

712

if npu_op.reversed_operands:

713

# If the operand order is reversed we also have to swap which operand is scaled

714

if op_to_scale == scaling.OperandToScale.OPa:

715

op_to_scale = scaling.OperandToScale.OPb

716

else:

717

op_to_scale = scaling.OperandToScale.OPa

718

emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)

719

emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)

720

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

721

elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):

722

output_scale = npu_op.ofm.quantization.scale_f32

723

ofm_scale, shift = scaling.quantise_scale(output_scale)

724

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

725

else:

726

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)

return op_to_scale

# -------------------------------------------------------------------

731

# ADDRESSING/STRIDES (helper functions)

732

# -------------------------------------------------------------------

733

734

735

def ranges_overlap(range1: NpuAddressRange, range2: NpuAddressRange) -> bool:

736

"""Checks if the ranges overlap"""

737

return range1.region == range2.region and numeric_util.overlaps(

738

range1.address, range1.address + range1.length, range2.address, range2.address + range2.length

)

def get_strides(fm: NpuFeatureMap) -> NpuShape3D:

743

"""Calculates STRIDE_C/Y/X"""

744

if fm.strides is not None:

745

return fm.strides

746

elem_size = fm.data_type.size_in_bytes()

747

if fm.layout == NpuLayout.NHWC:

748

stride_c = elem_size

749

stride_x = fm.shape.depth * stride_c

750

stride_y = fm.shape.width * stride_x

751

else:

752

stride_x = 16 * elem_size

753

stride_c = stride_x * fm.shape.width

754

stride_y = elem_size * fm.shape.width * numeric_util.round_up(fm.shape.depth, 16)

755

return NpuShape3D(depth=stride_c, height=stride_y, width=stride_x)

756

757

758

def get_address(fm: NpuFeatureMap, strides: NpuShape3D, y: int, x: int, c: int) -> int:

759

"""Returns address of given coordinate"""

760

t = 0

761

BRICK = 16

762

stride_c = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHWC else strides.depth

763

stride_x = BRICK * fm.data_type.size_in_bytes() if fm.layout == NpuLayout.NHCWB16 else strides.width

764

if x >= fm.tiles.width_0:

765

x -= fm.tiles.width_0

766

t = 1

767

if y >= fm.tiles.height_1:

768

y -= fm.tiles.height_1

769

t += 2

770

elif y >= fm.tiles.height_0:

771

y -= fm.tiles.height_0

772

t += 2

773

elem_size = fm.data_type.size_in_bytes()

774

return (

775

fm.tiles.addresses[t] + y * strides.height + x * stride_x + (c // BRICK) * stride_c + int(c % BRICK) * elem_size

)

def get_address_range(

780

fm: NpuFeatureMap, strides: NpuShape3D, y0: int, x0: int, c0: int, y1: int, x1: int, c1: int

781

) -> NpuAddressRange:

782

"""Gets address range for (y0, x0, c0) - (y1, x1, c1)"""

783

addr0 = get_address(fm, strides, y0, x0, c0)

784

addr1 = get_address(fm, strides, y1, x1, c1)

785

return NpuAddressRange(region=fm.region, address=addr0, length=addr1 - addr0 + fm.data_type.size_in_bytes())

786

787

788

def get_address_ranges(fm: NpuFeatureMap) -> List[Optional[NpuAddressRange]]:

789

"""Returns 4 adddress ranges, one for every tile, None if the tile is not in use"""

790

strides = get_strides(fm)

791

height, width, depth = fm.shape.height, fm.shape.width, fm.shape.depth

792

height_0, height_1, width_0 = fm.tiles.height_0, fm.tiles.height_1, fm.tiles.width_0

793

t0 = get_address_range(fm, strides, 0, 0, 0, min(height, height_0) - 1, min(width, width_0) - 1, depth - 1,)

794

if width > width_0:

795

t1 = get_address_range(fm, strides, 0, width_0, 0, min(height, height_1) - 1, width - 1, depth - 1)

796

else:

797

t1 = None

798

if height > height_0:

799

t2 = get_address_range(fm, strides, height_0, 0, 0, height - 1, min(width, width_0) - 1, depth - 1)

800

else:

801

t2 = None

802

if t1 is not None and t2 is not None:

803

t3 = get_address_range(fm, strides, height_0, width_0, 0, height - 1, width - 1, depth - 1)

804

else:

805

t3 = None

806

return [t0, t1, t2, t3]

807

808

809

# -------------------------------------------------------------------

810

# DMA_WAIT/KERNEL_WAIT

811

# -------------------------------------------------------------------

812

813

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

814

Watermark = namedtuple("Watermark", ["npu", "dma"])

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

815

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

816

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

817

def memory_range_set(range: NpuAddressRange) -> MemoryRangeSet:

818

return MemoryRangeSet(range.region, range.address, range.address + range.length)

819

820

821

def get_dma_memory_accesses(dma_op: NpuDmaOperation) -> MemoryAccessSet:

822

"""Returns the address that are read and written by the given DMA operation"""

823

res = MemoryAccessSet()

824

res.add(memory_range_set(dma_op.src), AccessDirection.Read)

825

res.add(memory_range_set(dma_op.dest), AccessDirection.Write)

return res

def get_op_memory_accesses(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> MemoryAccessSet:

830

"""Returns the addresses that are read and written by the given operation"""

831

assert npu_op.ifm is not None and npu_op.ofm is not None

832

# Read addresses

833

read_ranges = get_address_ranges(npu_op.ifm)

834

if has_ifm2(npu_op):

835

assert npu_op.ifm2 is not None

836

read_ranges.extend(get_address_ranges(npu_op.ifm2))

837

read_ranges.extend(npu_op.weights)

838

read_ranges.extend(npu_op.biases)

839

if npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP:

840

address = arch.available_shram_banks(True) * arch.shram_bank_size

841

read_ranges.append(NpuAddressRange(region=BasePointerIndex.Mem2Mem, address=address, length=2048))

842

# Written addresses

843

write_ranges = get_address_ranges(npu_op.ofm)

844

# Add write access to SHRAM, needed when LUTs can overwrite accumulator banks

845

uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP

846

written_shram_size = arch.available_shram_banks(uses_lut) * arch.shram_bank_size

847

write_ranges.append(NpuAddressRange(region=BasePointerIndex.Mem2Mem, address=0, length=written_shram_size))

848

849

res = MemoryAccessSet()

850

for read_range in read_ranges:

851

if read_range is not None:

852

res.add(memory_range_set(read_range), AccessDirection.Read)

853

for write_range in write_ranges:

854

if write_range is not None:

855

res.add(memory_range_set(write_range), AccessDirection.Write)

return res

def get_wait_dependency(

860

arch: ArchitectureFeatures, npu_op_list: List[NpuOperation], memory_accesses, op_index: int, watermark: Watermark

861

):

862

"""Used to calculate whether DMA wait or kernel wait operations are needed"""

863

npu_op = npu_op_list[op_index]

864

op_access = memory_accesses[npu_op]

865

index = op_index - 1

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

866

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

867

# NPU dependency tracking

868

npu_outstanding = -1

869

npu_ops = 0

870

npu_index = watermark.npu

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

871

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

872

# DMA dependency tracking

873

dma_outstanding = -1

874

dma_ops = 0

875

dma_index = watermark.dma

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

876

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

877

# Seek back in the command stream looking for NPU or DMA dependencies

878

# but only as far as the first dependency or the watermarks (dependencies

879

# before this point have been satisfied already).

880

# The watermark moves to after the latest element we must wait for, not

881

# the command that issues the wait.

882

# NPU->NPU dependency is handled via blockdep.

883

while (index >= npu_index) or (index >= dma_index):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

884

prev_op = npu_op_list[index]

885

prev_access = memory_accesses[prev_op]

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

886

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

887

# Check NPU consuming DMA output

888

if is_dma_op(prev_op):

889

if index >= dma_index:

890

if not is_dma_op(npu_op):

891

if (dma_outstanding == -1) and prev_access.conflicts(op_access):

892

dma_outstanding = dma_ops

893

dma_ops += 1 # Count DMA ops in the pipeline

894

if dma_ops >= arch.max_outstanding_dma:

895

dma_index = max(index + 1, dma_index)

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

896

# Check DMA consuming NPU output

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

897

else:

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

898

if index >= npu_index:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

899

if is_dma_op(npu_op) and npu_outstanding == -1 and prev_access.conflicts(op_access):

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

900

npu_outstanding = npu_ops

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

901

npu_ops += 1 # Count NPU ops in the pipeline

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

902

if npu_ops >= arch.max_outstanding_kernels:

903

npu_index = max(index + 1, npu_index)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

904

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

905

index -= 1

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

906

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

907

# Update DMA watermark if we didn't see any and the NPU pipeline is full

908

if (dma_ops == 0) and (npu_ops >= arch.max_outstanding_kernels):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

909

dma_index = op_index

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

910

911

# Bring the search watermark forwards as we complete for those dependencies

912

watermark = Watermark(npu_index, dma_index)

913

outstanding = Watermark(npu_outstanding, dma_outstanding)

914

915

return watermark, outstanding

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

916

917

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

918

def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):

919

if cmd_waits.npu >= 0:

920

emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)

921

922

if cmd_waits.dma >= 0:

923

emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)

924

925

926

# -------------------------------------------------------------------

927

# BLOCKDEP

928

# -------------------------------------------------------------------

929

930

931

def is_dependent_on_prev_op(prev_op: NpuBlockOperation, npu_op: NpuBlockOperation) -> bool:

932

"""Checks if npu_op's input is dependent on prev_op's output"""

933

assert npu_op.ifm is not None

934

assert prev_op.ofm is not None

935

curr_input_ranges = get_address_ranges(npu_op.ifm)

936

937

if has_ifm2(npu_op):

938

assert npu_op.ifm2 is not None

939

curr_input_ranges.extend(get_address_ranges(npu_op.ifm2))

940

for prev_range in get_address_ranges(prev_op.ofm):

941

if prev_range is None:

942

continue

943

for curr_range in curr_input_ranges:

944

if curr_range is not None and ranges_overlap(prev_range, curr_range):

945

return True

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

return False

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

949

def shape3d_to_rect(shape: NpuShape3D) -> Rect:

950

return Rect(0, 0, 0, shape.width - 1, shape.height - 1, shape.depth - 1)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

951

952

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

953

def get_ifm_ofm_block_depth(arch: ArchitectureFeatures, npu_op: NpuBlockOperation) -> int:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

954

# Note: NOT equivalent to the normal ifm block depth calculation since

955

# it takes into account 'depthless' block operations by returning full

956

# depth

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

957

if npu_op.op_type == NpuOperationType.Conv2D:

958

res = arch.calc_ifm_block_depth(npu_op.ifm.shape.depth, npu_op.ifm.data_type.size_in_bits())

959

return res

960

return npu_op.ofm.shape.depth

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

961

962

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

963

def calc_blockdep(

964

arch: ArchitectureFeatures,

965

prev_op: Optional[NpuBlockOperation],

966

prev_block_config: Optional[NpuShape3D],

967

npu_op: NpuBlockOperation,

968

block_config: NpuShape3D,

969

) -> int:

970

"""Calculates the value of the BLOCKDEP register"""

971

if prev_op is None:

972

return 0

973

if not is_dependent_on_prev_op(prev_op, npu_op):

974

return ArchitectureFeatures.MAX_BLOCKDEP

975

if prev_op.ofm.shape != npu_op.ifm.shape:

976

return 0

977

prev_ifm_block_depth = get_ifm_ofm_block_depth(arch, prev_op)

978

prev_ofm_block = Block(prev_block_config.width, prev_block_config.height, prev_block_config.depth)

979

prev_ofm_rect = shape3d_to_rect(prev_op.ofm.shape)

980

prev_ifm_rect = shape3d_to_rect(prev_op.ifm.shape)

981

cur_ifm_block_depth = get_ifm_ofm_block_depth(arch, npu_op)

982

cur_ofm_block = Block(block_config.width, block_config.height, block_config.depth)

983

cur_ofm_rect = shape3d_to_rect(npu_op.ofm.shape)

984

cur_ifm_rect = shape3d_to_rect(npu_op.ifm.shape)

985

cur_padLT = (0, 0) if npu_op.padding is None else (npu_op.padding.left, npu_op.padding.top)

986

blockdep = arch.calc_block_dep(

987

prev_ifm_rect,

988

prev_ofm_rect,

989

prev_ifm_block_depth,

990

prev_ofm_block,

991

to_kernel(prev_op.kernel),

cur_ifm_rect,

cur_ofm_rect,

cur_ifm_block_depth,

cur_ofm_block,

to_kernel(npu_op.kernel),

997

cur_padLT,

998

)

999

return blockdep

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1000

1001

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

1002

# -------------------------------------------------------------------

1003

# PRINT

1004

# -------------------------------------------------------------------

Jacob Bohlin

e99b893

2020-07-13 16:01:51 +0200

[diff] [blame]

1005

1006

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

1007

def print_feature_map(fm: NpuFeatureMap, name: str):

if fm is not None:

q = (

"no quantization"

if fm.quantization is None

1012

else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"

1013

)

1014

h, w, c = fm.shape

1015

sz = h * w * c * fm.data_type.size_in_bytes()

1016

print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")

1017

strides = get_strides(fm)

1018

stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"

1019

t = fm.tiles

1020

addresses = [hex(addr) for addr in t.addresses]

1021

print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1022

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

1023

1024

def print_operation(npu_op: NpuOperation, index: int = 0):

1025

pass_info = f", {npu_op.cmd}" if hasattr(npu_op, "cmd") else ""

1026

if is_dma_op(npu_op):

1027

print(f"{index} DMA_START src={npu_op.src}, dest={npu_op.dest}{pass_info}")

1028

return

1029

k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)

1030

if npu_op.op_type in (NpuOperationType.Pooling, NpuOperationType.ElementWise):

1031

print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name}:{pass_info}")

Patrik Gustavsson

eca2e95

2020-05-27 09:15:11 +0200

[diff] [blame]

1032

else:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

1033

if (

1034

npu_op.op_type == NpuOperationType.Conv2D

1035

and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1

1036

):

1037

fc = "FullyConnected "

1038

else:

1039

fc = ""

1040

print(f"{index} {fc}{npu_op.op_type.name}{pass_info}")

1041

print_feature_map(npu_op.ifm, "IFM")

1042

if npu_op.ifm2_scalar is not None:

1043

quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)

1044

print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")

1045

else:

1046

print_feature_map(npu_op.ifm2, "IFM2")

1047

print_feature_map(npu_op.ofm, "OFM")

1048

if k is not None and npu_op.op_type != NpuOperationType.ElementWise:

1049

print(f" Kernel: {k}")

1050

if npu_op.padding is not None:

1051

print(f" {npu_op.padding}")

1052

for weights in npu_op.weights:

1053

print(f" Weights: {weights}")

1054

for bias in npu_op.biases:

1055

print(f" Scales: {bias}")

1056

if npu_op.activation is not None:

1057

act = npu_op.activation

1058

if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:

1059

lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""

1060

print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")

1061

if npu_op.op_type == NpuOperationType.Conv2D:

1062

print(f" {npu_op.block_traversal}")

1063

bh, bw, bc = npu_op.block_config

1064

rescale = f", rescale={npu_op.rescale}" if hasattr(npu_op, "rescale") else ""

1065

print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1066

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1067

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

1068

def print_operations(npu_op_list: List[NpuOperation]):

1069

for index, npu_op in enumerate(npu_op_list):

1070

print_operation(npu_op, index)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1071

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

1072

1073

# -------------------------------------------------------------------

1074

# OPERATIONS

1075

# -------------------------------------------------------------------

1076

1077

1078

def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):

1079

"""Generates NPU_OP_* command"""

1080

op_type = npu_op.op_type

1081

if op_type == NpuOperationType.Dma:

1082

emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)

1083

elif op_type == NpuOperationType.Conv2D:

1084

emit.cmd_do_operation(cmd0.NPU_OP_CONV)

1085

elif op_type == NpuOperationType.ConvDepthWise:

1086

emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)

1087

elif op_type == NpuOperationType.Pooling:

1088

emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])

1089

elif op_type == NpuOperationType.ElementWise:

1090

emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])

1091

else:

1092

assert 0, "Unsupported operation"

1093

1094

1095

def generate_conv2d_op(

1096

emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures

1097

) -> NpuShape3D:

1098

"""Generates register commands for Conv2D operations"""

1099

generate_common(emit, npu_op, npu_op.block_traversal, arch)

1100

ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]

1101

shared_buffer = shared_buffer_allocation_for_npu_op(arch, npu_op, NpuBlockType.ConvolutionMxN, ifm_resampling_mode)

1102

block_config = generate_block_config(emit, npu_op, arch, shared_buffer)

1103

generate_shram_registers_non_elementwise(emit, shared_buffer)

return block_config

def generate_conv_depthwise_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):

1108

"""Generates register commands for depthwise convolution operations"""

1109

generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)

1110

ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]

1111

shared_buffer = shared_buffer_allocation_for_npu_op(

1112

arch, npu_op, NpuBlockType.ConvolutionDepthWise, ifm_resampling_mode

1113

)

1114

block_config = generate_block_config(emit, npu_op, arch, shared_buffer)

1115

generate_shram_registers_non_elementwise(emit, shared_buffer)

return block_config

def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):

1120

"""Generates register commands for pooling operations"""

1121

use_global_scale = (

1122

npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0

1123

)

1124

generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)

1125

# Pooling op specific

1126

if use_global_scale:

1127

generate_ofm_scaling_for_pooling(emit, npu_op)

1128

ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]

1129

npu_block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling

1130

shared_buffer = shared_buffer_allocation_for_npu_op(arch, npu_op, npu_block_type, ifm_resampling_mode)

1131

block_config = generate_block_config(emit, npu_op, arch, shared_buffer)

1132

generate_shram_registers_non_elementwise(emit, shared_buffer)

return block_config

def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):

1137

"""Generates register commands for elementwise operations"""

1138

use_global_scale = npu_op.sub_op_type in (

1139

NpuElementWiseOp.ADD,

1140

NpuElementWiseOp.SUB,

1141

NpuElementWiseOp.MUL,

1142

NpuElementWiseOp.LRELU,

1143

NpuElementWiseOp.ABS,

1144

)

1145

op_to_scale = generate_scaling_for_elementwise(emit, npu_op)

1146

generate_common(

1147

emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale

1148

)

1149

# Elementwise op specific

1150

if npu_op.sub_op_type not in unary_elementwise_ops:

1151

# Binary operation; generate IFM2 registers

1152

assert npu_op.ifm2 is not None

1153

has_scalar = npu_op.ifm2_scalar is not None

1154

generate_ifm2(emit, npu_op.ifm2, has_scalar)

1155

generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)

1156

generate_ifm2_broadcast(emit, npu_op)

1157

if has_scalar:

1158

quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)

1159

assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()

1160

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)

1161

ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]

1162

shared_buffer = shared_buffer_allocation_for_npu_op(arch, npu_op, NpuBlockType.ElementWise, ifm_resampling_mode)

1163

block_config = generate_block_config(emit, npu_op, arch, shared_buffer)

1164

generate_shram_registers_elementwise(emit, npu_op, arch, shared_buffer)

return block_config

def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):

1169

"""Generates register commands for DMA operations"""

1170

emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)

1171

emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)

1172

emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)

1173

1174

emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)

1175

emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)

1176

1177

1178

def generate_registers_for_op(

1179

emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures

1180

) -> Optional[NpuShape3D]:

1181

"""

1182

Generates register commands for the given operation, but not the final NPU_OP_... command.

1183

Returns the selected block config

1184

"""

1185

op_type = npu_op.op_type

1186

block_config = None

1187

if op_type == NpuOperationType.Conv2D:

1188

block_config = generate_conv2d_op(emit, npu_op, arch)

1189

elif op_type == NpuOperationType.ConvDepthWise:

1190

block_config = generate_conv_depthwise_op(emit, npu_op, arch)

1191

elif op_type == NpuOperationType.Pooling:

1192

block_config = generate_pooling_op(emit, npu_op, arch)

1193

elif op_type == NpuOperationType.ElementWise:

1194

block_config = generate_elementwise_op(emit, npu_op, arch)

1195

elif op_type == NpuOperationType.Dma:

1196

generate_dma_op(emit, npu_op)

1197

else:

1198

assert 0, "Unsupported operation"

return block_config

def generate_command_stream(

1203

emit: CommandStreamEmitter, npu_op_list: List[NpuOperation], arch: ArchitectureFeatures, add_to_debug_db=None

1204

):

1205

"""Generates register commands for the given list of NPU operations"""

1206

# Calculate memory accesses for every operation

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

1207

memory_accesses = {}

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

1208

for npu_op in npu_op_list:

1209

if is_dma_op(npu_op):

1210

memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)

1211

else:

1212

memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)

1213

if arch.is_yoda_system:

1214

emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)

1215

dep_watermark = Watermark(0, 0)

1216

prev_op = None

1217

prev_block_config = None

1218

# Generate register commands for all operations

1219

for op_index, npu_op in enumerate(npu_op_list):

1220

dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)

1221

block_config = generate_registers_for_op(emit, npu_op, arch)

1222

if not is_dma_op(npu_op):

1223

# Generate BLOCKDEP

1224

assert block_config is not None

1225

blockdep = calc_blockdep(arch, prev_op, prev_block_config, npu_op, block_config)

1226

blockdep = min(blockdep, arch.max_blockdep)

1227

emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)

1228

prev_op = npu_op

1229

prev_block_config = block_config

1230

1231

generate_cmd_waits(emit, cmd_waits)

1232

# Generate the actual NPU_OP command

1233

generate_operation_code(emit, npu_op)

1234

if add_to_debug_db is not None:

1235

add_to_debug_db(npu_op, emit.offset)

1236

# Fill in final part of command stream:

1237

emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)

1238

1239

1240

def generate_register_command_stream_for_sg(nng, sg, arch, verbose=False):

1241

"""Generates command stream for the subgraph, adds it to sg.register_command_stream"""

1242

# Convert high level command stream to list of NpuOperation

1243

npu_op_list = []

1244

npu_op_to_cmd = dict() # map from npu op to high level command

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1245

for cmd in sg.high_level_command_stream:

1246

if cmd.cmdtype == CommandType.NpuStripe and cmd.ps.npu_block_type == NpuBlockType.Default:

1247

print("Warning: Skipping register command stream generation for", cmd.ps)

1248

else:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

1249

npu_op = convert_command_to_npu_op(cmd, arch)

1250

npu_op_list.append(npu_op)

1251

npu_op_to_cmd[npu_op] = cmd

1252

if verbose:

1253

print_operations(npu_op_list)

1254

# Generate register commands

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

1255

stream_id = DebugDatabase.add_stream(sg)

1256

DebugDatabase.set_stream_offset(sg, 0) # Default to zero, can only set during file writing

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

1257

emit = CommandStreamEmitter()

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

1258

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

1259

def add_to_debug_db(npu_op: NpuOperation, offset: int):

1260

"""Adds info to the debug database"""

1261

if not is_dma_op(npu_op):

1262

cmd = npu_op_to_cmd[npu_op]

1263

DebugDatabase.add_command(stream_id, offset, cmd.ps.primary_op)

Tim Hall

2020-08-04 21:40:14 +0100

[diff] [blame]

1264

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame^]

1265

generate_command_stream(emit, npu_op_list, arch, add_to_debug_db)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1266

sg.register_command_stream = emit.to_list()

1267

if verbose:

1268

emit.print_cmds()

1269

print("number of commands", len(emit.cmd_stream))

1270

print("command stream length in words", len(sg.register_command_stream))

Louis Verhaard