Blame - ethosu/vela/register_command_stream_generator.py - ml/ethos-u/ethos-u-vela

2020-04-27 18:20:16 +0100

[diff] [blame]

1

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

16

# Description:

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

17

# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

18

# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

19

# stream suitable for interpretation by the Ethos-U processor.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

20

from collections import defaultdict

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

21

from enum import Enum

22

from enum import IntEnum

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

23

from typing import List

24

from typing import Optional

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

import numpy as np

from . import scaling

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

29

from .api import NpuAccelerator

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

30

from .api import NpuActivation

31

from .api import NpuActivationOp

32

from .api import NpuAddressRange

33

from .api import NpuBlockOperation

34

from .api import NpuBlockTraversal

35

from .api import NpuConv2DOperation

36

from .api import NpuDataType

37

from .api import NpuDmaOperation

38

from .api import NpuElementWiseOp

39

from .api import NpuElementWiseOperation

40

from .api import NpuFeatureMap

41

from .api import NpuKernel

42

from .api import NpuLayout

43

from .api import NpuOperation

44

from .api import NpuOperationType

45

from .api import NpuPadding

46

from .api import NpuPoolingOp

47

from .api import NpuPoolingOperation

48

from .api import NpuQuantization

49

from .api import NpuResamplingMode

50

from .api import NpuRoundingMode

51

from .api import NpuShape3D

52

from .api import NpuTileBox

53

from .architecture_features import Accelerator

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

54

from .architecture_features import ArchitectureFeatures

55

from .architecture_features import Block

Louis Verhaard

5207830

2020-11-18 13:35:06 +0100

[diff] [blame]

56

from .architecture_features import create_default_arch

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

57

from .architecture_features import SharedBufferArea

58

from .architecture_features import SHRAMElements

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

59

from .ethos_u55_regs.ethos_u55_regs import acc_format

60

from .ethos_u55_regs.ethos_u55_regs import activation

61

from .ethos_u55_regs.ethos_u55_regs import cmd0

62

from .ethos_u55_regs.ethos_u55_regs import cmd1

63

from .ethos_u55_regs.ethos_u55_regs import elementwise_mode

Fredrik Svedberg

a0c3624

2020-06-03 15:43:31 +0200

[diff] [blame]

64

from .ethos_u55_regs.ethos_u55_regs import pooling_mode

Jacob Bohlin

cf7da10

2020-05-20 09:03:40 +0200

[diff] [blame]

65

from .ethos_u55_regs.ethos_u55_regs import resampling_mode

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

66

from .ethos_u55_regs.ethos_u55_regs import rounding

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

67

from .numeric_util import quantise_float32

68

from .numeric_util import round_away_zero

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

69

from .numeric_util import round_up_to_int

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

70

from .operation import NpuBlockType

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame^]

71

from .register_command_stream_util import calc_blockdep

72

from .register_command_stream_util import get_dma_memory_accesses

73

from .register_command_stream_util import get_op_memory_accesses

74

from .register_command_stream_util import get_strides

75

from .register_command_stream_util import get_wait_dependency

76

from .register_command_stream_util import has_ifm2

77

from .register_command_stream_util import is_dma_op

78

from .register_command_stream_util import to_kernel

79

from .register_command_stream_util import UNARY_ELEMWISE_OPS

80

from .register_command_stream_util import Watermark

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

81

from .shared_buffer_allocation import find_suitable_block_configs

82

from .shared_buffer_allocation import shared_buffer_allocation_for_npu_op

83

from .shared_buffer_allocation import SharedBufferAllocation

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

84

85

86

class RegisterMachine:

87

def __init__(self):

88

self.n_banks = 1

89

self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]

90

self.bank_idx = 0

91

92

def set_register(self, reg, value):

93

is_changed = self.registers[self.bank_idx][reg] != value

94

self.registers[self.bank_idx][reg] = value

95

# is_changed = True # force command

96

return is_changed

97

98

def switch_bank(self):

99

self.bank_idx = (self.bank_idx + 1) % self.n_banks

100

101

102

class CmdMode(IntEnum):

NoPayload = 0x0000

Payload32 = 0x4000

Mask = 0xC000

CmdOpMask = 0x03FF

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

109

class CommandStreamEmitter:

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

110

WORD_SIZE = 4

111

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

112

def __init__(self):

113

self.cmd_stream = []

114

self.reg_machine = [RegisterMachine(), RegisterMachine()]

115

self.last_absolute_wait = defaultdict(int)

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

116

self.offset = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

117

118

def get_reg_machine(self, cmd):

119

if "DMA" in cmd.name:

120

return self.reg_machine[1]

121

else:

122

return self.reg_machine[0]

123

124

def size_in_bytes(self):

125

sz = 0

126

for cmd in self.cmd_stream:

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

127

sz += len(cmd) * CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

128

return sz

129

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

130

def to_list(self) -> List[int]:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

131

return [elem for cmd in self.cmd_stream for elem in cmd]

132

133

def print_cmds(self):

134

print("Code: Command: Param: Payload:")

135

for words_for_one_command in self.cmd_stream:

136

code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits

137

param = words_for_one_command[0] >> 16 # higher 16 bits

138

139

payload_mode = CmdMode(code & CmdMode.Mask)

140

141

# code and command

142

s = " 0x%04x " % code

143

if payload_mode == CmdMode.NoPayload:

144

s += str(cmd0(code & CmdMode.CmdOpMask))

145

else:

146

s += str(cmd1(code & CmdMode.CmdOpMask))

s = s.ljust(40)

s += "%5d" % param

# payload

if payload_mode == CmdMode.Payload32:

153

s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])

else:

s += " -"

print(s)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

159

def cmd0_with_param(self, cmd: cmd0, param):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

160

if isinstance(param, Enum):

161

param = int(param.value)

162

else:

163

param = int(param)

164

param = param & 0xFFFF

165

command = cmd.value | (param << 16)

166

if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):

167

return

168

169

# This is not a redundant command, actually write it

170

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

171

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

172

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

173

def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

174

offset = int(offset) & 0xFFFFFFFFF

175

command = cmd.value | CmdMode.Payload32.value | (param << 16)

176

177

if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):

178

return

179

180

# This is not a redundant command, actually write it

181

self.cmd_stream.append((command, offset))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

182

self.offset += CommandStreamEmitter.WORD_SIZE * 2

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

183

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

184

def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):

Tim Hall

289a41d

2020-08-04 21:40:14 +0100

[diff] [blame]

185

param = (16 * channel) + outstanding_count

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

186

command = ((param & 0xFFFF) << 16) | cmd.value

187

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

188

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

189

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

190

def cmd_do_operation(self, cmd: cmd0, param=0):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

191

param = int(param)

192

command = ((param & 0xFFFF) << 16) | cmd.value

193

194

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

195

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

196

self.get_reg_machine(cmd).switch_bank()

197

198

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

199

# -------------------------------------------------------------------

200

# REGISTER GENERATION

201

# -------------------------------------------------------------------

202

203

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

204

# TODO: Replace with definitions from ethos_u55_regs

205

class IFM2Broadcast(IntEnum):

206

BroadcastHdim = 1 << 0

207

BroadcastWdim = 1 << 1

208

BroadcastCdim = 1 << 2

209

ReverseOperandOrder = 1 << 6

210

UseIFM2Scalar = 1 << 7

pooling_op_map = {

NpuPoolingOp.MAX: pooling_mode.MAX.value,

215

NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,

216

NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,

217

}

218

219

elementwise_op_map = {

220

NpuElementWiseOp.MUL: elementwise_mode.MUL.value,

221

NpuElementWiseOp.ADD: elementwise_mode.ADD.value,

222

NpuElementWiseOp.SUB: elementwise_mode.SUB.value,

223

NpuElementWiseOp.MIN: elementwise_mode.MIN.value,

224

NpuElementWiseOp.MAX: elementwise_mode.MAX.value,

225

NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,

226

NpuElementWiseOp.ABS: elementwise_mode.ABS.value,

227

NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,

228

NpuElementWiseOp.SHR: elementwise_mode.SHR.value,

229

NpuElementWiseOp.SHL: elementwise_mode.SHL.value,

230

}

231

232

activation_op_map = {

233

NpuActivationOp.NONE_OR_RELU: activation.NONE,

234

NpuActivationOp.TANH: activation.TANH,

235

NpuActivationOp.SIGMOID: activation.SIGMOID,

236

}

237

238

# Maps an AccumulatorType enum to the corresponding acc_format value

239

acc_format_map = {

240

SHRAMElements.Acc16: acc_format.FP_S5_10.value,

241

SHRAMElements.Acc32: acc_format.INT_32BIT.value,

242

SHRAMElements.Acc40: acc_format.INT_40BIT.value,

243

}

244

245

resampling_mode_map = {

246

NpuResamplingMode.NONE: resampling_mode.NONE,

247

NpuResamplingMode.NEAREST: resampling_mode.NEAREST,

248

NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,

249

}

250

251

# Maps data type size in bits to activation precision

252

precision_map = {8: 0, 16: 1, 32: 2}

253

254

# Maps rounding mode to the corresponding value

255

rounding_mode_map = {

256

NpuRoundingMode.TFL: rounding.TFL.value,

257

NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,

258

NpuRoundingMode.NATURAL: rounding.NATURAL.value,

}

def quantise(value: float, quant: Optional[NpuQuantization]) -> int:

263

"""Quantizes the given value"""

264

scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32

265

zp = 0 if quant is None else quant.zero_point

266

return quantise_float32(value, scale, zp)

267

268

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

269

def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):

270

"""Generates IFM_PAD registers"""

271

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)

272

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)

273

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)

274

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)

275

276

277

def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):

278

"""Generates ACTIVATION registers"""

279

act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)

280

281

if act.min is None:

282

quantized_min = ofm.data_type.min_value()

283

else:

284

quantized_min = quantise(act.min, ofm.quantization)

285

if act.max is None:

286

quantized_max = ofm.data_type.max_value()

287

else:

288

quantized_max = quantise(act.max, ofm.quantization)

289

quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())

290

quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())

291

if act.op_type == NpuActivationOp.TABLE_LOOKUP:

292

assert 0 <= act.lookup_table_index < 8

293

activation_value = 16 + act.lookup_table_index

294

if ofm.data_type == NpuDataType.INT32:

295

activation_value |= 3 << 12 # Force I8 range

296

quantized_min = max(-128, quantized_min)

297

quantized_max = min(127, quantized_max)

298

else:

299

activation_value = activation_op_map[act.op_type]

300

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)

301

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)

302

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)

303

304

305

def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):

306

"""Generates xFM_BASE registers"""

307

if layout == NpuLayout.NHCWB16:

308

# Check that all BasePointer addresses are aligned to 16 bytes

309

assert all((int(addr) % 16) == 0 for addr in addresses)

310

emit.cmd1_with_offset(ptr_cmds[0], addresses[0])

311

emit.cmd1_with_offset(ptr_cmds[1], addresses[1])

312

emit.cmd1_with_offset(ptr_cmds[2], addresses[2])

313

emit.cmd1_with_offset(ptr_cmds[3], addresses[3])

314

315

316

def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):

317

"""Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""

318

emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)

319

emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)

320

emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)

321

322

323

def generate_strides(

324

emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1

325

):

326

"""Generates STRIDE_C/Y/X registers"""

327

strides = get_strides(fm)

328

emit.cmd1_with_offset(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)

329

emit.cmd1_with_offset(stride_y_cmd, strides.height) # stride between vertical values (H)

330

emit.cmd1_with_offset(stride_x_cmd, strides.width) # stride between horisontal values (W)

331

332

333

def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):

334

"""Generates IFM/IFM2_PRECISION register"""

335

dtype = fm.data_type

336

prec = 1 if dtype.is_signed() else 0

337

activation_precision = precision_map[dtype.size_in_bits()]

338

prec += activation_precision << 2

339

340

if fm.layout == NpuLayout.NHCWB16:

341

prec |= 1 << 6

342

343

prec |= op_to_scale << 8

344

emit.cmd0_with_param(precision_cmd, prec)

345

346

347

def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):

348

"""Generates OFM_PRECISION register"""

349

dtype = npu_op.ofm.data_type

350

prec = 1 if dtype.is_signed() else 0

351

activation_precision = precision_map[dtype.size_in_bits()]

352

prec += activation_precision << 1

353

354

if use_global_scale:

355

# Set global scale bit, as opposed to using per channel scale

356

prec |= 1 << 8

357

if npu_op.ofm.layout == NpuLayout.NHCWB16:

358

prec |= 1 << 6

359

prec |= rounding_mode_map[npu_op.rounding_mode] << 14

360

emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)

361

362

363

def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):

364

"""Generates IFM2_BROADCAST register for binary elementwise operations"""

ifm2_broadcast = 0

ifm = npu_op.ifm

ifm2 = npu_op.ifm2

if npu_op.reversed_operands:

369

ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder

370

if npu_op.ifm2_scalar is not None:

371

# IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST

372

ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar

373

else:

374

if ifm.shape.height != ifm2.shape.height:

375

# Broadcast in 'H' dimension

376

assert ifm2.shape.height == 1

377

ifm2_broadcast |= IFM2Broadcast.BroadcastHdim

378

379

if ifm.shape.width != ifm2.shape.width:

380

# Broadcast in 'W' dimension

381

assert ifm2.shape.width == 1

382

ifm2_broadcast |= IFM2Broadcast.BroadcastWdim

383

384

if ifm.shape.depth != ifm2.shape.depth:

385

# Broadcast in 'C' dimension

386

assert ifm2.shape.depth == 1

387

ifm2_broadcast |= IFM2Broadcast.BroadcastCdim

388

389

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)

390

391

392

def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):

393

"""Generates general IFM registers"""

394

emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)

395

generate_addresses(

396

emit,

397

[cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],

ifm.tiles.addresses,

ifm.layout,

)

generate_tiles(

emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles

403

)

404

emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)

405

generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)

406

emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))

407

408

409

def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):

410

"""Generates general IFM2 registers"""

411

if not has_scalar:

412

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)

413

generate_addresses(

414

emit,

415

[cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],

416

ifm2.tiles.addresses,

ifm2.layout,

)

generate_tiles(

emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles

421

)

422

generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)

423

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))

424

425

426

def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):

427

"""Generates general OFM registers"""

428

emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)

429

generate_addresses(

430

emit,

431

[cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],

ofm.tiles.addresses,

ofm.layout,

)

generate_tiles(

emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles

437

)

438

emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)

439

emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)

440

emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)

441

generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)

442

emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))

443

444

445

def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):

446

"""Generates KERNEL related registers"""

447

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))

448

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))

449

# set kernel x stride low bit

450

stride = (kernel.stride_x - 1) & 1

451

# set kernel y stride low bit

452

stride |= (kernel.stride_y - 1 & 1) << 1

453

# set kernel x stride extension bits

454

stride |= (kernel.stride_x - 1 >> 1) << 6

455

# set kernel y stride extension bits

456

stride |= (kernel.stride_y - 1 >> 1) << 9

457

stride |= (kernel.dilation_x - 1) << 3

458

stride |= (kernel.dilation_y - 1) << 4

459

if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:

460

stride |= 1 << 2

461

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)

462

463

464

def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):

465

"""Generates WEIGHT registers"""

466

if len(weights) == 0:

467

return

468

emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)

469

# Set weights sources for active and present cores

470

for core, (addr, length) in enumerate(

471

[

472

(cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),

473

(cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),

474

]

475

):

476

if core < len(weights):

477

emit.cmd1_with_offset(addr, weights[core].address)

478

emit.cmd1_with_offset(length, weights[core].length)

479

elif core < arch.ncores:

480

emit.cmd1_with_offset(addr, weights[0].address)

481

emit.cmd1_with_offset(length, 0)

482

483

484

def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):

485

"""Generates SCALE registers"""

486

if len(biases) == 0:

487

return

488

emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)

489

# Set weights sources for active and present cores

490

for core, (addr, length) in enumerate(

491

[(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]

492

):

493

if core < len(biases):

494

emit.cmd1_with_offset(addr, biases[core].address)

495

emit.cmd1_with_offset(length, biases[core].length)

496

elif core < arch.ncores:

497

emit.cmd1_with_offset(addr, biases[0].address)

498

emit.cmd1_with_offset(length, 0)

499

500

501

def generate_block_config(

502

emit: CommandStreamEmitter,

503

npu_op: NpuBlockOperation,

504

arch: ArchitectureFeatures,

505

shared_buffer: SharedBufferAllocation,

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

506

):

507

"""Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

508

block_config = npu_op.block_config

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

509

assert block_config is not None, "block_config has not been set"

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

510

alloc = shared_buffer.try_block(Block(block_config.width, block_config.height, block_config.depth))

511

assert alloc is not None, f"Block config {block_config} does not fit, op: {npu_op.op_type}"

512

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)

513

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)

514

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

515

516

517

def generate_shram_registers_elementwise(

518

emit: CommandStreamEmitter,

519

npu_op: NpuElementWiseOperation,

520

arch: ArchitectureFeatures,

521

shared_buffer: SharedBufferAllocation,

522

):

523

"""Generates IB_END/IB_START/AB_START registers for elementwise operations"""

524

# For elementwise set the required SHRAM to be equal to the total size of available SHRAM

525

uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP

526

shram_required = arch.available_shram_banks(uses_lut)

527

528

# Acc buffers not needed so set AB_START to size of SHRAM

529

emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, shram_required)

530

emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shram_required)

531

if has_ifm2(npu_op):

532

# Set IFM2_IB_START to the latter half of the IB space

533

ifm_ib_start = shared_buffer.bank_locations[SharedBufferArea.IFM]

534

emit.cmd0_with_param(

535

cmd0.NPU_SET_IFM2_IB_START, (shram_required - ifm_ib_start) // shared_buffer.ifm_count + ifm_ib_start,

536

)

537

emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])

538

539

540

def generate_shram_registers_non_elementwise(emit: CommandStreamEmitter, shared_buffer: SharedBufferAllocation):

541

"""Generates IB_END/IB_START/AB_START registers for non-elementwise operations"""

542

emit.cmd0_with_param(

543

cmd0.NPU_SET_IFM_IB_END,

544

shared_buffer.bank_locations[SharedBufferArea.IFM] + shared_buffer.banks_required[SharedBufferArea.IFM],

545

)

546

emit.cmd0_with_param(cmd0.NPU_SET_AB_START, shared_buffer.bank_locations[SharedBufferArea.Accumulators])

547

emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[shared_buffer.use_accumulator_element])

548

549

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

550

def create_shared_buffer(npu_op: NpuBlockOperation, arch: ArchitectureFeatures) -> SharedBufferAllocation:

551

"""Creates shared buffer allocation for the given operation"""

552

op_type = npu_op.op_type

553

block_type = NpuBlockType.Default

554

if op_type == NpuOperationType.Conv2D:

555

block_type = NpuBlockType.ConvolutionMxN

556

elif op_type == NpuOperationType.ConvDepthWise:

557

block_type = NpuBlockType.ConvolutionDepthWise

558

elif op_type == NpuOperationType.Pooling:

559

block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling

560

elif op_type == NpuOperationType.ElementWise:

561

block_type = NpuBlockType.ElementWise

562

else:

563

assert 0, "Unsupported operation"

564

ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]

565

return shared_buffer_allocation_for_npu_op(arch, npu_op, block_type, ifm_resampling_mode)

566

567

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame^]

568

def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):

569

"""Generates KERNEL_WAIT/DMA_WAIT"""

570

if cmd_waits.npu >= 0:

571

emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)

572

573

if cmd_waits.dma >= 0:

574

emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)

575

576

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

577

def generate_common(

578

emit: CommandStreamEmitter,

579

npu_op: NpuBlockOperation,

580

block_traversal: NpuBlockTraversal,

581

arch: ArchitectureFeatures,

582

use_global_scale: bool = False,

583

op_to_scale: int = 0,

584

):

585

"""Generate registers that are common to most operations"""

586

assert npu_op.ifm is not None and npu_op.ofm is not None

587

generate_ifm(emit, npu_op.ifm)

588

generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)

589

emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])

590

if npu_op.padding is not None:

591

generate_padding(emit, npu_op.padding)

592

generate_ofm(emit, npu_op.ofm)

593

generate_ofm_precision(emit, npu_op, use_global_scale)

594

if npu_op.op_type != NpuOperationType.ElementWise:

595

assert npu_op.kernel is not None

596

generate_kernel(emit, npu_op.kernel, block_traversal)

597

generate_weights(emit, npu_op.weights, arch)

598

generate_biases(emit, npu_op.biases, arch)

599

generate_activation(emit, npu_op.activation, npu_op.ofm)

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

600

shared_buffer = create_shared_buffer(npu_op, arch)

601

generate_block_config(emit, npu_op, arch, shared_buffer)

602

if npu_op.op_type == NpuOperationType.ElementWise:

603

generate_shram_registers_elementwise(emit, npu_op, arch, shared_buffer)

604

else:

605

generate_shram_registers_non_elementwise(emit, shared_buffer)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

606

607

608

# -------------------------------------------------------------------

609

# SCALING

610

# -------------------------------------------------------------------

611

612

613

def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):

614

"""Generates OFM_SCALE register for pooling operations"""

615

# For valid padding vela has to output scaling values

616

kernel = pool_op.kernel

617

ifm_quant = pool_op.ifm.quantization

618

ofm_quant = pool_op.ofm.quantization

619

if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):

620

assert ifm_quant.scale_f32 is not None

621

rescale = 0x3000 * ifm_quant.scale_f32

622

if pool_op.ifm.data_type == NpuDataType.INT16:

623

# Calculate scale and shift for the output scale of 1/(3*4096)

624

shift = 0

625

max_rescale = np.iinfo(np.int16).max / 2

626

while rescale <= max_rescale and shift <= 30:

shift += 1

rescale *= 2

scale = int(rescale)

else:

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

632

scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)

633

scale = int(round_away_zero(scale * rescale))

634

elif pool_op.fused_quantize:

635

# Quantize op requires different scaling

636

ifm_scale_f64 = np.double(ifm_quant.scale_f32)

637

ofm_scale_f64 = np.double(ofm_quant.scale_f32)

638

scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)

639

elif pool_op.rescale is not None:

640

# for ResizeBilinear operations with "rescale" in primary_op.attrs

641

rescale = pool_op.rescale

642

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

643

scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)

644

scale = int(round_away_zero(scale * rescale))

645

else:

646

# In case avg pool fused with concat or other memory operation, rescaling might be needed.

647

# kernel height == kernel width == 1 is always true in this case

648

# Normally the scale is maximised, to get maximum precision, which means that

649

# if rescale != 1, scale need to consider the number of bits needed for rescaling

650

if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:

651

rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32

652

rescale_bits = 0

653

if kernel.height == kernel.width == 1:

654

if rescale > 1:

655

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

656

elif rescale < 1:

657

rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)

658

scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)

659

scale = int(round_away_zero(scale * rescale))

else:

scale = 1

shift = 0

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)

665

666

667

def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:

668

"""

669

Generates OFM/OPA/OPB_SCALE registers for elementwise operators.

670

Returns the operator to scale

671

"""

672

op_to_scale = 0

673

if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):

674

input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None

675

input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None

676

output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None

677

678

if npu_op.activation is not None and npu_op.activation.op_type in (

679

NpuActivationOp.SIGMOID,

680

NpuActivationOp.TANH,

681

):

682

output_scale = 1 / 0x3000

683

684

if npu_op.sub_op_type == NpuElementWiseOp.MUL:

685

if None in (input_scale, input2_scale, output_scale):

ofm_scale = 1

shift = 0

else:

ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)

690

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

691

else: # Add/Sub

692

if None in (input_scale, input2_scale, output_scale):

693

opa_scale = opb_scale = ofm_scale = 1

694

opa_shift = shift = 0

695

if npu_op.rescale is not None:

696

ofm_scale, shift = npu_op.rescale

697

elif input_scale == input2_scale:

698

opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(

699

input_scale, input2_scale, output_scale

700

)

701

opa_shift = 0 # Unused for this case

702

else:

703

# Use advanced implementation only when input scales differ

704

bitdepth = npu_op.ifm.data_type.size_in_bits()

705

(opa_scale, opa_shift, ofm_scale, shift, op_to_scale,) = scaling.advanced_elementwise_add_sub_scale(

706

input_scale, input2_scale, output_scale, bitdepth

707

)

708

opb_scale = 0 # Unused for this case

709

if npu_op.reversed_operands:

710

# If the operand order is reversed we also have to swap which operand is scaled

711

if op_to_scale == scaling.OperandToScale.OPa:

712

op_to_scale = scaling.OperandToScale.OPb

713

else:

714

op_to_scale = scaling.OperandToScale.OPa

715

emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)

716

emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)

717

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

718

elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):

719

output_scale = npu_op.ofm.quantization.scale_f32

720

ofm_scale, shift = scaling.quantise_scale(output_scale)

721

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

722

else:

723

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)

return op_to_scale

# -------------------------------------------------------------------

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

728

# PRINT

729

# -------------------------------------------------------------------

Jacob Bohlin

e99b893

2020-07-13 16:01:51 +0200

[diff] [blame]

730

731

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

732

def print_feature_map(fm: NpuFeatureMap, name: str):

if fm is not None:

q = (

"no quantization"

if fm.quantization is None

737

else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"

738

)

739

h, w, c = fm.shape

740

sz = h * w * c * fm.data_type.size_in_bytes()

741

print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")

742

strides = get_strides(fm)

743

stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"

744

t = fm.tiles

745

addresses = [hex(addr) for addr in t.addresses]

746

print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

747

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

748

749

def print_operation(npu_op: NpuOperation, index: int = 0):

750

pass_info = f", {npu_op.cmd}" if hasattr(npu_op, "cmd") else ""

751

if is_dma_op(npu_op):

752

print(f"{index} DMA_START src={npu_op.src}, dest={npu_op.dest}{pass_info}")

753

return

754

k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)

755

if npu_op.op_type in (NpuOperationType.Pooling, NpuOperationType.ElementWise):

756

print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name}:{pass_info}")

Patrik Gustavsson

eca2e95

2020-05-27 09:15:11 +0200

[diff] [blame]

757

else:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

758

if (

759

npu_op.op_type == NpuOperationType.Conv2D

760

and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1

761

):

762

fc = "FullyConnected "

763

else:

764

fc = ""

765

print(f"{index} {fc}{npu_op.op_type.name}{pass_info}")

766

print_feature_map(npu_op.ifm, "IFM")

767

if npu_op.ifm2_scalar is not None:

768

quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)

769

print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")

770

else:

771

print_feature_map(npu_op.ifm2, "IFM2")

772

print_feature_map(npu_op.ofm, "OFM")

773

if k is not None and npu_op.op_type != NpuOperationType.ElementWise:

774

print(f" Kernel: {k}")

775

if npu_op.padding is not None:

776

print(f" {npu_op.padding}")

777

for weights in npu_op.weights:

778

print(f" Weights: {weights}")

779

for bias in npu_op.biases:

780

print(f" Scales: {bias}")

781

if npu_op.activation is not None:

782

act = npu_op.activation

783

if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:

784

lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""

785

print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")

786

if npu_op.op_type == NpuOperationType.Conv2D:

787

print(f" {npu_op.block_traversal}")

788

bh, bw, bc = npu_op.block_config

789

rescale = f", rescale={npu_op.rescale}" if hasattr(npu_op, "rescale") else ""

790

print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

791

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

792

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

793

def print_operations(npu_op_list: List[NpuOperation]):

794

for index, npu_op in enumerate(npu_op_list):

795

print_operation(npu_op, index)

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

796

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

797

798

# -------------------------------------------------------------------

799

# OPERATIONS

800

# -------------------------------------------------------------------

801

802

803

def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):

804

"""Generates NPU_OP_* command"""

805

op_type = npu_op.op_type

806

if op_type == NpuOperationType.Dma:

807

emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)

808

elif op_type == NpuOperationType.Conv2D:

809

emit.cmd_do_operation(cmd0.NPU_OP_CONV)

810

elif op_type == NpuOperationType.ConvDepthWise:

811

emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)

812

elif op_type == NpuOperationType.Pooling:

813

emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])

814

elif op_type == NpuOperationType.ElementWise:

815

emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])

816

else:

817

assert 0, "Unsupported operation"

818

819

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

820

def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

821

"""Generates register commands for Conv2D operations"""

822

generate_common(emit, npu_op, npu_op.block_traversal, arch)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

823

824

825

def generate_conv_depthwise_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):

826

"""Generates register commands for depthwise convolution operations"""

827

generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

828

829

830

def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):

831

"""Generates register commands for pooling operations"""

832

use_global_scale = (

833

npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0

834

)

835

generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)

836

# Pooling op specific

837

if use_global_scale:

838

generate_ofm_scaling_for_pooling(emit, npu_op)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

839

840

841

def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):

842

"""Generates register commands for elementwise operations"""

843

use_global_scale = npu_op.sub_op_type in (

844

NpuElementWiseOp.ADD,

845

NpuElementWiseOp.SUB,

846

NpuElementWiseOp.MUL,

847

NpuElementWiseOp.LRELU,

848

NpuElementWiseOp.ABS,

849

)

850

op_to_scale = generate_scaling_for_elementwise(emit, npu_op)

851

generate_common(

852

emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale

853

)

854

# Elementwise op specific

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame^]

855

if npu_op.sub_op_type not in UNARY_ELEMWISE_OPS:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

856

# Binary operation; generate IFM2 registers

857

assert npu_op.ifm2 is not None

858

has_scalar = npu_op.ifm2_scalar is not None

859

generate_ifm2(emit, npu_op.ifm2, has_scalar)

860

generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)

861

generate_ifm2_broadcast(emit, npu_op)

862

if has_scalar:

863

quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)

864

assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()

865

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

866

867

868

def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):

869

"""Generates register commands for DMA operations"""

870

emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)

871

emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)

872

emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)

873

874

emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)

875

emit.cmd1_with_offset(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)

876

877

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

878

def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

879

"""

880

Generates register commands for the given operation, but not the final NPU_OP_... command.

881

Returns the selected block config

882

"""

883

op_type = npu_op.op_type

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

884

if op_type == NpuOperationType.Conv2D:

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

885

generate_conv2d_op(emit, npu_op, arch)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

886

elif op_type == NpuOperationType.ConvDepthWise:

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

887

generate_conv_depthwise_op(emit, npu_op, arch)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

888

elif op_type == NpuOperationType.Pooling:

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

889

generate_pooling_op(emit, npu_op, arch)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

890

elif op_type == NpuOperationType.ElementWise:

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

891

generate_elementwise_op(emit, npu_op, arch)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

892

elif op_type == NpuOperationType.Dma:

893

generate_dma_op(emit, npu_op)

894

else:

895

assert 0, "Unsupported operation"

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

896

897

898

def generate_command_stream(

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame^]

899

npu_op_list: List[NpuOperation], arch: ArchitectureFeatures, verbose: bool, add_to_debug_db=None,

900

) -> List[int]:

901

"""

902

Generates register commands for the given list of NPU operations.

903

Returns Ethos-U instructions, as a list of 32-bit integers.

904

"""

905

emit = CommandStreamEmitter()

906

if verbose:

907

print_operations(npu_op_list)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

908

# Calculate memory accesses for every operation

Tim Hall

289a41d

2020-08-04 21:40:14 +0100

[diff] [blame]

909

memory_accesses = {}

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

910

for npu_op in npu_op_list:

911

if is_dma_op(npu_op):

912

memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)

913

else:

914

memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

915

if arch.is_ethos_u65_system:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

916

emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)

917

dep_watermark = Watermark(0, 0)

918

prev_op = None

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

919

# Generate register commands for all operations

920

for op_index, npu_op in enumerate(npu_op_list):

921

dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

922

generate_registers_for_op(emit, npu_op, arch)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

923

if not is_dma_op(npu_op):

924

# Generate BLOCKDEP

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

925

blockdep = calc_blockdep(arch, prev_op, npu_op)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

926

blockdep = min(blockdep, arch.max_blockdep)

927

emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)

928

prev_op = npu_op

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

929

930

generate_cmd_waits(emit, cmd_waits)

931

# Generate the actual NPU_OP command

932

generate_operation_code(emit, npu_op)

933

if add_to_debug_db is not None:

934

add_to_debug_db(npu_op, emit.offset)

935

# Fill in final part of command stream:

936

emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame^]

937

res = emit.to_list()

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

938

if verbose:

939

emit.print_cmds()

940

print("number of commands", len(emit.cmd_stream))

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame^]

941

print("command stream length in words", len(res))

return res

# -------------------------------------------------------------------

946

# EXTERNAL API

947

# -------------------------------------------------------------------

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

948

949

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

950

def find_block_configs(npu_op: NpuOperation, npu_accelerator: NpuAccelerator) -> List[NpuShape3D]:

951

"""

952

Internal implementation of the public facing API for finding block configs.

953

"""

954

if is_dma_op(npu_op):

955

return []

956

arch = create_default_arch(Accelerator.from_npu_accelerator(npu_accelerator))

957

shared_buffer = create_shared_buffer(npu_op, arch)

958

blocks = find_suitable_block_configs(arch, shared_buffer)

959

return [NpuShape3D(height=block[0], width=block[1], depth=block[3]) for block in blocks]

960

961

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

962

def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

963

"""

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

964

Internal implementation of the public facing API for generating an Ethos-U register command stream.

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

965

Calculates dependencies between commands and inserts wait operations if needed.

966

967

:param npu_op_list: List[NpuOperation] list of high level NPU operations

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

968

:param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator

969

:return Ethos-U instructions, as a list of 32-bit integers

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

970

"""

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

971

accelerator = Accelerator.from_npu_accelerator(npu_accelerator)

Louis Verhaard

5207830

2020-11-18 13:35:06 +0100

[diff] [blame]

972

arch = create_default_arch(accelerator)

Louis Verhaard