Blame - ethosu/vela/register_command_stream_generator.py - ml/ethos-u/ethos-u-vela

2020-04-27 18:20:16 +0100

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

16

# Description:

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

17

# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

18

# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

19

# stream suitable for interpretation by the Ethos-U processor.

Louis Verhaard

c629129

2021-03-19 09:35:48 +0100

[diff] [blame]

20

import math

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

21

from collections import defaultdict

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

22

from enum import Enum

23

from enum import IntEnum

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

24

from typing import Dict

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

25

from typing import List

26

from typing import Optional

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

import numpy as np

from . import scaling

Louis Verhaard

aeae567

2020-11-02 18:04:27 +0100

[diff] [blame]

31

from .api import NpuAccelerator

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

32

from .api import NpuActivation

33

from .api import NpuActivationOp

34

from .api import NpuAddressRange

35

from .api import NpuBlockOperation

36

from .api import NpuBlockTraversal

37

from .api import NpuConv2DOperation

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

38

from .api import NpuConvDepthWiseOperation

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

39

from .api import NpuDataType

40

from .api import NpuDmaOperation

41

from .api import NpuElementWiseOp

42

from .api import NpuElementWiseOperation

43

from .api import NpuFeatureMap

44

from .api import NpuKernel

45

from .api import NpuLayout

46

from .api import NpuOperation

47

from .api import NpuOperationType

48

from .api import NpuPadding

49

from .api import NpuPoolingOp

50

from .api import NpuPoolingOperation

51

from .api import NpuQuantization

52

from .api import NpuResamplingMode

53

from .api import NpuRoundingMode

54

from .api import NpuShape3D

55

from .api import NpuTileBox

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

56

from .architecture_allocator import ArchitectureBlockConfig

57

from .architecture_allocator import try_block_config

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

58

from .architecture_features import Accelerator

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

59

from .architecture_features import ArchitectureFeatures

Louis Verhaard

5207830

2020-11-18 13:35:06 +0100

[diff] [blame]

60

from .architecture_features import create_default_arch

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

61

from .architecture_features import SHRAMElements

erik.andersson@arm.com

1878dab

2021-03-16 09:40:24 +0100

[diff] [blame]

62

from .errors import VelaError

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

63

from .ethos_u55_regs.ethos_u55_regs import acc_format

64

from .ethos_u55_regs.ethos_u55_regs import activation

65

from .ethos_u55_regs.ethos_u55_regs import cmd0

66

from .ethos_u55_regs.ethos_u55_regs import cmd1

67

from .ethos_u55_regs.ethos_u55_regs import elementwise_mode

Fredrik Svedberg

a0c3624

2020-06-03 15:43:31 +0200

[diff] [blame]

68

from .ethos_u55_regs.ethos_u55_regs import pooling_mode

Jacob Bohlin

cf7da10

2020-05-20 09:03:40 +0200

[diff] [blame]

69

from .ethos_u55_regs.ethos_u55_regs import resampling_mode

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

70

from .ethos_u55_regs.ethos_u55_regs import rounding

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

71

from .numeric_util import quantise_float32

72

from .numeric_util import round_away_zero

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

73

from .numeric_util import round_up_to_int

Patrik Gustavsson

c74682c

2021-08-17 14:26:38 +0200

[diff] [blame]

74

from .operation import ExplicitScaling

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

75

from .operation import NpuBlockType

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

76

from .range_set import MemoryAccessSet

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

77

from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

78

from .register_command_stream_util import calc_blockdep

79

from .register_command_stream_util import get_dma_memory_accesses

80

from .register_command_stream_util import get_op_memory_accesses

81

from .register_command_stream_util import get_strides

82

from .register_command_stream_util import get_wait_dependency

83

from .register_command_stream_util import has_ifm2

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

84

from .register_command_stream_util import shape3d_to_block

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

85

from .register_command_stream_util import to_kernel

86

from .register_command_stream_util import UNARY_ELEMWISE_OPS

87

from .register_command_stream_util import Watermark

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

88

89

90

class RegisterMachine:

91

def __init__(self):

92

self.n_banks = 1

93

self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]

94

self.bank_idx = 0

95

96

def set_register(self, reg, value):

97

is_changed = self.registers[self.bank_idx][reg] != value

98

self.registers[self.bank_idx][reg] = value

99

# is_changed = True # force command

100

return is_changed

101

102

def switch_bank(self):

103

self.bank_idx = (self.bank_idx + 1) % self.n_banks

104

105

106

class CmdMode(IntEnum):

NoPayload = 0x0000

Payload32 = 0x4000

Mask = 0xC000

CmdOpMask = 0x03FF

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

113

class CommandStreamEmitter:

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

114

WORD_SIZE = 4

115

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

116

def __init__(self):

117

self.cmd_stream = []

118

self.reg_machine = [RegisterMachine(), RegisterMachine()]

119

self.last_absolute_wait = defaultdict(int)

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

120

self.offset = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

121

122

def get_reg_machine(self, cmd):

123

if "DMA" in cmd.name:

124

return self.reg_machine[1]

125

else:

126

return self.reg_machine[0]

127

128

def size_in_bytes(self):

129

sz = 0

130

for cmd in self.cmd_stream:

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

131

sz += len(cmd) * CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

132

return sz

133

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

134

def to_list(self) -> List[int]:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

135

return [elem for cmd in self.cmd_stream for elem in cmd]

136

137

def print_cmds(self):

138

print("Code: Command: Param: Payload:")

139

for words_for_one_command in self.cmd_stream:

140

code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits

141

param = words_for_one_command[0] >> 16 # higher 16 bits

142

143

payload_mode = CmdMode(code & CmdMode.Mask)

144

145

# code and command

146

s = " 0x%04x " % code

147

if payload_mode == CmdMode.NoPayload:

148

s += str(cmd0(code & CmdMode.CmdOpMask))

149

else:

150

s += str(cmd1(code & CmdMode.CmdOpMask))

s = s.ljust(40)

s += "%5d" % param

# payload

if payload_mode == CmdMode.Payload32:

157

s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])

else:

s += " -"

print(s)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

163

def cmd0_with_param(self, cmd: cmd0, param):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

164

if isinstance(param, Enum):

165

param = int(param.value)

166

else:

167

param = int(param)

168

param = param & 0xFFFF

169

command = cmd.value | (param << 16)

170

if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):

171

return

172

173

# This is not a redundant command, actually write it

174

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

175

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

176

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

177

def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):

Louis Verhaard

893780c

2021-03-30 09:02:30 +0200

[diff] [blame]

178

offset = int(offset) & 0xFFFFFFFF

179

param = int(param) & 0xFFFF

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

180

command = cmd.value | CmdMode.Payload32.value | (param << 16)

181

182

if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):

183

return

184

185

# This is not a redundant command, actually write it

186

self.cmd_stream.append((command, offset))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

187

self.offset += CommandStreamEmitter.WORD_SIZE * 2

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

188

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

189

def cmd1_with_address(self, cmd: cmd1, offset):

190

self.cmd1_with_offset(cmd, offset, offset >> 32)

191

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

192

def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):

Tim Hall

289a41d

2020-08-04 21:40:14 +0100

[diff] [blame]

193

param = (16 * channel) + outstanding_count

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

194

command = ((param & 0xFFFF) << 16) | cmd.value

195

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

196

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

197

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

198

def cmd_do_operation(self, cmd: cmd0, param=0):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

199

param = int(param)

200

command = ((param & 0xFFFF) << 16) | cmd.value

201

202

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

203

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

204

self.get_reg_machine(cmd).switch_bank()

205

206

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

207

# -------------------------------------------------------------------

208

# REGISTER GENERATION

209

# -------------------------------------------------------------------

210

211

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

212

# TODO: Replace with definitions from ethos_u55_regs

213

class IFM2Broadcast(IntEnum):

214

BroadcastHdim = 1 << 0

215

BroadcastWdim = 1 << 1

216

BroadcastCdim = 1 << 2

217

ReverseOperandOrder = 1 << 6

218

UseIFM2Scalar = 1 << 7

pooling_op_map = {

NpuPoolingOp.MAX: pooling_mode.MAX.value,

223

NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,

224

NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,

225

}

226

227

elementwise_op_map = {

228

NpuElementWiseOp.MUL: elementwise_mode.MUL.value,

229

NpuElementWiseOp.ADD: elementwise_mode.ADD.value,

230

NpuElementWiseOp.SUB: elementwise_mode.SUB.value,

231

NpuElementWiseOp.MIN: elementwise_mode.MIN.value,

232

NpuElementWiseOp.MAX: elementwise_mode.MAX.value,

233

NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,

234

NpuElementWiseOp.ABS: elementwise_mode.ABS.value,

235

NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,

236

NpuElementWiseOp.SHR: elementwise_mode.SHR.value,

237

NpuElementWiseOp.SHL: elementwise_mode.SHL.value,

238

}

239

240

activation_op_map = {

241

NpuActivationOp.NONE_OR_RELU: activation.NONE,

242

NpuActivationOp.TANH: activation.TANH,

243

NpuActivationOp.SIGMOID: activation.SIGMOID,

244

}

245

246

# Maps an AccumulatorType enum to the corresponding acc_format value

247

acc_format_map = {

248

SHRAMElements.Acc16: acc_format.FP_S5_10.value,

249

SHRAMElements.Acc32: acc_format.INT_32BIT.value,

250

SHRAMElements.Acc40: acc_format.INT_40BIT.value,

251

}

252

253

resampling_mode_map = {

254

NpuResamplingMode.NONE: resampling_mode.NONE,

255

NpuResamplingMode.NEAREST: resampling_mode.NEAREST,

256

NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,

257

}

258

259

# Maps data type size in bits to activation precision

260

precision_map = {8: 0, 16: 1, 32: 2}

261

262

# Maps rounding mode to the corresponding value

263

rounding_mode_map = {

264

NpuRoundingMode.TFL: rounding.TFL.value,

265

NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,

266

NpuRoundingMode.NATURAL: rounding.NATURAL.value,

}

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

270

def check_mem_limits(memory_accesses: MemoryAccessSet, mem_limits: Dict[int, int]):

271

"""Checks that an operation's memory accesses respect the boundaries imposed by mem_limits"""

272

for mem_access in memory_accesses.accesses:

273

for region, range_set in mem_access.regions.items():

274

if region not in mem_limits:

275

raise VelaError(f"Invalid region: {region}")

276

max = mem_limits[region]

277

for start, end in range_set.ranges:

278

for offset in (start, end):

279

if offset < 0:

280

raise VelaError(f"Negative address offset: {offset}, region: {region}")

281

if offset > max:

282

raise VelaError(f"Address offset out of range: {offset}, region: {region}, max: {max}")

283

284

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

285

def quantise(value: float, quant: Optional[NpuQuantization]) -> int:

286

"""Quantizes the given value"""

287

scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32

288

zp = 0 if quant is None else quant.zero_point

289

return quantise_float32(value, scale, zp)

290

291

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

292

def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):

293

"""Generates IFM_PAD registers"""

294

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)

295

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)

296

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)

297

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)

298

299

300

def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):

301

"""Generates ACTIVATION registers"""

302

act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)

303

304

if act.min is None:

305

quantized_min = ofm.data_type.min_value()

306

else:

307

quantized_min = quantise(act.min, ofm.quantization)

308

if act.max is None:

309

quantized_max = ofm.data_type.max_value()

310

else:

311

quantized_max = quantise(act.max, ofm.quantization)

312

quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())

313

quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())

314

if act.op_type == NpuActivationOp.TABLE_LOOKUP:

315

assert 0 <= act.lookup_table_index < 8

316

activation_value = 16 + act.lookup_table_index

317

if ofm.data_type == NpuDataType.INT32:

318

activation_value |= 3 << 12 # Force I8 range

319

quantized_min = max(-128, quantized_min)

320

quantized_max = min(127, quantized_max)

321

else:

322

activation_value = activation_op_map[act.op_type]

323

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)

324

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)

325

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)

326

327

328

def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):

329

"""Generates xFM_BASE registers"""

330

if layout == NpuLayout.NHCWB16:

331

# Check that all BasePointer addresses are aligned to 16 bytes

332

assert all((int(addr) % 16) == 0 for addr in addresses)

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

333

for i in range(4):

334

emit.cmd1_with_address(ptr_cmds[i], addresses[i])

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

335

336

337

def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):

338

"""Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""

339

emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)

340

emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)

341

emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)

342

343

344

def generate_strides(

345

emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1

346

):

347

"""Generates STRIDE_C/Y/X registers"""

348

strides = get_strides(fm)

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

349

emit.cmd1_with_address(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)

350

emit.cmd1_with_address(stride_y_cmd, strides.height) # stride between vertical values (H)

351

emit.cmd1_with_address(stride_x_cmd, strides.width) # stride between horisontal values (W)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

352

353

354

def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):

355

"""Generates IFM/IFM2_PRECISION register"""

356

dtype = fm.data_type

357

prec = 1 if dtype.is_signed() else 0

358

activation_precision = precision_map[dtype.size_in_bits()]

359

prec += activation_precision << 2

360

361

if fm.layout == NpuLayout.NHCWB16:

362

prec |= 1 << 6

363

364

prec |= op_to_scale << 8

365

emit.cmd0_with_param(precision_cmd, prec)

366

367

368

def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):

369

"""Generates OFM_PRECISION register"""

370

dtype = npu_op.ofm.data_type

371

prec = 1 if dtype.is_signed() else 0

372

activation_precision = precision_map[dtype.size_in_bits()]

373

prec += activation_precision << 1

374

375

if use_global_scale:

376

# Set global scale bit, as opposed to using per channel scale

377

prec |= 1 << 8

378

if npu_op.ofm.layout == NpuLayout.NHCWB16:

379

prec |= 1 << 6

380

prec |= rounding_mode_map[npu_op.rounding_mode] << 14

381

emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)

382

383

384

def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):

385

"""Generates IFM2_BROADCAST register for binary elementwise operations"""

ifm2_broadcast = 0

ifm = npu_op.ifm

ifm2 = npu_op.ifm2

if npu_op.reversed_operands:

390

ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder

391

if npu_op.ifm2_scalar is not None:

392

# IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST

393

ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar

394

else:

395

if ifm.shape.height != ifm2.shape.height:

396

# Broadcast in 'H' dimension

397

assert ifm2.shape.height == 1

398

ifm2_broadcast |= IFM2Broadcast.BroadcastHdim

399

400

if ifm.shape.width != ifm2.shape.width:

401

# Broadcast in 'W' dimension

402

assert ifm2.shape.width == 1

403

ifm2_broadcast |= IFM2Broadcast.BroadcastWdim

404

405

if ifm.shape.depth != ifm2.shape.depth:

406

# Broadcast in 'C' dimension

407

assert ifm2.shape.depth == 1

408

ifm2_broadcast |= IFM2Broadcast.BroadcastCdim

409

410

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)

411

412

413

def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):

414

"""Generates general IFM registers"""

415

emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)

416

generate_addresses(

417

emit,

418

[cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],

ifm.tiles.addresses,

ifm.layout,

)

generate_tiles(

emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles

424

)

425

emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)

426

generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)

427

emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))

428

429

430

def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):

431

"""Generates general IFM2 registers"""

432

if not has_scalar:

433

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)

434

generate_addresses(

435

emit,

436

[cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],

437

ifm2.tiles.addresses,

ifm2.layout,

)

generate_tiles(

emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles

442

)

443

generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)

444

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))

445

446

447

def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):

448

"""Generates general OFM registers"""

449

emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)

450

generate_addresses(

451

emit,

452

[cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],

ofm.tiles.addresses,

ofm.layout,

)

generate_tiles(

emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles

458

)

459

emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)

460

emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)

461

emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)

462

generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)

463

emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))

464

465

466

def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):

467

"""Generates KERNEL related registers"""

468

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))

469

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))

470

# set kernel x stride low bit

471

stride = (kernel.stride_x - 1) & 1

472

# set kernel y stride low bit

473

stride |= (kernel.stride_y - 1 & 1) << 1

474

# set kernel x stride extension bits

475

stride |= (kernel.stride_x - 1 >> 1) << 6

476

# set kernel y stride extension bits

477

stride |= (kernel.stride_y - 1 >> 1) << 9

478

stride |= (kernel.dilation_x - 1) << 3

479

stride |= (kernel.dilation_y - 1) << 4

480

if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:

481

stride |= 1 << 2

482

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)

483

484

485

def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):

486

"""Generates WEIGHT registers"""

487

if len(weights) == 0:

488

return

489

emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)

490

# Set weights sources for active and present cores

491

for core, (addr, length) in enumerate(

492

[

493

(cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),

494

(cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),

495

]

496

):

497

if core < len(weights):

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

498

emit.cmd1_with_address(addr, weights[core].address)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

499

emit.cmd1_with_offset(length, weights[core].length)

500

elif core < arch.ncores:

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

501

emit.cmd1_with_address(addr, weights[0].address)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

502

emit.cmd1_with_offset(length, 0)

503

504

505

def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):

506

"""Generates SCALE registers"""

507

if len(biases) == 0:

508

return

509

emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)

510

# Set weights sources for active and present cores

511

for core, (addr, length) in enumerate(

512

[(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]

513

):

514

if core < len(biases):

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

515

emit.cmd1_with_address(addr, biases[core].address)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

516

emit.cmd1_with_offset(length, biases[core].length)

517

elif core < arch.ncores:

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

518

emit.cmd1_with_address(addr, biases[0].address)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

519

emit.cmd1_with_offset(length, 0)

520

521

522

def generate_block_config(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

523

emit: CommandStreamEmitter, block_config: NpuShape3D,

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

524

):

525

"""Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

526

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)

527

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)

528

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

529

530

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

531

def generate_shram_registers(

532

emit: CommandStreamEmitter, npu_op: NpuBlockOperation, arch_block_config: ArchitectureBlockConfig,

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

533

):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

534

"""Generates IB_END/IB_START/AB_START/ACC_FORMAT registers"""

535

emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, arch_block_config.layout.ib_end)

536

emit.cmd0_with_param(cmd0.NPU_SET_AB_START, arch_block_config.layout.ab_start)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

537

if has_ifm2(npu_op):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

538

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_IB_START, arch_block_config.layout.ib_start2)

539

emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[arch_block_config.acc_type])

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

540

541

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

542

def get_block_config_for_npu_op(

543

arch, npu_op: NpuBlockOperation, npu_block_type: NpuBlockType, is_partkernel: bool, ifm_resampling: resampling_mode

544

) -> Optional[ArchitectureBlockConfig]:

545

"""

546

Given npu_op.block_config, returns a corresponding ArchitectureBlockConfig.

547

Returns None if the block_config does not fit.

548

"""

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

549

550

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

551

def get_arch_block_config(

552

npu_op: NpuBlockOperation, block_traversal: NpuBlockTraversal, arch: ArchitectureFeatures

553

) -> ArchitectureBlockConfig:

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

554

"""Creates shared buffer allocation for the given operation"""

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

555

assert npu_op.block_config is not None, "block_config has not been set"

556

block_type = NpuBlockType.Default

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

557

if isinstance(npu_op, NpuConv2DOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

558

block_type = NpuBlockType.ConvolutionMxN

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

559

elif isinstance(npu_op, NpuConvDepthWiseOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

560

block_type = NpuBlockType.ConvolutionDepthWise

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

561

elif isinstance(npu_op, NpuPoolingOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

562

block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

563

elif isinstance(npu_op, NpuElementWiseOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

564

block_type = NpuBlockType.ElementWise

565

else:

566

assert 0, "Unsupported operation"

567

ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

568

is_partkernel = block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST

569

uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP

570

lut_banks = 2 if uses_lut else 0

571

fms = [npu_op.ifm, npu_op.ofm]

572

if npu_op.ifm2 is not None:

573

fms.append(npu_op.ifm2)

574

all_fms_have_quant = not any(fm.quantization is None or fm.quantization.scale_f32 is None for fm in fms)

575

ifm_bits = npu_op.ifm.data_type.size_in_bits()

576

ifm_shape = shape3d_to_block(npu_op.ifm.shape)

577

if has_ifm2(npu_op):

578

ifm2_shape = shape3d_to_block(npu_op.ifm2.shape)

579

else:

580

ifm2_shape = None

581

uses_scalar = npu_op.ifm2_scalar is not None

582

block_config = shape3d_to_block(npu_op.block_config)

583

arch_block_config = try_block_config(

584

block_config,

585

arch,

586

block_type,

Tim Hall

3016157

2021-06-17 17:03:49 +0100

[diff] [blame]

587

npu_op.ofm.shape,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

ifm_shape,

ifm2_shape,

uses_scalar,

ifm_bits,

is_partkernel=is_partkernel,

593

kernel=to_kernel(npu_op.kernel),

594

lut_banks=lut_banks,

595

scaled=all_fms_have_quant,

596

ifm_resampling=ifm_resampling_mode,

597

)

598

assert arch_block_config is not None, f"block_config {npu_op.block_config} does not fit"

599

return arch_block_config

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

600

601

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

602

def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):

603

"""Generates KERNEL_WAIT/DMA_WAIT"""

604

if cmd_waits.npu >= 0:

605

emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)

606

607

if cmd_waits.dma >= 0:

608

emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)

609

610

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

611

def generate_common(

612

emit: CommandStreamEmitter,

613

npu_op: NpuBlockOperation,

614

block_traversal: NpuBlockTraversal,

615

arch: ArchitectureFeatures,

616

use_global_scale: bool = False,

617

op_to_scale: int = 0,

618

):

619

"""Generate registers that are common to most operations"""

620

assert npu_op.ifm is not None and npu_op.ofm is not None

621

generate_ifm(emit, npu_op.ifm)

622

generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)

623

emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])

624

if npu_op.padding is not None:

625

generate_padding(emit, npu_op.padding)

626

generate_ofm(emit, npu_op.ofm)

627

generate_ofm_precision(emit, npu_op, use_global_scale)

628

if npu_op.op_type != NpuOperationType.ElementWise:

629

assert npu_op.kernel is not None

630

generate_kernel(emit, npu_op.kernel, block_traversal)

631

generate_weights(emit, npu_op.weights, arch)

632

generate_biases(emit, npu_op.biases, arch)

633

generate_activation(emit, npu_op.activation, npu_op.ofm)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

634

arch_block_config = get_arch_block_config(npu_op, block_traversal, arch)

635

generate_block_config(emit, npu_op.block_config)

636

generate_shram_registers(emit, npu_op, arch_block_config)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

637

638

639

# -------------------------------------------------------------------

640

# SCALING

641

# -------------------------------------------------------------------

642

643

644

def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):

645

"""Generates OFM_SCALE register for pooling operations"""

646

# For valid padding vela has to output scaling values

647

kernel = pool_op.kernel

648

ifm_quant = pool_op.ifm.quantization

649

ofm_quant = pool_op.ofm.quantization

650

if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):

651

assert ifm_quant.scale_f32 is not None

652

rescale = 0x3000 * ifm_quant.scale_f32

653

if pool_op.ifm.data_type == NpuDataType.INT16:

654

# Calculate scale and shift for the output scale of 1/(3*4096)

Louis Verhaard

c629129

2021-03-19 09:35:48 +0100

[diff] [blame]

655

x_log2 = math.log2(ifm_quant.scale_f32)

656

rounded_log2 = int(round(x_log2))

657

is_power_of_two = abs(x_log2 - rounded_log2) < 0.001

658

shift = rounded_log2 + 12

Patrik Gustavsson

e3dd2f3

2021-12-02 09:08:26 +0100

[diff] [blame]

659

if is_power_of_two and (

660

(pool_op.activation.op_type == NpuActivationOp.TANH and shift in (0, 1))

661

or (pool_op.activation.op_type == NpuActivationOp.SIGMOID and shift == 0)

662

):

663

# Special handling if input scale is 1/2048 (tanh/sigmoid) or 1/4096 (tanh)

Louis Verhaard

c629129

2021-03-19 09:35:48 +0100

[diff] [blame]

scale = 3 << shift

shift = 0

else:

shift = 0

max_rescale = np.iinfo(np.int16).max / 2

669

while rescale <= max_rescale and shift <= 30:

670

shift += 1

671

rescale *= 2

672

scale = int(rescale)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

673

else:

674

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

675

scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)

676

scale = int(round_away_zero(scale * rescale))

677

elif pool_op.fused_quantize:

678

# Quantize op requires different scaling

679

ifm_scale_f64 = np.double(ifm_quant.scale_f32)

680

ofm_scale_f64 = np.double(ofm_quant.scale_f32)

681

scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)

682

elif pool_op.rescale is not None:

Patrik Gustavsson

c74682c

2021-08-17 14:26:38 +0200

[diff] [blame]

683

if type(pool_op.rescale) == ExplicitScaling:

684

# Note: reuse of rescale for explicit scaling to not expose this in the external API

685

explicit_scaling = pool_op.rescale

686

assert explicit_scaling.per_channel is False

687

scale = explicit_scaling.multiplier[0]

688

shift = explicit_scaling.shift[0]

689

else:

690

# for ResizeBilinear operations with rescale

691

rescale = pool_op.rescale

692

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

693

scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)

694

scale = int(round_away_zero(scale * rescale))

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

695

else:

696

# In case avg pool fused with concat or other memory operation, rescaling might be needed.

697

# kernel height == kernel width == 1 is always true in this case

698

# Normally the scale is maximised, to get maximum precision, which means that

699

# if rescale != 1, scale need to consider the number of bits needed for rescaling

700

if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:

701

rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32

702

rescale_bits = 0

703

if kernel.height == kernel.width == 1:

704

if rescale > 1:

705

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

706

elif rescale < 1:

707

rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)

708

scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)

709

scale = int(round_away_zero(scale * rescale))

else:

scale = 1

shift = 0

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)

715

716

717

def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:

718

"""

719

Generates OFM/OPA/OPB_SCALE registers for elementwise operators.

720

Returns the operator to scale

721

"""

722

op_to_scale = 0

723

if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):

724

input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None

725

input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None

726

output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None

727

728

if npu_op.activation is not None and npu_op.activation.op_type in (

729

NpuActivationOp.SIGMOID,

730

NpuActivationOp.TANH,

731

):

732

output_scale = 1 / 0x3000

733

734

if npu_op.sub_op_type == NpuElementWiseOp.MUL:

Patrik Gustavsson

b081d67

2021-08-25 13:49:25 +0200

[diff] [blame]

735

if npu_op.rescale:

736

ofm_scale, shift = npu_op.rescale

737

elif None in (input_scale, input2_scale, output_scale):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

ofm_scale = 1

shift = 0

else:

ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)

742

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

743

else: # Add/Sub

Henrik G Olsson

2021-03-19 15:50:28 +0100

[diff] [blame]

744

bitdepth = npu_op.ifm.data_type.size_in_bits()

745

use_advanced_scaling = False

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

746

if None in (input_scale, input2_scale, output_scale):

747

opa_scale = opb_scale = ofm_scale = 1

748

opa_shift = shift = 0

749

if npu_op.rescale is not None:

750

ofm_scale, shift = npu_op.rescale

Henrik G Olsson

2021-03-19 15:50:28 +0100

[diff] [blame]

751

elif input_scale == input2_scale and bitdepth == 16:

752

opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(

753

input_scale, input2_scale, output_scale

754

)

755

# align the double rounding with that of advanced scaling

opa_scale /= 2

opb_scale /= 2

shift -= 1

opa_shift = 0 # Unused for this case

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

760

elif input_scale == input2_scale:

761

opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(

762

input_scale, input2_scale, output_scale

763

)

764

opa_shift = 0 # Unused for this case

Henrik G Olsson

2021-03-19 15:50:28 +0100

[diff] [blame]

765

# For 8 bit we can't guarantee double rounding with simplified scaling will always be

766

# the same as with advanced scaling due to different shifts. When the ofm scale fulfils

767

# the following we know that double rounding will have no effect for advanced scaling

768

# no matter the input, so we can safely use simplified scaling with double rounding disabled.

769

use_advanced_scaling = int(ofm_scale) & 0xFFF != 0

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

770

else:

Henrik G Olsson

2021-03-19 15:50:28 +0100

[diff] [blame]

771

use_advanced_scaling = True

772

if use_advanced_scaling:

773

# Use advanced implementation only when input/output scales differ,

774

# or when we can't guarantee the absence of rounding errors

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

775

(opa_scale, opa_shift, ofm_scale, shift, op_to_scale,) = scaling.advanced_elementwise_add_sub_scale(

776

input_scale, input2_scale, output_scale, bitdepth

777

)

778

opb_scale = 0 # Unused for this case

779

if npu_op.reversed_operands:

780

# If the operand order is reversed we also have to swap which operand is scaled

781

if op_to_scale == scaling.OperandToScale.OPa:

782

op_to_scale = scaling.OperandToScale.OPb

783

else:

784

op_to_scale = scaling.OperandToScale.OPa

785

emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)

786

emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)

787

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

788

elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):

789

output_scale = npu_op.ofm.quantization.scale_f32

790

ofm_scale, shift = scaling.quantise_scale(output_scale)

791

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

792

else:

793

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)

return op_to_scale

# -------------------------------------------------------------------

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

798

# PRINT

799

# -------------------------------------------------------------------

Jacob Bohlin

e99b893

2020-07-13 16:01:51 +0200

[diff] [blame]

800

801

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

802

def print_feature_map(fm: NpuFeatureMap, name: str):

if fm is not None:

q = (

"no quantization"

if fm.quantization is None

807

else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"

808

)

809

h, w, c = fm.shape

810

sz = h * w * c * fm.data_type.size_in_bytes()

811

print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")

812

strides = get_strides(fm)

813

stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"

814

t = fm.tiles

815

addresses = [hex(addr) for addr in t.addresses]

816

print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

817

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

818

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

819

def print_operation(npu_op: NpuOperation, index: int = 0, cmd=None):

820

pass_info = f", {cmd}" if cmd else ""

821

if isinstance(npu_op, NpuOperation) and not isinstance(npu_op, (NpuDmaOperation, NpuBlockOperation)):

822

print(f"{index} {npu_op.op_type.name}{pass_info}")

823

return

824

if isinstance(npu_op, NpuDmaOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

825

print(f"{index} DMA_START src={npu_op.src}, dest={npu_op.dest}{pass_info}")

826

return

827

k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

828

if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

829

print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name}:{pass_info}")

Patrik Gustavsson

eca2e95

2020-05-27 09:15:11 +0200

[diff] [blame]

830

else:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

831

if (

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

832

isinstance(npu_op, NpuConv2DOperation)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

833

and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1

834

):

835

fc = "FullyConnected "

836

else:

837

fc = ""

838

print(f"{index} {fc}{npu_op.op_type.name}{pass_info}")

839

print_feature_map(npu_op.ifm, "IFM")

840

if npu_op.ifm2_scalar is not None:

841

quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)

842

print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")

843

else:

844

print_feature_map(npu_op.ifm2, "IFM2")

845

print_feature_map(npu_op.ofm, "OFM")

846

if k is not None and npu_op.op_type != NpuOperationType.ElementWise:

847

print(f" Kernel: {k}")

848

if npu_op.padding is not None:

849

print(f" {npu_op.padding}")

850

for weights in npu_op.weights:

851

print(f" Weights: {weights}")

852

for bias in npu_op.biases:

853

print(f" Scales: {bias}")

854

if npu_op.activation is not None:

855

act = npu_op.activation

856

if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:

857

lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""

858

print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

859

if isinstance(npu_op, NpuConv2DOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

860

print(f" {npu_op.block_traversal}")

861

bh, bw, bc = npu_op.block_config

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

862

rescale = (

863

f", rescale={npu_op.rescale}" if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)) else ""

864

)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

865

print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

866

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

867

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

868

def print_operations(npu_op_list: List[NpuOperation], npu_op_to_cmd=None):

869

npu_op_to_cmd = dict() if npu_op_to_cmd is None else npu_op_to_cmd

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

870

for index, npu_op in enumerate(npu_op_list):

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

871

print_operation(npu_op, index, npu_op_to_cmd.get(npu_op))

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

872

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

873

874

# -------------------------------------------------------------------

875

# OPERATIONS

876

# -------------------------------------------------------------------

877

878

879

def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):

880

"""Generates NPU_OP_* command"""

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

881

if isinstance(npu_op, NpuDmaOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

882

emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

883

elif isinstance(npu_op, NpuConv2DOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

884

emit.cmd_do_operation(cmd0.NPU_OP_CONV)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

885

elif isinstance(npu_op, NpuConvDepthWiseOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

886

emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

887

elif isinstance(npu_op, NpuPoolingOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

888

emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

889

elif isinstance(npu_op, NpuElementWiseOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

890

emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])

891

else:

892

assert 0, "Unsupported operation"

893

894

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

895

def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

896

"""Generates register commands for Conv2D operations"""

897

generate_common(emit, npu_op, npu_op.block_traversal, arch)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

898

899

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

900

def generate_conv_depthwise_op(

901

emit: CommandStreamEmitter, npu_op: NpuConvDepthWiseOperation, arch: ArchitectureFeatures

902

):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

903

"""Generates register commands for depthwise convolution operations"""

904

generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

905

906

907

def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):

908

"""Generates register commands for pooling operations"""

909

use_global_scale = (

910

npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0

911

)

Patrik Gustavsson

c74682c

2021-08-17 14:26:38 +0200

[diff] [blame]

912

# Note: reuse of rescale for explicit scaling to not expose this in the external API

913

if npu_op.rescale is not None and type(npu_op.rescale) == ExplicitScaling:

914

use_global_scale = not npu_op.rescale.per_channel

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

915

generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)

916

# Pooling op specific

917

if use_global_scale:

918

generate_ofm_scaling_for_pooling(emit, npu_op)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

919

920

921

def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):

922

"""Generates register commands for elementwise operations"""

923

use_global_scale = npu_op.sub_op_type in (

924

NpuElementWiseOp.ADD,

925

NpuElementWiseOp.SUB,

926

NpuElementWiseOp.MUL,

927

NpuElementWiseOp.LRELU,

928

NpuElementWiseOp.ABS,

929

)

930

op_to_scale = generate_scaling_for_elementwise(emit, npu_op)

931

generate_common(

932

emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale

933

)

934

# Elementwise op specific

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

935

if npu_op.sub_op_type not in UNARY_ELEMWISE_OPS:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

936

# Binary operation; generate IFM2 registers

937

assert npu_op.ifm2 is not None

938

has_scalar = npu_op.ifm2_scalar is not None

939

generate_ifm2(emit, npu_op.ifm2, has_scalar)

940

generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)

941

generate_ifm2_broadcast(emit, npu_op)

942

if has_scalar:

943

quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)

944

assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()

945

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

946

947

948

def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):

949

"""Generates register commands for DMA operations"""

950

emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

951

emit.cmd1_with_address(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

952

emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)

953

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

954

emit.cmd1_with_address(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)

955

emit.cmd1_with_address(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

956

957

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

958

def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

959

"""

960

Generates register commands for the given operation, but not the final NPU_OP_... command.

961

Returns the selected block config

962

"""

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

963

if isinstance(npu_op, NpuConv2DOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

964

generate_conv2d_op(emit, npu_op, arch)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

965

elif isinstance(npu_op, NpuConvDepthWiseOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

966

generate_conv_depthwise_op(emit, npu_op, arch)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

967

elif isinstance(npu_op, NpuPoolingOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

968

generate_pooling_op(emit, npu_op, arch)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

969

elif isinstance(npu_op, NpuElementWiseOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

970

generate_elementwise_op(emit, npu_op, arch)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

971

elif isinstance(npu_op, NpuDmaOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

972

generate_dma_op(emit, npu_op)

973

else:

974

assert 0, "Unsupported operation"

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

975

976

977

def generate_command_stream(

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

978

npu_op_list: List[NpuOperation],

979

arch: ArchitectureFeatures,

980

verbose: bool,

981

mem_limits: Dict[int, int],

982

add_to_debug_db=None,

983

npu_op_to_cmd=None,

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

984

) -> List[int]:

985

"""

986

Generates register commands for the given list of NPU operations.

987

Returns Ethos-U instructions, as a list of 32-bit integers.

988

"""

989

emit = CommandStreamEmitter()

990

if verbose:

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

991

print_operations(npu_op_list, npu_op_to_cmd)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

992

# Calculate memory accesses for every operation

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

993

memory_accesses: Dict[NpuOperation, MemoryAccessSet] = {}

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

994

for npu_op in npu_op_list:

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

995

if isinstance(npu_op, NpuDmaOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

996

memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

997

elif isinstance(npu_op, NpuBlockOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

998

memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

999

else:

1000

assert 0, "Invalid operation type"

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

1001

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

1002

if arch.is_ethos_u65_system:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1003

emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)

1004

dep_watermark = Watermark(0, 0)

1005

prev_op = None

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1006

# Generate register commands for all operations

1007

for op_index, npu_op in enumerate(npu_op_list):

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

1008

try:

1009

check_mem_limits(memory_accesses[npu_op], mem_limits)

1010

dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)

1011

generate_registers_for_op(emit, npu_op, arch)

1012

except VelaError as e:

1013

# Add operation info and rethrow

1014

raise VelaError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1015

if not isinstance(npu_op, NpuDmaOperation) and isinstance(npu_op, NpuBlockOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1016

# Generate BLOCKDEP

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

1017

blockdep = calc_blockdep(arch, prev_op, npu_op)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1018

blockdep = min(blockdep, arch.max_blockdep)

1019

emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)

1020

prev_op = npu_op

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1021

1022

generate_cmd_waits(emit, cmd_waits)

1023

# Generate the actual NPU_OP command

1024

generate_operation_code(emit, npu_op)

1025

if add_to_debug_db is not None:

1026

add_to_debug_db(npu_op, emit.offset)

1027

# Fill in final part of command stream:

1028

emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

1029

res = emit.to_list()

erik.andersson@arm.com

1878dab

2021-03-16 09:40:24 +0100

[diff] [blame]

1030

1031

if emit.size_in_bytes() >= 1 << 24:

1032

raise VelaError(

1033

f"The command stream size exceeds the hardware limit of 16 MiB. "

1034

f"The current stream size is {emit.size_in_bytes()/2**20:.2F} MiB."

1035

)

1036

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1037

if verbose:

1038

emit.print_cmds()

1039

print("number of commands", len(emit.cmd_stream))

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

1040

print("command stream length in words", len(res))

return res

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1044

def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1045

"""

Louis Verhaard

aeae567

2020-11-02 18:04:27 +0100

[diff] [blame]

1046

Internal implementation of the public facing API for generating an Ethos-U register command stream.

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1047

Calculates dependencies between commands and inserts wait operations if needed.

1048

1049

:param npu_op_list: List[NpuOperation] list of high level NPU operations

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

1050

:param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator

1051

:return Ethos-U instructions, as a list of 32-bit integers

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1052

"""

Louis Verhaard

aeae567

2020-11-02 18:04:27 +0100

[diff] [blame]

1053

accelerator = Accelerator.from_npu_accelerator(npu_accelerator)

Louis Verhaard

5207830

2020-11-18 13:35:06 +0100

[diff] [blame]

1054

arch = create_default_arch(accelerator)

Louis Verhaard