Blame - ethosu/vela/register_command_stream_generator.py - ml/ethos-u/ethos-u-vela

2020-04-27 18:20:16 +0100

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

16

# Description:

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

17

# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

18

# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

19

# stream suitable for interpretation by the Ethos-U processor.

Louis Verhaard

c629129

2021-03-19 09:35:48 +0100

[diff] [blame]

20

import math

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

21

from collections import defaultdict

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

22

from enum import Enum

23

from enum import IntEnum

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

24

from typing import Dict

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

25

from typing import List

26

from typing import Optional

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

import numpy as np

from . import scaling

Louis Verhaard

aeae567

2020-11-02 18:04:27 +0100

[diff] [blame]

31

from .api import NpuAccelerator

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

32

from .api import NpuActivation

33

from .api import NpuActivationOp

34

from .api import NpuAddressRange

35

from .api import NpuBlockOperation

36

from .api import NpuBlockTraversal

37

from .api import NpuConv2DOperation

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

38

from .api import NpuConvDepthWiseOperation

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

39

from .api import NpuDataType

40

from .api import NpuDmaOperation

41

from .api import NpuElementWiseOp

42

from .api import NpuElementWiseOperation

43

from .api import NpuFeatureMap

44

from .api import NpuKernel

45

from .api import NpuLayout

46

from .api import NpuOperation

47

from .api import NpuOperationType

48

from .api import NpuPadding

49

from .api import NpuPoolingOp

50

from .api import NpuPoolingOperation

51

from .api import NpuQuantization

52

from .api import NpuResamplingMode

53

from .api import NpuRoundingMode

54

from .api import NpuShape3D

55

from .api import NpuTileBox

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

56

from .architecture_allocator import ArchitectureBlockConfig

57

from .architecture_allocator import try_block_config

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

58

from .architecture_features import Accelerator

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

59

from .architecture_features import ArchitectureFeatures

Louis Verhaard

5207830

2020-11-18 13:35:06 +0100

[diff] [blame]

60

from .architecture_features import create_default_arch

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

61

from .architecture_features import SHRAMElements

erik.andersson@arm.com

1878dab

2021-03-16 09:40:24 +0100

[diff] [blame]

62

from .errors import VelaError

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

63

from .ethos_u55_regs.ethos_u55_regs import acc_format

64

from .ethos_u55_regs.ethos_u55_regs import activation

65

from .ethos_u55_regs.ethos_u55_regs import cmd0

66

from .ethos_u55_regs.ethos_u55_regs import cmd1

67

from .ethos_u55_regs.ethos_u55_regs import elementwise_mode

Fredrik Svedberg

a0c3624

2020-06-03 15:43:31 +0200

[diff] [blame]

68

from .ethos_u55_regs.ethos_u55_regs import pooling_mode

Jacob Bohlin

cf7da10

2020-05-20 09:03:40 +0200

[diff] [blame]

69

from .ethos_u55_regs.ethos_u55_regs import resampling_mode

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

70

from .ethos_u55_regs.ethos_u55_regs import rounding

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

71

from .numeric_util import quantise_float32

72

from .numeric_util import round_away_zero

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

73

from .numeric_util import round_up_to_int

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

74

from .operation import NpuBlockType

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

75

from .range_set import MemoryAccessSet

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

76

from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

77

from .register_command_stream_util import calc_blockdep

78

from .register_command_stream_util import get_dma_memory_accesses

79

from .register_command_stream_util import get_op_memory_accesses

80

from .register_command_stream_util import get_strides

81

from .register_command_stream_util import get_wait_dependency

82

from .register_command_stream_util import has_ifm2

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

83

from .register_command_stream_util import shape3d_to_block

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

84

from .register_command_stream_util import to_kernel

85

from .register_command_stream_util import UNARY_ELEMWISE_OPS

86

from .register_command_stream_util import Watermark

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

87

88

89

class RegisterMachine:

90

def __init__(self):

91

self.n_banks = 1

92

self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]

93

self.bank_idx = 0

94

95

def set_register(self, reg, value):

96

is_changed = self.registers[self.bank_idx][reg] != value

97

self.registers[self.bank_idx][reg] = value

98

# is_changed = True # force command

99

return is_changed

100

101

def switch_bank(self):

102

self.bank_idx = (self.bank_idx + 1) % self.n_banks

103

104

105

class CmdMode(IntEnum):

NoPayload = 0x0000

Payload32 = 0x4000

Mask = 0xC000

CmdOpMask = 0x03FF

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

112

class CommandStreamEmitter:

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

113

WORD_SIZE = 4

114

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

115

def __init__(self):

116

self.cmd_stream = []

117

self.reg_machine = [RegisterMachine(), RegisterMachine()]

118

self.last_absolute_wait = defaultdict(int)

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

119

self.offset = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

120

121

def get_reg_machine(self, cmd):

122

if "DMA" in cmd.name:

123

return self.reg_machine[1]

124

else:

125

return self.reg_machine[0]

126

127

def size_in_bytes(self):

128

sz = 0

129

for cmd in self.cmd_stream:

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

130

sz += len(cmd) * CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

131

return sz

132

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

133

def to_list(self) -> List[int]:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

134

return [elem for cmd in self.cmd_stream for elem in cmd]

135

136

def print_cmds(self):

137

print("Code: Command: Param: Payload:")

138

for words_for_one_command in self.cmd_stream:

139

code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits

140

param = words_for_one_command[0] >> 16 # higher 16 bits

141

142

payload_mode = CmdMode(code & CmdMode.Mask)

143

144

# code and command

145

s = " 0x%04x " % code

146

if payload_mode == CmdMode.NoPayload:

147

s += str(cmd0(code & CmdMode.CmdOpMask))

148

else:

149

s += str(cmd1(code & CmdMode.CmdOpMask))

s = s.ljust(40)

s += "%5d" % param

# payload

if payload_mode == CmdMode.Payload32:

156

s += " 0x%08x (%d)" % (words_for_one_command[1], words_for_one_command[1])

else:

s += " -"

print(s)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

162

def cmd0_with_param(self, cmd: cmd0, param):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

163

if isinstance(param, Enum):

164

param = int(param.value)

165

else:

166

param = int(param)

167

param = param & 0xFFFF

168

command = cmd.value | (param << 16)

169

if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):

170

return

171

172

# This is not a redundant command, actually write it

173

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

174

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

175

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

176

def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):

Louis Verhaard

893780c

2021-03-30 09:02:30 +0200

[diff] [blame]

177

offset = int(offset) & 0xFFFFFFFF

178

param = int(param) & 0xFFFF

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

179

command = cmd.value | CmdMode.Payload32.value | (param << 16)

180

181

if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):

182

return

183

184

# This is not a redundant command, actually write it

185

self.cmd_stream.append((command, offset))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

186

self.offset += CommandStreamEmitter.WORD_SIZE * 2

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

187

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

188

def cmd1_with_address(self, cmd: cmd1, offset):

189

self.cmd1_with_offset(cmd, offset, offset >> 32)

190

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

191

def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):

Tim Hall

289a41d

2020-08-04 21:40:14 +0100

[diff] [blame]

192

param = (16 * channel) + outstanding_count

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

193

command = ((param & 0xFFFF) << 16) | cmd.value

194

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

195

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

196

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

197

def cmd_do_operation(self, cmd: cmd0, param=0):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

198

param = int(param)

199

command = ((param & 0xFFFF) << 16) | cmd.value

200

201

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

202

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

203

self.get_reg_machine(cmd).switch_bank()

204

205

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

206

# -------------------------------------------------------------------

207

# REGISTER GENERATION

208

# -------------------------------------------------------------------

209

210

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

211

# TODO: Replace with definitions from ethos_u55_regs

212

class IFM2Broadcast(IntEnum):

213

BroadcastHdim = 1 << 0

214

BroadcastWdim = 1 << 1

215

BroadcastCdim = 1 << 2

216

ReverseOperandOrder = 1 << 6

217

UseIFM2Scalar = 1 << 7

pooling_op_map = {

NpuPoolingOp.MAX: pooling_mode.MAX.value,

222

NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,

223

NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,

224

}

225

226

elementwise_op_map = {

227

NpuElementWiseOp.MUL: elementwise_mode.MUL.value,

228

NpuElementWiseOp.ADD: elementwise_mode.ADD.value,

229

NpuElementWiseOp.SUB: elementwise_mode.SUB.value,

230

NpuElementWiseOp.MIN: elementwise_mode.MIN.value,

231

NpuElementWiseOp.MAX: elementwise_mode.MAX.value,

232

NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,

233

NpuElementWiseOp.ABS: elementwise_mode.ABS.value,

234

NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,

235

NpuElementWiseOp.SHR: elementwise_mode.SHR.value,

236

NpuElementWiseOp.SHL: elementwise_mode.SHL.value,

237

}

238

239

activation_op_map = {

240

NpuActivationOp.NONE_OR_RELU: activation.NONE,

241

NpuActivationOp.TANH: activation.TANH,

242

NpuActivationOp.SIGMOID: activation.SIGMOID,

243

}

244

245

# Maps an AccumulatorType enum to the corresponding acc_format value

246

acc_format_map = {

247

SHRAMElements.Acc16: acc_format.FP_S5_10.value,

248

SHRAMElements.Acc32: acc_format.INT_32BIT.value,

249

SHRAMElements.Acc40: acc_format.INT_40BIT.value,

250

}

251

252

resampling_mode_map = {

253

NpuResamplingMode.NONE: resampling_mode.NONE,

254

NpuResamplingMode.NEAREST: resampling_mode.NEAREST,

255

NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,

256

}

257

258

# Maps data type size in bits to activation precision

259

precision_map = {8: 0, 16: 1, 32: 2}

260

261

# Maps rounding mode to the corresponding value

262

rounding_mode_map = {

263

NpuRoundingMode.TFL: rounding.TFL.value,

264

NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,

265

NpuRoundingMode.NATURAL: rounding.NATURAL.value,

}

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

269

def check_mem_limits(memory_accesses: MemoryAccessSet, mem_limits: Dict[int, int]):

270

"""Checks that an operation's memory accesses respect the boundaries imposed by mem_limits"""

271

for mem_access in memory_accesses.accesses:

272

for region, range_set in mem_access.regions.items():

273

if region not in mem_limits:

274

raise VelaError(f"Invalid region: {region}")

275

max = mem_limits[region]

276

for start, end in range_set.ranges:

277

for offset in (start, end):

278

if offset < 0:

279

raise VelaError(f"Negative address offset: {offset}, region: {region}")

280

if offset > max:

281

raise VelaError(f"Address offset out of range: {offset}, region: {region}, max: {max}")

282

283

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

284

def quantise(value: float, quant: Optional[NpuQuantization]) -> int:

285

"""Quantizes the given value"""

286

scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32

287

zp = 0 if quant is None else quant.zero_point

288

return quantise_float32(value, scale, zp)

289

290

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

291

def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):

292

"""Generates IFM_PAD registers"""

293

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)

294

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)

295

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)

296

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)

297

298

299

def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):

300

"""Generates ACTIVATION registers"""

301

act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)

302

303

if act.min is None:

304

quantized_min = ofm.data_type.min_value()

305

else:

306

quantized_min = quantise(act.min, ofm.quantization)

307

if act.max is None:

308

quantized_max = ofm.data_type.max_value()

309

else:

310

quantized_max = quantise(act.max, ofm.quantization)

311

quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())

312

quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())

313

if act.op_type == NpuActivationOp.TABLE_LOOKUP:

314

assert 0 <= act.lookup_table_index < 8

315

activation_value = 16 + act.lookup_table_index

316

if ofm.data_type == NpuDataType.INT32:

317

activation_value |= 3 << 12 # Force I8 range

318

quantized_min = max(-128, quantized_min)

319

quantized_max = min(127, quantized_max)

320

else:

321

activation_value = activation_op_map[act.op_type]

322

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)

323

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)

324

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)

325

326

327

def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):

328

"""Generates xFM_BASE registers"""

329

if layout == NpuLayout.NHCWB16:

330

# Check that all BasePointer addresses are aligned to 16 bytes

331

assert all((int(addr) % 16) == 0 for addr in addresses)

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

332

for i in range(4):

333

emit.cmd1_with_address(ptr_cmds[i], addresses[i])

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

334

335

336

def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):

337

"""Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""

338

emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)

339

emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)

340

emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)

341

342

343

def generate_strides(

344

emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1

345

):

346

"""Generates STRIDE_C/Y/X registers"""

347

strides = get_strides(fm)

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

348

emit.cmd1_with_address(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)

349

emit.cmd1_with_address(stride_y_cmd, strides.height) # stride between vertical values (H)

350

emit.cmd1_with_address(stride_x_cmd, strides.width) # stride between horisontal values (W)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

351

352

353

def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):

354

"""Generates IFM/IFM2_PRECISION register"""

355

dtype = fm.data_type

356

prec = 1 if dtype.is_signed() else 0

357

activation_precision = precision_map[dtype.size_in_bits()]

358

prec += activation_precision << 2

359

360

if fm.layout == NpuLayout.NHCWB16:

361

prec |= 1 << 6

362

363

prec |= op_to_scale << 8

364

emit.cmd0_with_param(precision_cmd, prec)

365

366

367

def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):

368

"""Generates OFM_PRECISION register"""

369

dtype = npu_op.ofm.data_type

370

prec = 1 if dtype.is_signed() else 0

371

activation_precision = precision_map[dtype.size_in_bits()]

372

prec += activation_precision << 1

373

374

if use_global_scale:

375

# Set global scale bit, as opposed to using per channel scale

376

prec |= 1 << 8

377

if npu_op.ofm.layout == NpuLayout.NHCWB16:

378

prec |= 1 << 6

379

prec |= rounding_mode_map[npu_op.rounding_mode] << 14

380

emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)

381

382

383

def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):

384

"""Generates IFM2_BROADCAST register for binary elementwise operations"""

ifm2_broadcast = 0

ifm = npu_op.ifm

ifm2 = npu_op.ifm2

if npu_op.reversed_operands:

389

ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder

390

if npu_op.ifm2_scalar is not None:

391

# IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST

392

ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar

393

else:

394

if ifm.shape.height != ifm2.shape.height:

395

# Broadcast in 'H' dimension

396

assert ifm2.shape.height == 1

397

ifm2_broadcast |= IFM2Broadcast.BroadcastHdim

398

399

if ifm.shape.width != ifm2.shape.width:

400

# Broadcast in 'W' dimension

401

assert ifm2.shape.width == 1

402

ifm2_broadcast |= IFM2Broadcast.BroadcastWdim

403

404

if ifm.shape.depth != ifm2.shape.depth:

405

# Broadcast in 'C' dimension

406

assert ifm2.shape.depth == 1

407

ifm2_broadcast |= IFM2Broadcast.BroadcastCdim

408

409

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)

410

411

412

def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):

413

"""Generates general IFM registers"""

414

emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)

415

generate_addresses(

416

emit,

417

[cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],

ifm.tiles.addresses,

ifm.layout,

)

generate_tiles(

emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles

423

)

424

emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)

425

generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)

426

emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))

427

428

429

def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):

430

"""Generates general IFM2 registers"""

431

if not has_scalar:

432

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)

433

generate_addresses(

434

emit,

435

[cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],

436

ifm2.tiles.addresses,

ifm2.layout,

)

generate_tiles(

emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles

441

)

442

generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)

443

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))

444

445

446

def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):

447

"""Generates general OFM registers"""

448

emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)

449

generate_addresses(

450

emit,

451

[cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],

ofm.tiles.addresses,

ofm.layout,

)

generate_tiles(

emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles

457

)

458

emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)

459

emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)

460

emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)

461

generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)

462

emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))

463

464

465

def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):

466

"""Generates KERNEL related registers"""

467

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))

468

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))

469

# set kernel x stride low bit

470

stride = (kernel.stride_x - 1) & 1

471

# set kernel y stride low bit

472

stride |= (kernel.stride_y - 1 & 1) << 1

473

# set kernel x stride extension bits

474

stride |= (kernel.stride_x - 1 >> 1) << 6

475

# set kernel y stride extension bits

476

stride |= (kernel.stride_y - 1 >> 1) << 9

477

stride |= (kernel.dilation_x - 1) << 3

478

stride |= (kernel.dilation_y - 1) << 4

479

if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:

480

stride |= 1 << 2

481

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)

482

483

484

def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):

485

"""Generates WEIGHT registers"""

486

if len(weights) == 0:

487

return

488

emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)

489

# Set weights sources for active and present cores

490

for core, (addr, length) in enumerate(

491

[

492

(cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),

493

(cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),

494

]

495

):

496

if core < len(weights):

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

497

emit.cmd1_with_address(addr, weights[core].address)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

498

emit.cmd1_with_offset(length, weights[core].length)

499

elif core < arch.ncores:

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

500

emit.cmd1_with_address(addr, weights[0].address)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

501

emit.cmd1_with_offset(length, 0)

502

503

504

def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):

505

"""Generates SCALE registers"""

506

if len(biases) == 0:

507

return

508

emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)

509

# Set weights sources for active and present cores

510

for core, (addr, length) in enumerate(

511

[(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]

512

):

513

if core < len(biases):

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

514

emit.cmd1_with_address(addr, biases[core].address)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

515

emit.cmd1_with_offset(length, biases[core].length)

516

elif core < arch.ncores:

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

517

emit.cmd1_with_address(addr, biases[0].address)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

518

emit.cmd1_with_offset(length, 0)

519

520

521

def generate_block_config(

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

522

emit: CommandStreamEmitter, block_config: NpuShape3D,

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

523

):

524

"""Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

525

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)

526

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)

527

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

528

529

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

530

def generate_shram_registers(

531

emit: CommandStreamEmitter, npu_op: NpuBlockOperation, arch_block_config: ArchitectureBlockConfig,

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

532

):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

533

"""Generates IB_END/IB_START/AB_START/ACC_FORMAT registers"""

534

emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, arch_block_config.layout.ib_end)

535

emit.cmd0_with_param(cmd0.NPU_SET_AB_START, arch_block_config.layout.ab_start)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

536

if has_ifm2(npu_op):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

537

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_IB_START, arch_block_config.layout.ib_start2)

538

emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[arch_block_config.acc_type])

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

539

540

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

541

def get_block_config_for_npu_op(

542

arch, npu_op: NpuBlockOperation, npu_block_type: NpuBlockType, is_partkernel: bool, ifm_resampling: resampling_mode

543

) -> Optional[ArchitectureBlockConfig]:

544

"""

545

Given npu_op.block_config, returns a corresponding ArchitectureBlockConfig.

546

Returns None if the block_config does not fit.

547

"""

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

548

549

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

550

def get_arch_block_config(

551

npu_op: NpuBlockOperation, block_traversal: NpuBlockTraversal, arch: ArchitectureFeatures

552

) -> ArchitectureBlockConfig:

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

553

"""Creates shared buffer allocation for the given operation"""

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

554

assert npu_op.block_config is not None, "block_config has not been set"

555

block_type = NpuBlockType.Default

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

556

if isinstance(npu_op, NpuConv2DOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

557

block_type = NpuBlockType.ConvolutionMxN

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

558

elif isinstance(npu_op, NpuConvDepthWiseOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

559

block_type = NpuBlockType.ConvolutionDepthWise

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

560

elif isinstance(npu_op, NpuPoolingOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

561

block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

562

elif isinstance(npu_op, NpuElementWiseOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

563

block_type = NpuBlockType.ElementWise

564

else:

565

assert 0, "Unsupported operation"

566

ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

567

is_partkernel = block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST

568

uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP

569

lut_banks = 2 if uses_lut else 0

570

fms = [npu_op.ifm, npu_op.ofm]

571

if npu_op.ifm2 is not None:

572

fms.append(npu_op.ifm2)

573

all_fms_have_quant = not any(fm.quantization is None or fm.quantization.scale_f32 is None for fm in fms)

574

ifm_bits = npu_op.ifm.data_type.size_in_bits()

575

ifm_shape = shape3d_to_block(npu_op.ifm.shape)

576

if has_ifm2(npu_op):

577

ifm2_shape = shape3d_to_block(npu_op.ifm2.shape)

578

else:

579

ifm2_shape = None

580

uses_scalar = npu_op.ifm2_scalar is not None

581

block_config = shape3d_to_block(npu_op.block_config)

582

arch_block_config = try_block_config(

583

block_config,

584

arch,

585

block_type,

Tim Hall

3016157

2021-06-17 17:03:49 +0100

[diff] [blame]

586

npu_op.ofm.shape,

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

ifm_shape,

ifm2_shape,

uses_scalar,

ifm_bits,

is_partkernel=is_partkernel,

592

kernel=to_kernel(npu_op.kernel),

593

lut_banks=lut_banks,

594

scaled=all_fms_have_quant,

595

ifm_resampling=ifm_resampling_mode,

596

)

597

assert arch_block_config is not None, f"block_config {npu_op.block_config} does not fit"

598

return arch_block_config

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

599

600

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

601

def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):

602

"""Generates KERNEL_WAIT/DMA_WAIT"""

603

if cmd_waits.npu >= 0:

604

emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)

605

606

if cmd_waits.dma >= 0:

607

emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)

608

609

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

610

def generate_common(

611

emit: CommandStreamEmitter,

612

npu_op: NpuBlockOperation,

613

block_traversal: NpuBlockTraversal,

614

arch: ArchitectureFeatures,

615

use_global_scale: bool = False,

616

op_to_scale: int = 0,

617

):

618

"""Generate registers that are common to most operations"""

619

assert npu_op.ifm is not None and npu_op.ofm is not None

620

generate_ifm(emit, npu_op.ifm)

621

generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)

622

emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])

623

if npu_op.padding is not None:

624

generate_padding(emit, npu_op.padding)

625

generate_ofm(emit, npu_op.ofm)

626

generate_ofm_precision(emit, npu_op, use_global_scale)

627

if npu_op.op_type != NpuOperationType.ElementWise:

628

assert npu_op.kernel is not None

629

generate_kernel(emit, npu_op.kernel, block_traversal)

630

generate_weights(emit, npu_op.weights, arch)

631

generate_biases(emit, npu_op.biases, arch)

632

generate_activation(emit, npu_op.activation, npu_op.ofm)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

633

arch_block_config = get_arch_block_config(npu_op, block_traversal, arch)

634

generate_block_config(emit, npu_op.block_config)

635

generate_shram_registers(emit, npu_op, arch_block_config)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

636

637

638

# -------------------------------------------------------------------

639

# SCALING

640

# -------------------------------------------------------------------

641

642

643

def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):

644

"""Generates OFM_SCALE register for pooling operations"""

645

# For valid padding vela has to output scaling values

646

kernel = pool_op.kernel

647

ifm_quant = pool_op.ifm.quantization

648

ofm_quant = pool_op.ofm.quantization

649

if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):

650

assert ifm_quant.scale_f32 is not None

651

rescale = 0x3000 * ifm_quant.scale_f32

652

if pool_op.ifm.data_type == NpuDataType.INT16:

653

# Calculate scale and shift for the output scale of 1/(3*4096)

Louis Verhaard

c629129

2021-03-19 09:35:48 +0100

[diff] [blame]

654

x_log2 = math.log2(ifm_quant.scale_f32)

655

rounded_log2 = int(round(x_log2))

656

is_power_of_two = abs(x_log2 - rounded_log2) < 0.001

657

shift = rounded_log2 + 12

658

if is_power_of_two and shift in (0, 1):

659

# Special handling if input scale is 1/2048 or 1/4096

scale = 3 << shift

shift = 0

else:

shift = 0

max_rescale = np.iinfo(np.int16).max / 2

665

while rescale <= max_rescale and shift <= 30:

666

shift += 1

667

rescale *= 2

668

scale = int(rescale)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

669

else:

670

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

671

scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)

672

scale = int(round_away_zero(scale * rescale))

673

elif pool_op.fused_quantize:

674

# Quantize op requires different scaling

675

ifm_scale_f64 = np.double(ifm_quant.scale_f32)

676

ofm_scale_f64 = np.double(ofm_quant.scale_f32)

677

scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)

678

elif pool_op.rescale is not None:

Fredrik Svedberg

e82be7c

2021-01-18 15:21:03 +0100

[diff] [blame]

679

# for ResizeBilinear operations with rescale

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

680

rescale = pool_op.rescale

681

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

682

scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)

683

scale = int(round_away_zero(scale * rescale))

684

else:

685

# In case avg pool fused with concat or other memory operation, rescaling might be needed.

686

# kernel height == kernel width == 1 is always true in this case

687

# Normally the scale is maximised, to get maximum precision, which means that

688

# if rescale != 1, scale need to consider the number of bits needed for rescaling

689

if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:

690

rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32

691

rescale_bits = 0

692

if kernel.height == kernel.width == 1:

693

if rescale > 1:

694

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

695

elif rescale < 1:

696

rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)

697

scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)

698

scale = int(round_away_zero(scale * rescale))

else:

scale = 1

shift = 0

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)

704

705

706

def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:

707

"""

708

Generates OFM/OPA/OPB_SCALE registers for elementwise operators.

709

Returns the operator to scale

710

"""

711

op_to_scale = 0

712

if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):

713

input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None

714

input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None

715

output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None

716

717

if npu_op.activation is not None and npu_op.activation.op_type in (

718

NpuActivationOp.SIGMOID,

719

NpuActivationOp.TANH,

720

):

721

output_scale = 1 / 0x3000

722

723

if npu_op.sub_op_type == NpuElementWiseOp.MUL:

724

if None in (input_scale, input2_scale, output_scale):

ofm_scale = 1

shift = 0

else:

ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)

729

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

730

else: # Add/Sub

Henrik G Olsson

2021-03-19 15:50:28 +0100

[diff] [blame]

731

bitdepth = npu_op.ifm.data_type.size_in_bits()

732

use_advanced_scaling = False

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

733

if None in (input_scale, input2_scale, output_scale):

734

opa_scale = opb_scale = ofm_scale = 1

735

opa_shift = shift = 0

736

if npu_op.rescale is not None:

737

ofm_scale, shift = npu_op.rescale

Henrik G Olsson

2021-03-19 15:50:28 +0100

[diff] [blame]

738

elif input_scale == input2_scale and bitdepth == 16:

739

opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(

740

input_scale, input2_scale, output_scale

741

)

742

# align the double rounding with that of advanced scaling

opa_scale /= 2

opb_scale /= 2

shift -= 1

opa_shift = 0 # Unused for this case

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

747

elif input_scale == input2_scale:

748

opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(

749

input_scale, input2_scale, output_scale

750

)

751

opa_shift = 0 # Unused for this case

Henrik G Olsson

2021-03-19 15:50:28 +0100

[diff] [blame]

752

# For 8 bit we can't guarantee double rounding with simplified scaling will always be

753

# the same as with advanced scaling due to different shifts. When the ofm scale fulfils

754

# the following we know that double rounding will have no effect for advanced scaling

755

# no matter the input, so we can safely use simplified scaling with double rounding disabled.

756

use_advanced_scaling = int(ofm_scale) & 0xFFF != 0

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

757

else:

Henrik G Olsson

2021-03-19 15:50:28 +0100

[diff] [blame]

758

use_advanced_scaling = True

759

if use_advanced_scaling:

760

# Use advanced implementation only when input/output scales differ,

761

# or when we can't guarantee the absence of rounding errors

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

762

(opa_scale, opa_shift, ofm_scale, shift, op_to_scale,) = scaling.advanced_elementwise_add_sub_scale(

763

input_scale, input2_scale, output_scale, bitdepth

764

)

765

opb_scale = 0 # Unused for this case

766

if npu_op.reversed_operands:

767

# If the operand order is reversed we also have to swap which operand is scaled

768

if op_to_scale == scaling.OperandToScale.OPa:

769

op_to_scale = scaling.OperandToScale.OPb

770

else:

771

op_to_scale = scaling.OperandToScale.OPa

772

emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)

773

emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)

774

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

775

elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):

776

output_scale = npu_op.ofm.quantization.scale_f32

777

ofm_scale, shift = scaling.quantise_scale(output_scale)

778

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

779

else:

780

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)

return op_to_scale

# -------------------------------------------------------------------

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

785

# PRINT

786

# -------------------------------------------------------------------

Jacob Bohlin

e99b893

2020-07-13 16:01:51 +0200

[diff] [blame]

787

788

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

789

def print_feature_map(fm: NpuFeatureMap, name: str):

if fm is not None:

q = (

"no quantization"

if fm.quantization is None

794

else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"

795

)

796

h, w, c = fm.shape

797

sz = h * w * c * fm.data_type.size_in_bytes()

798

print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")

799

strides = get_strides(fm)

800

stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"

801

t = fm.tiles

802

addresses = [hex(addr) for addr in t.addresses]

803

print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

804

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

805

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

806

def print_operation(npu_op: NpuOperation, index: int = 0, cmd=None):

807

pass_info = f", {cmd}" if cmd else ""

808

if isinstance(npu_op, NpuOperation) and not isinstance(npu_op, (NpuDmaOperation, NpuBlockOperation)):

809

print(f"{index} {npu_op.op_type.name}{pass_info}")

810

return

811

if isinstance(npu_op, NpuDmaOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

812

print(f"{index} DMA_START src={npu_op.src}, dest={npu_op.dest}{pass_info}")

813

return

814

k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

815

if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

816

print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name}:{pass_info}")

Patrik Gustavsson

eca2e95

2020-05-27 09:15:11 +0200

[diff] [blame]

817

else:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

818

if (

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

819

isinstance(npu_op, NpuConv2DOperation)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

820

and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1

821

):

822

fc = "FullyConnected "

823

else:

824

fc = ""

825

print(f"{index} {fc}{npu_op.op_type.name}{pass_info}")

826

print_feature_map(npu_op.ifm, "IFM")

827

if npu_op.ifm2_scalar is not None:

828

quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)

829

print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")

830

else:

831

print_feature_map(npu_op.ifm2, "IFM2")

832

print_feature_map(npu_op.ofm, "OFM")

833

if k is not None and npu_op.op_type != NpuOperationType.ElementWise:

834

print(f" Kernel: {k}")

835

if npu_op.padding is not None:

836

print(f" {npu_op.padding}")

837

for weights in npu_op.weights:

838

print(f" Weights: {weights}")

839

for bias in npu_op.biases:

840

print(f" Scales: {bias}")

841

if npu_op.activation is not None:

842

act = npu_op.activation

843

if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:

844

lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""

845

print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

846

if isinstance(npu_op, NpuConv2DOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

847

print(f" {npu_op.block_traversal}")

848

bh, bw, bc = npu_op.block_config

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

849

rescale = (

850

f", rescale={npu_op.rescale}" if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)) else ""

851

)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

852

print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

853

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

854

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

855

def print_operations(npu_op_list: List[NpuOperation], npu_op_to_cmd=None):

856

npu_op_to_cmd = dict() if npu_op_to_cmd is None else npu_op_to_cmd

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

857

for index, npu_op in enumerate(npu_op_list):

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

858

print_operation(npu_op, index, npu_op_to_cmd.get(npu_op))

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

859

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

860

861

# -------------------------------------------------------------------

862

# OPERATIONS

863

# -------------------------------------------------------------------

864

865

866

def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):

867

"""Generates NPU_OP_* command"""

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

868

if isinstance(npu_op, NpuDmaOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

869

emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

870

elif isinstance(npu_op, NpuConv2DOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

871

emit.cmd_do_operation(cmd0.NPU_OP_CONV)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

872

elif isinstance(npu_op, NpuConvDepthWiseOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

873

emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

874

elif isinstance(npu_op, NpuPoolingOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

875

emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

876

elif isinstance(npu_op, NpuElementWiseOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

877

emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])

878

else:

879

assert 0, "Unsupported operation"

880

881

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

882

def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

883

"""Generates register commands for Conv2D operations"""

884

generate_common(emit, npu_op, npu_op.block_traversal, arch)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

885

886

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

887

def generate_conv_depthwise_op(

888

emit: CommandStreamEmitter, npu_op: NpuConvDepthWiseOperation, arch: ArchitectureFeatures

889

):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

890

"""Generates register commands for depthwise convolution operations"""

891

generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

892

893

894

def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):

895

"""Generates register commands for pooling operations"""

896

use_global_scale = (

897

npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0

898

)

899

generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)

900

# Pooling op specific

901

if use_global_scale:

902

generate_ofm_scaling_for_pooling(emit, npu_op)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

903

904

905

def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):

906

"""Generates register commands for elementwise operations"""

907

use_global_scale = npu_op.sub_op_type in (

908

NpuElementWiseOp.ADD,

909

NpuElementWiseOp.SUB,

910

NpuElementWiseOp.MUL,

911

NpuElementWiseOp.LRELU,

912

NpuElementWiseOp.ABS,

913

)

914

op_to_scale = generate_scaling_for_elementwise(emit, npu_op)

915

generate_common(

916

emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale

917

)

918

# Elementwise op specific

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

919

if npu_op.sub_op_type not in UNARY_ELEMWISE_OPS:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

920

# Binary operation; generate IFM2 registers

921

assert npu_op.ifm2 is not None

922

has_scalar = npu_op.ifm2_scalar is not None

923

generate_ifm2(emit, npu_op.ifm2, has_scalar)

924

generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)

925

generate_ifm2_broadcast(emit, npu_op)

926

if has_scalar:

927

quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)

928

assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()

929

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

930

931

932

def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):

933

"""Generates register commands for DMA operations"""

934

emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

935

emit.cmd1_with_address(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

936

emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)

937

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

938

emit.cmd1_with_address(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)

939

emit.cmd1_with_address(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

940

941

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

942

def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

943

"""

944

Generates register commands for the given operation, but not the final NPU_OP_... command.

945

Returns the selected block config

946

"""

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

947

if isinstance(npu_op, NpuConv2DOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

948

generate_conv2d_op(emit, npu_op, arch)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

949

elif isinstance(npu_op, NpuConvDepthWiseOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

950

generate_conv_depthwise_op(emit, npu_op, arch)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

951

elif isinstance(npu_op, NpuPoolingOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

952

generate_pooling_op(emit, npu_op, arch)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

953

elif isinstance(npu_op, NpuElementWiseOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

954

generate_elementwise_op(emit, npu_op, arch)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

955

elif isinstance(npu_op, NpuDmaOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

956

generate_dma_op(emit, npu_op)

957

else:

958

assert 0, "Unsupported operation"

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

959

960

961

def generate_command_stream(

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

962

npu_op_list: List[NpuOperation],

963

arch: ArchitectureFeatures,

964

verbose: bool,

965

mem_limits: Dict[int, int],

966

add_to_debug_db=None,

967

npu_op_to_cmd=None,

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

968

) -> List[int]:

969

"""

970

Generates register commands for the given list of NPU operations.

971

Returns Ethos-U instructions, as a list of 32-bit integers.

972

"""

973

emit = CommandStreamEmitter()

974

if verbose:

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

975

print_operations(npu_op_list, npu_op_to_cmd)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

976

# Calculate memory accesses for every operation

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

977

memory_accesses: Dict[NpuOperation, MemoryAccessSet] = {}

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

978

for npu_op in npu_op_list:

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

979

if isinstance(npu_op, NpuDmaOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

980

memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

981

elif isinstance(npu_op, NpuBlockOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

982

memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

983

else:

984

assert 0, "Invalid operation type"

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

985

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

986

if arch.is_ethos_u65_system:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

987

emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)

988

dep_watermark = Watermark(0, 0)

989

prev_op = None

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

990

# Generate register commands for all operations

991

for op_index, npu_op in enumerate(npu_op_list):

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

992

try:

993

check_mem_limits(memory_accesses[npu_op], mem_limits)

994

dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)

995

generate_registers_for_op(emit, npu_op, arch)

996

except VelaError as e:

997

# Add operation info and rethrow

998

raise VelaError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

999

if not isinstance(npu_op, NpuDmaOperation) and isinstance(npu_op, NpuBlockOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1000

# Generate BLOCKDEP

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

1001

blockdep = calc_blockdep(arch, prev_op, npu_op)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1002

blockdep = min(blockdep, arch.max_blockdep)

1003

emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)

1004

prev_op = npu_op

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1005

1006

generate_cmd_waits(emit, cmd_waits)

1007

# Generate the actual NPU_OP command

1008

generate_operation_code(emit, npu_op)

1009

if add_to_debug_db is not None:

1010

add_to_debug_db(npu_op, emit.offset)

1011

# Fill in final part of command stream:

1012

emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

1013

res = emit.to_list()

erik.andersson@arm.com

1878dab

2021-03-16 09:40:24 +0100

[diff] [blame]

1014

1015

if emit.size_in_bytes() >= 1 << 24:

1016

raise VelaError(

1017

f"The command stream size exceeds the hardware limit of 16 MiB. "

1018

f"The current stream size is {emit.size_in_bytes()/2**20:.2F} MiB."

1019

)

1020

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1021

if verbose:

1022

emit.print_cmds()

1023

print("number of commands", len(emit.cmd_stream))

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

1024

print("command stream length in words", len(res))

return res

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1028

def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1029

"""

Louis Verhaard

aeae567

2020-11-02 18:04:27 +0100

[diff] [blame]

1030

Internal implementation of the public facing API for generating an Ethos-U register command stream.

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1031

Calculates dependencies between commands and inserts wait operations if needed.

1032

1033

:param npu_op_list: List[NpuOperation] list of high level NPU operations

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

1034

:param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator

1035

:return Ethos-U instructions, as a list of 32-bit integers

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1036

"""

Louis Verhaard

aeae567

2020-11-02 18:04:27 +0100

[diff] [blame]

1037

accelerator = Accelerator.from_npu_accelerator(npu_accelerator)

Louis Verhaard

5207830

2020-11-18 13:35:06 +0100

[diff] [blame]

1038

arch = create_default_arch(accelerator)

Louis Verhaard