Blame - ethosu/vela/register_command_stream_generator.py - ml/ethos-u/ethos-u-vela

2020-04-27 18:20:16 +0100

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Rickard Bolin

bc6ee58

2022-11-04 08:24:29 +0000

[diff] [blame^]

16

#

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

17

# Description:

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

18

# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

19

# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

20

# stream suitable for interpretation by the Ethos-U processor.

Louis Verhaard

c629129

2021-03-19 09:35:48 +0100

[diff] [blame]

21

import math

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

22

from collections import defaultdict

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

23

from enum import Enum

24

from enum import IntEnum

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

25

from typing import cast

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

26

from typing import Dict

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

27

from typing import List

28

from typing import Optional

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

import numpy as np

from . import scaling

Louis Verhaard

aeae567

2020-11-02 18:04:27 +0100

[diff] [blame]

33

from .api import NpuAccelerator

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

34

from .api import NpuActivation

35

from .api import NpuActivationOp

36

from .api import NpuAddressRange

37

from .api import NpuBlockOperation

38

from .api import NpuBlockTraversal

39

from .api import NpuConv2DOperation

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

40

from .api import NpuConvDepthWiseOperation

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

41

from .api import NpuDataType

42

from .api import NpuDmaOperation

43

from .api import NpuElementWiseOp

44

from .api import NpuElementWiseOperation

45

from .api import NpuFeatureMap

46

from .api import NpuKernel

47

from .api import NpuLayout

48

from .api import NpuOperation

49

from .api import NpuOperationType

50

from .api import NpuPadding

51

from .api import NpuPoolingOp

52

from .api import NpuPoolingOperation

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

53

from .api import NpuResamplingMode

54

from .api import NpuRoundingMode

55

from .api import NpuShape3D

56

from .api import NpuTileBox

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

57

from .architecture_allocator import ArchitectureBlockConfig

58

from .architecture_allocator import try_block_config

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

59

from .architecture_features import Accelerator

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

60

from .architecture_features import ArchitectureFeatures

Louis Verhaard

5207830

2020-11-18 13:35:06 +0100

[diff] [blame]

61

from .architecture_features import create_default_arch

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

62

from .architecture_features import SHRAMElements

erik.andersson@arm.com

1878dab

2021-03-16 09:40:24 +0100

[diff] [blame]

63

from .errors import VelaError

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

64

from .ethos_u55_regs.ethos_u55_regs import acc_format

65

from .ethos_u55_regs.ethos_u55_regs import activation

66

from .ethos_u55_regs.ethos_u55_regs import cmd0

67

from .ethos_u55_regs.ethos_u55_regs import cmd1

68

from .ethos_u55_regs.ethos_u55_regs import elementwise_mode

Fredrik Svedberg

a0c3624

2020-06-03 15:43:31 +0200

[diff] [blame]

69

from .ethos_u55_regs.ethos_u55_regs import pooling_mode

Jacob Bohlin

cf7da10

2020-05-20 09:03:40 +0200

[diff] [blame]

70

from .ethos_u55_regs.ethos_u55_regs import resampling_mode

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

71

from .ethos_u55_regs.ethos_u55_regs import rounding

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

72

from .numeric_util import round_away_zero

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

73

from .numeric_util import round_up_to_int

Patrik Gustavsson

2021-08-17 14:26:38 +0200

[diff] [blame]

74

from .operation import ExplicitScaling

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

75

from .operation import NpuBlockType

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

76

from .range_set import MemoryAccessSet

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

77

from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

78

from .register_command_stream_util import calc_blockdep

79

from .register_command_stream_util import get_dma_memory_accesses

80

from .register_command_stream_util import get_op_memory_accesses

81

from .register_command_stream_util import get_strides

82

from .register_command_stream_util import get_wait_dependency

Fredrik Svedberg

2022-11-04 09:48:49 +0100

[diff] [blame]

83

from .register_command_stream_util import get_zero_point

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

84

from .register_command_stream_util import has_ifm2

Fredrik Svedberg

2022-11-04 09:48:49 +0100

[diff] [blame]

85

from .register_command_stream_util import quantise

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

86

from .register_command_stream_util import shape3d_to_block

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

87

from .register_command_stream_util import to_kernel

88

from .register_command_stream_util import UNARY_ELEMWISE_OPS

89

from .register_command_stream_util import Watermark

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

90

91

92

class RegisterMachine:

93

def __init__(self):

94

self.n_banks = 1

95

self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]

96

self.bank_idx = 0

97

98

def set_register(self, reg, value):

99

is_changed = self.registers[self.bank_idx][reg] != value

100

self.registers[self.bank_idx][reg] = value

101

# is_changed = True # force command

102

return is_changed

103

104

def switch_bank(self):

105

self.bank_idx = (self.bank_idx + 1) % self.n_banks

106

107

108

class CmdMode(IntEnum):

NoPayload = 0x0000

Payload32 = 0x4000

Mask = 0xC000

CmdOpMask = 0x03FF

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

115

class CommandStreamEmitter:

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

116

WORD_SIZE = 4

117

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

118

def __init__(self):

119

self.cmd_stream = []

120

self.reg_machine = [RegisterMachine(), RegisterMachine()]

121

self.last_absolute_wait = defaultdict(int)

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

122

self.offset = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

123

124

def get_reg_machine(self, cmd):

125

if "DMA" in cmd.name:

126

return self.reg_machine[1]

127

else:

128

return self.reg_machine[0]

129

130

def size_in_bytes(self):

131

sz = 0

132

for cmd in self.cmd_stream:

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

133

sz += len(cmd) * CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

134

return sz

135

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

136

def to_list(self) -> List[int]:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

137

return [elem for cmd in self.cmd_stream for elem in cmd]

138

139

def print_cmds(self):

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame]

140

s = f" {'Offset':6}:"

141

s += f" {'Payload':8}"

142

s += f"{'Param':4}" # no leading space for alignment

143

s += f" {'Code':4}"

144

s += f" - {'Command':30}"

s += f" {'Param':5}"

print(s)

offset = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

149

for words_for_one_command in self.cmd_stream:

150

code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits

151

param = words_for_one_command[0] >> 16 # higher 16 bits

152

153

payload_mode = CmdMode(code & CmdMode.Mask)

154

Tim Hall

cda4fcb

2022-05-19 12:36:58 +0100

[diff] [blame]

155

s = f"{offset:#08x}:"

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame]

156

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

157

if payload_mode == CmdMode.NoPayload:

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame]

158

s += f" {'':8}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

159

else:

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame]

160

assert payload_mode == CmdMode.Payload32

161

s += f" {words_for_one_command[1]:08x}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

162

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame]

163

s += f" {param:04x}"

164

s += f" {code:04x}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

165

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame]

166

if payload_mode == CmdMode.NoPayload:

167

s += f" - {cmd0(code & CmdMode.CmdOpMask):30}"

168

offset += 4

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

169

else:

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame]

170

s += f" - {cmd1(code & CmdMode.CmdOpMask):30}"

171

offset += 8

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

172

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame]

173

s += f" {param:5}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

174

print(s)

175

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

176

def cmd0_with_param(self, cmd: cmd0, param):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

177

if isinstance(param, Enum):

178

param = int(param.value)

179

else:

180

param = int(param)

181

param = param & 0xFFFF

182

command = cmd.value | (param << 16)

183

if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):

184

return

185

186

# This is not a redundant command, actually write it

187

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

188

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

189

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

190

def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):

Louis Verhaard

893780c

2021-03-30 09:02:30 +0200

[diff] [blame]

191

offset = int(offset) & 0xFFFFFFFF

192

param = int(param) & 0xFFFF

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

193

command = cmd.value | CmdMode.Payload32.value | (param << 16)

194

195

if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):

196

return

197

198

# This is not a redundant command, actually write it

199

self.cmd_stream.append((command, offset))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

200

self.offset += CommandStreamEmitter.WORD_SIZE * 2

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

201

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

202

def cmd1_with_address(self, cmd: cmd1, offset):

203

self.cmd1_with_offset(cmd, offset, offset >> 32)

204

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

205

def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):

Tim Hall

289a41d

2020-08-04 21:40:14 +0100

[diff] [blame]

206

param = (16 * channel) + outstanding_count

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

207

command = ((param & 0xFFFF) << 16) | cmd.value

208

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

209

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

210

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

211

def cmd_do_operation(self, cmd: cmd0, param=0):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

212

param = int(param)

213

command = ((param & 0xFFFF) << 16) | cmd.value

214

215

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

216

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

217

self.get_reg_machine(cmd).switch_bank()

218

219

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

220

# -------------------------------------------------------------------

221

# REGISTER GENERATION

222

# -------------------------------------------------------------------

223

224

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

225

# TODO: Replace with definitions from ethos_u55_regs

226

class IFM2Broadcast(IntEnum):

227

BroadcastHdim = 1 << 0

228

BroadcastWdim = 1 << 1

229

BroadcastCdim = 1 << 2

230

ReverseOperandOrder = 1 << 6

231

UseIFM2Scalar = 1 << 7

pooling_op_map = {

NpuPoolingOp.MAX: pooling_mode.MAX.value,

236

NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,

237

NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,

238

}

239

240

elementwise_op_map = {

241

NpuElementWiseOp.MUL: elementwise_mode.MUL.value,

242

NpuElementWiseOp.ADD: elementwise_mode.ADD.value,

243

NpuElementWiseOp.SUB: elementwise_mode.SUB.value,

244

NpuElementWiseOp.MIN: elementwise_mode.MIN.value,

245

NpuElementWiseOp.MAX: elementwise_mode.MAX.value,

246

NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,

247

NpuElementWiseOp.ABS: elementwise_mode.ABS.value,

248

NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,

249

NpuElementWiseOp.SHR: elementwise_mode.SHR.value,

250

NpuElementWiseOp.SHL: elementwise_mode.SHL.value,

251

}

252

253

activation_op_map = {

254

NpuActivationOp.NONE_OR_RELU: activation.NONE,

255

NpuActivationOp.TANH: activation.TANH,

256

NpuActivationOp.SIGMOID: activation.SIGMOID,

257

}

258

259

# Maps an AccumulatorType enum to the corresponding acc_format value

260

acc_format_map = {

261

SHRAMElements.Acc16: acc_format.FP_S5_10.value,

262

SHRAMElements.Acc32: acc_format.INT_32BIT.value,

263

SHRAMElements.Acc40: acc_format.INT_40BIT.value,

264

}

265

266

resampling_mode_map = {

267

NpuResamplingMode.NONE: resampling_mode.NONE,

268

NpuResamplingMode.NEAREST: resampling_mode.NEAREST,

269

NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,

270

}

271

272

# Maps data type size in bits to activation precision

273

precision_map = {8: 0, 16: 1, 32: 2}

274

275

# Maps rounding mode to the corresponding value

276

rounding_mode_map = {

277

NpuRoundingMode.TFL: rounding.TFL.value,

278

NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,

279

NpuRoundingMode.NATURAL: rounding.NATURAL.value,

}

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

283

def check_mem_limits(memory_accesses: MemoryAccessSet, mem_limits: Dict[int, int]):

284

"""Checks that an operation's memory accesses respect the boundaries imposed by mem_limits"""

285

for mem_access in memory_accesses.accesses:

286

for region, range_set in mem_access.regions.items():

287

if region not in mem_limits:

288

raise VelaError(f"Invalid region: {region}")

289

max = mem_limits[region]

290

for start, end in range_set.ranges:

291

for offset in (start, end):

292

if offset < 0:

293

raise VelaError(f"Negative address offset: {offset}, region: {region}")

294

if offset > max:

Tim Hall

cda4fcb

2022-05-19 12:36:58 +0100

[diff] [blame]

295

raise VelaError(

296

f"Address offset out of range: {offset}, region: {region}, max: {max}. Perhaps try running"

297

f" with the HillClimb tensor allocator and/or increasing the maximum iteration of that"

298

f" allocator"

299

)

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

300

301

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

302

def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):

303

"""Generates IFM_PAD registers"""

304

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)

305

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)

306

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)

307

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)

308

309

310

def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):

311

"""Generates ACTIVATION registers"""

312

act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)

313

314

if act.min is None:

315

quantized_min = ofm.data_type.min_value()

316

else:

317

quantized_min = quantise(act.min, ofm.quantization)

318

if act.max is None:

319

quantized_max = ofm.data_type.max_value()

320

else:

321

quantized_max = quantise(act.max, ofm.quantization)

322

quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())

323

quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())

324

if act.op_type == NpuActivationOp.TABLE_LOOKUP:

325

assert 0 <= act.lookup_table_index < 8

326

activation_value = 16 + act.lookup_table_index

327

if ofm.data_type == NpuDataType.INT32:

328

activation_value |= 3 << 12 # Force I8 range

329

quantized_min = max(-128, quantized_min)

330

quantized_max = min(127, quantized_max)

331

else:

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

332

activation_value = cast(int, activation_op_map[act.op_type])

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

333

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)

334

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)

335

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)

336

337

338

def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):

339

"""Generates xFM_BASE registers"""

340

if layout == NpuLayout.NHCWB16:

341

# Check that all BasePointer addresses are aligned to 16 bytes

342

assert all((int(addr) % 16) == 0 for addr in addresses)

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

343

for i in range(4):

344

emit.cmd1_with_address(ptr_cmds[i], addresses[i])

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

345

346

347

def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):

348

"""Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""

349

emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)

350

emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)

351

emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)

352

353

354

def generate_strides(

355

emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1

356

):

357

"""Generates STRIDE_C/Y/X registers"""

358

strides = get_strides(fm)

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

359

emit.cmd1_with_address(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)

360

emit.cmd1_with_address(stride_y_cmd, strides.height) # stride between vertical values (H)

361

emit.cmd1_with_address(stride_x_cmd, strides.width) # stride between horisontal values (W)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

362

363

364

def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):

365

"""Generates IFM/IFM2_PRECISION register"""

366

dtype = fm.data_type

367

prec = 1 if dtype.is_signed() else 0

368

activation_precision = precision_map[dtype.size_in_bits()]

369

prec += activation_precision << 2

370

371

if fm.layout == NpuLayout.NHCWB16:

372

prec |= 1 << 6

373

374

prec |= op_to_scale << 8

375

emit.cmd0_with_param(precision_cmd, prec)

376

377

378

def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):

379

"""Generates OFM_PRECISION register"""

380

dtype = npu_op.ofm.data_type

381

prec = 1 if dtype.is_signed() else 0

382

activation_precision = precision_map[dtype.size_in_bits()]

383

prec += activation_precision << 1

384

385

if use_global_scale:

386

# Set global scale bit, as opposed to using per channel scale

387

prec |= 1 << 8

388

if npu_op.ofm.layout == NpuLayout.NHCWB16:

389

prec |= 1 << 6

390

prec |= rounding_mode_map[npu_op.rounding_mode] << 14

391

emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)

392

393

394

def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):

395

"""Generates IFM2_BROADCAST register for binary elementwise operations"""

ifm2_broadcast = 0

ifm = npu_op.ifm

ifm2 = npu_op.ifm2

if npu_op.reversed_operands:

400

ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder

401

if npu_op.ifm2_scalar is not None:

402

# IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST

403

ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar

404

else:

405

if ifm.shape.height != ifm2.shape.height:

406

# Broadcast in 'H' dimension

407

assert ifm2.shape.height == 1

408

ifm2_broadcast |= IFM2Broadcast.BroadcastHdim

409

410

if ifm.shape.width != ifm2.shape.width:

411

# Broadcast in 'W' dimension

412

assert ifm2.shape.width == 1

413

ifm2_broadcast |= IFM2Broadcast.BroadcastWdim

414

415

if ifm.shape.depth != ifm2.shape.depth:

416

# Broadcast in 'C' dimension

417

assert ifm2.shape.depth == 1

418

ifm2_broadcast |= IFM2Broadcast.BroadcastCdim

419

420

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)

421

422

423

def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):

424

"""Generates general IFM registers"""

425

emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)

426

generate_addresses(

427

emit,

428

[cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],

ifm.tiles.addresses,

ifm.layout,

)

generate_tiles(

emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles

434

)

435

emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)

436

generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)

Fredrik Svedberg

2022-11-04 09:48:49 +0100

[diff] [blame]

437

emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, get_zero_point(ifm))

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

438

439

440

def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):

441

"""Generates general IFM2 registers"""

442

if not has_scalar:

443

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)

444

generate_addresses(

445

emit,

446

[cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],

447

ifm2.tiles.addresses,

ifm2.layout,

)

generate_tiles(

emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles

452

)

453

generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)

Fredrik Svedberg

2022-11-04 09:48:49 +0100

[diff] [blame]

454

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, get_zero_point(ifm2))

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

455

456

457

def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):

458

"""Generates general OFM registers"""

459

emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)

460

generate_addresses(

461

emit,

462

[cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],

ofm.tiles.addresses,

ofm.layout,

)

generate_tiles(

emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles

468

)

469

emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)

470

emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)

471

emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)

472

generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)

Fredrik Svedberg

2022-11-04 09:48:49 +0100

[diff] [blame]

473

emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, get_zero_point(ofm))

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

474

475

476

def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):

477

"""Generates KERNEL related registers"""

478

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))

479

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))

480

# set kernel x stride low bit

481

stride = (kernel.stride_x - 1) & 1

482

# set kernel y stride low bit

483

stride |= (kernel.stride_y - 1 & 1) << 1

484

# set kernel x stride extension bits

485

stride |= (kernel.stride_x - 1 >> 1) << 6

486

# set kernel y stride extension bits

487

stride |= (kernel.stride_y - 1 >> 1) << 9

488

stride |= (kernel.dilation_x - 1) << 3

489

stride |= (kernel.dilation_y - 1) << 4

490

if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:

491

stride |= 1 << 2

492

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)

493

494

495

def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):

496

"""Generates WEIGHT registers"""

497

if len(weights) == 0:

498

return

499

emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)

500

# Set weights sources for active and present cores

501

for core, (addr, length) in enumerate(

502

[

503

(cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),

504

(cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),

505

]

506

):

507

if core < len(weights):

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

508

emit.cmd1_with_address(addr, weights[core].address)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

509

emit.cmd1_with_offset(length, weights[core].length)

510

elif core < arch.ncores:

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

511

emit.cmd1_with_address(addr, weights[0].address)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

512

emit.cmd1_with_offset(length, 0)

513

514

515

def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):

516

"""Generates SCALE registers"""

517

if len(biases) == 0:

518

return

519

emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)

520

# Set weights sources for active and present cores

521

for core, (addr, length) in enumerate(

522

[(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]

523

):

524

if core < len(biases):

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

525

emit.cmd1_with_address(addr, biases[core].address)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

526

emit.cmd1_with_offset(length, biases[core].length)

527

elif core < arch.ncores:

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

528

emit.cmd1_with_address(addr, biases[0].address)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

529

emit.cmd1_with_offset(length, 0)

530

531

532

def generate_block_config(

Jonas Ohlsson

d857507

2022-03-30 10:30:25 +0200

[diff] [blame]

533

emit: CommandStreamEmitter,

534

block_config: NpuShape3D,

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

535

):

536

"""Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

537

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)

538

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)

539

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

540

541

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

542

def generate_shram_registers(

Jonas Ohlsson

d857507

2022-03-30 10:30:25 +0200

[diff] [blame]

543

emit: CommandStreamEmitter,

544

npu_op: NpuBlockOperation,

545

arch_block_config: ArchitectureBlockConfig,

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

546

):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

547

"""Generates IB_END/IB_START/AB_START/ACC_FORMAT registers"""

548

emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, arch_block_config.layout.ib_end)

549

emit.cmd0_with_param(cmd0.NPU_SET_AB_START, arch_block_config.layout.ab_start)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

550

if has_ifm2(npu_op):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

551

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_IB_START, arch_block_config.layout.ib_start2)

552

emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[arch_block_config.acc_type])

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

553

554

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

555

def get_block_config_for_npu_op(

556

arch, npu_op: NpuBlockOperation, npu_block_type: NpuBlockType, is_partkernel: bool, ifm_resampling: resampling_mode

557

) -> Optional[ArchitectureBlockConfig]:

558

"""

559

Given npu_op.block_config, returns a corresponding ArchitectureBlockConfig.

560

Returns None if the block_config does not fit.

561

"""

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

562

563

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

564

def get_arch_block_config(

565

npu_op: NpuBlockOperation, block_traversal: NpuBlockTraversal, arch: ArchitectureFeatures

566

) -> ArchitectureBlockConfig:

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

567

"""Creates shared buffer allocation for the given operation"""

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

568

assert npu_op.block_config is not None, "block_config has not been set"

569

block_type = NpuBlockType.Default

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

570

if isinstance(npu_op, NpuConv2DOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

571

block_type = NpuBlockType.ConvolutionMxN

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

572

elif isinstance(npu_op, NpuConvDepthWiseOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

573

block_type = NpuBlockType.ConvolutionDepthWise

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

574

elif isinstance(npu_op, NpuPoolingOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

575

block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

576

elif isinstance(npu_op, NpuElementWiseOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

577

block_type = NpuBlockType.ElementWise

578

else:

579

assert 0, "Unsupported operation"

580

ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

581

is_partkernel = block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST

582

uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP

583

lut_banks = 2 if uses_lut else 0

584

fms = [npu_op.ifm, npu_op.ofm]

585

if npu_op.ifm2 is not None:

586

fms.append(npu_op.ifm2)

587

all_fms_have_quant = not any(fm.quantization is None or fm.quantization.scale_f32 is None for fm in fms)

588

ifm_bits = npu_op.ifm.data_type.size_in_bits()

589

ifm_shape = shape3d_to_block(npu_op.ifm.shape)

590

if has_ifm2(npu_op):

591

ifm2_shape = shape3d_to_block(npu_op.ifm2.shape)

592

else:

593

ifm2_shape = None

594

uses_scalar = npu_op.ifm2_scalar is not None

595

block_config = shape3d_to_block(npu_op.block_config)

596

arch_block_config = try_block_config(

597

block_config,

598

arch,

599

block_type,

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

600

shape3d_to_block(npu_op.ofm.shape),

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

ifm_shape,

ifm2_shape,

uses_scalar,

ifm_bits,

is_partkernel=is_partkernel,

606

kernel=to_kernel(npu_op.kernel),

607

lut_banks=lut_banks,

608

scaled=all_fms_have_quant,

609

ifm_resampling=ifm_resampling_mode,

610

)

611

assert arch_block_config is not None, f"block_config {npu_op.block_config} does not fit"

612

return arch_block_config

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

613

614

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

615

def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):

616

"""Generates KERNEL_WAIT/DMA_WAIT"""

617

if cmd_waits.npu >= 0:

618

emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)

619

620

if cmd_waits.dma >= 0:

621

emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)

622

623

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

624

def generate_common(

625

emit: CommandStreamEmitter,

626

npu_op: NpuBlockOperation,

627

block_traversal: NpuBlockTraversal,

628

arch: ArchitectureFeatures,

629

use_global_scale: bool = False,

630

op_to_scale: int = 0,

631

):

632

"""Generate registers that are common to most operations"""

633

assert npu_op.ifm is not None and npu_op.ofm is not None

634

generate_ifm(emit, npu_op.ifm)

635

generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)

636

emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])

637

if npu_op.padding is not None:

638

generate_padding(emit, npu_op.padding)

639

generate_ofm(emit, npu_op.ofm)

640

generate_ofm_precision(emit, npu_op, use_global_scale)

641

if npu_op.op_type != NpuOperationType.ElementWise:

642

assert npu_op.kernel is not None

643

generate_kernel(emit, npu_op.kernel, block_traversal)

644

generate_weights(emit, npu_op.weights, arch)

645

generate_biases(emit, npu_op.biases, arch)

646

generate_activation(emit, npu_op.activation, npu_op.ofm)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

647

arch_block_config = get_arch_block_config(npu_op, block_traversal, arch)

648

generate_block_config(emit, npu_op.block_config)

649

generate_shram_registers(emit, npu_op, arch_block_config)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

650

651

652

# -------------------------------------------------------------------

653

# SCALING

654

# -------------------------------------------------------------------

655

656

657

def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):

658

"""Generates OFM_SCALE register for pooling operations"""

659

# For valid padding vela has to output scaling values

660

kernel = pool_op.kernel

661

ifm_quant = pool_op.ifm.quantization

662

ofm_quant = pool_op.ofm.quantization

663

if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):

664

assert ifm_quant.scale_f32 is not None

665

rescale = 0x3000 * ifm_quant.scale_f32

666

if pool_op.ifm.data_type == NpuDataType.INT16:

667

# Calculate scale and shift for the output scale of 1/(3*4096)

Louis Verhaard

c629129

2021-03-19 09:35:48 +0100

[diff] [blame]

668

x_log2 = math.log2(ifm_quant.scale_f32)

669

rounded_log2 = int(round(x_log2))

670

is_power_of_two = abs(x_log2 - rounded_log2) < 0.001

671

shift = rounded_log2 + 12

Patrik Gustavsson

e3dd2f3

2021-12-02 09:08:26 +0100

[diff] [blame]

672

if is_power_of_two and (

673

(pool_op.activation.op_type == NpuActivationOp.TANH and shift in (0, 1))

674

or (pool_op.activation.op_type == NpuActivationOp.SIGMOID and shift == 0)

675

):

676

# Special handling if input scale is 1/2048 (tanh/sigmoid) or 1/4096 (tanh)

Louis Verhaard

c629129

2021-03-19 09:35:48 +0100

[diff] [blame]

scale = 3 << shift

shift = 0

else:

shift = 0

max_rescale = np.iinfo(np.int16).max / 2

682

while rescale <= max_rescale and shift <= 30:

683

shift += 1

684

rescale *= 2

685

scale = int(rescale)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

686

else:

687

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

688

scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)

689

scale = int(round_away_zero(scale * rescale))

690

elif pool_op.fused_quantize:

691

# Quantize op requires different scaling

692

ifm_scale_f64 = np.double(ifm_quant.scale_f32)

693

ofm_scale_f64 = np.double(ofm_quant.scale_f32)

694

scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)

695

elif pool_op.rescale is not None:

Patrik Gustavsson

2021-08-17 14:26:38 +0200

[diff] [blame]

696

if type(pool_op.rescale) == ExplicitScaling:

697

# Note: reuse of rescale for explicit scaling to not expose this in the external API

698

explicit_scaling = pool_op.rescale

699

assert explicit_scaling.per_channel is False

700

scale = explicit_scaling.multiplier[0]

701

shift = explicit_scaling.shift[0]

702

else:

Tim Hall

885033b

2022-07-21 11:46:03 +0100

[diff] [blame]

703

# for ResizeBilinear/NearestNeighbor operations with rescale

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame]

704

# Note: this is not used, but part of the public API

Patrik Gustavsson

2021-08-17 14:26:38 +0200

[diff] [blame]

705

rescale = pool_op.rescale

706

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

707

scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)

708

scale = int(round_away_zero(scale * rescale))

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

709

else:

710

# In case avg pool fused with concat or other memory operation, rescaling might be needed.

711

# kernel height == kernel width == 1 is always true in this case

712

# Normally the scale is maximised, to get maximum precision, which means that

713

# if rescale != 1, scale need to consider the number of bits needed for rescaling

714

if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:

715

rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32

716

rescale_bits = 0

717

if kernel.height == kernel.width == 1:

718

if rescale > 1:

719

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

720

elif rescale < 1:

721

rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)

722

scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)

723

scale = int(round_away_zero(scale * rescale))

else:

scale = 1

shift = 0

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)

729

730

731

def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:

732

"""

733

Generates OFM/OPA/OPB_SCALE registers for elementwise operators.

734

Returns the operator to scale

735

"""

736

op_to_scale = 0

737

if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):

738

input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None

739

input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None

740

output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None

741

742

if npu_op.activation is not None and npu_op.activation.op_type in (

743

NpuActivationOp.SIGMOID,

744

NpuActivationOp.TANH,

745

):

746

output_scale = 1 / 0x3000

747

748

if npu_op.sub_op_type == NpuElementWiseOp.MUL:

Patrik Gustavsson

b081d67

2021-08-25 13:49:25 +0200

[diff] [blame]

749

if npu_op.rescale:

750

ofm_scale, shift = npu_op.rescale

751

elif None in (input_scale, input2_scale, output_scale):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

ofm_scale = 1

shift = 0

else:

ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

756

else: # Add/Sub

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame]

757

# Default operand scaling is no scaling

758

opa_scale = opb_scale = 1

759

opa_shift = 0

Henrik G Olsson

2021-03-19 15:50:28 +0100

[diff] [blame]

760

bitdepth = npu_op.ifm.data_type.size_in_bits()

761

use_advanced_scaling = False

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame]

762

if npu_op.rescale is not None:

763

# Explicit ofm scaling

764

ofm_scale, shift = npu_op.rescale

765

elif None in (input_scale, input2_scale, output_scale):

766

# No ofm scaling

767

ofm_scale = 1

768

shift = 0

Henrik G Olsson

2021-03-19 15:50:28 +0100

[diff] [blame]

769

elif input_scale == input2_scale and bitdepth == 16:

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame]

770

# int16 same scaling

Henrik G Olsson

2021-03-19 15:50:28 +0100

[diff] [blame]

771

opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(

772

input_scale, input2_scale, output_scale

773

)

774

# align the double rounding with that of advanced scaling

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame]

775

opa_scale //= 2

776

opb_scale //= 2

Henrik G Olsson

2021-03-19 15:50:28 +0100

[diff] [blame]

777

shift -= 1

778

opa_shift = 0 # Unused for this case

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

779

elif input_scale == input2_scale:

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame]

780

# Same scaling

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

781

opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(

782

input_scale, input2_scale, output_scale

783

)

784

opa_shift = 0 # Unused for this case

Henrik G Olsson

2021-03-19 15:50:28 +0100

[diff] [blame]

785

# For 8 bit we can't guarantee double rounding with simplified scaling will always be

786

# the same as with advanced scaling due to different shifts. When the ofm scale fulfils

787

# the following we know that double rounding will have no effect for advanced scaling

788

# no matter the input, so we can safely use simplified scaling with double rounding disabled.

789

use_advanced_scaling = int(ofm_scale) & 0xFFF != 0

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

790

else:

Henrik G Olsson

2021-03-19 15:50:28 +0100

[diff] [blame]

791

use_advanced_scaling = True

792

if use_advanced_scaling:

793

# Use advanced implementation only when input/output scales differ,

794

# or when we can't guarantee the absence of rounding errors

Jonas Ohlsson

d857507

2022-03-30 10:30:25 +0200

[diff] [blame]

(

opa_scale,

opa_shift,

ofm_scale,

shift,

op_to_scale,

) = scaling.advanced_elementwise_add_sub_scale(input_scale, input2_scale, output_scale, bitdepth)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

802

opb_scale = 0 # Unused for this case

803

if npu_op.reversed_operands:

804

# If the operand order is reversed we also have to swap which operand is scaled

805

if op_to_scale == scaling.OperandToScale.OPa:

806

op_to_scale = scaling.OperandToScale.OPb

807

else:

808

op_to_scale = scaling.OperandToScale.OPa

809

emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)

810

emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

811

elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):

812

output_scale = npu_op.ofm.quantization.scale_f32

813

ofm_scale, shift = scaling.quantise_scale(output_scale)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

814

else:

Tim Hall

e178f38

2022-07-12 17:02:25 +0100

[diff] [blame]

815

ofm_scale = 1

816

shift = 0

817

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

return op_to_scale

# -------------------------------------------------------------------

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

822

# PRINT

823

# -------------------------------------------------------------------

Jacob Bohlin

e99b893

2020-07-13 16:01:51 +0200

[diff] [blame]

824

825

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

826

def print_feature_map(fm: Optional[NpuFeatureMap], name: str):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

if fm is not None:

q = (

"no quantization"

if fm.quantization is None

831

else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"

832

)

833

h, w, c = fm.shape

834

sz = h * w * c * fm.data_type.size_in_bytes()

835

print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")

836

strides = get_strides(fm)

837

stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"

838

t = fm.tiles

839

addresses = [hex(addr) for addr in t.addresses]

840

print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")

Tim Hall

2022-03-16 16:51:16 +0000

[diff] [blame]

841

print(f" name={fm.name}")

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

842

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

843

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

844

def print_operation(npu_op: NpuOperation, index: int = 0, cmd=None):

Tim Hall

2022-03-16 16:51:16 +0000

[diff] [blame]

845

pass_info = f" {cmd}" if cmd else ""

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

846

if isinstance(npu_op, NpuOperation) and not isinstance(npu_op, (NpuDmaOperation, NpuBlockOperation)):

Tim Hall

2022-03-16 16:51:16 +0000

[diff] [blame]

847

print(f"{index} {npu_op.op_type.name} name={npu_op.name}:{pass_info}")

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

848

return

849

if isinstance(npu_op, NpuDmaOperation):

Tim Hall

2022-03-16 16:51:16 +0000

[diff] [blame]

850

print(f"{index} {npu_op.op_type.name} name={npu_op.name}, src={npu_op.src}, dest={npu_op.dest}:{pass_info}")

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

851

return

852

k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

853

if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)):

Tim Hall

2022-03-16 16:51:16 +0000

[diff] [blame]

854

print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name} name={npu_op.name}:{pass_info}")

Patrik Gustavsson

eca2e95

2020-05-27 09:15:11 +0200

[diff] [blame]

855

else:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

856

if (

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

857

isinstance(npu_op, NpuConv2DOperation)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

858

and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1

859

):

860

fc = "FullyConnected "

861

else:

862

fc = ""

Tim Hall

2022-03-16 16:51:16 +0000

[diff] [blame]

863

print(f"{index} {fc}{npu_op.op_type.name} name={npu_op.name}:{pass_info}")

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

864

print_feature_map(npu_op.ifm, "IFM")

865

if npu_op.ifm2_scalar is not None:

866

quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)

867

print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")

868

else:

869

print_feature_map(npu_op.ifm2, "IFM2")

870

print_feature_map(npu_op.ofm, "OFM")

871

if k is not None and npu_op.op_type != NpuOperationType.ElementWise:

872

print(f" Kernel: {k}")

873

if npu_op.padding is not None:

874

print(f" {npu_op.padding}")

875

for weights in npu_op.weights:

876

print(f" Weights: {weights}")

877

for bias in npu_op.biases:

878

print(f" Scales: {bias}")

879

if npu_op.activation is not None:

880

act = npu_op.activation

881

if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:

882

lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""

883

print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

884

if isinstance(npu_op, NpuConv2DOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

885

print(f" {npu_op.block_traversal}")

886

bh, bw, bc = npu_op.block_config

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

887

rescale = (

888

f", rescale={npu_op.rescale}" if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)) else ""

889

)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

890

print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

891

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

892

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

893

def print_operations(npu_op_list: List[NpuOperation], npu_op_to_cmd=None):

894

npu_op_to_cmd = dict() if npu_op_to_cmd is None else npu_op_to_cmd

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

895

for index, npu_op in enumerate(npu_op_list):

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

896

print_operation(npu_op, index, npu_op_to_cmd.get(npu_op))

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

897

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

898

899

# -------------------------------------------------------------------

900

# OPERATIONS

901

# -------------------------------------------------------------------

902

903

904

def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):

905

"""Generates NPU_OP_* command"""

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

906

if isinstance(npu_op, NpuDmaOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

907

emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

908

elif isinstance(npu_op, NpuConv2DOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

909

emit.cmd_do_operation(cmd0.NPU_OP_CONV)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

910

elif isinstance(npu_op, NpuConvDepthWiseOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

911

emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

912

elif isinstance(npu_op, NpuPoolingOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

913

emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

914

elif isinstance(npu_op, NpuElementWiseOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

915

emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])

916

else:

917

assert 0, "Unsupported operation"

918

919

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

920

def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

921

"""Generates register commands for Conv2D operations"""

922

generate_common(emit, npu_op, npu_op.block_traversal, arch)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

923

924

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

925

def generate_conv_depthwise_op(

926

emit: CommandStreamEmitter, npu_op: NpuConvDepthWiseOperation, arch: ArchitectureFeatures

927

):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

928

"""Generates register commands for depthwise convolution operations"""

929

generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

930

931

932

def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):

933

"""Generates register commands for pooling operations"""

Tim Hall

d6efcd3

2022-09-02 15:01:01 +0100

[diff] [blame]

934

# check that reduce_sum input is NHWC

935

if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM and npu_op.ifm.layout != NpuLayout.NHWC:

936

if npu_op.ifm.data_type == NpuDataType.INT32:

937

raise VelaError(

938

f"REDUCE_SUM ({npu_op.name}) with IFM data type of INT32 requires IFM layout to be NHWC"

939

f" ({npu_op.ifm.name} == {npu_op.ifm.layout})"

940

)

941

elif arch.accelerator_config == Accelerator.Ethos_U65_512:

942

raise VelaError(

943

f"REDUCE_SUM ({npu_op.name}) with accelerator config of Ethos_U65_512 requires IFM layout to be NHWC"

944

f" ({npu_op.ifm.name} == {npu_op.ifm.layout})"

945

)

946

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

947

use_global_scale = (

948

npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0

949

)

Patrik Gustavsson

2021-08-17 14:26:38 +0200

[diff] [blame]

950

# Note: reuse of rescale for explicit scaling to not expose this in the external API

951

if npu_op.rescale is not None and type(npu_op.rescale) == ExplicitScaling:

952

use_global_scale = not npu_op.rescale.per_channel

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

953

generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)

954

# Pooling op specific

955

if use_global_scale:

956

generate_ofm_scaling_for_pooling(emit, npu_op)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

957

958

959

def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):

960

"""Generates register commands for elementwise operations"""

961

use_global_scale = npu_op.sub_op_type in (

962

NpuElementWiseOp.ADD,

963

NpuElementWiseOp.SUB,

964

NpuElementWiseOp.MUL,

965

NpuElementWiseOp.LRELU,

966

NpuElementWiseOp.ABS,

967

)

968

op_to_scale = generate_scaling_for_elementwise(emit, npu_op)

969

generate_common(

970

emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale

971

)

972

# Elementwise op specific

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

973

if npu_op.sub_op_type not in UNARY_ELEMWISE_OPS:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

974

# Binary operation; generate IFM2 registers

975

assert npu_op.ifm2 is not None

976

has_scalar = npu_op.ifm2_scalar is not None

977

generate_ifm2(emit, npu_op.ifm2, has_scalar)

978

generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)

979

generate_ifm2_broadcast(emit, npu_op)

980

if has_scalar:

981

quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)

982

assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()

983

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

984

985

986

def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):

987

"""Generates register commands for DMA operations"""

988

emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

989

emit.cmd1_with_address(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

990

emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)

991

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

992

emit.cmd1_with_address(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)

993

emit.cmd1_with_address(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

994

995

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

996

def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

997

"""

998

Generates register commands for the given operation, but not the final NPU_OP_... command.

999

Returns the selected block config

1000

"""

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1001

if isinstance(npu_op, NpuConv2DOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

1002

generate_conv2d_op(emit, npu_op, arch)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1003

elif isinstance(npu_op, NpuConvDepthWiseOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

1004

generate_conv_depthwise_op(emit, npu_op, arch)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1005

elif isinstance(npu_op, NpuPoolingOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

1006

generate_pooling_op(emit, npu_op, arch)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1007

elif isinstance(npu_op, NpuElementWiseOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

1008

generate_elementwise_op(emit, npu_op, arch)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1009

elif isinstance(npu_op, NpuDmaOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1010

generate_dma_op(emit, npu_op)

1011

else:

1012

assert 0, "Unsupported operation"

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1013

1014

1015

def generate_command_stream(

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

1016

npu_op_list: List[NpuOperation],

1017

arch: ArchitectureFeatures,

1018

verbose: bool,

1019

mem_limits: Dict[int, int],

1020

add_to_debug_db=None,

1021

npu_op_to_cmd=None,

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

1022

) -> List[int]:

1023

"""

1024

Generates register commands for the given list of NPU operations.

1025

Returns Ethos-U instructions, as a list of 32-bit integers.

1026

"""

1027

emit = CommandStreamEmitter()

1028

if verbose:

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1029

print_operations(npu_op_list, npu_op_to_cmd)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1030

# Calculate memory accesses for every operation

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1031

memory_accesses: Dict[NpuOperation, MemoryAccessSet] = {}

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1032

for npu_op in npu_op_list:

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1033

if isinstance(npu_op, NpuDmaOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1034

memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1035

elif isinstance(npu_op, NpuBlockOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1036

memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1037

else:

1038

assert 0, "Invalid operation type"

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

1039

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

1040

if arch.is_ethos_u65_system:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1041

emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)

1042

dep_watermark = Watermark(0, 0)

1043

prev_op = None

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1044

# Generate register commands for all operations

1045

for op_index, npu_op in enumerate(npu_op_list):

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

1046

try:

1047

check_mem_limits(memory_accesses[npu_op], mem_limits)

1048

dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)

1049

generate_registers_for_op(emit, npu_op, arch)

1050

except VelaError as e:

1051

# Add operation info and rethrow

1052

raise VelaError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1053

if not isinstance(npu_op, NpuDmaOperation) and isinstance(npu_op, NpuBlockOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1054

# Generate BLOCKDEP

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

1055

blockdep = calc_blockdep(arch, prev_op, npu_op)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1056

blockdep = min(blockdep, arch.max_blockdep)

1057

emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)

1058

prev_op = npu_op

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1059

1060

generate_cmd_waits(emit, cmd_waits)

1061

# Generate the actual NPU_OP command

1062

generate_operation_code(emit, npu_op)

1063

if add_to_debug_db is not None:

1064

add_to_debug_db(npu_op, emit.offset)

1065

# Fill in final part of command stream:

1066

emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

1067

res = emit.to_list()

erik.andersson@arm.com

1878dab

2021-03-16 09:40:24 +0100

[diff] [blame]

1068

1069

if emit.size_in_bytes() >= 1 << 24:

1070

raise VelaError(

1071

f"The command stream size exceeds the hardware limit of 16 MiB. "

1072

f"The current stream size is {emit.size_in_bytes()/2**20:.2F} MiB."

1073

)

1074

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1075

if verbose:

1076

emit.print_cmds()

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame]

1077

print(f"Number of commands = {len(emit.cmd_stream)}")

1078

print(f"Command stream length = {emit.size_in_bytes()} bytes")

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

return res

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1082

def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1083

"""

Louis Verhaard

aeae567

2020-11-02 18:04:27 +0100

[diff] [blame]

1084

Internal implementation of the public facing API for generating an Ethos-U register command stream.

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1085

Calculates dependencies between commands and inserts wait operations if needed.

1086

1087

:param npu_op_list: List[NpuOperation] list of high level NPU operations

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

1088

:param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator

1089

:return Ethos-U instructions, as a list of 32-bit integers

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1090

"""

Louis Verhaard

aeae567

2020-11-02 18:04:27 +0100

[diff] [blame]

1091

accelerator = Accelerator.from_npu_accelerator(npu_accelerator)

Louis Verhaard

5207830

2020-11-18 13:35:06 +0100

[diff] [blame]

1092

arch = create_default_arch(accelerator)

Louis Verhaard