Blame - ethosu/vela/register_command_stream_generator.py - ml/ethos-u/ethos-u-vela

2020-04-27 18:20:16 +0100

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

16

# Description:

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

17

# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

18

# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

19

# stream suitable for interpretation by the Ethos-U processor.

Louis Verhaard

c629129

2021-03-19 09:35:48 +0100

[diff] [blame]

20

import math

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

21

from collections import defaultdict

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

22

from enum import Enum

23

from enum import IntEnum

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

24

from typing import cast

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

25

from typing import Dict

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

26

from typing import List

27

from typing import Optional

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

import numpy as np

from . import scaling

Louis Verhaard

aeae567

2020-11-02 18:04:27 +0100

[diff] [blame]

32

from .api import NpuAccelerator

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

33

from .api import NpuActivation

34

from .api import NpuActivationOp

35

from .api import NpuAddressRange

36

from .api import NpuBlockOperation

37

from .api import NpuBlockTraversal

38

from .api import NpuConv2DOperation

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

39

from .api import NpuConvDepthWiseOperation

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

40

from .api import NpuDataType

41

from .api import NpuDmaOperation

42

from .api import NpuElementWiseOp

43

from .api import NpuElementWiseOperation

44

from .api import NpuFeatureMap

45

from .api import NpuKernel

46

from .api import NpuLayout

47

from .api import NpuOperation

48

from .api import NpuOperationType

49

from .api import NpuPadding

50

from .api import NpuPoolingOp

51

from .api import NpuPoolingOperation

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

52

from .api import NpuResamplingMode

53

from .api import NpuRoundingMode

54

from .api import NpuShape3D

55

from .api import NpuTileBox

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

56

from .architecture_allocator import ArchitectureBlockConfig

57

from .architecture_allocator import try_block_config

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

58

from .architecture_features import Accelerator

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

59

from .architecture_features import ArchitectureFeatures

Louis Verhaard

5207830

2020-11-18 13:35:06 +0100

[diff] [blame]

60

from .architecture_features import create_default_arch

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

61

from .architecture_features import SHRAMElements

erik.andersson@arm.com

1878dab

2021-03-16 09:40:24 +0100

[diff] [blame]

62

from .errors import VelaError

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

63

from .ethos_u55_regs.ethos_u55_regs import acc_format

64

from .ethos_u55_regs.ethos_u55_regs import activation

65

from .ethos_u55_regs.ethos_u55_regs import cmd0

66

from .ethos_u55_regs.ethos_u55_regs import cmd1

67

from .ethos_u55_regs.ethos_u55_regs import elementwise_mode

Fredrik Svedberg

a0c3624

2020-06-03 15:43:31 +0200

[diff] [blame]

68

from .ethos_u55_regs.ethos_u55_regs import pooling_mode

Jacob Bohlin

cf7da10

2020-05-20 09:03:40 +0200

[diff] [blame]

69

from .ethos_u55_regs.ethos_u55_regs import resampling_mode

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

70

from .ethos_u55_regs.ethos_u55_regs import rounding

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

71

from .numeric_util import round_away_zero

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

72

from .numeric_util import round_up_to_int

Patrik Gustavsson

2021-08-17 14:26:38 +0200

[diff] [blame]

73

from .operation import ExplicitScaling

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

74

from .operation import NpuBlockType

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

75

from .range_set import MemoryAccessSet

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

76

from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

77

from .register_command_stream_util import calc_blockdep

78

from .register_command_stream_util import get_dma_memory_accesses

79

from .register_command_stream_util import get_op_memory_accesses

80

from .register_command_stream_util import get_strides

81

from .register_command_stream_util import get_wait_dependency

Fredrik Svedberg

2022-11-04 09:48:49 +0100

[diff] [blame]

82

from .register_command_stream_util import get_zero_point

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

83

from .register_command_stream_util import has_ifm2

Fredrik Svedberg

2022-11-04 09:48:49 +0100

[diff] [blame]

84

from .register_command_stream_util import quantise

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

85

from .register_command_stream_util import shape3d_to_block

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

86

from .register_command_stream_util import to_kernel

87

from .register_command_stream_util import UNARY_ELEMWISE_OPS

88

from .register_command_stream_util import Watermark

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

89

90

91

class RegisterMachine:

92

def __init__(self):

93

self.n_banks = 1

94

self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]

95

self.bank_idx = 0

96

97

def set_register(self, reg, value):

98

is_changed = self.registers[self.bank_idx][reg] != value

99

self.registers[self.bank_idx][reg] = value

100

# is_changed = True # force command

101

return is_changed

102

103

def switch_bank(self):

104

self.bank_idx = (self.bank_idx + 1) % self.n_banks

105

106

107

class CmdMode(IntEnum):

NoPayload = 0x0000

Payload32 = 0x4000

Mask = 0xC000

CmdOpMask = 0x03FF

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

114

class CommandStreamEmitter:

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

115

WORD_SIZE = 4

116

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

117

def __init__(self):

118

self.cmd_stream = []

119

self.reg_machine = [RegisterMachine(), RegisterMachine()]

120

self.last_absolute_wait = defaultdict(int)

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

121

self.offset = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

122

123

def get_reg_machine(self, cmd):

124

if "DMA" in cmd.name:

125

return self.reg_machine[1]

126

else:

127

return self.reg_machine[0]

128

129

def size_in_bytes(self):

130

sz = 0

131

for cmd in self.cmd_stream:

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

132

sz += len(cmd) * CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

133

return sz

134

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

135

def to_list(self) -> List[int]:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

136

return [elem for cmd in self.cmd_stream for elem in cmd]

137

138

def print_cmds(self):

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame]

139

s = f" {'Offset':6}:"

140

s += f" {'Payload':8}"

141

s += f"{'Param':4}" # no leading space for alignment

142

s += f" {'Code':4}"

143

s += f" - {'Command':30}"

s += f" {'Param':5}"

print(s)

offset = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

148

for words_for_one_command in self.cmd_stream:

149

code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits

150

param = words_for_one_command[0] >> 16 # higher 16 bits

151

152

payload_mode = CmdMode(code & CmdMode.Mask)

153

Tim Hall

cda4fcb

2022-05-19 12:36:58 +0100

[diff] [blame]

154

s = f"{offset:#08x}:"

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame]

155

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

156

if payload_mode == CmdMode.NoPayload:

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame]

157

s += f" {'':8}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

158

else:

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame]

159

assert payload_mode == CmdMode.Payload32

160

s += f" {words_for_one_command[1]:08x}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

161

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame]

162

s += f" {param:04x}"

163

s += f" {code:04x}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

164

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame]

165

if payload_mode == CmdMode.NoPayload:

166

s += f" - {cmd0(code & CmdMode.CmdOpMask):30}"

167

offset += 4

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

168

else:

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame]

169

s += f" - {cmd1(code & CmdMode.CmdOpMask):30}"

170

offset += 8

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

171

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame]

172

s += f" {param:5}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

173

print(s)

174

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

175

def cmd0_with_param(self, cmd: cmd0, param):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

176

if isinstance(param, Enum):

177

param = int(param.value)

178

else:

179

param = int(param)

180

param = param & 0xFFFF

181

command = cmd.value | (param << 16)

182

if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):

183

return

184

185

# This is not a redundant command, actually write it

186

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

187

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

188

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

189

def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):

Louis Verhaard

893780c

2021-03-30 09:02:30 +0200

[diff] [blame]

190

offset = int(offset) & 0xFFFFFFFF

191

param = int(param) & 0xFFFF

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

192

command = cmd.value | CmdMode.Payload32.value | (param << 16)

193

194

if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):

195

return

196

197

# This is not a redundant command, actually write it

198

self.cmd_stream.append((command, offset))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

199

self.offset += CommandStreamEmitter.WORD_SIZE * 2

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

200

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

201

def cmd1_with_address(self, cmd: cmd1, offset):

202

self.cmd1_with_offset(cmd, offset, offset >> 32)

203

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

204

def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):

Tim Hall

289a41d

2020-08-04 21:40:14 +0100

[diff] [blame]

205

param = (16 * channel) + outstanding_count

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

206

command = ((param & 0xFFFF) << 16) | cmd.value

207

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

208

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

209

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

210

def cmd_do_operation(self, cmd: cmd0, param=0):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

211

param = int(param)

212

command = ((param & 0xFFFF) << 16) | cmd.value

213

214

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

215

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

216

self.get_reg_machine(cmd).switch_bank()

217

218

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

219

# -------------------------------------------------------------------

220

# REGISTER GENERATION

221

# -------------------------------------------------------------------

222

223

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

224

# TODO: Replace with definitions from ethos_u55_regs

225

class IFM2Broadcast(IntEnum):

226

BroadcastHdim = 1 << 0

227

BroadcastWdim = 1 << 1

228

BroadcastCdim = 1 << 2

229

ReverseOperandOrder = 1 << 6

230

UseIFM2Scalar = 1 << 7

pooling_op_map = {

NpuPoolingOp.MAX: pooling_mode.MAX.value,

235

NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,

236

NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,

237

}

238

239

elementwise_op_map = {

240

NpuElementWiseOp.MUL: elementwise_mode.MUL.value,

241

NpuElementWiseOp.ADD: elementwise_mode.ADD.value,

242

NpuElementWiseOp.SUB: elementwise_mode.SUB.value,

243

NpuElementWiseOp.MIN: elementwise_mode.MIN.value,

244

NpuElementWiseOp.MAX: elementwise_mode.MAX.value,

245

NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,

246

NpuElementWiseOp.ABS: elementwise_mode.ABS.value,

247

NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,

248

NpuElementWiseOp.SHR: elementwise_mode.SHR.value,

249

NpuElementWiseOp.SHL: elementwise_mode.SHL.value,

250

}

251

252

activation_op_map = {

253

NpuActivationOp.NONE_OR_RELU: activation.NONE,

254

NpuActivationOp.TANH: activation.TANH,

255

NpuActivationOp.SIGMOID: activation.SIGMOID,

256

}

257

258

# Maps an AccumulatorType enum to the corresponding acc_format value

259

acc_format_map = {

260

SHRAMElements.Acc16: acc_format.FP_S5_10.value,

261

SHRAMElements.Acc32: acc_format.INT_32BIT.value,

262

SHRAMElements.Acc40: acc_format.INT_40BIT.value,

263

}

264

265

resampling_mode_map = {

266

NpuResamplingMode.NONE: resampling_mode.NONE,

267

NpuResamplingMode.NEAREST: resampling_mode.NEAREST,

268

NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,

269

}

270

271

# Maps data type size in bits to activation precision

272

precision_map = {8: 0, 16: 1, 32: 2}

273

274

# Maps rounding mode to the corresponding value

275

rounding_mode_map = {

276

NpuRoundingMode.TFL: rounding.TFL.value,

277

NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,

278

NpuRoundingMode.NATURAL: rounding.NATURAL.value,

}

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

282

def check_mem_limits(memory_accesses: MemoryAccessSet, mem_limits: Dict[int, int]):

283

"""Checks that an operation's memory accesses respect the boundaries imposed by mem_limits"""

284

for mem_access in memory_accesses.accesses:

285

for region, range_set in mem_access.regions.items():

286

if region not in mem_limits:

287

raise VelaError(f"Invalid region: {region}")

288

max = mem_limits[region]

289

for start, end in range_set.ranges:

290

for offset in (start, end):

291

if offset < 0:

292

raise VelaError(f"Negative address offset: {offset}, region: {region}")

293

if offset > max:

Tim Hall

cda4fcb

2022-05-19 12:36:58 +0100

[diff] [blame]

294

raise VelaError(

295

f"Address offset out of range: {offset}, region: {region}, max: {max}. Perhaps try running"

296

f" with the HillClimb tensor allocator and/or increasing the maximum iteration of that"

297

f" allocator"

298

)

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

299

300

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

301

def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):

302

"""Generates IFM_PAD registers"""

303

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)

304

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)

305

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)

306

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)

307

308

309

def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):

310

"""Generates ACTIVATION registers"""

311

act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)

312

313

if act.min is None:

314

quantized_min = ofm.data_type.min_value()

315

else:

316

quantized_min = quantise(act.min, ofm.quantization)

317

if act.max is None:

318

quantized_max = ofm.data_type.max_value()

319

else:

320

quantized_max = quantise(act.max, ofm.quantization)

321

quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())

322

quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())

323

if act.op_type == NpuActivationOp.TABLE_LOOKUP:

324

assert 0 <= act.lookup_table_index < 8

325

activation_value = 16 + act.lookup_table_index

326

if ofm.data_type == NpuDataType.INT32:

327

activation_value |= 3 << 12 # Force I8 range

328

quantized_min = max(-128, quantized_min)

329

quantized_max = min(127, quantized_max)

330

else:

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

331

activation_value = cast(int, activation_op_map[act.op_type])

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

332

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)

333

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)

334

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)

335

336

337

def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):

338

"""Generates xFM_BASE registers"""

339

if layout == NpuLayout.NHCWB16:

340

# Check that all BasePointer addresses are aligned to 16 bytes

341

assert all((int(addr) % 16) == 0 for addr in addresses)

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

342

for i in range(4):

343

emit.cmd1_with_address(ptr_cmds[i], addresses[i])

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

344

345

346

def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):

347

"""Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""

348

emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)

349

emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)

350

emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)

351

352

353

def generate_strides(

354

emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1

355

):

356

"""Generates STRIDE_C/Y/X registers"""

357

strides = get_strides(fm)

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

358

emit.cmd1_with_address(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)

359

emit.cmd1_with_address(stride_y_cmd, strides.height) # stride between vertical values (H)

360

emit.cmd1_with_address(stride_x_cmd, strides.width) # stride between horisontal values (W)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

361

362

363

def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):

364

"""Generates IFM/IFM2_PRECISION register"""

365

dtype = fm.data_type

366

prec = 1 if dtype.is_signed() else 0

367

activation_precision = precision_map[dtype.size_in_bits()]

368

prec += activation_precision << 2

369

370

if fm.layout == NpuLayout.NHCWB16:

371

prec |= 1 << 6

372

373

prec |= op_to_scale << 8

374

emit.cmd0_with_param(precision_cmd, prec)

375

376

377

def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):

378

"""Generates OFM_PRECISION register"""

379

dtype = npu_op.ofm.data_type

380

prec = 1 if dtype.is_signed() else 0

381

activation_precision = precision_map[dtype.size_in_bits()]

382

prec += activation_precision << 1

383

384

if use_global_scale:

385

# Set global scale bit, as opposed to using per channel scale

386

prec |= 1 << 8

387

if npu_op.ofm.layout == NpuLayout.NHCWB16:

388

prec |= 1 << 6

389

prec |= rounding_mode_map[npu_op.rounding_mode] << 14

390

emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)

391

392

393

def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):

394

"""Generates IFM2_BROADCAST register for binary elementwise operations"""

ifm2_broadcast = 0

ifm = npu_op.ifm

ifm2 = npu_op.ifm2

if npu_op.reversed_operands:

399

ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder

400

if npu_op.ifm2_scalar is not None:

401

# IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST

402

ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar

403

else:

404

if ifm.shape.height != ifm2.shape.height:

405

# Broadcast in 'H' dimension

406

assert ifm2.shape.height == 1

407

ifm2_broadcast |= IFM2Broadcast.BroadcastHdim

408

409

if ifm.shape.width != ifm2.shape.width:

410

# Broadcast in 'W' dimension

411

assert ifm2.shape.width == 1

412

ifm2_broadcast |= IFM2Broadcast.BroadcastWdim

413

414

if ifm.shape.depth != ifm2.shape.depth:

415

# Broadcast in 'C' dimension

416

assert ifm2.shape.depth == 1

417

ifm2_broadcast |= IFM2Broadcast.BroadcastCdim

418

419

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)

420

421

422

def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):

423

"""Generates general IFM registers"""

424

emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)

425

generate_addresses(

426

emit,

427

[cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],

ifm.tiles.addresses,

ifm.layout,

)

generate_tiles(

emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles

433

)

434

emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)

435

generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)

Fredrik Svedberg

2022-11-04 09:48:49 +0100

[diff] [blame]

436

emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, get_zero_point(ifm))

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

437

438

439

def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):

440

"""Generates general IFM2 registers"""

441

if not has_scalar:

442

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)

443

generate_addresses(

444

emit,

445

[cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],

446

ifm2.tiles.addresses,

ifm2.layout,

)

generate_tiles(

emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles

451

)

452

generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)

Fredrik Svedberg

2022-11-04 09:48:49 +0100

[diff] [blame]

453

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, get_zero_point(ifm2))

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

454

455

456

def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):

457

"""Generates general OFM registers"""

458

emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)

459

generate_addresses(

460

emit,

461

[cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],

ofm.tiles.addresses,

ofm.layout,

)

generate_tiles(

emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles

467

)

468

emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)

469

emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)

470

emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)

471

generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)

Fredrik Svedberg

2022-11-04 09:48:49 +0100

[diff] [blame]

472

emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, get_zero_point(ofm))

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

473

474

475

def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):

476

"""Generates KERNEL related registers"""

477

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))

478

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))

479

# set kernel x stride low bit

480

stride = (kernel.stride_x - 1) & 1

481

# set kernel y stride low bit

482

stride |= (kernel.stride_y - 1 & 1) << 1

483

# set kernel x stride extension bits

484

stride |= (kernel.stride_x - 1 >> 1) << 6

485

# set kernel y stride extension bits

486

stride |= (kernel.stride_y - 1 >> 1) << 9

487

stride |= (kernel.dilation_x - 1) << 3

488

stride |= (kernel.dilation_y - 1) << 4

489

if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:

490

stride |= 1 << 2

491

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)

492

493

494

def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):

495

"""Generates WEIGHT registers"""

496

if len(weights) == 0:

497

return

498

emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)

499

# Set weights sources for active and present cores

500

for core, (addr, length) in enumerate(

501

[

502

(cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),

503

(cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),

504

]

505

):

506

if core < len(weights):

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

507

emit.cmd1_with_address(addr, weights[core].address)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

508

emit.cmd1_with_offset(length, weights[core].length)

509

elif core < arch.ncores:

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

510

emit.cmd1_with_address(addr, weights[0].address)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

511

emit.cmd1_with_offset(length, 0)

512

513

514

def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):

515

"""Generates SCALE registers"""

516

if len(biases) == 0:

517

return

518

emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)

519

# Set weights sources for active and present cores

520

for core, (addr, length) in enumerate(

521

[(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]

522

):

523

if core < len(biases):

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

524

emit.cmd1_with_address(addr, biases[core].address)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

525

emit.cmd1_with_offset(length, biases[core].length)

526

elif core < arch.ncores:

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

527

emit.cmd1_with_address(addr, biases[0].address)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

528

emit.cmd1_with_offset(length, 0)

529

530

531

def generate_block_config(

Jonas Ohlsson

d857507

2022-03-30 10:30:25 +0200

[diff] [blame]

532

emit: CommandStreamEmitter,

533

block_config: NpuShape3D,

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

534

):

535

"""Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

536

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)

537

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)

538

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

539

540

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

541

def generate_shram_registers(

Jonas Ohlsson

d857507

2022-03-30 10:30:25 +0200

[diff] [blame]

542

emit: CommandStreamEmitter,

543

npu_op: NpuBlockOperation,

544

arch_block_config: ArchitectureBlockConfig,

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

545

):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

546

"""Generates IB_END/IB_START/AB_START/ACC_FORMAT registers"""

547

emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, arch_block_config.layout.ib_end)

548

emit.cmd0_with_param(cmd0.NPU_SET_AB_START, arch_block_config.layout.ab_start)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

549

if has_ifm2(npu_op):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

550

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_IB_START, arch_block_config.layout.ib_start2)

551

emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[arch_block_config.acc_type])

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

552

553

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

554

def get_block_config_for_npu_op(

555

arch, npu_op: NpuBlockOperation, npu_block_type: NpuBlockType, is_partkernel: bool, ifm_resampling: resampling_mode

556

) -> Optional[ArchitectureBlockConfig]:

557

"""

558

Given npu_op.block_config, returns a corresponding ArchitectureBlockConfig.

559

Returns None if the block_config does not fit.

560

"""

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

561

562

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

563

def get_arch_block_config(

564

npu_op: NpuBlockOperation, block_traversal: NpuBlockTraversal, arch: ArchitectureFeatures

565

) -> ArchitectureBlockConfig:

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

566

"""Creates shared buffer allocation for the given operation"""

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

567

assert npu_op.block_config is not None, "block_config has not been set"

568

block_type = NpuBlockType.Default

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

569

if isinstance(npu_op, NpuConv2DOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

570

block_type = NpuBlockType.ConvolutionMxN

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

571

elif isinstance(npu_op, NpuConvDepthWiseOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

572

block_type = NpuBlockType.ConvolutionDepthWise

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

573

elif isinstance(npu_op, NpuPoolingOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

574

block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

575

elif isinstance(npu_op, NpuElementWiseOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

576

block_type = NpuBlockType.ElementWise

577

else:

578

assert 0, "Unsupported operation"

579

ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

580

is_partkernel = block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST

581

uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP

582

lut_banks = 2 if uses_lut else 0

583

fms = [npu_op.ifm, npu_op.ofm]

584

if npu_op.ifm2 is not None:

585

fms.append(npu_op.ifm2)

586

all_fms_have_quant = not any(fm.quantization is None or fm.quantization.scale_f32 is None for fm in fms)

587

ifm_bits = npu_op.ifm.data_type.size_in_bits()

588

ifm_shape = shape3d_to_block(npu_op.ifm.shape)

589

if has_ifm2(npu_op):

590

ifm2_shape = shape3d_to_block(npu_op.ifm2.shape)

591

else:

592

ifm2_shape = None

593

uses_scalar = npu_op.ifm2_scalar is not None

594

block_config = shape3d_to_block(npu_op.block_config)

595

arch_block_config = try_block_config(

596

block_config,

597

arch,

598

block_type,

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

599

shape3d_to_block(npu_op.ofm.shape),

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

ifm_shape,

ifm2_shape,

uses_scalar,

ifm_bits,

is_partkernel=is_partkernel,

605

kernel=to_kernel(npu_op.kernel),

606

lut_banks=lut_banks,

607

scaled=all_fms_have_quant,

608

ifm_resampling=ifm_resampling_mode,

609

)

610

assert arch_block_config is not None, f"block_config {npu_op.block_config} does not fit"

611

return arch_block_config

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

612

613

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

614

def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):

615

"""Generates KERNEL_WAIT/DMA_WAIT"""

616

if cmd_waits.npu >= 0:

617

emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)

618

619

if cmd_waits.dma >= 0:

620

emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)

621

622

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

623

def generate_common(

624

emit: CommandStreamEmitter,

625

npu_op: NpuBlockOperation,

626

block_traversal: NpuBlockTraversal,

627

arch: ArchitectureFeatures,

628

use_global_scale: bool = False,

629

op_to_scale: int = 0,

630

):

631

"""Generate registers that are common to most operations"""

632

assert npu_op.ifm is not None and npu_op.ofm is not None

633

generate_ifm(emit, npu_op.ifm)

634

generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)

635

emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])

636

if npu_op.padding is not None:

637

generate_padding(emit, npu_op.padding)

638

generate_ofm(emit, npu_op.ofm)

639

generate_ofm_precision(emit, npu_op, use_global_scale)

640

if npu_op.op_type != NpuOperationType.ElementWise:

641

assert npu_op.kernel is not None

642

generate_kernel(emit, npu_op.kernel, block_traversal)

643

generate_weights(emit, npu_op.weights, arch)

644

generate_biases(emit, npu_op.biases, arch)

645

generate_activation(emit, npu_op.activation, npu_op.ofm)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

646

arch_block_config = get_arch_block_config(npu_op, block_traversal, arch)

647

generate_block_config(emit, npu_op.block_config)

648

generate_shram_registers(emit, npu_op, arch_block_config)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

649

650

651

# -------------------------------------------------------------------

652

# SCALING

653

# -------------------------------------------------------------------

654

655

656

def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):

657

"""Generates OFM_SCALE register for pooling operations"""

658

# For valid padding vela has to output scaling values

659

kernel = pool_op.kernel

660

ifm_quant = pool_op.ifm.quantization

661

ofm_quant = pool_op.ofm.quantization

662

if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):

663

assert ifm_quant.scale_f32 is not None

664

rescale = 0x3000 * ifm_quant.scale_f32

665

if pool_op.ifm.data_type == NpuDataType.INT16:

666

# Calculate scale and shift for the output scale of 1/(3*4096)

Louis Verhaard

c629129

2021-03-19 09:35:48 +0100

[diff] [blame]

667

x_log2 = math.log2(ifm_quant.scale_f32)

668

rounded_log2 = int(round(x_log2))

669

is_power_of_two = abs(x_log2 - rounded_log2) < 0.001

670

shift = rounded_log2 + 12

Patrik Gustavsson

e3dd2f3

2021-12-02 09:08:26 +0100

[diff] [blame]

671

if is_power_of_two and (

672

(pool_op.activation.op_type == NpuActivationOp.TANH and shift in (0, 1))

673

or (pool_op.activation.op_type == NpuActivationOp.SIGMOID and shift == 0)

674

):

675

# Special handling if input scale is 1/2048 (tanh/sigmoid) or 1/4096 (tanh)

Louis Verhaard

c629129

2021-03-19 09:35:48 +0100

[diff] [blame]

scale = 3 << shift

shift = 0

else:

shift = 0

max_rescale = np.iinfo(np.int16).max / 2

681

while rescale <= max_rescale and shift <= 30:

682

shift += 1

683

rescale *= 2

684

scale = int(rescale)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

685

else:

686

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

687

scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)

688

scale = int(round_away_zero(scale * rescale))

689

elif pool_op.fused_quantize:

690

# Quantize op requires different scaling

691

ifm_scale_f64 = np.double(ifm_quant.scale_f32)

692

ofm_scale_f64 = np.double(ofm_quant.scale_f32)

693

scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)

694

elif pool_op.rescale is not None:

Patrik Gustavsson

2021-08-17 14:26:38 +0200

[diff] [blame]

695

if type(pool_op.rescale) == ExplicitScaling:

696

# Note: reuse of rescale for explicit scaling to not expose this in the external API

697

explicit_scaling = pool_op.rescale

698

assert explicit_scaling.per_channel is False

699

scale = explicit_scaling.multiplier[0]

700

shift = explicit_scaling.shift[0]

701

else:

Tim Hall

885033b

2022-07-21 11:46:03 +0100

[diff] [blame]

702

# for ResizeBilinear/NearestNeighbor operations with rescale

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame]

703

# Note: this is not used, but part of the public API

Patrik Gustavsson

2021-08-17 14:26:38 +0200

[diff] [blame]

704

rescale = pool_op.rescale

705

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

706

scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)

707

scale = int(round_away_zero(scale * rescale))

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

708

else:

709

# In case avg pool fused with concat or other memory operation, rescaling might be needed.

710

# kernel height == kernel width == 1 is always true in this case

711

# Normally the scale is maximised, to get maximum precision, which means that

712

# if rescale != 1, scale need to consider the number of bits needed for rescaling

713

if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:

714

rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32

715

rescale_bits = 0

716

if kernel.height == kernel.width == 1:

717

if rescale > 1:

718

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

719

elif rescale < 1:

720

rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)

721

scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)

722

scale = int(round_away_zero(scale * rescale))

else:

scale = 1

shift = 0

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)

728

729

730

def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:

731

"""

732

Generates OFM/OPA/OPB_SCALE registers for elementwise operators.

733

Returns the operator to scale

734

"""

735

op_to_scale = 0

736

if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):

737

input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None

738

input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None

739

output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None

740

741

if npu_op.activation is not None and npu_op.activation.op_type in (

742

NpuActivationOp.SIGMOID,

743

NpuActivationOp.TANH,

744

):

745

output_scale = 1 / 0x3000

746

747

if npu_op.sub_op_type == NpuElementWiseOp.MUL:

Patrik Gustavsson

b081d67

2021-08-25 13:49:25 +0200

[diff] [blame]

748

if npu_op.rescale:

749

ofm_scale, shift = npu_op.rescale

750

elif None in (input_scale, input2_scale, output_scale):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

ofm_scale = 1

shift = 0

else:

ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

755

else: # Add/Sub

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame]

756

# Default operand scaling is no scaling

757

opa_scale = opb_scale = 1

758

opa_shift = 0

Henrik G Olsson

2021-03-19 15:50:28 +0100

[diff] [blame]

759

bitdepth = npu_op.ifm.data_type.size_in_bits()

760

use_advanced_scaling = False

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame]

761

if npu_op.rescale is not None:

762

# Explicit ofm scaling

763

ofm_scale, shift = npu_op.rescale

764

elif None in (input_scale, input2_scale, output_scale):

765

# No ofm scaling

766

ofm_scale = 1

767

shift = 0

Henrik G Olsson

2021-03-19 15:50:28 +0100

[diff] [blame]

768

elif input_scale == input2_scale and bitdepth == 16:

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame]

769

# int16 same scaling

Henrik G Olsson

2021-03-19 15:50:28 +0100

[diff] [blame]

770

opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(

771

input_scale, input2_scale, output_scale

772

)

773

# align the double rounding with that of advanced scaling

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame]

774

opa_scale //= 2

775

opb_scale //= 2

Henrik G Olsson

2021-03-19 15:50:28 +0100

[diff] [blame]

776

shift -= 1

777

opa_shift = 0 # Unused for this case

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

778

elif input_scale == input2_scale:

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame]

779

# Same scaling

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

780

opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(

781

input_scale, input2_scale, output_scale

782

)

783

opa_shift = 0 # Unused for this case

Henrik G Olsson

2021-03-19 15:50:28 +0100

[diff] [blame]

784

# For 8 bit we can't guarantee double rounding with simplified scaling will always be

785

# the same as with advanced scaling due to different shifts. When the ofm scale fulfils

786

# the following we know that double rounding will have no effect for advanced scaling

787

# no matter the input, so we can safely use simplified scaling with double rounding disabled.

788

use_advanced_scaling = int(ofm_scale) & 0xFFF != 0

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

789

else:

Henrik G Olsson

2021-03-19 15:50:28 +0100

[diff] [blame]

790

use_advanced_scaling = True

791

if use_advanced_scaling:

792

# Use advanced implementation only when input/output scales differ,

793

# or when we can't guarantee the absence of rounding errors

Jonas Ohlsson

d857507

2022-03-30 10:30:25 +0200

[diff] [blame]

(

opa_scale,

opa_shift,

ofm_scale,

shift,

op_to_scale,

) = scaling.advanced_elementwise_add_sub_scale(input_scale, input2_scale, output_scale, bitdepth)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

801

opb_scale = 0 # Unused for this case

802

if npu_op.reversed_operands:

803

# If the operand order is reversed we also have to swap which operand is scaled

804

if op_to_scale == scaling.OperandToScale.OPa:

805

op_to_scale = scaling.OperandToScale.OPb

806

else:

807

op_to_scale = scaling.OperandToScale.OPa

808

emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)

809

emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

810

elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):

811

output_scale = npu_op.ofm.quantization.scale_f32

812

ofm_scale, shift = scaling.quantise_scale(output_scale)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

813

else:

Tim Hall

e178f38

2022-07-12 17:02:25 +0100

[diff] [blame]

814

ofm_scale = 1

815

shift = 0

816

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

return op_to_scale

# -------------------------------------------------------------------

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

821

# PRINT

822

# -------------------------------------------------------------------

Jacob Bohlin

e99b893

2020-07-13 16:01:51 +0200

[diff] [blame]

823

824

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

825

def print_feature_map(fm: Optional[NpuFeatureMap], name: str):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

if fm is not None:

q = (

"no quantization"

if fm.quantization is None

830

else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"

831

)

832

h, w, c = fm.shape

833

sz = h * w * c * fm.data_type.size_in_bytes()

834

print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")

835

strides = get_strides(fm)

836

stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"

837

t = fm.tiles

838

addresses = [hex(addr) for addr in t.addresses]

839

print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")

Tim Hall

2022-03-16 16:51:16 +0000

[diff] [blame]

840

print(f" name={fm.name}")

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

841

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

842

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

843

def print_operation(npu_op: NpuOperation, index: int = 0, cmd=None):

Tim Hall

2022-03-16 16:51:16 +0000

[diff] [blame]

844

pass_info = f" {cmd}" if cmd else ""

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

845

if isinstance(npu_op, NpuOperation) and not isinstance(npu_op, (NpuDmaOperation, NpuBlockOperation)):

Tim Hall

2022-03-16 16:51:16 +0000

[diff] [blame]

846

print(f"{index} {npu_op.op_type.name} name={npu_op.name}:{pass_info}")

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

847

return

848

if isinstance(npu_op, NpuDmaOperation):

Tim Hall

2022-03-16 16:51:16 +0000

[diff] [blame]

849

print(f"{index} {npu_op.op_type.name} name={npu_op.name}, src={npu_op.src}, dest={npu_op.dest}:{pass_info}")

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

850

return

851

k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

852

if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)):

Tim Hall

2022-03-16 16:51:16 +0000

[diff] [blame]

853

print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name} name={npu_op.name}:{pass_info}")

Patrik Gustavsson

eca2e95

2020-05-27 09:15:11 +0200

[diff] [blame]

854

else:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

855

if (

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

856

isinstance(npu_op, NpuConv2DOperation)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

857

and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1

858

):

859

fc = "FullyConnected "

860

else:

861

fc = ""

Tim Hall

2022-03-16 16:51:16 +0000

[diff] [blame]

862

print(f"{index} {fc}{npu_op.op_type.name} name={npu_op.name}:{pass_info}")

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

863

print_feature_map(npu_op.ifm, "IFM")

864

if npu_op.ifm2_scalar is not None:

865

quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)

866

print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")

867

else:

868

print_feature_map(npu_op.ifm2, "IFM2")

869

print_feature_map(npu_op.ofm, "OFM")

870

if k is not None and npu_op.op_type != NpuOperationType.ElementWise:

871

print(f" Kernel: {k}")

872

if npu_op.padding is not None:

873

print(f" {npu_op.padding}")

874

for weights in npu_op.weights:

875

print(f" Weights: {weights}")

876

for bias in npu_op.biases:

877

print(f" Scales: {bias}")

878

if npu_op.activation is not None:

879

act = npu_op.activation

880

if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:

881

lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""

882

print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

883

if isinstance(npu_op, NpuConv2DOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

884

print(f" {npu_op.block_traversal}")

885

bh, bw, bc = npu_op.block_config

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

886

rescale = (

887

f", rescale={npu_op.rescale}" if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)) else ""

888

)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

889

print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

890

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

891

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

892

def print_operations(npu_op_list: List[NpuOperation], npu_op_to_cmd=None):

893

npu_op_to_cmd = dict() if npu_op_to_cmd is None else npu_op_to_cmd

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

894

for index, npu_op in enumerate(npu_op_list):

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

895

print_operation(npu_op, index, npu_op_to_cmd.get(npu_op))

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

896

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

897

898

# -------------------------------------------------------------------

899

# OPERATIONS

900

# -------------------------------------------------------------------

901

902

903

def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):

904

"""Generates NPU_OP_* command"""

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

905

if isinstance(npu_op, NpuDmaOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

906

emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

907

elif isinstance(npu_op, NpuConv2DOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

908

emit.cmd_do_operation(cmd0.NPU_OP_CONV)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

909

elif isinstance(npu_op, NpuConvDepthWiseOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

910

emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

911

elif isinstance(npu_op, NpuPoolingOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

912

emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

913

elif isinstance(npu_op, NpuElementWiseOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

914

emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])

915

else:

916

assert 0, "Unsupported operation"

917

918

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

919

def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

920

"""Generates register commands for Conv2D operations"""

921

generate_common(emit, npu_op, npu_op.block_traversal, arch)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

922

923

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

924

def generate_conv_depthwise_op(

925

emit: CommandStreamEmitter, npu_op: NpuConvDepthWiseOperation, arch: ArchitectureFeatures

926

):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

927

"""Generates register commands for depthwise convolution operations"""

928

generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

929

930

931

def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):

932

"""Generates register commands for pooling operations"""

Tim Hall

d6efcd3

2022-09-02 15:01:01 +0100

[diff] [blame]

933

# check that reduce_sum input is NHWC

934

if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM and npu_op.ifm.layout != NpuLayout.NHWC:

935

if npu_op.ifm.data_type == NpuDataType.INT32:

936

raise VelaError(

937

f"REDUCE_SUM ({npu_op.name}) with IFM data type of INT32 requires IFM layout to be NHWC"

938

f" ({npu_op.ifm.name} == {npu_op.ifm.layout})"

939

)

940

elif arch.accelerator_config == Accelerator.Ethos_U65_512:

941

raise VelaError(

942

f"REDUCE_SUM ({npu_op.name}) with accelerator config of Ethos_U65_512 requires IFM layout to be NHWC"

943

f" ({npu_op.ifm.name} == {npu_op.ifm.layout})"

944

)

945

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

946

use_global_scale = (

947

npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0

948

)

Patrik Gustavsson

2021-08-17 14:26:38 +0200

[diff] [blame]

949

# Note: reuse of rescale for explicit scaling to not expose this in the external API

950

if npu_op.rescale is not None and type(npu_op.rescale) == ExplicitScaling:

951

use_global_scale = not npu_op.rescale.per_channel

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

952

generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)

953

# Pooling op specific

954

if use_global_scale:

955

generate_ofm_scaling_for_pooling(emit, npu_op)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

956

957

958

def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):

959

"""Generates register commands for elementwise operations"""

960

use_global_scale = npu_op.sub_op_type in (

961

NpuElementWiseOp.ADD,

962

NpuElementWiseOp.SUB,

963

NpuElementWiseOp.MUL,

964

NpuElementWiseOp.LRELU,

965

NpuElementWiseOp.ABS,

966

)

967

op_to_scale = generate_scaling_for_elementwise(emit, npu_op)

968

generate_common(

969

emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale

970

)

971

# Elementwise op specific

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

972

if npu_op.sub_op_type not in UNARY_ELEMWISE_OPS:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

973

# Binary operation; generate IFM2 registers

974

assert npu_op.ifm2 is not None

975

has_scalar = npu_op.ifm2_scalar is not None

976

generate_ifm2(emit, npu_op.ifm2, has_scalar)

977

generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)

978

generate_ifm2_broadcast(emit, npu_op)

979

if has_scalar:

980

quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)

981

assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()

982

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

983

984

985

def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):

986

"""Generates register commands for DMA operations"""

987

emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

988

emit.cmd1_with_address(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

989

emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)

990

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

991

emit.cmd1_with_address(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)

992

emit.cmd1_with_address(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

993

994

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

995

def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

996

"""

997

Generates register commands for the given operation, but not the final NPU_OP_... command.

998

Returns the selected block config

999

"""

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1000

if isinstance(npu_op, NpuConv2DOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

1001

generate_conv2d_op(emit, npu_op, arch)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1002

elif isinstance(npu_op, NpuConvDepthWiseOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

1003

generate_conv_depthwise_op(emit, npu_op, arch)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1004

elif isinstance(npu_op, NpuPoolingOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

1005

generate_pooling_op(emit, npu_op, arch)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1006

elif isinstance(npu_op, NpuElementWiseOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

1007

generate_elementwise_op(emit, npu_op, arch)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1008

elif isinstance(npu_op, NpuDmaOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1009

generate_dma_op(emit, npu_op)

1010

else:

1011

assert 0, "Unsupported operation"

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1012

1013

1014

def generate_command_stream(

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

1015

npu_op_list: List[NpuOperation],

1016

arch: ArchitectureFeatures,

1017

verbose: bool,

1018

mem_limits: Dict[int, int],

1019

add_to_debug_db=None,

1020

npu_op_to_cmd=None,

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

1021

) -> List[int]:

1022

"""

1023

Generates register commands for the given list of NPU operations.

1024

Returns Ethos-U instructions, as a list of 32-bit integers.

1025

"""

1026

emit = CommandStreamEmitter()

1027

if verbose:

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1028

print_operations(npu_op_list, npu_op_to_cmd)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1029

# Calculate memory accesses for every operation

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1030

memory_accesses: Dict[NpuOperation, MemoryAccessSet] = {}

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1031

for npu_op in npu_op_list:

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1032

if isinstance(npu_op, NpuDmaOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1033

memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1034

elif isinstance(npu_op, NpuBlockOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1035

memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1036

else:

1037

assert 0, "Invalid operation type"

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

1038

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

1039

if arch.is_ethos_u65_system:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1040

emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)

1041

dep_watermark = Watermark(0, 0)

1042

prev_op = None

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1043

# Generate register commands for all operations

1044

for op_index, npu_op in enumerate(npu_op_list):

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

1045

try:

1046

check_mem_limits(memory_accesses[npu_op], mem_limits)

1047

dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)

1048

generate_registers_for_op(emit, npu_op, arch)

1049

except VelaError as e:

1050

# Add operation info and rethrow

1051

raise VelaError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1052

if not isinstance(npu_op, NpuDmaOperation) and isinstance(npu_op, NpuBlockOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1053

# Generate BLOCKDEP

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

1054

blockdep = calc_blockdep(arch, prev_op, npu_op)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1055

blockdep = min(blockdep, arch.max_blockdep)

1056

emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)

1057

prev_op = npu_op

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1058

1059

generate_cmd_waits(emit, cmd_waits)

1060

# Generate the actual NPU_OP command

1061

generate_operation_code(emit, npu_op)

1062

if add_to_debug_db is not None:

1063

add_to_debug_db(npu_op, emit.offset)

1064

# Fill in final part of command stream:

1065

emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

1066

res = emit.to_list()

erik.andersson@arm.com

1878dab

2021-03-16 09:40:24 +0100

[diff] [blame]

1067

1068

if emit.size_in_bytes() >= 1 << 24:

1069

raise VelaError(

1070

f"The command stream size exceeds the hardware limit of 16 MiB. "

1071

f"The current stream size is {emit.size_in_bytes()/2**20:.2F} MiB."

1072

)

1073

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1074

if verbose:

1075

emit.print_cmds()

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame]

1076

print(f"Number of commands = {len(emit.cmd_stream)}")

1077

print(f"Command stream length = {emit.size_in_bytes()} bytes")

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

return res

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1081

def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1082

"""

Louis Verhaard

aeae567

2020-11-02 18:04:27 +0100

[diff] [blame]

1083

Internal implementation of the public facing API for generating an Ethos-U register command stream.

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1084

Calculates dependencies between commands and inserts wait operations if needed.

1085

1086

:param npu_op_list: List[NpuOperation] list of high level NPU operations

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

1087

:param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator

1088

:return Ethos-U instructions, as a list of 32-bit integers

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1089

"""

Louis Verhaard

aeae567

2020-11-02 18:04:27 +0100

[diff] [blame]

1090

accelerator = Accelerator.from_npu_accelerator(npu_accelerator)

Louis Verhaard

5207830

2020-11-18 13:35:06 +0100

[diff] [blame]

1091

arch = create_default_arch(accelerator)

Louis Verhaard