Blame - ethosu/vela/register_command_stream_generator.py - ml/ethos-u/ethos-u-vela

2020-04-27 18:20:16 +0100

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

16

# Description:

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

17

# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

18

# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

19

# stream suitable for interpretation by the Ethos-U processor.

Louis Verhaard

c629129

2021-03-19 09:35:48 +0100

[diff] [blame]

20

import math

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

21

from collections import defaultdict

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

22

from enum import Enum

23

from enum import IntEnum

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

24

from typing import cast

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

25

from typing import Dict

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

26

from typing import List

27

from typing import Optional

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

import numpy as np

from . import scaling

Louis Verhaard

aeae567

2020-11-02 18:04:27 +0100

[diff] [blame]

32

from .api import NpuAccelerator

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

33

from .api import NpuActivation

34

from .api import NpuActivationOp

35

from .api import NpuAddressRange

36

from .api import NpuBlockOperation

37

from .api import NpuBlockTraversal

38

from .api import NpuConv2DOperation

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

39

from .api import NpuConvDepthWiseOperation

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

40

from .api import NpuDataType

41

from .api import NpuDmaOperation

42

from .api import NpuElementWiseOp

43

from .api import NpuElementWiseOperation

44

from .api import NpuFeatureMap

45

from .api import NpuKernel

46

from .api import NpuLayout

47

from .api import NpuOperation

48

from .api import NpuOperationType

49

from .api import NpuPadding

50

from .api import NpuPoolingOp

51

from .api import NpuPoolingOperation

52

from .api import NpuQuantization

53

from .api import NpuResamplingMode

54

from .api import NpuRoundingMode

55

from .api import NpuShape3D

56

from .api import NpuTileBox

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

57

from .architecture_allocator import ArchitectureBlockConfig

58

from .architecture_allocator import try_block_config

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

59

from .architecture_features import Accelerator

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

60

from .architecture_features import ArchitectureFeatures

Louis Verhaard

5207830

2020-11-18 13:35:06 +0100

[diff] [blame]

61

from .architecture_features import create_default_arch

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

62

from .architecture_features import SHRAMElements

erik.andersson@arm.com

1878dab

2021-03-16 09:40:24 +0100

[diff] [blame]

63

from .errors import VelaError

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

64

from .ethos_u55_regs.ethos_u55_regs import acc_format

65

from .ethos_u55_regs.ethos_u55_regs import activation

66

from .ethos_u55_regs.ethos_u55_regs import cmd0

67

from .ethos_u55_regs.ethos_u55_regs import cmd1

68

from .ethos_u55_regs.ethos_u55_regs import elementwise_mode

Fredrik Svedberg

a0c3624

2020-06-03 15:43:31 +0200

[diff] [blame]

69

from .ethos_u55_regs.ethos_u55_regs import pooling_mode

Jacob Bohlin

cf7da10

2020-05-20 09:03:40 +0200

[diff] [blame]

70

from .ethos_u55_regs.ethos_u55_regs import resampling_mode

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

71

from .ethos_u55_regs.ethos_u55_regs import rounding

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

72

from .numeric_util import quantise_float32

73

from .numeric_util import round_away_zero

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

74

from .numeric_util import round_up_to_int

Patrik Gustavsson

2021-08-17 14:26:38 +0200

[diff] [blame]

75

from .operation import ExplicitScaling

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

76

from .operation import NpuBlockType

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

77

from .range_set import MemoryAccessSet

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

78

from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

79

from .register_command_stream_util import calc_blockdep

80

from .register_command_stream_util import get_dma_memory_accesses

81

from .register_command_stream_util import get_op_memory_accesses

82

from .register_command_stream_util import get_strides

83

from .register_command_stream_util import get_wait_dependency

84

from .register_command_stream_util import has_ifm2

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

85

from .register_command_stream_util import shape3d_to_block

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

86

from .register_command_stream_util import to_kernel

87

from .register_command_stream_util import UNARY_ELEMWISE_OPS

88

from .register_command_stream_util import Watermark

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

89

90

91

class RegisterMachine:

92

def __init__(self):

93

self.n_banks = 1

94

self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]

95

self.bank_idx = 0

96

97

def set_register(self, reg, value):

98

is_changed = self.registers[self.bank_idx][reg] != value

99

self.registers[self.bank_idx][reg] = value

100

# is_changed = True # force command

101

return is_changed

102

103

def switch_bank(self):

104

self.bank_idx = (self.bank_idx + 1) % self.n_banks

105

106

107

class CmdMode(IntEnum):

NoPayload = 0x0000

Payload32 = 0x4000

Mask = 0xC000

CmdOpMask = 0x03FF

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

114

class CommandStreamEmitter:

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

115

WORD_SIZE = 4

116

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

117

def __init__(self):

118

self.cmd_stream = []

119

self.reg_machine = [RegisterMachine(), RegisterMachine()]

120

self.last_absolute_wait = defaultdict(int)

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

121

self.offset = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

122

123

def get_reg_machine(self, cmd):

124

if "DMA" in cmd.name:

125

return self.reg_machine[1]

126

else:

127

return self.reg_machine[0]

128

129

def size_in_bytes(self):

130

sz = 0

131

for cmd in self.cmd_stream:

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

132

sz += len(cmd) * CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

133

return sz

134

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

135

def to_list(self) -> List[int]:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

136

return [elem for cmd in self.cmd_stream for elem in cmd]

137

138

def print_cmds(self):

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame]

139

s = f" {'Offset':6}:"

140

s += f" {'Payload':8}"

141

s += f"{'Param':4}" # no leading space for alignment

142

s += f" {'Code':4}"

143

s += f" - {'Command':30}"

s += f" {'Param':5}"

print(s)

offset = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

148

for words_for_one_command in self.cmd_stream:

149

code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits

150

param = words_for_one_command[0] >> 16 # higher 16 bits

151

152

payload_mode = CmdMode(code & CmdMode.Mask)

153

Tim Hall

cda4fcb

2022-05-19 12:36:58 +0100

[diff] [blame]

154

s = f"{offset:#08x}:"

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame]

155

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

156

if payload_mode == CmdMode.NoPayload:

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame]

157

s += f" {'':8}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

158

else:

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame]

159

assert payload_mode == CmdMode.Payload32

160

s += f" {words_for_one_command[1]:08x}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

161

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame]

162

s += f" {param:04x}"

163

s += f" {code:04x}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

164

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame]

165

if payload_mode == CmdMode.NoPayload:

166

s += f" - {cmd0(code & CmdMode.CmdOpMask):30}"

167

offset += 4

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

168

else:

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame]

169

s += f" - {cmd1(code & CmdMode.CmdOpMask):30}"

170

offset += 8

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

171

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame]

172

s += f" {param:5}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

173

print(s)

174

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

175

def cmd0_with_param(self, cmd: cmd0, param):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

176

if isinstance(param, Enum):

177

param = int(param.value)

178

else:

179

param = int(param)

180

param = param & 0xFFFF

181

command = cmd.value | (param << 16)

182

if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):

183

return

184

185

# This is not a redundant command, actually write it

186

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

187

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

188

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

189

def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):

Louis Verhaard

893780c

2021-03-30 09:02:30 +0200

[diff] [blame]

190

offset = int(offset) & 0xFFFFFFFF

191

param = int(param) & 0xFFFF

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

192

command = cmd.value | CmdMode.Payload32.value | (param << 16)

193

194

if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):

195

return

196

197

# This is not a redundant command, actually write it

198

self.cmd_stream.append((command, offset))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

199

self.offset += CommandStreamEmitter.WORD_SIZE * 2

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

200

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

201

def cmd1_with_address(self, cmd: cmd1, offset):

202

self.cmd1_with_offset(cmd, offset, offset >> 32)

203

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

204

def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):

Tim Hall

289a41d

2020-08-04 21:40:14 +0100

[diff] [blame]

205

param = (16 * channel) + outstanding_count

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

206

command = ((param & 0xFFFF) << 16) | cmd.value

207

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

208

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

209

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

210

def cmd_do_operation(self, cmd: cmd0, param=0):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

211

param = int(param)

212

command = ((param & 0xFFFF) << 16) | cmd.value

213

214

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

215

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

216

self.get_reg_machine(cmd).switch_bank()

217

218

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

219

# -------------------------------------------------------------------

220

# REGISTER GENERATION

221

# -------------------------------------------------------------------

222

223

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

224

# TODO: Replace with definitions from ethos_u55_regs

225

class IFM2Broadcast(IntEnum):

226

BroadcastHdim = 1 << 0

227

BroadcastWdim = 1 << 1

228

BroadcastCdim = 1 << 2

229

ReverseOperandOrder = 1 << 6

230

UseIFM2Scalar = 1 << 7

pooling_op_map = {

NpuPoolingOp.MAX: pooling_mode.MAX.value,

235

NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,

236

NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,

237

}

238

239

elementwise_op_map = {

240

NpuElementWiseOp.MUL: elementwise_mode.MUL.value,

241

NpuElementWiseOp.ADD: elementwise_mode.ADD.value,

242

NpuElementWiseOp.SUB: elementwise_mode.SUB.value,

243

NpuElementWiseOp.MIN: elementwise_mode.MIN.value,

244

NpuElementWiseOp.MAX: elementwise_mode.MAX.value,

245

NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,

246

NpuElementWiseOp.ABS: elementwise_mode.ABS.value,

247

NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,

248

NpuElementWiseOp.SHR: elementwise_mode.SHR.value,

249

NpuElementWiseOp.SHL: elementwise_mode.SHL.value,

250

}

251

252

activation_op_map = {

253

NpuActivationOp.NONE_OR_RELU: activation.NONE,

254

NpuActivationOp.TANH: activation.TANH,

255

NpuActivationOp.SIGMOID: activation.SIGMOID,

256

}

257

258

# Maps an AccumulatorType enum to the corresponding acc_format value

259

acc_format_map = {

260

SHRAMElements.Acc16: acc_format.FP_S5_10.value,

261

SHRAMElements.Acc32: acc_format.INT_32BIT.value,

262

SHRAMElements.Acc40: acc_format.INT_40BIT.value,

263

}

264

265

resampling_mode_map = {

266

NpuResamplingMode.NONE: resampling_mode.NONE,

267

NpuResamplingMode.NEAREST: resampling_mode.NEAREST,

268

NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,

269

}

270

271

# Maps data type size in bits to activation precision

272

precision_map = {8: 0, 16: 1, 32: 2}

273

274

# Maps rounding mode to the corresponding value

275

rounding_mode_map = {

276

NpuRoundingMode.TFL: rounding.TFL.value,

277

NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,

278

NpuRoundingMode.NATURAL: rounding.NATURAL.value,

}

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

282

def check_mem_limits(memory_accesses: MemoryAccessSet, mem_limits: Dict[int, int]):

283

"""Checks that an operation's memory accesses respect the boundaries imposed by mem_limits"""

284

for mem_access in memory_accesses.accesses:

285

for region, range_set in mem_access.regions.items():

286

if region not in mem_limits:

287

raise VelaError(f"Invalid region: {region}")

288

max = mem_limits[region]

289

for start, end in range_set.ranges:

290

for offset in (start, end):

291

if offset < 0:

292

raise VelaError(f"Negative address offset: {offset}, region: {region}")

293

if offset > max:

Tim Hall

cda4fcb

2022-05-19 12:36:58 +0100

[diff] [blame]

294

raise VelaError(

295

f"Address offset out of range: {offset}, region: {region}, max: {max}. Perhaps try running"

296

f" with the HillClimb tensor allocator and/or increasing the maximum iteration of that"

297

f" allocator"

298

)

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

299

300

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

301

def quantise(value: float, quant: Optional[NpuQuantization]) -> int:

302

"""Quantizes the given value"""

303

scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32

304

zp = 0 if quant is None else quant.zero_point

305

return quantise_float32(value, scale, zp)

306

307

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

308

def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):

309

"""Generates IFM_PAD registers"""

310

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)

311

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)

312

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)

313

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)

314

315

316

def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):

317

"""Generates ACTIVATION registers"""

318

act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)

319

320

if act.min is None:

321

quantized_min = ofm.data_type.min_value()

322

else:

323

quantized_min = quantise(act.min, ofm.quantization)

324

if act.max is None:

325

quantized_max = ofm.data_type.max_value()

326

else:

327

quantized_max = quantise(act.max, ofm.quantization)

328

quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())

329

quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())

330

if act.op_type == NpuActivationOp.TABLE_LOOKUP:

331

assert 0 <= act.lookup_table_index < 8

332

activation_value = 16 + act.lookup_table_index

333

if ofm.data_type == NpuDataType.INT32:

334

activation_value |= 3 << 12 # Force I8 range

335

quantized_min = max(-128, quantized_min)

336

quantized_max = min(127, quantized_max)

337

else:

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

338

activation_value = cast(int, activation_op_map[act.op_type])

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

339

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)

340

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)

341

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)

342

343

344

def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):

345

"""Generates xFM_BASE registers"""

346

if layout == NpuLayout.NHCWB16:

347

# Check that all BasePointer addresses are aligned to 16 bytes

348

assert all((int(addr) % 16) == 0 for addr in addresses)

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

349

for i in range(4):

350

emit.cmd1_with_address(ptr_cmds[i], addresses[i])

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

351

352

353

def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):

354

"""Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""

355

emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)

356

emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)

357

emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)

358

359

360

def generate_strides(

361

emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1

362

):

363

"""Generates STRIDE_C/Y/X registers"""

364

strides = get_strides(fm)

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

365

emit.cmd1_with_address(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)

366

emit.cmd1_with_address(stride_y_cmd, strides.height) # stride between vertical values (H)

367

emit.cmd1_with_address(stride_x_cmd, strides.width) # stride between horisontal values (W)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

368

369

370

def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):

371

"""Generates IFM/IFM2_PRECISION register"""

372

dtype = fm.data_type

373

prec = 1 if dtype.is_signed() else 0

374

activation_precision = precision_map[dtype.size_in_bits()]

375

prec += activation_precision << 2

376

377

if fm.layout == NpuLayout.NHCWB16:

378

prec |= 1 << 6

379

380

prec |= op_to_scale << 8

381

emit.cmd0_with_param(precision_cmd, prec)

382

383

384

def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):

385

"""Generates OFM_PRECISION register"""

386

dtype = npu_op.ofm.data_type

387

prec = 1 if dtype.is_signed() else 0

388

activation_precision = precision_map[dtype.size_in_bits()]

389

prec += activation_precision << 1

390

391

if use_global_scale:

392

# Set global scale bit, as opposed to using per channel scale

393

prec |= 1 << 8

394

if npu_op.ofm.layout == NpuLayout.NHCWB16:

395

prec |= 1 << 6

396

prec |= rounding_mode_map[npu_op.rounding_mode] << 14

397

emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)

398

399

400

def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):

401

"""Generates IFM2_BROADCAST register for binary elementwise operations"""

ifm2_broadcast = 0

ifm = npu_op.ifm

ifm2 = npu_op.ifm2

if npu_op.reversed_operands:

406

ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder

407

if npu_op.ifm2_scalar is not None:

408

# IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST

409

ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar

410

else:

411

if ifm.shape.height != ifm2.shape.height:

412

# Broadcast in 'H' dimension

413

assert ifm2.shape.height == 1

414

ifm2_broadcast |= IFM2Broadcast.BroadcastHdim

415

416

if ifm.shape.width != ifm2.shape.width:

417

# Broadcast in 'W' dimension

418

assert ifm2.shape.width == 1

419

ifm2_broadcast |= IFM2Broadcast.BroadcastWdim

420

421

if ifm.shape.depth != ifm2.shape.depth:

422

# Broadcast in 'C' dimension

423

assert ifm2.shape.depth == 1

424

ifm2_broadcast |= IFM2Broadcast.BroadcastCdim

425

426

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)

427

428

429

def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):

430

"""Generates general IFM registers"""

431

emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)

432

generate_addresses(

433

emit,

434

[cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],

ifm.tiles.addresses,

ifm.layout,

)

generate_tiles(

emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles

440

)

441

emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)

442

generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)

443

emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))

444

445

446

def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):

447

"""Generates general IFM2 registers"""

448

if not has_scalar:

449

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)

450

generate_addresses(

451

emit,

452

[cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],

453

ifm2.tiles.addresses,

ifm2.layout,

)

generate_tiles(

emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles

458

)

459

generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)

460

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))

461

462

463

def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):

464

"""Generates general OFM registers"""

465

emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)

466

generate_addresses(

467

emit,

468

[cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],

ofm.tiles.addresses,

ofm.layout,

)

generate_tiles(

emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles

474

)

475

emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)

476

emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)

477

emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)

478

generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)

479

emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))

480

481

482

def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):

483

"""Generates KERNEL related registers"""

484

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))

485

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))

486

# set kernel x stride low bit

487

stride = (kernel.stride_x - 1) & 1

488

# set kernel y stride low bit

489

stride |= (kernel.stride_y - 1 & 1) << 1

490

# set kernel x stride extension bits

491

stride |= (kernel.stride_x - 1 >> 1) << 6

492

# set kernel y stride extension bits

493

stride |= (kernel.stride_y - 1 >> 1) << 9

494

stride |= (kernel.dilation_x - 1) << 3

495

stride |= (kernel.dilation_y - 1) << 4

496

if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:

497

stride |= 1 << 2

498

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)

499

500

501

def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):

502

"""Generates WEIGHT registers"""

503

if len(weights) == 0:

504

return

505

emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)

506

# Set weights sources for active and present cores

507

for core, (addr, length) in enumerate(

508

[

509

(cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),

510

(cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),

511

]

512

):

513

if core < len(weights):

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

514

emit.cmd1_with_address(addr, weights[core].address)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

515

emit.cmd1_with_offset(length, weights[core].length)

516

elif core < arch.ncores:

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

517

emit.cmd1_with_address(addr, weights[0].address)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

518

emit.cmd1_with_offset(length, 0)

519

520

521

def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):

522

"""Generates SCALE registers"""

523

if len(biases) == 0:

524

return

525

emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)

526

# Set weights sources for active and present cores

527

for core, (addr, length) in enumerate(

528

[(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]

529

):

530

if core < len(biases):

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

531

emit.cmd1_with_address(addr, biases[core].address)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

532

emit.cmd1_with_offset(length, biases[core].length)

533

elif core < arch.ncores:

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

534

emit.cmd1_with_address(addr, biases[0].address)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

535

emit.cmd1_with_offset(length, 0)

536

537

538

def generate_block_config(

Jonas Ohlsson

d857507

2022-03-30 10:30:25 +0200

[diff] [blame]

539

emit: CommandStreamEmitter,

540

block_config: NpuShape3D,

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

541

):

542

"""Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

543

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)

544

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)

545

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

546

547

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

548

def generate_shram_registers(

Jonas Ohlsson

d857507

2022-03-30 10:30:25 +0200

[diff] [blame]

549

emit: CommandStreamEmitter,

550

npu_op: NpuBlockOperation,

551

arch_block_config: ArchitectureBlockConfig,

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

552

):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

553

"""Generates IB_END/IB_START/AB_START/ACC_FORMAT registers"""

554

emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, arch_block_config.layout.ib_end)

555

emit.cmd0_with_param(cmd0.NPU_SET_AB_START, arch_block_config.layout.ab_start)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

556

if has_ifm2(npu_op):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

557

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_IB_START, arch_block_config.layout.ib_start2)

558

emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[arch_block_config.acc_type])

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

559

560

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

561

def get_block_config_for_npu_op(

562

arch, npu_op: NpuBlockOperation, npu_block_type: NpuBlockType, is_partkernel: bool, ifm_resampling: resampling_mode

563

) -> Optional[ArchitectureBlockConfig]:

564

"""

565

Given npu_op.block_config, returns a corresponding ArchitectureBlockConfig.

566

Returns None if the block_config does not fit.

567

"""

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

568

569

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

570

def get_arch_block_config(

571

npu_op: NpuBlockOperation, block_traversal: NpuBlockTraversal, arch: ArchitectureFeatures

572

) -> ArchitectureBlockConfig:

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

573

"""Creates shared buffer allocation for the given operation"""

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

574

assert npu_op.block_config is not None, "block_config has not been set"

575

block_type = NpuBlockType.Default

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

576

if isinstance(npu_op, NpuConv2DOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

577

block_type = NpuBlockType.ConvolutionMxN

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

578

elif isinstance(npu_op, NpuConvDepthWiseOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

579

block_type = NpuBlockType.ConvolutionDepthWise

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

580

elif isinstance(npu_op, NpuPoolingOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

581

block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

582

elif isinstance(npu_op, NpuElementWiseOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

583

block_type = NpuBlockType.ElementWise

584

else:

585

assert 0, "Unsupported operation"

586

ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

587

is_partkernel = block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST

588

uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP

589

lut_banks = 2 if uses_lut else 0

590

fms = [npu_op.ifm, npu_op.ofm]

591

if npu_op.ifm2 is not None:

592

fms.append(npu_op.ifm2)

593

all_fms_have_quant = not any(fm.quantization is None or fm.quantization.scale_f32 is None for fm in fms)

594

ifm_bits = npu_op.ifm.data_type.size_in_bits()

595

ifm_shape = shape3d_to_block(npu_op.ifm.shape)

596

if has_ifm2(npu_op):

597

ifm2_shape = shape3d_to_block(npu_op.ifm2.shape)

598

else:

599

ifm2_shape = None

600

uses_scalar = npu_op.ifm2_scalar is not None

601

block_config = shape3d_to_block(npu_op.block_config)

602

arch_block_config = try_block_config(

603

block_config,

604

arch,

605

block_type,

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

606

shape3d_to_block(npu_op.ofm.shape),

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

ifm_shape,

ifm2_shape,

uses_scalar,

ifm_bits,

is_partkernel=is_partkernel,

612

kernel=to_kernel(npu_op.kernel),

613

lut_banks=lut_banks,

614

scaled=all_fms_have_quant,

615

ifm_resampling=ifm_resampling_mode,

616

)

617

assert arch_block_config is not None, f"block_config {npu_op.block_config} does not fit"

618

return arch_block_config

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

619

620

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

621

def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):

622

"""Generates KERNEL_WAIT/DMA_WAIT"""

623

if cmd_waits.npu >= 0:

624

emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)

625

626

if cmd_waits.dma >= 0:

627

emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)

628

629

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

630

def generate_common(

631

emit: CommandStreamEmitter,

632

npu_op: NpuBlockOperation,

633

block_traversal: NpuBlockTraversal,

634

arch: ArchitectureFeatures,

635

use_global_scale: bool = False,

636

op_to_scale: int = 0,

637

):

638

"""Generate registers that are common to most operations"""

639

assert npu_op.ifm is not None and npu_op.ofm is not None

640

generate_ifm(emit, npu_op.ifm)

641

generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)

642

emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])

643

if npu_op.padding is not None:

644

generate_padding(emit, npu_op.padding)

645

generate_ofm(emit, npu_op.ofm)

646

generate_ofm_precision(emit, npu_op, use_global_scale)

647

if npu_op.op_type != NpuOperationType.ElementWise:

648

assert npu_op.kernel is not None

649

generate_kernel(emit, npu_op.kernel, block_traversal)

650

generate_weights(emit, npu_op.weights, arch)

651

generate_biases(emit, npu_op.biases, arch)

652

generate_activation(emit, npu_op.activation, npu_op.ofm)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

653

arch_block_config = get_arch_block_config(npu_op, block_traversal, arch)

654

generate_block_config(emit, npu_op.block_config)

655

generate_shram_registers(emit, npu_op, arch_block_config)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

656

657

658

# -------------------------------------------------------------------

659

# SCALING

660

# -------------------------------------------------------------------

661

662

663

def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):

664

"""Generates OFM_SCALE register for pooling operations"""

665

# For valid padding vela has to output scaling values

666

kernel = pool_op.kernel

667

ifm_quant = pool_op.ifm.quantization

668

ofm_quant = pool_op.ofm.quantization

669

if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):

670

assert ifm_quant.scale_f32 is not None

671

rescale = 0x3000 * ifm_quant.scale_f32

672

if pool_op.ifm.data_type == NpuDataType.INT16:

673

# Calculate scale and shift for the output scale of 1/(3*4096)

Louis Verhaard

c629129

2021-03-19 09:35:48 +0100

[diff] [blame]

674

x_log2 = math.log2(ifm_quant.scale_f32)

675

rounded_log2 = int(round(x_log2))

676

is_power_of_two = abs(x_log2 - rounded_log2) < 0.001

677

shift = rounded_log2 + 12

Patrik Gustavsson

e3dd2f3

2021-12-02 09:08:26 +0100

[diff] [blame]

678

if is_power_of_two and (

679

(pool_op.activation.op_type == NpuActivationOp.TANH and shift in (0, 1))

680

or (pool_op.activation.op_type == NpuActivationOp.SIGMOID and shift == 0)

681

):

682

# Special handling if input scale is 1/2048 (tanh/sigmoid) or 1/4096 (tanh)

Louis Verhaard

c629129

2021-03-19 09:35:48 +0100

[diff] [blame]

scale = 3 << shift

shift = 0

else:

shift = 0

max_rescale = np.iinfo(np.int16).max / 2

688

while rescale <= max_rescale and shift <= 30:

689

shift += 1

690

rescale *= 2

691

scale = int(rescale)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

692

else:

693

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

694

scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)

695

scale = int(round_away_zero(scale * rescale))

696

elif pool_op.fused_quantize:

697

# Quantize op requires different scaling

698

ifm_scale_f64 = np.double(ifm_quant.scale_f32)

699

ofm_scale_f64 = np.double(ofm_quant.scale_f32)

700

scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)

701

elif pool_op.rescale is not None:

Patrik Gustavsson

2021-08-17 14:26:38 +0200

[diff] [blame]

702

if type(pool_op.rescale) == ExplicitScaling:

703

# Note: reuse of rescale for explicit scaling to not expose this in the external API

704

explicit_scaling = pool_op.rescale

705

assert explicit_scaling.per_channel is False

706

scale = explicit_scaling.multiplier[0]

707

shift = explicit_scaling.shift[0]

708

else:

Tim Hall

885033b

2022-07-21 11:46:03 +0100

[diff] [blame]

709

# for ResizeBilinear/NearestNeighbor operations with rescale

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame^]

710

# Note: this is not used, but part of the public API

Patrik Gustavsson

2021-08-17 14:26:38 +0200

[diff] [blame]

711

rescale = pool_op.rescale

712

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

713

scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)

714

scale = int(round_away_zero(scale * rescale))

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

715

else:

716

# In case avg pool fused with concat or other memory operation, rescaling might be needed.

717

# kernel height == kernel width == 1 is always true in this case

718

# Normally the scale is maximised, to get maximum precision, which means that

719

# if rescale != 1, scale need to consider the number of bits needed for rescaling

720

if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:

721

rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32

722

rescale_bits = 0

723

if kernel.height == kernel.width == 1:

724

if rescale > 1:

725

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

726

elif rescale < 1:

727

rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)

728

scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)

729

scale = int(round_away_zero(scale * rescale))

else:

scale = 1

shift = 0

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)

735

736

737

def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:

738

"""

739

Generates OFM/OPA/OPB_SCALE registers for elementwise operators.

740

Returns the operator to scale

741

"""

742

op_to_scale = 0

743

if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):

744

input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None

745

input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None

746

output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None

747

748

if npu_op.activation is not None and npu_op.activation.op_type in (

749

NpuActivationOp.SIGMOID,

750

NpuActivationOp.TANH,

751

):

752

output_scale = 1 / 0x3000

753

754

if npu_op.sub_op_type == NpuElementWiseOp.MUL:

Patrik Gustavsson

b081d67

2021-08-25 13:49:25 +0200

[diff] [blame]

755

if npu_op.rescale:

756

ofm_scale, shift = npu_op.rescale

757

elif None in (input_scale, input2_scale, output_scale):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

ofm_scale = 1

shift = 0

else:

ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

762

else: # Add/Sub

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame^]

763

# Default operand scaling is no scaling

764

opa_scale = opb_scale = 1

765

opa_shift = 0

Henrik G Olsson

2021-03-19 15:50:28 +0100

[diff] [blame]

766

bitdepth = npu_op.ifm.data_type.size_in_bits()

767

use_advanced_scaling = False

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame^]

768

if npu_op.rescale is not None:

769

# Explicit ofm scaling

770

ofm_scale, shift = npu_op.rescale

771

elif None in (input_scale, input2_scale, output_scale):

772

# No ofm scaling

773

ofm_scale = 1

774

shift = 0

Henrik G Olsson

2021-03-19 15:50:28 +0100

[diff] [blame]

775

elif input_scale == input2_scale and bitdepth == 16:

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame^]

776

# int16 same scaling

Henrik G Olsson

2021-03-19 15:50:28 +0100

[diff] [blame]

777

opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(

778

input_scale, input2_scale, output_scale

779

)

780

# align the double rounding with that of advanced scaling

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame^]

781

opa_scale //= 2

782

opb_scale //= 2

Henrik G Olsson

2021-03-19 15:50:28 +0100

[diff] [blame]

783

shift -= 1

784

opa_shift = 0 # Unused for this case

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

785

elif input_scale == input2_scale:

Fredrik Svedberg

2022-09-27 14:13:01 +0200

[diff] [blame^]

786

# Same scaling

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

787

opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(

788

input_scale, input2_scale, output_scale

789

)

790

opa_shift = 0 # Unused for this case

Henrik G Olsson

2021-03-19 15:50:28 +0100

[diff] [blame]

791

# For 8 bit we can't guarantee double rounding with simplified scaling will always be

792

# the same as with advanced scaling due to different shifts. When the ofm scale fulfils

793

# the following we know that double rounding will have no effect for advanced scaling

794

# no matter the input, so we can safely use simplified scaling with double rounding disabled.

795

use_advanced_scaling = int(ofm_scale) & 0xFFF != 0

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

796

else:

Henrik G Olsson

2021-03-19 15:50:28 +0100

[diff] [blame]

797

use_advanced_scaling = True

798

if use_advanced_scaling:

799

# Use advanced implementation only when input/output scales differ,

800

# or when we can't guarantee the absence of rounding errors

Jonas Ohlsson

d857507

2022-03-30 10:30:25 +0200

[diff] [blame]

(

opa_scale,

opa_shift,

ofm_scale,

shift,

op_to_scale,

) = scaling.advanced_elementwise_add_sub_scale(input_scale, input2_scale, output_scale, bitdepth)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

808

opb_scale = 0 # Unused for this case

809

if npu_op.reversed_operands:

810

# If the operand order is reversed we also have to swap which operand is scaled

811

if op_to_scale == scaling.OperandToScale.OPa:

812

op_to_scale = scaling.OperandToScale.OPb

813

else:

814

op_to_scale = scaling.OperandToScale.OPa

815

emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)

816

emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

817

elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):

818

output_scale = npu_op.ofm.quantization.scale_f32

819

ofm_scale, shift = scaling.quantise_scale(output_scale)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

820

else:

Tim Hall

e178f38

2022-07-12 17:02:25 +0100

[diff] [blame]

821

ofm_scale = 1

822

shift = 0

823

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

return op_to_scale

# -------------------------------------------------------------------

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

828

# PRINT

829

# -------------------------------------------------------------------

Jacob Bohlin

e99b893

2020-07-13 16:01:51 +0200

[diff] [blame]

830

831

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

832

def print_feature_map(fm: Optional[NpuFeatureMap], name: str):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

if fm is not None:

q = (

"no quantization"

if fm.quantization is None

837

else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"

838

)

839

h, w, c = fm.shape

840

sz = h * w * c * fm.data_type.size_in_bytes()

841

print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")

842

strides = get_strides(fm)

843

stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"

844

t = fm.tiles

845

addresses = [hex(addr) for addr in t.addresses]

846

print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")

Tim Hall

2022-03-16 16:51:16 +0000

[diff] [blame]

847

print(f" name={fm.name}")

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

848

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

849

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

850

def print_operation(npu_op: NpuOperation, index: int = 0, cmd=None):

Tim Hall

2022-03-16 16:51:16 +0000

[diff] [blame]

851

pass_info = f" {cmd}" if cmd else ""

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

852

if isinstance(npu_op, NpuOperation) and not isinstance(npu_op, (NpuDmaOperation, NpuBlockOperation)):

Tim Hall

2022-03-16 16:51:16 +0000

[diff] [blame]

853

print(f"{index} {npu_op.op_type.name} name={npu_op.name}:{pass_info}")

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

854

return

855

if isinstance(npu_op, NpuDmaOperation):

Tim Hall

2022-03-16 16:51:16 +0000

[diff] [blame]

856

print(f"{index} {npu_op.op_type.name} name={npu_op.name}, src={npu_op.src}, dest={npu_op.dest}:{pass_info}")

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

857

return

858

k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

859

if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)):

Tim Hall

2022-03-16 16:51:16 +0000

[diff] [blame]

860

print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name} name={npu_op.name}:{pass_info}")

Patrik Gustavsson

eca2e95

2020-05-27 09:15:11 +0200

[diff] [blame]

861

else:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

862

if (

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

863

isinstance(npu_op, NpuConv2DOperation)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

864

and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1

865

):

866

fc = "FullyConnected "

867

else:

868

fc = ""

Tim Hall

2022-03-16 16:51:16 +0000

[diff] [blame]

869

print(f"{index} {fc}{npu_op.op_type.name} name={npu_op.name}:{pass_info}")

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

870

print_feature_map(npu_op.ifm, "IFM")

871

if npu_op.ifm2_scalar is not None:

872

quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)

873

print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")

874

else:

875

print_feature_map(npu_op.ifm2, "IFM2")

876

print_feature_map(npu_op.ofm, "OFM")

877

if k is not None and npu_op.op_type != NpuOperationType.ElementWise:

878

print(f" Kernel: {k}")

879

if npu_op.padding is not None:

880

print(f" {npu_op.padding}")

881

for weights in npu_op.weights:

882

print(f" Weights: {weights}")

883

for bias in npu_op.biases:

884

print(f" Scales: {bias}")

885

if npu_op.activation is not None:

886

act = npu_op.activation

887

if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:

888

lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""

889

print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

890

if isinstance(npu_op, NpuConv2DOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

891

print(f" {npu_op.block_traversal}")

892

bh, bw, bc = npu_op.block_config

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

893

rescale = (

894

f", rescale={npu_op.rescale}" if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)) else ""

895

)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

896

print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

897

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

898

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

899

def print_operations(npu_op_list: List[NpuOperation], npu_op_to_cmd=None):

900

npu_op_to_cmd = dict() if npu_op_to_cmd is None else npu_op_to_cmd

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

901

for index, npu_op in enumerate(npu_op_list):

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

902

print_operation(npu_op, index, npu_op_to_cmd.get(npu_op))

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

903

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

904

905

# -------------------------------------------------------------------

906

# OPERATIONS

907

# -------------------------------------------------------------------

908

909

910

def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):

911

"""Generates NPU_OP_* command"""

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

912

if isinstance(npu_op, NpuDmaOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

913

emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

914

elif isinstance(npu_op, NpuConv2DOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

915

emit.cmd_do_operation(cmd0.NPU_OP_CONV)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

916

elif isinstance(npu_op, NpuConvDepthWiseOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

917

emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

918

elif isinstance(npu_op, NpuPoolingOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

919

emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

920

elif isinstance(npu_op, NpuElementWiseOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

921

emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])

922

else:

923

assert 0, "Unsupported operation"

924

925

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

926

def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

927

"""Generates register commands for Conv2D operations"""

928

generate_common(emit, npu_op, npu_op.block_traversal, arch)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

929

930

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

931

def generate_conv_depthwise_op(

932

emit: CommandStreamEmitter, npu_op: NpuConvDepthWiseOperation, arch: ArchitectureFeatures

933

):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

934

"""Generates register commands for depthwise convolution operations"""

935

generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

936

937

938

def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):

939

"""Generates register commands for pooling operations"""

Tim Hall

d6efcd3

2022-09-02 15:01:01 +0100

[diff] [blame]

940

# check that reduce_sum input is NHWC

941

if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM and npu_op.ifm.layout != NpuLayout.NHWC:

942

if npu_op.ifm.data_type == NpuDataType.INT32:

943

raise VelaError(

944

f"REDUCE_SUM ({npu_op.name}) with IFM data type of INT32 requires IFM layout to be NHWC"

945

f" ({npu_op.ifm.name} == {npu_op.ifm.layout})"

946

)

947

elif arch.accelerator_config == Accelerator.Ethos_U65_512:

948

raise VelaError(

949

f"REDUCE_SUM ({npu_op.name}) with accelerator config of Ethos_U65_512 requires IFM layout to be NHWC"

950

f" ({npu_op.ifm.name} == {npu_op.ifm.layout})"

951

)

952

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

953

use_global_scale = (

954

npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0

955

)

Patrik Gustavsson

2021-08-17 14:26:38 +0200

[diff] [blame]

956

# Note: reuse of rescale for explicit scaling to not expose this in the external API

957

if npu_op.rescale is not None and type(npu_op.rescale) == ExplicitScaling:

958

use_global_scale = not npu_op.rescale.per_channel

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

959

generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)

960

# Pooling op specific

961

if use_global_scale:

962

generate_ofm_scaling_for_pooling(emit, npu_op)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

963

964

965

def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):

966

"""Generates register commands for elementwise operations"""

967

use_global_scale = npu_op.sub_op_type in (

968

NpuElementWiseOp.ADD,

969

NpuElementWiseOp.SUB,

970

NpuElementWiseOp.MUL,

971

NpuElementWiseOp.LRELU,

972

NpuElementWiseOp.ABS,

973

)

974

op_to_scale = generate_scaling_for_elementwise(emit, npu_op)

975

generate_common(

976

emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale

977

)

978

# Elementwise op specific

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

979

if npu_op.sub_op_type not in UNARY_ELEMWISE_OPS:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

980

# Binary operation; generate IFM2 registers

981

assert npu_op.ifm2 is not None

982

has_scalar = npu_op.ifm2_scalar is not None

983

generate_ifm2(emit, npu_op.ifm2, has_scalar)

984

generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)

985

generate_ifm2_broadcast(emit, npu_op)

986

if has_scalar:

987

quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)

988

assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()

989

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

990

991

992

def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):

993

"""Generates register commands for DMA operations"""

994

emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

995

emit.cmd1_with_address(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

996

emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)

997

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

998

emit.cmd1_with_address(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)

999

emit.cmd1_with_address(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1000

1001

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

1002

def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1003

"""

1004

Generates register commands for the given operation, but not the final NPU_OP_... command.

1005

Returns the selected block config

1006

"""

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1007

if isinstance(npu_op, NpuConv2DOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

1008

generate_conv2d_op(emit, npu_op, arch)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1009

elif isinstance(npu_op, NpuConvDepthWiseOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

1010

generate_conv_depthwise_op(emit, npu_op, arch)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1011

elif isinstance(npu_op, NpuPoolingOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

1012

generate_pooling_op(emit, npu_op, arch)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1013

elif isinstance(npu_op, NpuElementWiseOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

1014

generate_elementwise_op(emit, npu_op, arch)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1015

elif isinstance(npu_op, NpuDmaOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1016

generate_dma_op(emit, npu_op)

1017

else:

1018

assert 0, "Unsupported operation"

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1019

1020

1021

def generate_command_stream(

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

1022

npu_op_list: List[NpuOperation],

1023

arch: ArchitectureFeatures,

1024

verbose: bool,

1025

mem_limits: Dict[int, int],

1026

add_to_debug_db=None,

1027

npu_op_to_cmd=None,

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

1028

) -> List[int]:

1029

"""

1030

Generates register commands for the given list of NPU operations.

1031

Returns Ethos-U instructions, as a list of 32-bit integers.

1032

"""

1033

emit = CommandStreamEmitter()

1034

if verbose:

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1035

print_operations(npu_op_list, npu_op_to_cmd)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1036

# Calculate memory accesses for every operation

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1037

memory_accesses: Dict[NpuOperation, MemoryAccessSet] = {}

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1038

for npu_op in npu_op_list:

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1039

if isinstance(npu_op, NpuDmaOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1040

memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1041

elif isinstance(npu_op, NpuBlockOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1042

memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1043

else:

1044

assert 0, "Invalid operation type"

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

1045

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

1046

if arch.is_ethos_u65_system:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1047

emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)

1048

dep_watermark = Watermark(0, 0)

1049

prev_op = None

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1050

# Generate register commands for all operations

1051

for op_index, npu_op in enumerate(npu_op_list):

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

1052

try:

1053

check_mem_limits(memory_accesses[npu_op], mem_limits)

1054

dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)

1055

generate_registers_for_op(emit, npu_op, arch)

1056

except VelaError as e:

1057

# Add operation info and rethrow

1058

raise VelaError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1059

if not isinstance(npu_op, NpuDmaOperation) and isinstance(npu_op, NpuBlockOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1060

# Generate BLOCKDEP

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

1061

blockdep = calc_blockdep(arch, prev_op, npu_op)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1062

blockdep = min(blockdep, arch.max_blockdep)

1063

emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)

1064

prev_op = npu_op

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1065

1066

generate_cmd_waits(emit, cmd_waits)

1067

# Generate the actual NPU_OP command

1068

generate_operation_code(emit, npu_op)

1069

if add_to_debug_db is not None:

1070

add_to_debug_db(npu_op, emit.offset)

1071

# Fill in final part of command stream:

1072

emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

1073

res = emit.to_list()

erik.andersson@arm.com

1878dab

2021-03-16 09:40:24 +0100

[diff] [blame]

1074

1075

if emit.size_in_bytes() >= 1 << 24:

1076

raise VelaError(

1077

f"The command stream size exceeds the hardware limit of 16 MiB. "

1078

f"The current stream size is {emit.size_in_bytes()/2**20:.2F} MiB."

1079

)

1080

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1081

if verbose:

1082

emit.print_cmds()

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame]

1083

print(f"Number of commands = {len(emit.cmd_stream)}")

1084

print(f"Command stream length = {emit.size_in_bytes()} bytes")

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

return res

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1088

def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1089

"""

Louis Verhaard

aeae567

2020-11-02 18:04:27 +0100

[diff] [blame]

1090

Internal implementation of the public facing API for generating an Ethos-U register command stream.

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1091

Calculates dependencies between commands and inserts wait operations if needed.

1092

1093

:param npu_op_list: List[NpuOperation] list of high level NPU operations

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

1094

:param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator

1095

:return Ethos-U instructions, as a list of 32-bit integers

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1096

"""

Louis Verhaard

aeae567

2020-11-02 18:04:27 +0100

[diff] [blame]

1097

accelerator = Accelerator.from_npu_accelerator(npu_accelerator)

Louis Verhaard

5207830

2020-11-18 13:35:06 +0100

[diff] [blame]

1098

arch = create_default_arch(accelerator)

Louis Verhaard