Blame - ethosu/vela/register_command_stream_generator.py - ml/ethos-u/ethos-u-vela

2020-04-27 18:20:16 +0100

[diff] [blame]

2

#

3

# SPDX-License-Identifier: Apache-2.0

4

#

5

# Licensed under the Apache License, Version 2.0 (the License); you may

6

# not use this file except in compliance with the License.

7

# You may obtain a copy of the License at

8

#

9

# www.apache.org/licenses/LICENSE-2.0

10

#

11

# Unless required by applicable law or agreed to in writing, software

12

# distributed under the License is distributed on an AS IS BASIS, WITHOUT

13

# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

14

# See the License for the specific language governing permissions and

15

# limitations under the License.

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

16

# Description:

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

17

# Register level (low-level) command stream generation for Ethos-U. Takes a list of NPU operations and generates

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

18

# all the register settings. Calculates dependencies between commands and inserts wait operations. And generates a bit

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

19

# stream suitable for interpretation by the Ethos-U processor.

Louis Verhaard

c629129

2021-03-19 09:35:48 +0100

[diff] [blame]

20

import math

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

21

from collections import defaultdict

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

22

from enum import Enum

23

from enum import IntEnum

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

24

from typing import cast

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

25

from typing import Dict

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

26

from typing import List

27

from typing import Optional

Diego Russo

ea6111a

2020-04-14 18:41:58 +0100

[diff] [blame]

import numpy as np

from . import scaling

Louis Verhaard

aeae567

2020-11-02 18:04:27 +0100

[diff] [blame]

32

from .api import NpuAccelerator

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

33

from .api import NpuActivation

34

from .api import NpuActivationOp

35

from .api import NpuAddressRange

36

from .api import NpuBlockOperation

37

from .api import NpuBlockTraversal

38

from .api import NpuConv2DOperation

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

39

from .api import NpuConvDepthWiseOperation

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

40

from .api import NpuDataType

41

from .api import NpuDmaOperation

42

from .api import NpuElementWiseOp

43

from .api import NpuElementWiseOperation

44

from .api import NpuFeatureMap

45

from .api import NpuKernel

46

from .api import NpuLayout

47

from .api import NpuOperation

48

from .api import NpuOperationType

49

from .api import NpuPadding

50

from .api import NpuPoolingOp

51

from .api import NpuPoolingOperation

52

from .api import NpuQuantization

53

from .api import NpuResamplingMode

54

from .api import NpuRoundingMode

55

from .api import NpuShape3D

56

from .api import NpuTileBox

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

57

from .architecture_allocator import ArchitectureBlockConfig

58

from .architecture_allocator import try_block_config

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

59

from .architecture_features import Accelerator

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

60

from .architecture_features import ArchitectureFeatures

Louis Verhaard

5207830

2020-11-18 13:35:06 +0100

[diff] [blame]

61

from .architecture_features import create_default_arch

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

62

from .architecture_features import SHRAMElements

erik.andersson@arm.com

1878dab

2021-03-16 09:40:24 +0100

[diff] [blame]

63

from .errors import VelaError

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

64

from .ethos_u55_regs.ethos_u55_regs import acc_format

65

from .ethos_u55_regs.ethos_u55_regs import activation

66

from .ethos_u55_regs.ethos_u55_regs import cmd0

67

from .ethos_u55_regs.ethos_u55_regs import cmd1

68

from .ethos_u55_regs.ethos_u55_regs import elementwise_mode

Fredrik Svedberg

a0c3624

2020-06-03 15:43:31 +0200

[diff] [blame]

69

from .ethos_u55_regs.ethos_u55_regs import pooling_mode

Jacob Bohlin

cf7da10

2020-05-20 09:03:40 +0200

[diff] [blame]

70

from .ethos_u55_regs.ethos_u55_regs import resampling_mode

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

71

from .ethos_u55_regs.ethos_u55_regs import rounding

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

72

from .numeric_util import quantise_float32

73

from .numeric_util import round_away_zero

Diego Russo

2020-04-21 17:39:10 +0100

[diff] [blame]

74

from .numeric_util import round_up_to_int

Patrik Gustavsson

c74682c

2021-08-17 14:26:38 +0200

[diff] [blame]

75

from .operation import ExplicitScaling

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

76

from .operation import NpuBlockType

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

77

from .range_set import MemoryAccessSet

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

78

from .register_command_stream_util import BASE_PTR_INDEX_MEM2MEM

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

79

from .register_command_stream_util import calc_blockdep

80

from .register_command_stream_util import get_dma_memory_accesses

81

from .register_command_stream_util import get_op_memory_accesses

82

from .register_command_stream_util import get_strides

83

from .register_command_stream_util import get_wait_dependency

84

from .register_command_stream_util import has_ifm2

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

85

from .register_command_stream_util import shape3d_to_block

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

86

from .register_command_stream_util import to_kernel

87

from .register_command_stream_util import UNARY_ELEMWISE_OPS

88

from .register_command_stream_util import Watermark

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

89

90

91

class RegisterMachine:

92

def __init__(self):

93

self.n_banks = 1

94

self.registers = [defaultdict(lambda: None) for _ in range(self.n_banks)]

95

self.bank_idx = 0

96

97

def set_register(self, reg, value):

98

is_changed = self.registers[self.bank_idx][reg] != value

99

self.registers[self.bank_idx][reg] = value

100

# is_changed = True # force command

101

return is_changed

102

103

def switch_bank(self):

104

self.bank_idx = (self.bank_idx + 1) % self.n_banks

105

106

107

class CmdMode(IntEnum):

NoPayload = 0x0000

Payload32 = 0x4000

Mask = 0xC000

CmdOpMask = 0x03FF

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

114

class CommandStreamEmitter:

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

115

WORD_SIZE = 4

116

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

117

def __init__(self):

118

self.cmd_stream = []

119

self.reg_machine = [RegisterMachine(), RegisterMachine()]

120

self.last_absolute_wait = defaultdict(int)

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

121

self.offset = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

122

123

def get_reg_machine(self, cmd):

124

if "DMA" in cmd.name:

125

return self.reg_machine[1]

126

else:

127

return self.reg_machine[0]

128

129

def size_in_bytes(self):

130

sz = 0

131

for cmd in self.cmd_stream:

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

132

sz += len(cmd) * CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

133

return sz

134

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

135

def to_list(self) -> List[int]:

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

136

return [elem for cmd in self.cmd_stream for elem in cmd]

137

138

def print_cmds(self):

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame^]

139

s = f" {'Offset':6}:"

140

s += f" {'Payload':8}"

141

s += f"{'Param':4}" # no leading space for alignment

142

s += f" {'Code':4}"

143

s += f" - {'Command':30}"

s += f" {'Param':5}"

print(s)

offset = 0

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

148

for words_for_one_command in self.cmd_stream:

149

code = words_for_one_command[0] & 0x0000FFFF # lower 16 bits

150

param = words_for_one_command[0] >> 16 # higher 16 bits

151

152

payload_mode = CmdMode(code & CmdMode.Mask)

153

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame^]

154

s = f"0x{offset:06x}:"

155

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

156

if payload_mode == CmdMode.NoPayload:

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame^]

157

s += f" {'':8}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

158

else:

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame^]

159

assert payload_mode == CmdMode.Payload32

160

s += f" {words_for_one_command[1]:08x}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

161

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame^]

162

s += f" {param:04x}"

163

s += f" {code:04x}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

164

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame^]

165

if payload_mode == CmdMode.NoPayload:

166

s += f" - {cmd0(code & CmdMode.CmdOpMask):30}"

167

offset += 4

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

168

else:

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame^]

169

s += f" - {cmd1(code & CmdMode.CmdOpMask):30}"

170

offset += 8

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

171

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame^]

172

s += f" {param:5}"

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

173

print(s)

174

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

175

def cmd0_with_param(self, cmd: cmd0, param):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

176

if isinstance(param, Enum):

177

param = int(param.value)

178

else:

179

param = int(param)

180

param = param & 0xFFFF

181

command = cmd.value | (param << 16)

182

if not self.get_reg_machine(cmd).set_register(cmd, (command, param)):

183

return

184

185

# This is not a redundant command, actually write it

186

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

187

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

188

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

189

def cmd1_with_offset(self, cmd: cmd1, offset, param=0x0):

Louis Verhaard

893780c

2021-03-30 09:02:30 +0200

[diff] [blame]

190

offset = int(offset) & 0xFFFFFFFF

191

param = int(param) & 0xFFFF

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

192

command = cmd.value | CmdMode.Payload32.value | (param << 16)

193

194

if not self.get_reg_machine(cmd).set_register(cmd, (command, offset)):

195

return

196

197

# This is not a redundant command, actually write it

198

self.cmd_stream.append((command, offset))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

199

self.offset += CommandStreamEmitter.WORD_SIZE * 2

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

200

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

201

def cmd1_with_address(self, cmd: cmd1, offset):

202

self.cmd1_with_offset(cmd, offset, offset >> 32)

203

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

204

def cmd_wait(self, cmd: cmd0, channel: int, outstanding_count: int):

Tim Hall

289a41d

2020-08-04 21:40:14 +0100

[diff] [blame]

205

param = (16 * channel) + outstanding_count

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

206

command = ((param & 0xFFFF) << 16) | cmd.value

207

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

208

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

209

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

210

def cmd_do_operation(self, cmd: cmd0, param=0):

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

211

param = int(param)

212

command = ((param & 0xFFFF) << 16) | cmd.value

213

214

self.cmd_stream.append((command,))

Tim Hall

2020-11-09 16:46:37 +0000

[diff] [blame]

215

self.offset += CommandStreamEmitter.WORD_SIZE

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

216

self.get_reg_machine(cmd).switch_bank()

217

218

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

219

# -------------------------------------------------------------------

220

# REGISTER GENERATION

221

# -------------------------------------------------------------------

222

223

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

224

# TODO: Replace with definitions from ethos_u55_regs

225

class IFM2Broadcast(IntEnum):

226

BroadcastHdim = 1 << 0

227

BroadcastWdim = 1 << 1

228

BroadcastCdim = 1 << 2

229

ReverseOperandOrder = 1 << 6

230

UseIFM2Scalar = 1 << 7

pooling_op_map = {

NpuPoolingOp.MAX: pooling_mode.MAX.value,

235

NpuPoolingOp.AVERAGE: pooling_mode.AVERAGE.value,

236

NpuPoolingOp.REDUCE_SUM: pooling_mode.REDUCE_SUM.value,

237

}

238

239

elementwise_op_map = {

240

NpuElementWiseOp.MUL: elementwise_mode.MUL.value,

241

NpuElementWiseOp.ADD: elementwise_mode.ADD.value,

242

NpuElementWiseOp.SUB: elementwise_mode.SUB.value,

243

NpuElementWiseOp.MIN: elementwise_mode.MIN.value,

244

NpuElementWiseOp.MAX: elementwise_mode.MAX.value,

245

NpuElementWiseOp.LRELU: elementwise_mode.LRELU.value,

246

NpuElementWiseOp.ABS: elementwise_mode.ABS.value,

247

NpuElementWiseOp.CLZ: elementwise_mode.CLZ.value,

248

NpuElementWiseOp.SHR: elementwise_mode.SHR.value,

249

NpuElementWiseOp.SHL: elementwise_mode.SHL.value,

250

}

251

252

activation_op_map = {

253

NpuActivationOp.NONE_OR_RELU: activation.NONE,

254

NpuActivationOp.TANH: activation.TANH,

255

NpuActivationOp.SIGMOID: activation.SIGMOID,

256

}

257

258

# Maps an AccumulatorType enum to the corresponding acc_format value

259

acc_format_map = {

260

SHRAMElements.Acc16: acc_format.FP_S5_10.value,

261

SHRAMElements.Acc32: acc_format.INT_32BIT.value,

262

SHRAMElements.Acc40: acc_format.INT_40BIT.value,

263

}

264

265

resampling_mode_map = {

266

NpuResamplingMode.NONE: resampling_mode.NONE,

267

NpuResamplingMode.NEAREST: resampling_mode.NEAREST,

268

NpuResamplingMode.TRANSPOSE: resampling_mode.TRANSPOSE,

269

}

270

271

# Maps data type size in bits to activation precision

272

precision_map = {8: 0, 16: 1, 32: 2}

273

274

# Maps rounding mode to the corresponding value

275

rounding_mode_map = {

276

NpuRoundingMode.TFL: rounding.TFL.value,

277

NpuRoundingMode.TRUNCATE: rounding.TRUNCATE.value,

278

NpuRoundingMode.NATURAL: rounding.NATURAL.value,

}

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

282

def check_mem_limits(memory_accesses: MemoryAccessSet, mem_limits: Dict[int, int]):

283

"""Checks that an operation's memory accesses respect the boundaries imposed by mem_limits"""

284

for mem_access in memory_accesses.accesses:

285

for region, range_set in mem_access.regions.items():

286

if region not in mem_limits:

287

raise VelaError(f"Invalid region: {region}")

288

max = mem_limits[region]

289

for start, end in range_set.ranges:

290

for offset in (start, end):

291

if offset < 0:

292

raise VelaError(f"Negative address offset: {offset}, region: {region}")

293

if offset > max:

294

raise VelaError(f"Address offset out of range: {offset}, region: {region}, max: {max}")

295

296

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

297

def quantise(value: float, quant: Optional[NpuQuantization]) -> int:

298

"""Quantizes the given value"""

299

scale = 1 if quant is None or quant.scale_f32 is None else quant.scale_f32

300

zp = 0 if quant is None else quant.zero_point

301

return quantise_float32(value, scale, zp)

302

303

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

304

def generate_padding(emit: CommandStreamEmitter, padding: NpuPadding):

305

"""Generates IFM_PAD registers"""

306

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_TOP, padding.top)

307

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_LEFT, padding.left)

308

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_BOTTOM, padding.bottom)

309

emit.cmd0_with_param(cmd0.NPU_SET_IFM_PAD_RIGHT, padding.right)

310

311

312

def generate_activation(emit: CommandStreamEmitter, activation: Optional[NpuActivation], ofm: NpuFeatureMap):

313

"""Generates ACTIVATION registers"""

314

act = activation if activation is not None else NpuActivation(NpuActivationOp.NONE_OR_RELU)

315

316

if act.min is None:

317

quantized_min = ofm.data_type.min_value()

318

else:

319

quantized_min = quantise(act.min, ofm.quantization)

320

if act.max is None:

321

quantized_max = ofm.data_type.max_value()

322

else:

323

quantized_max = quantise(act.max, ofm.quantization)

324

quantized_min = max(quantized_min, np.iinfo(np.int16).min, ofm.data_type.min_value())

325

quantized_max = min(quantized_max, np.iinfo(np.int16).max, ofm.data_type.max_value())

326

if act.op_type == NpuActivationOp.TABLE_LOOKUP:

327

assert 0 <= act.lookup_table_index < 8

328

activation_value = 16 + act.lookup_table_index

329

if ofm.data_type == NpuDataType.INT32:

330

activation_value |= 3 << 12 # Force I8 range

331

quantized_min = max(-128, quantized_min)

332

quantized_max = min(127, quantized_max)

333

else:

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

334

activation_value = cast(int, activation_op_map[act.op_type])

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

335

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION, activation_value)

336

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MIN, quantized_min)

337

emit.cmd0_with_param(cmd0.NPU_SET_ACTIVATION_MAX, quantized_max)

338

339

340

def generate_addresses(emit: CommandStreamEmitter, ptr_cmds: List[cmd1], addresses: List[int], layout: NpuLayout):

341

"""Generates xFM_BASE registers"""

342

if layout == NpuLayout.NHCWB16:

343

# Check that all BasePointer addresses are aligned to 16 bytes

344

assert all((int(addr) % 16) == 0 for addr in addresses)

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

345

for i in range(4):

346

emit.cmd1_with_address(ptr_cmds[i], addresses[i])

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

347

348

349

def generate_tiles(emit: CommandStreamEmitter, tile_cmds: List[cmd0], tiles: NpuTileBox):

350

"""Generates xFM_HEIGHT0/HEIGHT1/WIDTH0 registers"""

351

emit.cmd0_with_param(tile_cmds[0], tiles.height_0 - 1)

352

emit.cmd0_with_param(tile_cmds[1], tiles.height_1 - 1)

353

emit.cmd0_with_param(tile_cmds[2], tiles.width_0 - 1)

354

355

356

def generate_strides(

357

emit: CommandStreamEmitter, fm: NpuFeatureMap, stride_c_cmd: cmd1, stride_y_cmd: cmd1, stride_x_cmd: cmd1

358

):

359

"""Generates STRIDE_C/Y/X registers"""

360

strides = get_strides(fm)

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

361

emit.cmd1_with_address(stride_c_cmd, strides.depth) # stride between 16-byte channel blocks (C)

362

emit.cmd1_with_address(stride_y_cmd, strides.height) # stride between vertical values (H)

363

emit.cmd1_with_address(stride_x_cmd, strides.width) # stride between horisontal values (W)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

364

365

366

def generate_ifm_precision(emit: CommandStreamEmitter, fm: NpuFeatureMap, op_to_scale: int, precision_cmd: cmd0):

367

"""Generates IFM/IFM2_PRECISION register"""

368

dtype = fm.data_type

369

prec = 1 if dtype.is_signed() else 0

370

activation_precision = precision_map[dtype.size_in_bits()]

371

prec += activation_precision << 2

372

373

if fm.layout == NpuLayout.NHCWB16:

374

prec |= 1 << 6

375

376

prec |= op_to_scale << 8

377

emit.cmd0_with_param(precision_cmd, prec)

378

379

380

def generate_ofm_precision(emit: CommandStreamEmitter, npu_op: NpuBlockOperation, use_global_scale: bool):

381

"""Generates OFM_PRECISION register"""

382

dtype = npu_op.ofm.data_type

383

prec = 1 if dtype.is_signed() else 0

384

activation_precision = precision_map[dtype.size_in_bits()]

385

prec += activation_precision << 1

386

387

if use_global_scale:

388

# Set global scale bit, as opposed to using per channel scale

389

prec |= 1 << 8

390

if npu_op.ofm.layout == NpuLayout.NHCWB16:

391

prec |= 1 << 6

392

prec |= rounding_mode_map[npu_op.rounding_mode] << 14

393

emit.cmd0_with_param(cmd0.NPU_SET_OFM_PRECISION, prec)

394

395

396

def generate_ifm2_broadcast(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation):

397

"""Generates IFM2_BROADCAST register for binary elementwise operations"""

ifm2_broadcast = 0

ifm = npu_op.ifm

ifm2 = npu_op.ifm2

if npu_op.reversed_operands:

402

ifm2_broadcast |= IFM2Broadcast.ReverseOperandOrder

403

if npu_op.ifm2_scalar is not None:

404

# IFM2 is a constant, set UseIFM2Scalar bit to IFM2_BROADCAST

405

ifm2_broadcast |= IFM2Broadcast.UseIFM2Scalar

406

else:

407

if ifm.shape.height != ifm2.shape.height:

408

# Broadcast in 'H' dimension

409

assert ifm2.shape.height == 1

410

ifm2_broadcast |= IFM2Broadcast.BroadcastHdim

411

412

if ifm.shape.width != ifm2.shape.width:

413

# Broadcast in 'W' dimension

414

assert ifm2.shape.width == 1

415

ifm2_broadcast |= IFM2Broadcast.BroadcastWdim

416

417

if ifm.shape.depth != ifm2.shape.depth:

418

# Broadcast in 'C' dimension

419

assert ifm2.shape.depth == 1

420

ifm2_broadcast |= IFM2Broadcast.BroadcastCdim

421

422

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_BROADCAST, ifm2_broadcast)

423

424

425

def generate_ifm(emit: CommandStreamEmitter, ifm: NpuFeatureMap):

426

"""Generates general IFM registers"""

427

emit.cmd0_with_param(cmd0.NPU_SET_IFM_REGION, ifm.region)

428

generate_addresses(

429

emit,

430

[cmd1.NPU_SET_IFM_BASE0, cmd1.NPU_SET_IFM_BASE1, cmd1.NPU_SET_IFM_BASE2, cmd1.NPU_SET_IFM_BASE3],

ifm.tiles.addresses,

ifm.layout,

)

generate_tiles(

emit, [cmd0.NPU_SET_IFM_HEIGHT0_M1, cmd0.NPU_SET_IFM_HEIGHT1_M1, cmd0.NPU_SET_IFM_WIDTH0_M1], ifm.tiles

436

)

437

emit.cmd0_with_param(cmd0.NPU_SET_IFM_DEPTH_M1, ifm.shape.depth - 1)

438

generate_strides(emit, ifm, cmd1.NPU_SET_IFM_STRIDE_C, cmd1.NPU_SET_IFM_STRIDE_Y, cmd1.NPU_SET_IFM_STRIDE_X)

439

emit.cmd0_with_param(cmd0.NPU_SET_IFM_ZERO_POINT, int(ifm.quantization.zero_point))

440

441

442

def generate_ifm2(emit: CommandStreamEmitter, ifm2: NpuFeatureMap, has_scalar: bool):

443

"""Generates general IFM2 registers"""

444

if not has_scalar:

445

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_REGION, ifm2.region)

446

generate_addresses(

447

emit,

448

[cmd1.NPU_SET_IFM2_BASE0, cmd1.NPU_SET_IFM2_BASE1, cmd1.NPU_SET_IFM2_BASE2, cmd1.NPU_SET_IFM2_BASE3],

449

ifm2.tiles.addresses,

ifm2.layout,

)

generate_tiles(

emit, [cmd0.NPU_SET_IFM2_HEIGHT0_M1, cmd0.NPU_SET_IFM2_HEIGHT1_M1, cmd0.NPU_SET_IFM2_WIDTH0_M1], ifm2.tiles

454

)

455

generate_strides(emit, ifm2, cmd1.NPU_SET_IFM2_STRIDE_C, cmd1.NPU_SET_IFM2_STRIDE_Y, cmd1.NPU_SET_IFM2_STRIDE_X)

456

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_ZERO_POINT, int(ifm2.quantization.zero_point))

457

458

459

def generate_ofm(emit: CommandStreamEmitter, ofm: NpuFeatureMap):

460

"""Generates general OFM registers"""

461

emit.cmd0_with_param(cmd0.NPU_SET_OFM_REGION, ofm.region)

462

generate_addresses(

463

emit,

464

[cmd1.NPU_SET_OFM_BASE0, cmd1.NPU_SET_OFM_BASE1, cmd1.NPU_SET_OFM_BASE2, cmd1.NPU_SET_OFM_BASE3],

ofm.tiles.addresses,

ofm.layout,

)

generate_tiles(

emit, [cmd0.NPU_SET_OFM_HEIGHT0_M1, cmd0.NPU_SET_OFM_HEIGHT1_M1, cmd0.NPU_SET_OFM_WIDTH0_M1], ofm.tiles

470

)

471

emit.cmd0_with_param(cmd0.NPU_SET_OFM_HEIGHT_M1, ofm.shape.height - 1)

472

emit.cmd0_with_param(cmd0.NPU_SET_OFM_WIDTH_M1, ofm.shape.width - 1)

473

emit.cmd0_with_param(cmd0.NPU_SET_OFM_DEPTH_M1, ofm.shape.depth - 1)

474

generate_strides(emit, ofm, cmd1.NPU_SET_OFM_STRIDE_C, cmd1.NPU_SET_OFM_STRIDE_Y, cmd1.NPU_SET_OFM_STRIDE_X)

475

emit.cmd0_with_param(cmd0.NPU_SET_OFM_ZERO_POINT, int(ofm.quantization.zero_point))

476

477

478

def generate_kernel(emit: CommandStreamEmitter, kernel: NpuKernel, block_traversal: NpuBlockTraversal):

479

"""Generates KERNEL related registers"""

480

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_HEIGHT_M1, kernel.dilation_y * (kernel.height - 1))

481

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_WIDTH_M1, kernel.dilation_x * (kernel.width - 1))

482

# set kernel x stride low bit

483

stride = (kernel.stride_x - 1) & 1

484

# set kernel y stride low bit

485

stride |= (kernel.stride_y - 1 & 1) << 1

486

# set kernel x stride extension bits

487

stride |= (kernel.stride_x - 1 >> 1) << 6

488

# set kernel y stride extension bits

489

stride |= (kernel.stride_y - 1 >> 1) << 9

490

stride |= (kernel.dilation_x - 1) << 3

491

stride |= (kernel.dilation_y - 1) << 4

492

if block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST:

493

stride |= 1 << 2

494

emit.cmd0_with_param(cmd0.NPU_SET_KERNEL_STRIDE, stride)

495

496

497

def generate_weights(emit: CommandStreamEmitter, weights: List[NpuAddressRange], arch: ArchitectureFeatures):

498

"""Generates WEIGHT registers"""

499

if len(weights) == 0:

500

return

501

emit.cmd0_with_param(cmd0.NPU_SET_WEIGHT_REGION, weights[0].region)

502

# Set weights sources for active and present cores

503

for core, (addr, length) in enumerate(

504

[

505

(cmd1.NPU_SET_WEIGHT_BASE, cmd1.NPU_SET_WEIGHT_LENGTH),

506

(cmd1.NPU_SET_WEIGHT1_BASE, cmd1.NPU_SET_WEIGHT1_LENGTH),

507

]

508

):

509

if core < len(weights):

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

510

emit.cmd1_with_address(addr, weights[core].address)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

511

emit.cmd1_with_offset(length, weights[core].length)

512

elif core < arch.ncores:

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

513

emit.cmd1_with_address(addr, weights[0].address)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

514

emit.cmd1_with_offset(length, 0)

515

516

517

def generate_biases(emit: CommandStreamEmitter, biases: List[NpuAddressRange], arch: ArchitectureFeatures):

518

"""Generates SCALE registers"""

519

if len(biases) == 0:

520

return

521

emit.cmd0_with_param(cmd0.NPU_SET_SCALE_REGION, biases[0].region)

522

# Set weights sources for active and present cores

523

for core, (addr, length) in enumerate(

524

[(cmd1.NPU_SET_SCALE_BASE, cmd1.NPU_SET_SCALE_LENGTH), (cmd1.NPU_SET_SCALE1_BASE, cmd1.NPU_SET_SCALE1_LENGTH)]

525

):

526

if core < len(biases):

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

527

emit.cmd1_with_address(addr, biases[core].address)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

528

emit.cmd1_with_offset(length, biases[core].length)

529

elif core < arch.ncores:

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

530

emit.cmd1_with_address(addr, biases[0].address)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

531

emit.cmd1_with_offset(length, 0)

532

533

534

def generate_block_config(

Jonas Ohlsson

d857507

2022-03-30 10:30:25 +0200

[diff] [blame]

535

emit: CommandStreamEmitter,

536

block_config: NpuShape3D,

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

537

):

538

"""Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers"""

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

539

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_HEIGHT_M1, block_config.height - 1)

540

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_WIDTH_M1, block_config.width - 1)

541

emit.cmd0_with_param(cmd0.NPU_SET_OFM_BLK_DEPTH_M1, block_config.depth - 1)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

542

543

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

544

def generate_shram_registers(

Jonas Ohlsson

d857507

2022-03-30 10:30:25 +0200

[diff] [blame]

545

emit: CommandStreamEmitter,

546

npu_op: NpuBlockOperation,

547

arch_block_config: ArchitectureBlockConfig,

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

548

):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

549

"""Generates IB_END/IB_START/AB_START/ACC_FORMAT registers"""

550

emit.cmd0_with_param(cmd0.NPU_SET_IFM_IB_END, arch_block_config.layout.ib_end)

551

emit.cmd0_with_param(cmd0.NPU_SET_AB_START, arch_block_config.layout.ab_start)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

552

if has_ifm2(npu_op):

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

553

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_IB_START, arch_block_config.layout.ib_start2)

554

emit.cmd0_with_param(cmd0.NPU_SET_ACC_FORMAT, acc_format_map[arch_block_config.acc_type])

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

555

556

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

557

def get_block_config_for_npu_op(

558

arch, npu_op: NpuBlockOperation, npu_block_type: NpuBlockType, is_partkernel: bool, ifm_resampling: resampling_mode

559

) -> Optional[ArchitectureBlockConfig]:

560

"""

561

Given npu_op.block_config, returns a corresponding ArchitectureBlockConfig.

562

Returns None if the block_config does not fit.

563

"""

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

564

565

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

566

def get_arch_block_config(

567

npu_op: NpuBlockOperation, block_traversal: NpuBlockTraversal, arch: ArchitectureFeatures

568

) -> ArchitectureBlockConfig:

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

569

"""Creates shared buffer allocation for the given operation"""

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

570

assert npu_op.block_config is not None, "block_config has not been set"

571

block_type = NpuBlockType.Default

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

572

if isinstance(npu_op, NpuConv2DOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

573

block_type = NpuBlockType.ConvolutionMxN

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

574

elif isinstance(npu_op, NpuConvDepthWiseOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

575

block_type = NpuBlockType.ConvolutionDepthWise

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

576

elif isinstance(npu_op, NpuPoolingOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

577

block_type = NpuBlockType.ReduceSum if npu_op.sub_op_type == NpuPoolingOp.REDUCE_SUM else NpuBlockType.Pooling

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

578

elif isinstance(npu_op, NpuElementWiseOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

579

block_type = NpuBlockType.ElementWise

580

else:

581

assert 0, "Unsupported operation"

582

ifm_resampling_mode = resampling_mode_map[npu_op.ifm_upscale]

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

583

is_partkernel = block_traversal == NpuBlockTraversal.PART_KERNEL_FIRST

584

uses_lut = npu_op.activation is not None and npu_op.activation.op_type == NpuActivationOp.TABLE_LOOKUP

585

lut_banks = 2 if uses_lut else 0

586

fms = [npu_op.ifm, npu_op.ofm]

587

if npu_op.ifm2 is not None:

588

fms.append(npu_op.ifm2)

589

all_fms_have_quant = not any(fm.quantization is None or fm.quantization.scale_f32 is None for fm in fms)

590

ifm_bits = npu_op.ifm.data_type.size_in_bits()

591

ifm_shape = shape3d_to_block(npu_op.ifm.shape)

592

if has_ifm2(npu_op):

593

ifm2_shape = shape3d_to_block(npu_op.ifm2.shape)

594

else:

595

ifm2_shape = None

596

uses_scalar = npu_op.ifm2_scalar is not None

597

block_config = shape3d_to_block(npu_op.block_config)

598

arch_block_config = try_block_config(

599

block_config,

600

arch,

601

block_type,

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

602

shape3d_to_block(npu_op.ofm.shape),

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

ifm_shape,

ifm2_shape,

uses_scalar,

ifm_bits,

is_partkernel=is_partkernel,

608

kernel=to_kernel(npu_op.kernel),

609

lut_banks=lut_banks,

610

scaled=all_fms_have_quant,

611

ifm_resampling=ifm_resampling_mode,

612

)

613

assert arch_block_config is not None, f"block_config {npu_op.block_config} does not fit"

614

return arch_block_config

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

615

616

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

617

def generate_cmd_waits(emit: CommandStreamEmitter, cmd_waits: Watermark):

618

"""Generates KERNEL_WAIT/DMA_WAIT"""

619

if cmd_waits.npu >= 0:

620

emit.cmd_wait(cmd0.NPU_OP_KERNEL_WAIT, 0, cmd_waits.npu)

621

622

if cmd_waits.dma >= 0:

623

emit.cmd_wait(cmd0.NPU_OP_DMA_WAIT, 0, cmd_waits.dma)

624

625

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

626

def generate_common(

627

emit: CommandStreamEmitter,

628

npu_op: NpuBlockOperation,

629

block_traversal: NpuBlockTraversal,

630

arch: ArchitectureFeatures,

631

use_global_scale: bool = False,

632

op_to_scale: int = 0,

633

):

634

"""Generate registers that are common to most operations"""

635

assert npu_op.ifm is not None and npu_op.ofm is not None

636

generate_ifm(emit, npu_op.ifm)

637

generate_ifm_precision(emit, npu_op.ifm, op_to_scale, cmd0.NPU_SET_IFM_PRECISION)

638

emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode_map[npu_op.ifm_upscale])

639

if npu_op.padding is not None:

640

generate_padding(emit, npu_op.padding)

641

generate_ofm(emit, npu_op.ofm)

642

generate_ofm_precision(emit, npu_op, use_global_scale)

643

if npu_op.op_type != NpuOperationType.ElementWise:

644

assert npu_op.kernel is not None

645

generate_kernel(emit, npu_op.kernel, block_traversal)

646

generate_weights(emit, npu_op.weights, arch)

647

generate_biases(emit, npu_op.biases, arch)

648

generate_activation(emit, npu_op.activation, npu_op.ofm)

Tim Hall

2021-05-27 18:49:40 +0100

[diff] [blame]

649

arch_block_config = get_arch_block_config(npu_op, block_traversal, arch)

650

generate_block_config(emit, npu_op.block_config)

651

generate_shram_registers(emit, npu_op, arch_block_config)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

652

653

654

# -------------------------------------------------------------------

655

# SCALING

656

# -------------------------------------------------------------------

657

658

659

def generate_ofm_scaling_for_pooling(emit: CommandStreamEmitter, pool_op: NpuPoolingOperation):

660

"""Generates OFM_SCALE register for pooling operations"""

661

# For valid padding vela has to output scaling values

662

kernel = pool_op.kernel

663

ifm_quant = pool_op.ifm.quantization

664

ofm_quant = pool_op.ofm.quantization

665

if pool_op.activation is not None and pool_op.activation.op_type in (NpuActivationOp.SIGMOID, NpuActivationOp.TANH):

666

assert ifm_quant.scale_f32 is not None

667

rescale = 0x3000 * ifm_quant.scale_f32

668

if pool_op.ifm.data_type == NpuDataType.INT16:

669

# Calculate scale and shift for the output scale of 1/(3*4096)

Louis Verhaard

c629129

2021-03-19 09:35:48 +0100

[diff] [blame]

670

x_log2 = math.log2(ifm_quant.scale_f32)

671

rounded_log2 = int(round(x_log2))

672

is_power_of_two = abs(x_log2 - rounded_log2) < 0.001

673

shift = rounded_log2 + 12

Patrik Gustavsson

e3dd2f3

2021-12-02 09:08:26 +0100

[diff] [blame]

674

if is_power_of_two and (

675

(pool_op.activation.op_type == NpuActivationOp.TANH and shift in (0, 1))

676

or (pool_op.activation.op_type == NpuActivationOp.SIGMOID and shift == 0)

677

):

678

# Special handling if input scale is 1/2048 (tanh/sigmoid) or 1/4096 (tanh)

Louis Verhaard

c629129

2021-03-19 09:35:48 +0100

[diff] [blame]

scale = 3 << shift

shift = 0

else:

shift = 0

max_rescale = np.iinfo(np.int16).max / 2

684

while rescale <= max_rescale and shift <= 30:

685

shift += 1

686

rescale *= 2

687

scale = int(rescale)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

688

else:

689

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

690

scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)

691

scale = int(round_away_zero(scale * rescale))

692

elif pool_op.fused_quantize:

693

# Quantize op requires different scaling

694

ifm_scale_f64 = np.double(ifm_quant.scale_f32)

695

ofm_scale_f64 = np.double(ofm_quant.scale_f32)

696

scale, shift = scaling.quantise_scale(ifm_scale_f64 / ofm_scale_f64)

697

elif pool_op.rescale is not None:

Patrik Gustavsson

c74682c

2021-08-17 14:26:38 +0200

[diff] [blame]

698

if type(pool_op.rescale) == ExplicitScaling:

699

# Note: reuse of rescale for explicit scaling to not expose this in the external API

700

explicit_scaling = pool_op.rescale

701

assert explicit_scaling.per_channel is False

702

scale = explicit_scaling.multiplier[0]

703

shift = explicit_scaling.shift[0]

704

else:

705

# for ResizeBilinear operations with rescale

706

rescale = pool_op.rescale

707

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

708

scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)

709

scale = int(round_away_zero(scale * rescale))

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

710

else:

711

# In case avg pool fused with concat or other memory operation, rescaling might be needed.

712

# kernel height == kernel width == 1 is always true in this case

713

# Normally the scale is maximised, to get maximum precision, which means that

714

# if rescale != 1, scale need to consider the number of bits needed for rescaling

715

if ofm_quant.scale_f32 is not None and ifm_quant.scale_f32 is not None:

716

rescale = ifm_quant.scale_f32 / ofm_quant.scale_f32

717

rescale_bits = 0

718

if kernel.height == kernel.width == 1:

719

if rescale > 1:

720

rescale_bits = len(bin(round_up_to_int(rescale))) - 2 + 1

721

elif rescale < 1:

722

rescale_bits = -(len(bin(round_up_to_int(1 / rescale))) - 2 - 1)

723

scale, shift = scaling.quantise_pooling_scale(kernel.height * kernel.width, rescale_bits)

724

scale = int(round_away_zero(scale * rescale))

else:

scale = 1

shift = 0

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, scale, shift)

730

731

732

def generate_scaling_for_elementwise(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation) -> int:

733

"""

734

Generates OFM/OPA/OPB_SCALE registers for elementwise operators.

735

Returns the operator to scale

736

"""

737

op_to_scale = 0

738

if npu_op.sub_op_type in (NpuElementWiseOp.ADD, NpuElementWiseOp.MUL, NpuElementWiseOp.SUB):

739

input_scale = npu_op.ifm.quantization.scale_f32 if npu_op.ifm.quantization else None

740

input2_scale = npu_op.ifm2.quantization.scale_f32 if npu_op.ifm2.quantization else None

741

output_scale = npu_op.ofm.quantization.scale_f32 if npu_op.ofm.quantization else None

742

743

if npu_op.activation is not None and npu_op.activation.op_type in (

744

NpuActivationOp.SIGMOID,

745

NpuActivationOp.TANH,

746

):

747

output_scale = 1 / 0x3000

748

749

if npu_op.sub_op_type == NpuElementWiseOp.MUL:

Patrik Gustavsson

b081d67

2021-08-25 13:49:25 +0200

[diff] [blame]

750

if npu_op.rescale:

751

ofm_scale, shift = npu_op.rescale

752

elif None in (input_scale, input2_scale, output_scale):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

ofm_scale = 1

shift = 0

else:

ofm_scale, shift = scaling.elementwise_mul_scale(input_scale, input2_scale, output_scale)

757

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

758

else: # Add/Sub

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

759

opa_scale: float

760

opb_scale: float

Henrik G Olsson

2021-03-19 15:50:28 +0100

[diff] [blame]

761

bitdepth = npu_op.ifm.data_type.size_in_bits()

762

use_advanced_scaling = False

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

763

if None in (input_scale, input2_scale, output_scale):

764

opa_scale = opb_scale = ofm_scale = 1

765

opa_shift = shift = 0

766

if npu_op.rescale is not None:

767

ofm_scale, shift = npu_op.rescale

Henrik G Olsson

2021-03-19 15:50:28 +0100

[diff] [blame]

768

elif input_scale == input2_scale and bitdepth == 16:

769

opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(

770

input_scale, input2_scale, output_scale

771

)

772

# align the double rounding with that of advanced scaling

opa_scale /= 2

opb_scale /= 2

shift -= 1

opa_shift = 0 # Unused for this case

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

777

elif input_scale == input2_scale:

778

opa_scale, opb_scale, ofm_scale, shift = scaling.simplified_elementwise_add_sub_scale(

779

input_scale, input2_scale, output_scale

780

)

781

opa_shift = 0 # Unused for this case

Henrik G Olsson

2021-03-19 15:50:28 +0100

[diff] [blame]

782

# For 8 bit we can't guarantee double rounding with simplified scaling will always be

783

# the same as with advanced scaling due to different shifts. When the ofm scale fulfils

784

# the following we know that double rounding will have no effect for advanced scaling

785

# no matter the input, so we can safely use simplified scaling with double rounding disabled.

786

use_advanced_scaling = int(ofm_scale) & 0xFFF != 0

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

787

else:

Henrik G Olsson

2021-03-19 15:50:28 +0100

[diff] [blame]

788

use_advanced_scaling = True

789

if use_advanced_scaling:

790

# Use advanced implementation only when input/output scales differ,

791

# or when we can't guarantee the absence of rounding errors

Jonas Ohlsson

d857507

2022-03-30 10:30:25 +0200

[diff] [blame]

(

opa_scale,

opa_shift,

ofm_scale,

shift,

op_to_scale,

) = scaling.advanced_elementwise_add_sub_scale(input_scale, input2_scale, output_scale, bitdepth)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

799

opb_scale = 0 # Unused for this case

800

if npu_op.reversed_operands:

801

# If the operand order is reversed we also have to swap which operand is scaled

802

if op_to_scale == scaling.OperandToScale.OPa:

803

op_to_scale = scaling.OperandToScale.OPb

804

else:

805

op_to_scale = scaling.OperandToScale.OPa

806

emit.cmd1_with_offset(cmd1.NPU_SET_OPA_SCALE, opa_scale, opa_shift)

807

emit.cmd1_with_offset(cmd1.NPU_SET_OPB_SCALE, opb_scale)

808

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

809

elif npu_op.sub_op_type in (NpuElementWiseOp.LRELU, NpuElementWiseOp.ABS):

810

output_scale = npu_op.ofm.quantization.scale_f32

811

ofm_scale, shift = scaling.quantise_scale(output_scale)

812

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, ofm_scale, shift)

813

else:

814

emit.cmd1_with_offset(cmd1.NPU_SET_OFM_SCALE, 1, 0)

return op_to_scale

# -------------------------------------------------------------------

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

819

# PRINT

820

# -------------------------------------------------------------------

Jacob Bohlin

e99b893

2020-07-13 16:01:51 +0200

[diff] [blame]

821

822

Jonas Ohlsson

2022-03-01 12:39:55 +0100

[diff] [blame]

823

def print_feature_map(fm: Optional[NpuFeatureMap], name: str):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

if fm is not None:

q = (

"no quantization"

if fm.quantization is None

828

else f"scale: {fm.quantization.scale_f32}, zero: {fm.quantization.zero_point}"

829

)

830

h, w, c = fm.shape

831

sz = h * w * c * fm.data_type.size_in_bytes()

832

print(f" {name}: h={h},w={w},c={c}, region={fm.region}, {fm.layout}, {fm.data_type}, size={sz}, {q}")

833

strides = get_strides(fm)

834

stride_str = f"Stride y/x/c: {strides.height}/{strides.width}/{strides.depth}"

835

t = fm.tiles

836

addresses = [hex(addr) for addr in t.addresses]

837

print(f" {stride_str}, tiles: w0={t.width_0}, h0={t.height_0}, h1={t.height_1}, base={addresses}")

Tim Hall

2022-03-16 16:51:16 +0000

[diff] [blame]

838

print(f" name={fm.name}")

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

839

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

840

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

841

def print_operation(npu_op: NpuOperation, index: int = 0, cmd=None):

Tim Hall

2022-03-16 16:51:16 +0000

[diff] [blame]

842

pass_info = f" {cmd}" if cmd else ""

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

843

if isinstance(npu_op, NpuOperation) and not isinstance(npu_op, (NpuDmaOperation, NpuBlockOperation)):

Tim Hall

2022-03-16 16:51:16 +0000

[diff] [blame]

844

print(f"{index} {npu_op.op_type.name} name={npu_op.name}:{pass_info}")

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

845

return

846

if isinstance(npu_op, NpuDmaOperation):

Tim Hall

2022-03-16 16:51:16 +0000

[diff] [blame]

847

print(f"{index} {npu_op.op_type.name} name={npu_op.name}, src={npu_op.src}, dest={npu_op.dest}:{pass_info}")

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

848

return

849

k = None if npu_op.kernel is None else to_kernel(npu_op.kernel)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

850

if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)):

Tim Hall

2022-03-16 16:51:16 +0000

[diff] [blame]

851

print(f"{index} {npu_op.sub_op_type.name} {npu_op.op_type.name} name={npu_op.name}:{pass_info}")

Patrik Gustavsson

eca2e95

2020-05-27 09:15:11 +0200

[diff] [blame]

852

else:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

853

if (

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

854

isinstance(npu_op, NpuConv2DOperation)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

855

and k.elements_wh() * k.stride.x * k.stride.y * k.dilation.x * k.dilation.y == 1

856

):

857

fc = "FullyConnected "

858

else:

859

fc = ""

Tim Hall

2022-03-16 16:51:16 +0000

[diff] [blame]

860

print(f"{index} {fc}{npu_op.op_type.name} name={npu_op.name}:{pass_info}")

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

861

print_feature_map(npu_op.ifm, "IFM")

862

if npu_op.ifm2_scalar is not None:

863

quant_val = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)

864

print(f" IFM2: Scalar={npu_op.ifm2_scalar} (quantized: {quant_val}), {npu_op.ifm2.quantization}")

865

else:

866

print_feature_map(npu_op.ifm2, "IFM2")

867

print_feature_map(npu_op.ofm, "OFM")

868

if k is not None and npu_op.op_type != NpuOperationType.ElementWise:

869

print(f" Kernel: {k}")

870

if npu_op.padding is not None:

871

print(f" {npu_op.padding}")

872

for weights in npu_op.weights:

873

print(f" Weights: {weights}")

874

for bias in npu_op.biases:

875

print(f" Scales: {bias}")

876

if npu_op.activation is not None:

877

act = npu_op.activation

878

if act.op_type != NpuActivationOp.NONE_OR_RELU or act.min is not None or act.max is not None:

879

lut = f", lut index={act.lookup_table_index}" if act.op_type == NpuActivationOp.TABLE_LOOKUP else ""

880

print(f" Activation: {act.op_type.name}, min={act.min}, max={act.max}{lut}")

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

881

if isinstance(npu_op, NpuConv2DOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

882

print(f" {npu_op.block_traversal}")

883

bh, bw, bc = npu_op.block_config

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

884

rescale = (

885

f", rescale={npu_op.rescale}" if isinstance(npu_op, (NpuPoolingOperation, NpuElementWiseOperation)) else ""

886

)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

887

print(f" Block config: h={bh},w={bw},c={bc}, {npu_op.ifm_upscale}, {npu_op.rounding_mode}{rescale}")

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

888

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

889

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

890

def print_operations(npu_op_list: List[NpuOperation], npu_op_to_cmd=None):

891

npu_op_to_cmd = dict() if npu_op_to_cmd is None else npu_op_to_cmd

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

892

for index, npu_op in enumerate(npu_op_list):

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

893

print_operation(npu_op, index, npu_op_to_cmd.get(npu_op))

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

894

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

895

896

# -------------------------------------------------------------------

897

# OPERATIONS

898

# -------------------------------------------------------------------

899

900

901

def generate_operation_code(emit: CommandStreamEmitter, npu_op: NpuOperation):

902

"""Generates NPU_OP_* command"""

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

903

if isinstance(npu_op, NpuDmaOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

904

emit.cmd_do_operation(cmd0.NPU_OP_DMA_START, npu_op.channel * 16 + npu_op.mode)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

905

elif isinstance(npu_op, NpuConv2DOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

906

emit.cmd_do_operation(cmd0.NPU_OP_CONV)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

907

elif isinstance(npu_op, NpuConvDepthWiseOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

908

emit.cmd_do_operation(cmd0.NPU_OP_DEPTHWISE)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

909

elif isinstance(npu_op, NpuPoolingOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

910

emit.cmd_do_operation(cmd0.NPU_OP_POOL, param=pooling_op_map[npu_op.sub_op_type])

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

911

elif isinstance(npu_op, NpuElementWiseOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

912

emit.cmd_do_operation(cmd0.NPU_OP_ELEMENTWISE, param=elementwise_op_map[npu_op.sub_op_type])

913

else:

914

assert 0, "Unsupported operation"

915

916

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

917

def generate_conv2d_op(emit: CommandStreamEmitter, npu_op: NpuConv2DOperation, arch: ArchitectureFeatures):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

918

"""Generates register commands for Conv2D operations"""

919

generate_common(emit, npu_op, npu_op.block_traversal, arch)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

920

921

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

922

def generate_conv_depthwise_op(

923

emit: CommandStreamEmitter, npu_op: NpuConvDepthWiseOperation, arch: ArchitectureFeatures

924

):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

925

"""Generates register commands for depthwise convolution operations"""

926

generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

927

928

929

def generate_pooling_op(emit: CommandStreamEmitter, npu_op: NpuPoolingOperation, arch: ArchitectureFeatures):

930

"""Generates register commands for pooling operations"""

931

use_global_scale = (

932

npu_op.sub_op_type in (NpuPoolingOp.AVERAGE, NpuPoolingOp.REDUCE_SUM) and sum(npu_op.padding) == 0

933

)

Patrik Gustavsson

c74682c

2021-08-17 14:26:38 +0200

[diff] [blame]

934

# Note: reuse of rescale for explicit scaling to not expose this in the external API

935

if npu_op.rescale is not None and type(npu_op.rescale) == ExplicitScaling:

936

use_global_scale = not npu_op.rescale.per_channel

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

937

generate_common(emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale)

938

# Pooling op specific

939

if use_global_scale:

940

generate_ofm_scaling_for_pooling(emit, npu_op)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

941

942

943

def generate_elementwise_op(emit: CommandStreamEmitter, npu_op: NpuElementWiseOperation, arch: ArchitectureFeatures):

944

"""Generates register commands for elementwise operations"""

945

use_global_scale = npu_op.sub_op_type in (

946

NpuElementWiseOp.ADD,

947

NpuElementWiseOp.SUB,

948

NpuElementWiseOp.MUL,

949

NpuElementWiseOp.LRELU,

950

NpuElementWiseOp.ABS,

951

)

952

op_to_scale = generate_scaling_for_elementwise(emit, npu_op)

953

generate_common(

954

emit, npu_op, NpuBlockTraversal.DEPTH_FIRST, arch, use_global_scale=use_global_scale, op_to_scale=op_to_scale

955

)

956

# Elementwise op specific

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

957

if npu_op.sub_op_type not in UNARY_ELEMWISE_OPS:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

958

# Binary operation; generate IFM2 registers

959

assert npu_op.ifm2 is not None

960

has_scalar = npu_op.ifm2_scalar is not None

961

generate_ifm2(emit, npu_op.ifm2, has_scalar)

962

generate_ifm_precision(emit, npu_op.ifm2, 0, cmd0.NPU_SET_IFM2_PRECISION)

963

generate_ifm2_broadcast(emit, npu_op)

964

if has_scalar:

965

quantized_scalar = quantise(npu_op.ifm2_scalar, npu_op.ifm2.quantization)

966

assert npu_op.ifm2.data_type.min_value() <= quantized_scalar <= npu_op.ifm2.data_type.max_value()

967

emit.cmd0_with_param(cmd0.NPU_SET_IFM2_SCALAR, quantized_scalar)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

968

969

970

def generate_dma_op(emit: CommandStreamEmitter, dma_op: NpuDmaOperation):

971

"""Generates register commands for DMA operations"""

972

emit.cmd0_with_param(cmd0.NPU_SET_DMA0_SRC_REGION, dma_op.src.region)

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

973

emit.cmd1_with_address(cmd1.NPU_SET_DMA0_SRC, dma_op.src.address)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

974

emit.cmd0_with_param(cmd0.NPU_SET_DMA0_DST_REGION, dma_op.dest.region)

975

Mauricio Briceno

2021-03-19 09:13:50 +0100

[diff] [blame]

976

emit.cmd1_with_address(cmd1.NPU_SET_DMA0_DST, dma_op.dest.address)

977

emit.cmd1_with_address(cmd1.NPU_SET_DMA0_LEN, dma_op.src.length)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

978

979

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

980

def generate_registers_for_op(emit: CommandStreamEmitter, npu_op: NpuOperation, arch: ArchitectureFeatures):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

981

"""

982

Generates register commands for the given operation, but not the final NPU_OP_... command.

983

Returns the selected block config

984

"""

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

985

if isinstance(npu_op, NpuConv2DOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

986

generate_conv2d_op(emit, npu_op, arch)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

987

elif isinstance(npu_op, NpuConvDepthWiseOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

988

generate_conv_depthwise_op(emit, npu_op, arch)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

989

elif isinstance(npu_op, NpuPoolingOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

990

generate_pooling_op(emit, npu_op, arch)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

991

elif isinstance(npu_op, NpuElementWiseOperation):

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

992

generate_elementwise_op(emit, npu_op, arch)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

993

elif isinstance(npu_op, NpuDmaOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

994

generate_dma_op(emit, npu_op)

995

else:

996

assert 0, "Unsupported operation"

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

997

998

999

def generate_command_stream(

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

1000

npu_op_list: List[NpuOperation],

1001

arch: ArchitectureFeatures,

1002

verbose: bool,

1003

mem_limits: Dict[int, int],

1004

add_to_debug_db=None,

1005

npu_op_to_cmd=None,

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

1006

) -> List[int]:

1007

"""

1008

Generates register commands for the given list of NPU operations.

1009

Returns Ethos-U instructions, as a list of 32-bit integers.

1010

"""

1011

emit = CommandStreamEmitter()

1012

if verbose:

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1013

print_operations(npu_op_list, npu_op_to_cmd)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1014

# Calculate memory accesses for every operation

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1015

memory_accesses: Dict[NpuOperation, MemoryAccessSet] = {}

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1016

for npu_op in npu_op_list:

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1017

if isinstance(npu_op, NpuDmaOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1018

memory_accesses[npu_op] = get_dma_memory_accesses(npu_op)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1019

elif isinstance(npu_op, NpuBlockOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1020

memory_accesses[npu_op] = get_op_memory_accesses(npu_op, arch)

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1021

else:

1022

assert 0, "Invalid operation type"

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

1023

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

1024

if arch.is_ethos_u65_system:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1025

emit.cmd0_with_param(cmd0.NPU_SET_PARALLEL_MODE, arch.ncores - 1)

1026

dep_watermark = Watermark(0, 0)

1027

prev_op = None

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1028

# Generate register commands for all operations

1029

for op_index, npu_op in enumerate(npu_op_list):

Louis Verhaard

2021-03-17 14:26:34 +0100

[diff] [blame]

1030

try:

1031

check_mem_limits(memory_accesses[npu_op], mem_limits)

1032

dep_watermark, cmd_waits = get_wait_dependency(arch, npu_op_list, memory_accesses, op_index, dep_watermark)

1033

generate_registers_for_op(emit, npu_op, arch)

1034

except VelaError as e:

1035

# Add operation info and rethrow

1036

raise VelaError(f"{e.error_msg}, in operation {op_index}:{npu_op.op_type.name}") from None

Dwight Lidman

2020-12-08 17:56:44 +0100

[diff] [blame]

1037

if not isinstance(npu_op, NpuDmaOperation) and isinstance(npu_op, NpuBlockOperation):

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1038

# Generate BLOCKDEP

Louis Verhaard

2020-11-25 14:10:30 +0100

[diff] [blame]

1039

blockdep = calc_blockdep(arch, prev_op, npu_op)

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1040

blockdep = min(blockdep, arch.max_blockdep)

1041

emit.cmd0_with_param(cmd0.NPU_SET_BLOCKDEP, blockdep)

1042

prev_op = npu_op

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1043

1044

generate_cmd_waits(emit, cmd_waits)

1045

# Generate the actual NPU_OP command

1046

generate_operation_code(emit, npu_op)

1047

if add_to_debug_db is not None:

1048

add_to_debug_db(npu_op, emit.offset)

1049

# Fill in final part of command stream:

1050

emit.cmd_do_operation(cmd0.NPU_OP_STOP, param=0xFFFF)

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

1051

res = emit.to_list()

erik.andersson@arm.com

1878dab

2021-03-16 09:40:24 +0100

[diff] [blame]

1052

1053

if emit.size_in_bytes() >= 1 << 24:

1054

raise VelaError(

1055

f"The command stream size exceeds the hardware limit of 16 MiB. "

1056

f"The current stream size is {emit.size_in_bytes()/2**20:.2F} MiB."

1057

)

1058

Tim Hall

2020-04-27 18:20:16 +0100

[diff] [blame]

1059

if verbose:

1060

emit.print_cmds()

Tim Hall

2022-05-10 12:42:27 +0100

[diff] [blame^]

1061

print(f"Number of commands = {len(emit.cmd_stream)}")

1062

print(f"Command stream length = {emit.size_in_bytes()} bytes")

Louis Verhaard

2020-11-26 11:42:04 +0100

[diff] [blame]

return res

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1066

def generate_register_command_stream(npu_op_list: List[NpuOperation], npu_accelerator: NpuAccelerator) -> List[int]:

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1067

"""

Louis Verhaard

aeae567

2020-11-02 18:04:27 +0100

[diff] [blame]

1068

Internal implementation of the public facing API for generating an Ethos-U register command stream.

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1069

Calculates dependencies between commands and inserts wait operations if needed.

1070

1071

:param npu_op_list: List[NpuOperation] list of high level NPU operations

Tim Hall

2020-10-27 12:43:14 +0000

[diff] [blame]

1072

:param accelerator: architecture_features.Accelerator enum to pick the correct Ethos-U accelerator

1073

:return Ethos-U instructions, as a list of 32-bit integers

Louis Verhaard

2020-11-02 18:04:27 +0100

[diff] [blame]

1074

"""

Louis Verhaard

aeae567

2020-11-02 18:04:27 +0100

[diff] [blame]

1075

accelerator = Accelerator.from_npu_accelerator(npu_accelerator)

Louis Verhaard

5207830

2020-11-18 13:35:06 +0100

[diff] [blame]

1076

arch = create_default_arch(accelerator)

Louis Verhaard